xref: /cloud-hypervisor/hypervisor/src/kvm/mod.rs (revision 88a9f799449c04180c6b9a21d3b9c0c4b57e2bd6)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 // Copyright © 2020, Microsoft Corporation
6 //
7 // Copyright 2018-2019 CrowdStrike, Inc.
8 //
9 //
10 
11 use std::any::Any;
12 use std::collections::HashMap;
13 #[cfg(target_arch = "x86_64")]
14 use std::fs::File;
15 #[cfg(target_arch = "x86_64")]
16 use std::os::unix::io::AsRawFd;
17 #[cfg(feature = "tdx")]
18 use std::os::unix::io::RawFd;
19 use std::result;
20 #[cfg(target_arch = "x86_64")]
21 use std::sync::atomic::{AtomicBool, Ordering};
22 use std::sync::Mutex;
23 use std::sync::{Arc, RwLock};
24 
25 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd};
26 use vmm_sys_util::eventfd::EventFd;
27 
28 #[cfg(target_arch = "aarch64")]
29 use crate::aarch64::gic::KvmGicV3Its;
30 #[cfg(target_arch = "aarch64")]
31 pub use crate::aarch64::{
32     check_required_kvm_extensions, gic::Gicv3ItsState as GicState, is_system_register, VcpuInit,
33     VcpuKvmState,
34 };
35 #[cfg(target_arch = "aarch64")]
36 use crate::arch::aarch64::gic::{Vgic, VgicConfig};
37 use crate::cpu;
38 use crate::hypervisor;
39 use crate::vec_with_array_field;
40 use crate::vm::{self, InterruptSourceConfig, VmOps};
41 use crate::HypervisorType;
42 #[cfg(target_arch = "aarch64")]
43 use crate::{arm64_core_reg_id, offset_of};
44 // x86_64 dependencies
45 #[cfg(target_arch = "x86_64")]
46 pub mod x86_64;
47 #[cfg(target_arch = "aarch64")]
48 use aarch64::{RegList, Register};
49 #[cfg(target_arch = "x86_64")]
50 use kvm_bindings::{
51     kvm_enable_cap, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, KVM_CAP_SPLIT_IRQCHIP,
52     KVM_GUESTDBG_USE_HW_BP,
53 };
54 #[cfg(target_arch = "x86_64")]
55 use x86_64::check_required_kvm_extensions;
56 #[cfg(target_arch = "x86_64")]
57 pub use x86_64::{CpuId, ExtendedControlRegisters, MsrEntries, VcpuKvmState};
58 
59 #[cfg(target_arch = "x86_64")]
60 use crate::arch::x86::{
61     CpuIdEntry, FpuState, LapicState, MsrEntry, SpecialRegisters, XsaveState, NUM_IOAPIC_PINS,
62 };
63 #[cfg(target_arch = "x86_64")]
64 use crate::ClockData;
65 use crate::StandardRegisters;
66 use crate::{
67     CpuState, IoEventAddress, IrqRoutingEntry, MpState, UserMemoryRegion,
68     USER_MEMORY_REGION_LOG_DIRTY, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE,
69 };
70 // aarch64 dependencies
71 #[cfg(target_arch = "aarch64")]
72 pub mod aarch64;
73 #[cfg(target_arch = "aarch64")]
74 use std::mem;
75 
76 pub use kvm_bindings;
77 pub use kvm_bindings::{
78     kvm_clock_data, kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_guest_debug,
79     kvm_irq_routing, kvm_irq_routing_entry, kvm_mp_state, kvm_userspace_memory_region,
80     KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI,
81     KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID,
82 };
83 #[cfg(target_arch = "aarch64")]
84 use kvm_bindings::{
85     kvm_regs, user_fpsimd_state, user_pt_regs, KVM_GUESTDBG_USE_HW, KVM_NR_SPSR, KVM_REG_ARM64,
86     KVM_REG_ARM64_SYSREG, KVM_REG_ARM64_SYSREG_CRM_MASK, KVM_REG_ARM64_SYSREG_CRN_MASK,
87     KVM_REG_ARM64_SYSREG_OP0_MASK, KVM_REG_ARM64_SYSREG_OP1_MASK, KVM_REG_ARM64_SYSREG_OP2_MASK,
88     KVM_REG_ARM_CORE, KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64,
89 };
90 #[cfg(feature = "tdx")]
91 use kvm_bindings::{kvm_run__bindgen_ty_1, KVMIO};
92 pub use kvm_ioctls;
93 pub use kvm_ioctls::{Cap, Kvm};
94 use thiserror::Error;
95 use vfio_ioctls::VfioDeviceFd;
96 #[cfg(feature = "tdx")]
97 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_ioc_nr, ioctl_iowr_nr};
98 ///
99 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms
100 ///
101 pub use {
102     kvm_bindings::kvm_create_device as CreateDevice, kvm_bindings::kvm_device_attr as DeviceAttr,
103     kvm_bindings::kvm_run, kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::VcpuExit,
104 };
105 
106 #[cfg(target_arch = "x86_64")]
107 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196;
108 
109 #[cfg(target_arch = "x86_64")]
110 use vmm_sys_util::ioctl_io_nr;
111 #[cfg(all(not(feature = "tdx"), target_arch = "x86_64"))]
112 use vmm_sys_util::ioctl_ioc_nr;
113 
114 #[cfg(target_arch = "x86_64")]
115 ioctl_io_nr!(KVM_NMI, kvm_bindings::KVMIO, 0x9a);
116 
117 #[cfg(feature = "tdx")]
118 const KVM_EXIT_TDX: u32 = 50;
119 #[cfg(feature = "tdx")]
120 const TDG_VP_VMCALL_GET_QUOTE: u64 = 0x10002;
121 #[cfg(feature = "tdx")]
122 const TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: u64 = 0x10004;
123 #[cfg(feature = "tdx")]
124 const TDG_VP_VMCALL_SUCCESS: u64 = 0;
125 #[cfg(feature = "tdx")]
126 const TDG_VP_VMCALL_INVALID_OPERAND: u64 = 0x8000000000000000;
127 
128 #[cfg(feature = "tdx")]
129 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong);
130 
131 #[cfg(feature = "tdx")]
132 #[repr(u32)]
133 enum TdxCommand {
134     Capabilities = 0,
135     InitVm,
136     InitVcpu,
137     InitMemRegion,
138     Finalize,
139 }
140 
141 #[cfg(feature = "tdx")]
142 pub enum TdxExitDetails {
143     GetQuote,
144     SetupEventNotifyInterrupt,
145 }
146 
147 #[cfg(feature = "tdx")]
148 pub enum TdxExitStatus {
149     Success,
150     InvalidOperand,
151 }
152 
153 #[cfg(feature = "tdx")]
154 const TDX_MAX_NR_CPUID_CONFIGS: usize = 6;
155 
156 #[cfg(feature = "tdx")]
157 #[repr(C)]
158 #[derive(Debug, Default)]
159 pub struct TdxCpuidConfig {
160     pub leaf: u32,
161     pub sub_leaf: u32,
162     pub eax: u32,
163     pub ebx: u32,
164     pub ecx: u32,
165     pub edx: u32,
166 }
167 
168 #[cfg(feature = "tdx")]
169 #[repr(C)]
170 #[derive(Debug, Default)]
171 pub struct TdxCapabilities {
172     pub attrs_fixed0: u64,
173     pub attrs_fixed1: u64,
174     pub xfam_fixed0: u64,
175     pub xfam_fixed1: u64,
176     pub nr_cpuid_configs: u32,
177     pub padding: u32,
178     pub cpuid_configs: [TdxCpuidConfig; TDX_MAX_NR_CPUID_CONFIGS],
179 }
180 
181 #[cfg(feature = "tdx")]
182 #[derive(Copy, Clone)]
183 pub struct KvmTdxExit {
184     pub type_: u32,
185     pub pad: u32,
186     pub u: KvmTdxExitU,
187 }
188 
189 #[cfg(feature = "tdx")]
190 #[repr(C)]
191 #[derive(Copy, Clone)]
192 pub union KvmTdxExitU {
193     pub vmcall: KvmTdxExitVmcall,
194 }
195 
196 #[cfg(feature = "tdx")]
197 #[repr(C)]
198 #[derive(Debug, Default, Copy, Clone, PartialEq)]
199 pub struct KvmTdxExitVmcall {
200     pub type_: u64,
201     pub subfunction: u64,
202     pub reg_mask: u64,
203     pub in_r12: u64,
204     pub in_r13: u64,
205     pub in_r14: u64,
206     pub in_r15: u64,
207     pub in_rbx: u64,
208     pub in_rdi: u64,
209     pub in_rsi: u64,
210     pub in_r8: u64,
211     pub in_r9: u64,
212     pub in_rdx: u64,
213     pub status_code: u64,
214     pub out_r11: u64,
215     pub out_r12: u64,
216     pub out_r13: u64,
217     pub out_r14: u64,
218     pub out_r15: u64,
219     pub out_rbx: u64,
220     pub out_rdi: u64,
221     pub out_rsi: u64,
222     pub out_r8: u64,
223     pub out_r9: u64,
224     pub out_rdx: u64,
225 }
226 
227 impl From<kvm_userspace_memory_region> for UserMemoryRegion {
228     fn from(region: kvm_userspace_memory_region) -> Self {
229         let mut flags = USER_MEMORY_REGION_READ;
230         if region.flags & KVM_MEM_READONLY == 0 {
231             flags |= USER_MEMORY_REGION_WRITE;
232         }
233         if region.flags & KVM_MEM_LOG_DIRTY_PAGES != 0 {
234             flags |= USER_MEMORY_REGION_LOG_DIRTY;
235         }
236 
237         UserMemoryRegion {
238             slot: region.slot,
239             guest_phys_addr: region.guest_phys_addr,
240             memory_size: region.memory_size,
241             userspace_addr: region.userspace_addr,
242             flags,
243         }
244     }
245 }
246 
247 impl From<UserMemoryRegion> for kvm_userspace_memory_region {
248     fn from(region: UserMemoryRegion) -> Self {
249         assert!(
250             region.flags & USER_MEMORY_REGION_READ != 0,
251             "KVM mapped memory is always readable"
252         );
253 
254         let mut flags = 0;
255         if region.flags & USER_MEMORY_REGION_WRITE == 0 {
256             flags |= KVM_MEM_READONLY;
257         }
258         if region.flags & USER_MEMORY_REGION_LOG_DIRTY != 0 {
259             flags |= KVM_MEM_LOG_DIRTY_PAGES;
260         }
261 
262         kvm_userspace_memory_region {
263             slot: region.slot,
264             guest_phys_addr: region.guest_phys_addr,
265             memory_size: region.memory_size,
266             userspace_addr: region.userspace_addr,
267             flags,
268         }
269     }
270 }
271 
272 impl From<kvm_mp_state> for MpState {
273     fn from(s: kvm_mp_state) -> Self {
274         MpState::Kvm(s)
275     }
276 }
277 
278 impl From<MpState> for kvm_mp_state {
279     fn from(ms: MpState) -> Self {
280         match ms {
281             MpState::Kvm(s) => s,
282             /* Needed in case other hypervisors are enabled */
283             #[allow(unreachable_patterns)]
284             _ => panic!("CpuState is not valid"),
285         }
286     }
287 }
288 
289 impl From<kvm_ioctls::IoEventAddress> for IoEventAddress {
290     fn from(a: kvm_ioctls::IoEventAddress) -> Self {
291         match a {
292             kvm_ioctls::IoEventAddress::Pio(x) => Self::Pio(x),
293             kvm_ioctls::IoEventAddress::Mmio(x) => Self::Mmio(x),
294         }
295     }
296 }
297 
298 impl From<IoEventAddress> for kvm_ioctls::IoEventAddress {
299     fn from(a: IoEventAddress) -> Self {
300         match a {
301             IoEventAddress::Pio(x) => Self::Pio(x),
302             IoEventAddress::Mmio(x) => Self::Mmio(x),
303         }
304     }
305 }
306 
307 impl From<VcpuKvmState> for CpuState {
308     fn from(s: VcpuKvmState) -> Self {
309         CpuState::Kvm(s)
310     }
311 }
312 
313 impl From<CpuState> for VcpuKvmState {
314     fn from(s: CpuState) -> Self {
315         match s {
316             CpuState::Kvm(s) => s,
317             /* Needed in case other hypervisors are enabled */
318             #[allow(unreachable_patterns)]
319             _ => panic!("CpuState is not valid"),
320         }
321     }
322 }
323 
324 #[cfg(target_arch = "x86_64")]
325 impl From<kvm_clock_data> for ClockData {
326     fn from(d: kvm_clock_data) -> Self {
327         ClockData::Kvm(d)
328     }
329 }
330 
331 #[cfg(target_arch = "x86_64")]
332 impl From<ClockData> for kvm_clock_data {
333     fn from(ms: ClockData) -> Self {
334         match ms {
335             ClockData::Kvm(s) => s,
336             /* Needed in case other hypervisors are enabled */
337             #[allow(unreachable_patterns)]
338             _ => panic!("CpuState is not valid"),
339         }
340     }
341 }
342 
343 impl From<kvm_bindings::kvm_regs> for crate::StandardRegisters {
344     fn from(s: kvm_bindings::kvm_regs) -> Self {
345         crate::StandardRegisters::Kvm(s)
346     }
347 }
348 
349 impl From<crate::StandardRegisters> for kvm_bindings::kvm_regs {
350     fn from(e: crate::StandardRegisters) -> Self {
351         match e {
352             crate::StandardRegisters::Kvm(e) => e,
353             /* Needed in case other hypervisors are enabled */
354             #[allow(unreachable_patterns)]
355             _ => panic!("StandardRegisters are not valid"),
356         }
357     }
358 }
359 
360 impl From<kvm_irq_routing_entry> for IrqRoutingEntry {
361     fn from(s: kvm_irq_routing_entry) -> Self {
362         IrqRoutingEntry::Kvm(s)
363     }
364 }
365 
366 impl From<IrqRoutingEntry> for kvm_irq_routing_entry {
367     fn from(e: IrqRoutingEntry) -> Self {
368         match e {
369             IrqRoutingEntry::Kvm(e) => e,
370             /* Needed in case other hypervisors are enabled */
371             #[allow(unreachable_patterns)]
372             _ => panic!("IrqRoutingEntry is not valid"),
373         }
374     }
375 }
376 
377 struct KvmDirtyLogSlot {
378     slot: u32,
379     guest_phys_addr: u64,
380     memory_size: u64,
381     userspace_addr: u64,
382 }
383 
384 /// Wrapper over KVM VM ioctls.
385 pub struct KvmVm {
386     fd: Arc<VmFd>,
387     #[cfg(target_arch = "x86_64")]
388     msrs: Vec<MsrEntry>,
389     dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>,
390 }
391 
392 impl KvmVm {
393     ///
394     /// Creates an emulated device in the kernel.
395     ///
396     /// See the documentation for `KVM_CREATE_DEVICE`.
397     fn create_device(&self, device: &mut CreateDevice) -> vm::Result<vfio_ioctls::VfioDeviceFd> {
398         let device_fd = self
399             .fd
400             .create_device(device)
401             .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?;
402         Ok(VfioDeviceFd::new_from_kvm(device_fd))
403     }
404     /// Checks if a particular `Cap` is available.
405     pub fn check_extension(&self, c: Cap) -> bool {
406         self.fd.check_extension(c)
407     }
408 }
409 
410 /// Implementation of Vm trait for KVM
411 ///
412 /// # Examples
413 ///
414 /// ```
415 /// # use hypervisor::kvm::KvmHypervisor;
416 /// # use std::sync::Arc;
417 /// let kvm = KvmHypervisor::new().unwrap();
418 /// let hypervisor = Arc::new(kvm);
419 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
420 /// ```
421 impl vm::Vm for KvmVm {
422     #[cfg(target_arch = "x86_64")]
423     ///
424     /// Sets the address of the one-page region in the VM's address space.
425     ///
426     fn set_identity_map_address(&self, address: u64) -> vm::Result<()> {
427         self.fd
428             .set_identity_map_address(address)
429             .map_err(|e| vm::HypervisorVmError::SetIdentityMapAddress(e.into()))
430     }
431 
432     #[cfg(target_arch = "x86_64")]
433     ///
434     /// Sets the address of the three-page region in the VM's address space.
435     ///
436     fn set_tss_address(&self, offset: usize) -> vm::Result<()> {
437         self.fd
438             .set_tss_address(offset)
439             .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into()))
440     }
441 
442     ///
443     /// Creates an in-kernel interrupt controller.
444     ///
445     fn create_irq_chip(&self) -> vm::Result<()> {
446         self.fd
447             .create_irq_chip()
448             .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into()))
449     }
450 
451     ///
452     /// Registers an event that will, when signaled, trigger the `gsi` IRQ.
453     ///
454     fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
455         self.fd
456             .register_irqfd(fd, gsi)
457             .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into()))
458     }
459 
460     ///
461     /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ.
462     ///
463     fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
464         self.fd
465             .unregister_irqfd(fd, gsi)
466             .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into()))
467     }
468 
469     ///
470     /// Creates a VcpuFd object from a vcpu RawFd.
471     ///
472     fn create_vcpu(
473         &self,
474         id: u8,
475         vm_ops: Option<Arc<dyn VmOps>>,
476     ) -> vm::Result<Arc<dyn cpu::Vcpu>> {
477         let fd = self
478             .fd
479             .create_vcpu(id as u64)
480             .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?;
481         let vcpu = KvmVcpu {
482             fd: Arc::new(Mutex::new(fd)),
483             #[cfg(target_arch = "x86_64")]
484             msrs: self.msrs.clone(),
485             vm_ops,
486             #[cfg(target_arch = "x86_64")]
487             hyperv_synic: AtomicBool::new(false),
488         };
489         Ok(Arc::new(vcpu))
490     }
491 
492     #[cfg(target_arch = "aarch64")]
493     ///
494     /// Creates a virtual GIC device.
495     ///
496     fn create_vgic(&self, config: VgicConfig) -> vm::Result<Arc<Mutex<dyn Vgic>>> {
497         let gic_device = KvmGicV3Its::new(self, config)
498             .map_err(|e| vm::HypervisorVmError::CreateVgic(anyhow!("Vgic error {:?}", e)))?;
499         Ok(Arc::new(Mutex::new(gic_device)))
500     }
501 
502     ///
503     /// Registers an event to be signaled whenever a certain address is written to.
504     ///
505     fn register_ioevent(
506         &self,
507         fd: &EventFd,
508         addr: &IoEventAddress,
509         datamatch: Option<vm::DataMatch>,
510     ) -> vm::Result<()> {
511         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
512         if let Some(dm) = datamatch {
513             match dm {
514                 vm::DataMatch::DataMatch32(kvm_dm32) => self
515                     .fd
516                     .register_ioevent(fd, addr, kvm_dm32)
517                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
518                 vm::DataMatch::DataMatch64(kvm_dm64) => self
519                     .fd
520                     .register_ioevent(fd, addr, kvm_dm64)
521                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
522             }
523         } else {
524             self.fd
525                 .register_ioevent(fd, addr, NoDatamatch)
526                 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into()))
527         }
528     }
529 
530     ///
531     /// Unregisters an event from a certain address it has been previously registered to.
532     ///
533     fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> {
534         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
535         self.fd
536             .unregister_ioevent(fd, addr, NoDatamatch)
537             .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into()))
538     }
539 
540     ///
541     /// Constructs a routing entry
542     ///
543     fn make_routing_entry(&self, gsi: u32, config: &InterruptSourceConfig) -> IrqRoutingEntry {
544         match &config {
545             InterruptSourceConfig::MsiIrq(cfg) => {
546                 let mut kvm_route = kvm_irq_routing_entry {
547                     gsi,
548                     type_: KVM_IRQ_ROUTING_MSI,
549                     ..Default::default()
550                 };
551 
552                 kvm_route.u.msi.address_lo = cfg.low_addr;
553                 kvm_route.u.msi.address_hi = cfg.high_addr;
554                 kvm_route.u.msi.data = cfg.data;
555 
556                 if self.check_extension(crate::kvm::Cap::MsiDevid) {
557                     // On AArch64, there is limitation on the range of the 'devid',
558                     // it cannot be greater than 65536 (the max of u16).
559                     //
560                     // BDF cannot be used directly, because 'segment' is in high
561                     // 16 bits. The layout of the u32 BDF is:
562                     // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --|
563                     // |      segment    |     bus    |   device   |  function  |
564                     //
565                     // Now that we support 1 bus only in a segment, we can build a
566                     // 'devid' by replacing the 'bus' bits with the low 8 bits of
567                     // 'segment' data.
568                     // This way we can resolve the range checking problem and give
569                     // different `devid` to all the devices. Limitation is that at
570                     // most 256 segments can be supported.
571                     //
572                     let modified_devid = (cfg.devid & 0x00ff_0000) >> 8 | cfg.devid & 0xff;
573 
574                     kvm_route.flags = KVM_MSI_VALID_DEVID;
575                     kvm_route.u.msi.__bindgen_anon_1.devid = modified_devid;
576                 }
577                 kvm_route.into()
578             }
579             InterruptSourceConfig::LegacyIrq(cfg) => {
580                 let mut kvm_route = kvm_irq_routing_entry {
581                     gsi,
582                     type_: KVM_IRQ_ROUTING_IRQCHIP,
583                     ..Default::default()
584                 };
585                 kvm_route.u.irqchip.irqchip = cfg.irqchip;
586                 kvm_route.u.irqchip.pin = cfg.pin;
587 
588                 kvm_route.into()
589             }
590         }
591     }
592 
593     ///
594     /// Sets the GSI routing table entries, overwriting any previously set
595     /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl.
596     ///
597     fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> {
598         let mut irq_routing =
599             vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len());
600         irq_routing[0].nr = entries.len() as u32;
601         irq_routing[0].flags = 0;
602         let entries: Vec<kvm_irq_routing_entry> = entries
603             .iter()
604             .map(|entry| match entry {
605                 IrqRoutingEntry::Kvm(e) => *e,
606                 #[allow(unreachable_patterns)]
607                 _ => panic!("IrqRoutingEntry type is wrong"),
608             })
609             .collect();
610 
611         // SAFETY: irq_routing initialized with entries.len() and now it is being turned into
612         // entries_slice with entries.len() again. It is guaranteed to be large enough to hold
613         // everything from entries.
614         unsafe {
615             let entries_slice: &mut [kvm_irq_routing_entry] =
616                 irq_routing[0].entries.as_mut_slice(entries.len());
617             entries_slice.copy_from_slice(&entries);
618         }
619 
620         self.fd
621             .set_gsi_routing(&irq_routing[0])
622             .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into()))
623     }
624 
625     ///
626     /// Creates a memory region structure that can be used with {create/remove}_user_memory_region
627     ///
628     fn make_user_memory_region(
629         &self,
630         slot: u32,
631         guest_phys_addr: u64,
632         memory_size: u64,
633         userspace_addr: u64,
634         readonly: bool,
635         log_dirty_pages: bool,
636     ) -> UserMemoryRegion {
637         kvm_userspace_memory_region {
638             slot,
639             guest_phys_addr,
640             memory_size,
641             userspace_addr,
642             flags: if readonly { KVM_MEM_READONLY } else { 0 }
643                 | if log_dirty_pages {
644                     KVM_MEM_LOG_DIRTY_PAGES
645                 } else {
646                     0
647                 },
648         }
649         .into()
650     }
651 
652     ///
653     /// Creates a guest physical memory region.
654     ///
655     fn create_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
656         let mut region: kvm_userspace_memory_region = user_memory_region.into();
657 
658         if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 {
659             if (region.flags & KVM_MEM_READONLY) != 0 {
660                 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!(
661                     "Error creating regions with both 'dirty-pages-log' and 'read-only'."
662                 )));
663             }
664 
665             // Keep track of the regions that need dirty pages log
666             self.dirty_log_slots.write().unwrap().insert(
667                 region.slot,
668                 KvmDirtyLogSlot {
669                     slot: region.slot,
670                     guest_phys_addr: region.guest_phys_addr,
671                     memory_size: region.memory_size,
672                     userspace_addr: region.userspace_addr,
673                 },
674             );
675 
676             // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`.
677             // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`.
678             region.flags = 0;
679         }
680 
681         // SAFETY: Safe because guest regions are guaranteed not to overlap.
682         unsafe {
683             self.fd
684                 .set_user_memory_region(region)
685                 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))
686         }
687     }
688 
689     ///
690     /// Removes a guest physical memory region.
691     ///
692     fn remove_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
693         let mut region: kvm_userspace_memory_region = user_memory_region.into();
694 
695         // Remove the corresponding entry from "self.dirty_log_slots" if needed
696         self.dirty_log_slots.write().unwrap().remove(&region.slot);
697 
698         // Setting the size to 0 means "remove"
699         region.memory_size = 0;
700         // SAFETY: Safe because guest regions are guaranteed not to overlap.
701         unsafe {
702             self.fd
703                 .set_user_memory_region(region)
704                 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into()))
705         }
706     }
707 
708     ///
709     /// Returns the preferred CPU target type which can be emulated by KVM on underlying host.
710     ///
711     #[cfg(target_arch = "aarch64")]
712     fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> {
713         self.fd
714             .get_preferred_target(kvi)
715             .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into()))
716     }
717 
718     #[cfg(target_arch = "x86_64")]
719     fn enable_split_irq(&self) -> vm::Result<()> {
720         // Create split irqchip
721         // Only the local APIC is emulated in kernel, both PICs and IOAPIC
722         // are not.
723         let mut cap = kvm_enable_cap {
724             cap: KVM_CAP_SPLIT_IRQCHIP,
725             ..Default::default()
726         };
727         cap.args[0] = NUM_IOAPIC_PINS as u64;
728         self.fd
729             .enable_cap(&cap)
730             .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?;
731         Ok(())
732     }
733 
734     #[cfg(target_arch = "x86_64")]
735     fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> {
736         let mut cap = kvm_enable_cap {
737             cap: KVM_CAP_SGX_ATTRIBUTE,
738             ..Default::default()
739         };
740         cap.args[0] = file.as_raw_fd() as u64;
741         self.fd
742             .enable_cap(&cap)
743             .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?;
744         Ok(())
745     }
746 
747     /// Retrieve guest clock.
748     #[cfg(target_arch = "x86_64")]
749     fn get_clock(&self) -> vm::Result<ClockData> {
750         Ok(self
751             .fd
752             .get_clock()
753             .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))?
754             .into())
755     }
756 
757     /// Set guest clock.
758     #[cfg(target_arch = "x86_64")]
759     fn set_clock(&self, data: &ClockData) -> vm::Result<()> {
760         let data = (*data).into();
761         self.fd
762             .set_clock(&data)
763             .map_err(|e| vm::HypervisorVmError::SetClock(e.into()))
764     }
765 
766     /// Create a device that is used for passthrough
767     fn create_passthrough_device(&self) -> vm::Result<VfioDeviceFd> {
768         let mut vfio_dev = kvm_create_device {
769             type_: kvm_device_type_KVM_DEV_TYPE_VFIO,
770             fd: 0,
771             flags: 0,
772         };
773 
774         self.create_device(&mut vfio_dev)
775             .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into()))
776     }
777 
778     ///
779     /// Start logging dirty pages
780     ///
781     fn start_dirty_log(&self) -> vm::Result<()> {
782         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
783         for (_, s) in dirty_log_slots.iter() {
784             let region = kvm_userspace_memory_region {
785                 slot: s.slot,
786                 guest_phys_addr: s.guest_phys_addr,
787                 memory_size: s.memory_size,
788                 userspace_addr: s.userspace_addr,
789                 flags: KVM_MEM_LOG_DIRTY_PAGES,
790             };
791             // SAFETY: Safe because guest regions are guaranteed not to overlap.
792             unsafe {
793                 self.fd
794                     .set_user_memory_region(region)
795                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
796             }
797         }
798 
799         Ok(())
800     }
801 
802     ///
803     /// Stop logging dirty pages
804     ///
805     fn stop_dirty_log(&self) -> vm::Result<()> {
806         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
807         for (_, s) in dirty_log_slots.iter() {
808             let region = kvm_userspace_memory_region {
809                 slot: s.slot,
810                 guest_phys_addr: s.guest_phys_addr,
811                 memory_size: s.memory_size,
812                 userspace_addr: s.userspace_addr,
813                 flags: 0,
814             };
815             // SAFETY: Safe because guest regions are guaranteed not to overlap.
816             unsafe {
817                 self.fd
818                     .set_user_memory_region(region)
819                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
820             }
821         }
822 
823         Ok(())
824     }
825 
826     ///
827     /// Get dirty pages bitmap (one bit per page)
828     ///
829     fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> {
830         self.fd
831             .get_dirty_log(slot, memory_size as usize)
832             .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into()))
833     }
834 
835     ///
836     /// Initialize TDX for this VM
837     ///
838     #[cfg(feature = "tdx")]
839     fn tdx_init(&self, cpuid: &[CpuIdEntry], max_vcpus: u32) -> vm::Result<()> {
840         const TDX_ATTR_SEPT_VE_DISABLE: usize = 28;
841 
842         let mut cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> =
843             cpuid.iter().map(|e| (*e).into()).collect();
844         cpuid.resize(256, kvm_bindings::kvm_cpuid_entry2::default());
845 
846         #[repr(C)]
847         struct TdxInitVm {
848             attributes: u64,
849             max_vcpus: u32,
850             padding: u32,
851             mrconfigid: [u64; 6],
852             mrowner: [u64; 6],
853             mrownerconfig: [u64; 6],
854             cpuid_nent: u32,
855             cpuid_padding: u32,
856             cpuid_entries: [kvm_bindings::kvm_cpuid_entry2; 256],
857         }
858         let data = TdxInitVm {
859             attributes: 1 << TDX_ATTR_SEPT_VE_DISABLE,
860             max_vcpus,
861             padding: 0,
862             mrconfigid: [0; 6],
863             mrowner: [0; 6],
864             mrownerconfig: [0; 6],
865             cpuid_nent: cpuid.len() as u32,
866             cpuid_padding: 0,
867             cpuid_entries: cpuid.as_slice().try_into().unwrap(),
868         };
869 
870         tdx_command(
871             &self.fd.as_raw_fd(),
872             TdxCommand::InitVm,
873             0,
874             &data as *const _ as u64,
875         )
876         .map_err(vm::HypervisorVmError::InitializeTdx)
877     }
878 
879     ///
880     /// Finalize the TDX setup for this VM
881     ///
882     #[cfg(feature = "tdx")]
883     fn tdx_finalize(&self) -> vm::Result<()> {
884         tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0)
885             .map_err(vm::HypervisorVmError::FinalizeTdx)
886     }
887 
888     ///
889     /// Initialize memory regions for the TDX VM
890     ///
891     #[cfg(feature = "tdx")]
892     fn tdx_init_memory_region(
893         &self,
894         host_address: u64,
895         guest_address: u64,
896         size: u64,
897         measure: bool,
898     ) -> vm::Result<()> {
899         #[repr(C)]
900         struct TdxInitMemRegion {
901             host_address: u64,
902             guest_address: u64,
903             pages: u64,
904         }
905         let data = TdxInitMemRegion {
906             host_address,
907             guest_address,
908             pages: size / 4096,
909         };
910 
911         tdx_command(
912             &self.fd.as_raw_fd(),
913             TdxCommand::InitMemRegion,
914             u32::from(measure),
915             &data as *const _ as u64,
916         )
917         .map_err(vm::HypervisorVmError::InitMemRegionTdx)
918     }
919 
920     /// Downcast to the underlying KvmVm type
921     fn as_any(&self) -> &dyn Any {
922         self
923     }
924 }
925 
926 #[cfg(feature = "tdx")]
927 fn tdx_command(
928     fd: &RawFd,
929     command: TdxCommand,
930     flags: u32,
931     data: u64,
932 ) -> std::result::Result<(), std::io::Error> {
933     #[repr(C)]
934     struct TdxIoctlCmd {
935         command: TdxCommand,
936         flags: u32,
937         data: u64,
938         error: u64,
939         unused: u64,
940     }
941     let cmd = TdxIoctlCmd {
942         command,
943         flags,
944         data,
945         error: 0,
946         unused: 0,
947     };
948     // SAFETY: FFI call. All input parameters are valid.
949     let ret = unsafe {
950         ioctl_with_val(
951             fd,
952             KVM_MEMORY_ENCRYPT_OP(),
953             &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong,
954         )
955     };
956 
957     if ret < 0 {
958         return Err(std::io::Error::last_os_error());
959     }
960     Ok(())
961 }
962 
963 /// Wrapper over KVM system ioctls.
964 pub struct KvmHypervisor {
965     kvm: Kvm,
966 }
967 
968 impl KvmHypervisor {
969     #[cfg(target_arch = "x86_64")]
970     ///
971     /// Retrieve the list of MSRs supported by the hypervisor.
972     ///
973     fn get_msr_list(&self) -> hypervisor::Result<MsrList> {
974         self.kvm
975             .get_msr_index_list()
976             .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into()))
977     }
978 }
979 
980 /// Enum for KVM related error
981 #[derive(Debug, Error)]
982 pub enum KvmError {
983     #[error("Capability missing: {0:?}")]
984     CapabilityMissing(Cap),
985 }
986 
987 pub type KvmResult<T> = result::Result<T, KvmError>;
988 
989 impl KvmHypervisor {
990     /// Create a hypervisor based on Kvm
991     #[allow(clippy::new_ret_no_self)]
992     pub fn new() -> hypervisor::Result<Arc<dyn hypervisor::Hypervisor>> {
993         let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?;
994         let api_version = kvm_obj.get_api_version();
995 
996         if api_version != kvm_bindings::KVM_API_VERSION as i32 {
997             return Err(hypervisor::HypervisorError::IncompatibleApiVersion);
998         }
999 
1000         Ok(Arc::new(KvmHypervisor { kvm: kvm_obj }))
1001     }
1002 
1003     /// Check if the hypervisor is available
1004     pub fn is_available() -> hypervisor::Result<bool> {
1005         match std::fs::metadata("/dev/kvm") {
1006             Ok(_) => Ok(true),
1007             Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false),
1008             Err(err) => Err(hypervisor::HypervisorError::HypervisorAvailableCheck(
1009                 err.into(),
1010             )),
1011         }
1012     }
1013 }
1014 
1015 /// Implementation of Hypervisor trait for KVM
1016 ///
1017 /// # Examples
1018 ///
1019 /// ```
1020 /// # use hypervisor::kvm::KvmHypervisor;
1021 /// # use std::sync::Arc;
1022 /// let kvm = KvmHypervisor::new().unwrap();
1023 /// let hypervisor = Arc::new(kvm);
1024 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
1025 /// ```
1026 impl hypervisor::Hypervisor for KvmHypervisor {
1027     ///
1028     /// Returns the type of the hypervisor
1029     ///
1030     fn hypervisor_type(&self) -> HypervisorType {
1031         HypervisorType::Kvm
1032     }
1033 
1034     /// Create a KVM vm object of a specific VM type and return the object as Vm trait object
1035     ///
1036     /// # Examples
1037     ///
1038     /// ```
1039     /// # use hypervisor::kvm::KvmHypervisor;
1040     /// use hypervisor::kvm::KvmVm;
1041     /// let hypervisor = KvmHypervisor::new().unwrap();
1042     /// let vm = hypervisor.create_vm_with_type(0).unwrap();
1043     /// ```
1044     fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> {
1045         let fd: VmFd;
1046         loop {
1047             match self.kvm.create_vm_with_type(vm_type) {
1048                 Ok(res) => fd = res,
1049                 Err(e) => {
1050                     if e.errno() == libc::EINTR {
1051                         // If the error returned is EINTR, which means the
1052                         // ioctl has been interrupted, we have to retry as
1053                         // this can't be considered as a regular error.
1054                         continue;
1055                     } else {
1056                         return Err(hypervisor::HypervisorError::VmCreate(e.into()));
1057                     }
1058                 }
1059             }
1060             break;
1061         }
1062 
1063         let vm_fd = Arc::new(fd);
1064 
1065         #[cfg(target_arch = "x86_64")]
1066         {
1067             let msr_list = self.get_msr_list()?;
1068             let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize;
1069             let mut msrs: Vec<MsrEntry> = vec![
1070                 MsrEntry {
1071                     ..Default::default()
1072                 };
1073                 num_msrs
1074             ];
1075             let indices = msr_list.as_slice();
1076             for (pos, index) in indices.iter().enumerate() {
1077                 msrs[pos].index = *index;
1078             }
1079 
1080             Ok(Arc::new(KvmVm {
1081                 fd: vm_fd,
1082                 msrs,
1083                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
1084             }))
1085         }
1086 
1087         #[cfg(target_arch = "aarch64")]
1088         {
1089             Ok(Arc::new(KvmVm {
1090                 fd: vm_fd,
1091                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
1092             }))
1093         }
1094     }
1095 
1096     /// Create a KVM vm object and return the object as Vm trait object
1097     ///
1098     /// # Examples
1099     ///
1100     /// ```
1101     /// # use hypervisor::kvm::KvmHypervisor;
1102     /// use hypervisor::kvm::KvmVm;
1103     /// let hypervisor = KvmHypervisor::new().unwrap();
1104     /// let vm = hypervisor.create_vm().unwrap();
1105     /// ```
1106     fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> {
1107         #[allow(unused_mut)]
1108         let mut vm_type: u64 = 0; // Create with default platform type
1109 
1110         // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA
1111         // size from the host and use that when creating the VM, which may
1112         // avoid unnecessary VM creation failures.
1113         #[cfg(target_arch = "aarch64")]
1114         if self.kvm.check_extension(Cap::ArmVmIPASize) {
1115             vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap();
1116         }
1117 
1118         self.create_vm_with_type(vm_type)
1119     }
1120 
1121     fn check_required_extensions(&self) -> hypervisor::Result<()> {
1122         check_required_kvm_extensions(&self.kvm)
1123             .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into()))
1124     }
1125 
1126     #[cfg(target_arch = "x86_64")]
1127     ///
1128     /// X86 specific call to get the system supported CPUID values.
1129     ///
1130     fn get_supported_cpuid(&self) -> hypervisor::Result<Vec<CpuIdEntry>> {
1131         let kvm_cpuid = self
1132             .kvm
1133             .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
1134             .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))?;
1135 
1136         let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect();
1137 
1138         Ok(v)
1139     }
1140 
1141     #[cfg(target_arch = "aarch64")]
1142     ///
1143     /// Retrieve AArch64 host maximum IPA size supported by KVM.
1144     ///
1145     fn get_host_ipa_limit(&self) -> i32 {
1146         self.kvm.get_host_ipa_limit()
1147     }
1148 
1149     ///
1150     /// Retrieve TDX capabilities
1151     ///
1152     #[cfg(feature = "tdx")]
1153     fn tdx_capabilities(&self) -> hypervisor::Result<TdxCapabilities> {
1154         let data = TdxCapabilities {
1155             nr_cpuid_configs: TDX_MAX_NR_CPUID_CONFIGS as u32,
1156             ..Default::default()
1157         };
1158 
1159         tdx_command(
1160             &self.kvm.as_raw_fd(),
1161             TdxCommand::Capabilities,
1162             0,
1163             &data as *const _ as u64,
1164         )
1165         .map_err(|e| hypervisor::HypervisorError::TdxCapabilities(e.into()))?;
1166 
1167         Ok(data)
1168     }
1169 
1170     ///
1171     /// Get the number of supported hardware breakpoints
1172     ///
1173     fn get_guest_debug_hw_bps(&self) -> usize {
1174         #[cfg(target_arch = "x86_64")]
1175         {
1176             4
1177         }
1178         #[cfg(target_arch = "aarch64")]
1179         {
1180             self.kvm.get_guest_debug_hw_bps() as usize
1181         }
1182     }
1183 
1184     /// Get maximum number of vCPUs
1185     fn get_max_vcpus(&self) -> u32 {
1186         self.kvm.get_max_vcpus().min(u32::MAX as usize) as u32
1187     }
1188 }
1189 
1190 /// Vcpu struct for KVM
1191 pub struct KvmVcpu {
1192     fd: Arc<Mutex<VcpuFd>>,
1193     #[cfg(target_arch = "x86_64")]
1194     msrs: Vec<MsrEntry>,
1195     vm_ops: Option<Arc<dyn vm::VmOps>>,
1196     #[cfg(target_arch = "x86_64")]
1197     hyperv_synic: AtomicBool,
1198 }
1199 
1200 /// Implementation of Vcpu trait for KVM
1201 ///
1202 /// # Examples
1203 ///
1204 /// ```
1205 /// # use hypervisor::kvm::KvmHypervisor;
1206 /// # use std::sync::Arc;
1207 /// let kvm = KvmHypervisor::new().unwrap();
1208 /// let hypervisor = Arc::new(kvm);
1209 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
1210 /// let vcpu = vm.create_vcpu(0, None).unwrap();
1211 /// ```
1212 impl cpu::Vcpu for KvmVcpu {
1213     ///
1214     /// Returns StandardRegisters with default value set
1215     ///
1216     #[cfg(target_arch = "x86_64")]
1217     fn create_standard_regs(&self) -> StandardRegisters {
1218         kvm_bindings::kvm_regs::default().into()
1219     }
1220     #[cfg(target_arch = "x86_64")]
1221     ///
1222     /// Returns the vCPU general purpose registers.
1223     ///
1224     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1225         Ok(self
1226             .fd
1227             .lock()
1228             .unwrap()
1229             .get_regs()
1230             .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))?
1231             .into())
1232     }
1233 
1234     ///
1235     /// Returns the vCPU general purpose registers.
1236     /// The `KVM_GET_REGS` ioctl is not available on AArch64, `KVM_GET_ONE_REG`
1237     /// is used to get registers one by one.
1238     ///
1239     #[cfg(target_arch = "aarch64")]
1240     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1241         let mut state = kvm_regs::default();
1242         let mut off = offset_of!(user_pt_regs, regs);
1243         // There are 31 user_pt_regs:
1244         // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72
1245         // These actually are the general-purpose registers of the Armv8-a
1246         // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register).
1247         for i in 0..31 {
1248             let mut bytes = [0_u8; 8];
1249             self.fd
1250                 .lock()
1251                 .unwrap()
1252                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1253                 .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1254             state.regs.regs[i] = u64::from_le_bytes(bytes);
1255             off += std::mem::size_of::<u64>();
1256         }
1257 
1258         // We are now entering the "Other register" section of the ARMv8-a architecture.
1259         // First one, stack pointer.
1260         let off = offset_of!(user_pt_regs, sp);
1261         let mut bytes = [0_u8; 8];
1262         self.fd
1263             .lock()
1264             .unwrap()
1265             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1266             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1267         state.regs.sp = u64::from_le_bytes(bytes);
1268 
1269         // Second one, the program counter.
1270         let off = offset_of!(user_pt_regs, pc);
1271         let mut bytes = [0_u8; 8];
1272         self.fd
1273             .lock()
1274             .unwrap()
1275             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1276             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1277         state.regs.pc = u64::from_le_bytes(bytes);
1278 
1279         // Next is the processor state.
1280         let off = offset_of!(user_pt_regs, pstate);
1281         let mut bytes = [0_u8; 8];
1282         self.fd
1283             .lock()
1284             .unwrap()
1285             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1286             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1287         state.regs.pstate = u64::from_le_bytes(bytes);
1288 
1289         // The stack pointer associated with EL1
1290         let off = offset_of!(kvm_regs, sp_el1);
1291         let mut bytes = [0_u8; 8];
1292         self.fd
1293             .lock()
1294             .unwrap()
1295             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1296             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1297         state.sp_el1 = u64::from_le_bytes(bytes);
1298 
1299         // Exception Link Register for EL1, when taking an exception to EL1, this register
1300         // holds the address to which to return afterwards.
1301         let off = offset_of!(kvm_regs, elr_el1);
1302         let mut bytes = [0_u8; 8];
1303         self.fd
1304             .lock()
1305             .unwrap()
1306             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1307             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1308         state.elr_el1 = u64::from_le_bytes(bytes);
1309 
1310         // Saved Program Status Registers, there are 5 of them used in the kernel.
1311         let mut off = offset_of!(kvm_regs, spsr);
1312         for i in 0..KVM_NR_SPSR as usize {
1313             let mut bytes = [0_u8; 8];
1314             self.fd
1315                 .lock()
1316                 .unwrap()
1317                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1318                 .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1319             state.spsr[i] = u64::from_le_bytes(bytes);
1320             off += std::mem::size_of::<u64>();
1321         }
1322 
1323         // Now moving on to floating point registers which are stored in the user_fpsimd_state in the kernel:
1324         // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53
1325         let mut off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, vregs);
1326         for i in 0..32 {
1327             let mut bytes = [0_u8; 16];
1328             self.fd
1329                 .lock()
1330                 .unwrap()
1331                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off), &mut bytes)
1332                 .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1333             state.fp_regs.vregs[i] = u128::from_le_bytes(bytes);
1334             off += mem::size_of::<u128>();
1335         }
1336 
1337         // Floating-point Status Register
1338         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpsr);
1339         let mut bytes = [0_u8; 4];
1340         self.fd
1341             .lock()
1342             .unwrap()
1343             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off), &mut bytes)
1344             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1345         state.fp_regs.fpsr = u32::from_le_bytes(bytes);
1346 
1347         // Floating-point Control Register
1348         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpcr);
1349         let mut bytes = [0_u8; 4];
1350         self.fd
1351             .lock()
1352             .unwrap()
1353             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off), &mut bytes)
1354             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1355         state.fp_regs.fpcr = u32::from_le_bytes(bytes);
1356         Ok(state.into())
1357     }
1358 
1359     #[cfg(target_arch = "x86_64")]
1360     ///
1361     /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl.
1362     ///
1363     fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> {
1364         let regs = (*regs).into();
1365         self.fd
1366             .lock()
1367             .unwrap()
1368             .set_regs(&regs)
1369             .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into()))
1370     }
1371 
1372     ///
1373     /// Sets the vCPU general purpose registers.
1374     /// The `KVM_SET_REGS` ioctl is not available on AArch64, `KVM_SET_ONE_REG`
1375     /// is used to set registers one by one.
1376     ///
1377     #[cfg(target_arch = "aarch64")]
1378     fn set_regs(&self, state: &StandardRegisters) -> cpu::Result<()> {
1379         // The function follows the exact identical order from `state`. Look there
1380         // for some additional info on registers.
1381         let kvm_regs_state: kvm_regs = (*state).into();
1382         let mut off = offset_of!(user_pt_regs, regs);
1383         for i in 0..31 {
1384             self.fd
1385                 .lock()
1386                 .unwrap()
1387                 .set_one_reg(
1388                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1389                     &kvm_regs_state.regs.regs[i].to_le_bytes(),
1390                 )
1391                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1392             off += std::mem::size_of::<u64>();
1393         }
1394 
1395         let off = offset_of!(user_pt_regs, sp);
1396         self.fd
1397             .lock()
1398             .unwrap()
1399             .set_one_reg(
1400                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1401                 &kvm_regs_state.regs.sp.to_le_bytes(),
1402             )
1403             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1404 
1405         let off = offset_of!(user_pt_regs, pc);
1406         self.fd
1407             .lock()
1408             .unwrap()
1409             .set_one_reg(
1410                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1411                 &kvm_regs_state.regs.pc.to_le_bytes(),
1412             )
1413             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1414 
1415         let off = offset_of!(user_pt_regs, pstate);
1416         self.fd
1417             .lock()
1418             .unwrap()
1419             .set_one_reg(
1420                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1421                 &kvm_regs_state.regs.pstate.to_le_bytes(),
1422             )
1423             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1424 
1425         let off = offset_of!(kvm_regs, sp_el1);
1426         self.fd
1427             .lock()
1428             .unwrap()
1429             .set_one_reg(
1430                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1431                 &kvm_regs_state.sp_el1.to_le_bytes(),
1432             )
1433             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1434 
1435         let off = offset_of!(kvm_regs, elr_el1);
1436         self.fd
1437             .lock()
1438             .unwrap()
1439             .set_one_reg(
1440                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1441                 &kvm_regs_state.elr_el1.to_le_bytes(),
1442             )
1443             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1444 
1445         let mut off = offset_of!(kvm_regs, spsr);
1446         for i in 0..KVM_NR_SPSR as usize {
1447             self.fd
1448                 .lock()
1449                 .unwrap()
1450                 .set_one_reg(
1451                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1452                     &kvm_regs_state.spsr[i].to_le_bytes(),
1453                 )
1454                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1455             off += std::mem::size_of::<u64>();
1456         }
1457 
1458         let mut off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, vregs);
1459         for i in 0..32 {
1460             self.fd
1461                 .lock()
1462                 .unwrap()
1463                 .set_one_reg(
1464                     arm64_core_reg_id!(KVM_REG_SIZE_U128, off),
1465                     &kvm_regs_state.fp_regs.vregs[i].to_le_bytes(),
1466                 )
1467                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1468             off += mem::size_of::<u128>();
1469         }
1470 
1471         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpsr);
1472         self.fd
1473             .lock()
1474             .unwrap()
1475             .set_one_reg(
1476                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1477                 &kvm_regs_state.fp_regs.fpsr.to_le_bytes(),
1478             )
1479             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1480 
1481         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpcr);
1482         self.fd
1483             .lock()
1484             .unwrap()
1485             .set_one_reg(
1486                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1487                 &kvm_regs_state.fp_regs.fpcr.to_le_bytes(),
1488             )
1489             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1490         Ok(())
1491     }
1492 
1493     #[cfg(target_arch = "x86_64")]
1494     ///
1495     /// Returns the vCPU special registers.
1496     ///
1497     fn get_sregs(&self) -> cpu::Result<SpecialRegisters> {
1498         Ok(self
1499             .fd
1500             .lock()
1501             .unwrap()
1502             .get_sregs()
1503             .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))?
1504             .into())
1505     }
1506 
1507     #[cfg(target_arch = "x86_64")]
1508     ///
1509     /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl.
1510     ///
1511     fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> {
1512         let sregs = (*sregs).into();
1513         self.fd
1514             .lock()
1515             .unwrap()
1516             .set_sregs(&sregs)
1517             .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into()))
1518     }
1519 
1520     #[cfg(target_arch = "x86_64")]
1521     ///
1522     /// Returns the floating point state (FPU) from the vCPU.
1523     ///
1524     fn get_fpu(&self) -> cpu::Result<FpuState> {
1525         Ok(self
1526             .fd
1527             .lock()
1528             .unwrap()
1529             .get_fpu()
1530             .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))?
1531             .into())
1532     }
1533 
1534     #[cfg(target_arch = "x86_64")]
1535     ///
1536     /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioctl.
1537     ///
1538     fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> {
1539         let fpu: kvm_bindings::kvm_fpu = (*fpu).clone().into();
1540         self.fd
1541             .lock()
1542             .unwrap()
1543             .set_fpu(&fpu)
1544             .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into()))
1545     }
1546 
1547     #[cfg(target_arch = "x86_64")]
1548     ///
1549     /// X86 specific call to setup the CPUID registers.
1550     ///
1551     fn set_cpuid2(&self, cpuid: &[CpuIdEntry]) -> cpu::Result<()> {
1552         let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> =
1553             cpuid.iter().map(|e| (*e).into()).collect();
1554         let kvm_cpuid = <CpuId>::from_entries(&cpuid)
1555             .map_err(|_| cpu::HypervisorCpuError::SetCpuid(anyhow!("failed to create CpuId")))?;
1556 
1557         self.fd
1558             .lock()
1559             .unwrap()
1560             .set_cpuid2(&kvm_cpuid)
1561             .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into()))
1562     }
1563 
1564     #[cfg(target_arch = "x86_64")]
1565     ///
1566     /// X86 specific call to enable HyperV SynIC
1567     ///
1568     fn enable_hyperv_synic(&self) -> cpu::Result<()> {
1569         // Update the information about Hyper-V SynIC being enabled and
1570         // emulated as it will influence later which MSRs should be saved.
1571         self.hyperv_synic.store(true, Ordering::Release);
1572 
1573         let cap = kvm_enable_cap {
1574             cap: KVM_CAP_HYPERV_SYNIC,
1575             ..Default::default()
1576         };
1577         self.fd
1578             .lock()
1579             .unwrap()
1580             .enable_cap(&cap)
1581             .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into()))
1582     }
1583 
1584     ///
1585     /// X86 specific call to retrieve the CPUID registers.
1586     ///
1587     #[cfg(target_arch = "x86_64")]
1588     fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<Vec<CpuIdEntry>> {
1589         let kvm_cpuid = self
1590             .fd
1591             .lock()
1592             .unwrap()
1593             .get_cpuid2(num_entries)
1594             .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))?;
1595 
1596         let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect();
1597 
1598         Ok(v)
1599     }
1600 
1601     #[cfg(target_arch = "x86_64")]
1602     ///
1603     /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1604     ///
1605     fn get_lapic(&self) -> cpu::Result<LapicState> {
1606         Ok(self
1607             .fd
1608             .lock()
1609             .unwrap()
1610             .get_lapic()
1611             .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))?
1612             .into())
1613     }
1614 
1615     #[cfg(target_arch = "x86_64")]
1616     ///
1617     /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1618     ///
1619     fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> {
1620         let klapic: kvm_bindings::kvm_lapic_state = (*klapic).clone().into();
1621         self.fd
1622             .lock()
1623             .unwrap()
1624             .set_lapic(&klapic)
1625             .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into()))
1626     }
1627 
1628     #[cfg(target_arch = "x86_64")]
1629     ///
1630     /// Returns the model-specific registers (MSR) for this vCPU.
1631     ///
1632     fn get_msrs(&self, msrs: &mut Vec<MsrEntry>) -> cpu::Result<usize> {
1633         let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect();
1634         let mut kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap();
1635         let succ = self
1636             .fd
1637             .lock()
1638             .unwrap()
1639             .get_msrs(&mut kvm_msrs)
1640             .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))?;
1641 
1642         msrs[..succ].copy_from_slice(
1643             &kvm_msrs.as_slice()[..succ]
1644                 .iter()
1645                 .map(|e| (*e).into())
1646                 .collect::<Vec<MsrEntry>>(),
1647         );
1648 
1649         Ok(succ)
1650     }
1651 
1652     #[cfg(target_arch = "x86_64")]
1653     ///
1654     /// Setup the model-specific registers (MSR) for this vCPU.
1655     /// Returns the number of MSR entries actually written.
1656     ///
1657     fn set_msrs(&self, msrs: &[MsrEntry]) -> cpu::Result<usize> {
1658         let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect();
1659         let kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap();
1660         self.fd
1661             .lock()
1662             .unwrap()
1663             .set_msrs(&kvm_msrs)
1664             .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into()))
1665     }
1666 
1667     ///
1668     /// Returns the vcpu's current "multiprocessing state".
1669     ///
1670     fn get_mp_state(&self) -> cpu::Result<MpState> {
1671         Ok(self
1672             .fd
1673             .lock()
1674             .unwrap()
1675             .get_mp_state()
1676             .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))?
1677             .into())
1678     }
1679 
1680     ///
1681     /// Sets the vcpu's current "multiprocessing state".
1682     ///
1683     fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> {
1684         self.fd
1685             .lock()
1686             .unwrap()
1687             .set_mp_state(mp_state.into())
1688             .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into()))
1689     }
1690 
1691     #[cfg(target_arch = "x86_64")]
1692     ///
1693     /// Translates guest virtual address to guest physical address using the `KVM_TRANSLATE` ioctl.
1694     ///
1695     fn translate_gva(&self, gva: u64, _flags: u64) -> cpu::Result<(u64, u32)> {
1696         let tr = self
1697             .fd
1698             .lock()
1699             .unwrap()
1700             .translate_gva(gva)
1701             .map_err(|e| cpu::HypervisorCpuError::TranslateVirtualAddress(e.into()))?;
1702         // tr.valid is set if the GVA is mapped to valid GPA.
1703         match tr.valid {
1704             0 => Err(cpu::HypervisorCpuError::TranslateVirtualAddress(anyhow!(
1705                 "Invalid GVA: {:#x}",
1706                 gva
1707             ))),
1708             _ => Ok((tr.physical_address, 0)),
1709         }
1710     }
1711 
1712     ///
1713     /// Triggers the running of the current virtual CPU returning an exit reason.
1714     ///
1715     fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> {
1716         match self.fd.lock().unwrap().run() {
1717             Ok(run) => match run {
1718                 #[cfg(target_arch = "x86_64")]
1719                 VcpuExit::IoIn(addr, data) => {
1720                     if let Some(vm_ops) = &self.vm_ops {
1721                         return vm_ops
1722                             .pio_read(addr.into(), data)
1723                             .map(|_| cpu::VmExit::Ignore)
1724                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1725                     }
1726 
1727                     Ok(cpu::VmExit::Ignore)
1728                 }
1729                 #[cfg(target_arch = "x86_64")]
1730                 VcpuExit::IoOut(addr, data) => {
1731                     if let Some(vm_ops) = &self.vm_ops {
1732                         return vm_ops
1733                             .pio_write(addr.into(), data)
1734                             .map(|_| cpu::VmExit::Ignore)
1735                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1736                     }
1737 
1738                     Ok(cpu::VmExit::Ignore)
1739                 }
1740                 #[cfg(target_arch = "x86_64")]
1741                 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)),
1742                 #[cfg(target_arch = "x86_64")]
1743                 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset),
1744 
1745                 #[cfg(target_arch = "aarch64")]
1746                 VcpuExit::SystemEvent(event_type, flags) => {
1747                     use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN};
1748                     // On Aarch64, when the VM is shutdown, run() returns
1749                     // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN
1750                     if event_type == KVM_SYSTEM_EVENT_RESET {
1751                         Ok(cpu::VmExit::Reset)
1752                     } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN {
1753                         Ok(cpu::VmExit::Shutdown)
1754                     } else {
1755                         Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1756                             "Unexpected system event with type 0x{:x}, flags 0x{:x?}",
1757                             event_type,
1758                             flags
1759                         )))
1760                     }
1761                 }
1762 
1763                 VcpuExit::MmioRead(addr, data) => {
1764                     if let Some(vm_ops) = &self.vm_ops {
1765                         return vm_ops
1766                             .mmio_read(addr, data)
1767                             .map(|_| cpu::VmExit::Ignore)
1768                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1769                     }
1770 
1771                     Ok(cpu::VmExit::Ignore)
1772                 }
1773                 VcpuExit::MmioWrite(addr, data) => {
1774                     if let Some(vm_ops) = &self.vm_ops {
1775                         return vm_ops
1776                             .mmio_write(addr, data)
1777                             .map(|_| cpu::VmExit::Ignore)
1778                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1779                     }
1780 
1781                     Ok(cpu::VmExit::Ignore)
1782                 }
1783                 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv),
1784                 #[cfg(feature = "tdx")]
1785                 VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx),
1786                 VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug),
1787 
1788                 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1789                     "Unexpected exit reason on vcpu run: {:?}",
1790                     r
1791                 ))),
1792             },
1793 
1794             Err(ref e) => match e.errno() {
1795                 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore),
1796                 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1797                     "VCPU error {:?}",
1798                     e
1799                 ))),
1800             },
1801         }
1802     }
1803 
1804     #[cfg(target_arch = "x86_64")]
1805     ///
1806     /// Let the guest know that it has been paused, which prevents from
1807     /// potential soft lockups when being resumed.
1808     ///
1809     fn notify_guest_clock_paused(&self) -> cpu::Result<()> {
1810         if let Err(e) = self.fd.lock().unwrap().kvmclock_ctrl() {
1811             // Linux kernel returns -EINVAL if the PV clock isn't yet initialised
1812             // which could be because we're still in firmware or the guest doesn't
1813             // use KVM clock.
1814             if e.errno() != libc::EINVAL {
1815                 return Err(cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into()));
1816             }
1817         }
1818 
1819         Ok(())
1820     }
1821 
1822     ///
1823     /// Sets debug registers to set hardware breakpoints and/or enable single step.
1824     ///
1825     fn set_guest_debug(
1826         &self,
1827         addrs: &[vm_memory::GuestAddress],
1828         singlestep: bool,
1829     ) -> cpu::Result<()> {
1830         let mut dbg = kvm_guest_debug {
1831             #[cfg(target_arch = "x86_64")]
1832             control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP,
1833             #[cfg(target_arch = "aarch64")]
1834             control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW,
1835             ..Default::default()
1836         };
1837         if singlestep {
1838             dbg.control |= KVM_GUESTDBG_SINGLESTEP;
1839         }
1840 
1841         // Set the debug registers.
1842         // Here we assume that the number of addresses do not exceed what
1843         // `Hypervisor::get_guest_debug_hw_bps()` specifies.
1844         #[cfg(target_arch = "x86_64")]
1845         {
1846             // Set bits 9 and 10.
1847             // bit 9: GE (global exact breakpoint enable) flag.
1848             // bit 10: always 1.
1849             dbg.arch.debugreg[7] = 0x0600;
1850 
1851             for (i, addr) in addrs.iter().enumerate() {
1852                 dbg.arch.debugreg[i] = addr.0;
1853                 // Set global breakpoint enable flag
1854                 dbg.arch.debugreg[7] |= 2 << (i * 2);
1855             }
1856         }
1857         #[cfg(target_arch = "aarch64")]
1858         {
1859             for (i, addr) in addrs.iter().enumerate() {
1860                 // DBGBCR_EL1 (Debug Breakpoint Control Registers, D13.3.2):
1861                 // bit 0: 1 (Enabled)
1862                 // bit 1~2: 0b11 (PMC = EL1/EL0)
1863                 // bit 5~8: 0b1111 (BAS = AArch64)
1864                 // others: 0
1865                 dbg.arch.dbg_bcr[i] = 0b1u64 | 0b110u64 | 0b1_1110_0000u64;
1866                 // DBGBVR_EL1 (Debug Breakpoint Value Registers, D13.3.3):
1867                 // bit 2~52: VA[2:52]
1868                 dbg.arch.dbg_bvr[i] = (!0u64 >> 11) & addr.0;
1869             }
1870         }
1871         self.fd
1872             .lock()
1873             .unwrap()
1874             .set_guest_debug(&dbg)
1875             .map_err(|e| cpu::HypervisorCpuError::SetDebugRegs(e.into()))
1876     }
1877 
1878     #[cfg(target_arch = "aarch64")]
1879     fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> {
1880         self.fd
1881             .lock()
1882             .unwrap()
1883             .vcpu_init(kvi)
1884             .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into()))
1885     }
1886 
1887     #[cfg(target_arch = "aarch64")]
1888     fn vcpu_finalize(&self, feature: i32) -> cpu::Result<()> {
1889         self.fd
1890             .lock()
1891             .unwrap()
1892             .vcpu_finalize(&feature)
1893             .map_err(|e| cpu::HypervisorCpuError::VcpuFinalize(e.into()))
1894     }
1895 
1896     ///
1897     /// Gets a list of the guest registers that are supported for the
1898     /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls.
1899     ///
1900     #[cfg(target_arch = "aarch64")]
1901     fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> {
1902         self.fd
1903             .lock()
1904             .unwrap()
1905             .get_reg_list(reg_list)
1906             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))
1907     }
1908 
1909     ///
1910     /// Gets the value of a system register
1911     ///
1912     #[cfg(target_arch = "aarch64")]
1913     fn get_sys_reg(&self, sys_reg: u32) -> cpu::Result<u64> {
1914         //
1915         // Arm Architecture Reference Manual defines the encoding of
1916         // AArch64 system registers, see
1917         // https://developer.arm.com/documentation/ddi0487 (chapter D12).
1918         // While KVM defines another ID for each AArch64 system register,
1919         // which is used in calling `KVM_G/SET_ONE_REG` to access a system
1920         // register of a guest.
1921         // A mapping exists between the Arm standard encoding and the KVM ID.
1922         // This function takes the standard u32 ID as input parameter, converts
1923         // it to the corresponding KVM ID, and call `KVM_GET_ONE_REG` API to
1924         // get the value of the system parameter.
1925         //
1926         let id: u64 = KVM_REG_ARM64
1927             | KVM_REG_SIZE_U64
1928             | KVM_REG_ARM64_SYSREG as u64
1929             | ((((sys_reg) >> 5)
1930                 & (KVM_REG_ARM64_SYSREG_OP0_MASK
1931                     | KVM_REG_ARM64_SYSREG_OP1_MASK
1932                     | KVM_REG_ARM64_SYSREG_CRN_MASK
1933                     | KVM_REG_ARM64_SYSREG_CRM_MASK
1934                     | KVM_REG_ARM64_SYSREG_OP2_MASK)) as u64);
1935         let mut bytes = [0_u8; 8];
1936         self.fd
1937             .lock()
1938             .unwrap()
1939             .get_one_reg(id, &mut bytes)
1940             .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?;
1941         Ok(u64::from_le_bytes(bytes))
1942     }
1943 
1944     ///
1945     /// Configure core registers for a given CPU.
1946     ///
1947     #[cfg(target_arch = "aarch64")]
1948     fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> {
1949         #[allow(non_upper_case_globals)]
1950         // PSR (Processor State Register) bits.
1951         // Taken from arch/arm64/include/uapi/asm/ptrace.h.
1952         const PSR_MODE_EL1h: u64 = 0x0000_0005;
1953         const PSR_F_BIT: u64 = 0x0000_0040;
1954         const PSR_I_BIT: u64 = 0x0000_0080;
1955         const PSR_A_BIT: u64 = 0x0000_0100;
1956         const PSR_D_BIT: u64 = 0x0000_0200;
1957         // Taken from arch/arm64/kvm/inject_fault.c.
1958         const PSTATE_FAULT_BITS_64: u64 =
1959             PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | PSR_I_BIT | PSR_D_BIT;
1960 
1961         let kreg_off = offset_of!(kvm_regs, regs);
1962 
1963         // Get the register index of the PSTATE (Processor State) register.
1964         let pstate = offset_of!(user_pt_regs, pstate) + kreg_off;
1965         self.fd
1966             .lock()
1967             .unwrap()
1968             .set_one_reg(
1969                 arm64_core_reg_id!(KVM_REG_SIZE_U64, pstate),
1970                 &PSTATE_FAULT_BITS_64.to_le_bytes(),
1971             )
1972             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1973 
1974         // Other vCPUs are powered off initially awaiting PSCI wakeup.
1975         if cpu_id == 0 {
1976             // Setting the PC (Processor Counter) to the current program address (kernel address).
1977             let pc = offset_of!(user_pt_regs, pc) + kreg_off;
1978             self.fd
1979                 .lock()
1980                 .unwrap()
1981                 .set_one_reg(
1982                     arm64_core_reg_id!(KVM_REG_SIZE_U64, pc),
1983                     &boot_ip.to_le_bytes(),
1984                 )
1985                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1986 
1987             // Last mandatory thing to set -> the address pointing to the FDT (also called DTB).
1988             // "The device tree blob (dtb) must be placed on an 8-byte boundary and must
1989             // not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt.
1990             // We are choosing to place it the end of DRAM. See `get_fdt_addr`.
1991             let regs0 = offset_of!(user_pt_regs, regs) + kreg_off;
1992             self.fd
1993                 .lock()
1994                 .unwrap()
1995                 .set_one_reg(
1996                     arm64_core_reg_id!(KVM_REG_SIZE_U64, regs0),
1997                     &fdt_start.to_le_bytes(),
1998                 )
1999                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
2000         }
2001         Ok(())
2002     }
2003 
2004     #[cfg(target_arch = "x86_64")]
2005     ///
2006     /// Get the current CPU state
2007     ///
2008     /// Ordering requirements:
2009     ///
2010     /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify
2011     /// vCPU/LAPIC state. As such, it must be done before most everything
2012     /// else, otherwise we cannot restore everything and expect it to work.
2013     ///
2014     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
2015     /// still running.
2016     ///
2017     /// KVM_GET_LAPIC may change state of LAPIC before returning it.
2018     ///
2019     /// GET_VCPU_EVENTS should probably be last to save. The code looks as
2020     /// it might as well be affected by internal state modifications of the
2021     /// GET ioctls.
2022     ///
2023     /// SREGS saves/restores a pending interrupt, similar to what
2024     /// VCPU_EVENTS also does.
2025     ///
2026     /// GET_MSRS requires a prepopulated data structure to do something
2027     /// meaningful. For SET_MSRS it will then contain good data.
2028     ///
2029     /// # Example
2030     ///
2031     /// ```rust
2032     /// # use hypervisor::kvm::KvmHypervisor;
2033     /// # use std::sync::Arc;
2034     /// let kvm = KvmHypervisor::new().unwrap();
2035     /// let hv = Arc::new(kvm);
2036     /// let vm = hv.create_vm().expect("new VM fd creation failed");
2037     /// vm.enable_split_irq().unwrap();
2038     /// let vcpu = vm.create_vcpu(0, None).unwrap();
2039     /// let state = vcpu.state().unwrap();
2040     /// ```
2041     fn state(&self) -> cpu::Result<CpuState> {
2042         let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?;
2043         let mp_state = self.get_mp_state()?.into();
2044         let regs = self.get_regs()?;
2045         let sregs = self.get_sregs()?;
2046         let xsave = self.get_xsave()?;
2047         let xcrs = self.get_xcrs()?;
2048         let lapic_state = self.get_lapic()?;
2049         let fpu = self.get_fpu()?;
2050 
2051         // Try to get all MSRs based on the list previously retrieved from KVM.
2052         // If the number of MSRs obtained from GET_MSRS is different from the
2053         // expected amount, we fallback onto a slower method by getting MSRs
2054         // by chunks. This is the only way to make sure we try to get as many
2055         // MSRs as possible, even if some MSRs are not supported.
2056         let mut msr_entries = self.msrs.clone();
2057 
2058         // Save extra MSRs if the Hyper-V synthetic interrupt controller is
2059         // emulated.
2060         if self.hyperv_synic.load(Ordering::Acquire) {
2061             let hyperv_synic_msrs = vec![
2062                 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084,
2063                 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096,
2064                 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d,
2065                 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4,
2066                 0x400000b5, 0x400000b6, 0x400000b7,
2067             ];
2068             for index in hyperv_synic_msrs {
2069                 let msr = kvm_msr_entry {
2070                     index,
2071                     ..Default::default()
2072                 };
2073                 msr_entries.push(msr.into());
2074             }
2075         }
2076 
2077         let expected_num_msrs = msr_entries.len();
2078         let num_msrs = self.get_msrs(&mut msr_entries)?;
2079         let msrs = if num_msrs != expected_num_msrs {
2080             let mut faulty_msr_index = num_msrs;
2081             let mut msr_entries_tmp = msr_entries[..faulty_msr_index].to_vec();
2082 
2083             loop {
2084                 warn!(
2085                     "Detected faulty MSR 0x{:x} while getting MSRs",
2086                     msr_entries[faulty_msr_index].index
2087                 );
2088 
2089                 // Skip the first bad MSR
2090                 let start_pos = faulty_msr_index + 1;
2091 
2092                 let mut sub_msr_entries = msr_entries[start_pos..].to_vec();
2093                 let num_msrs = self.get_msrs(&mut sub_msr_entries)?;
2094 
2095                 msr_entries_tmp.extend(&sub_msr_entries[..num_msrs]);
2096 
2097                 if num_msrs == sub_msr_entries.len() {
2098                     break;
2099                 }
2100 
2101                 faulty_msr_index = start_pos + num_msrs;
2102             }
2103 
2104             msr_entries_tmp
2105         } else {
2106             msr_entries
2107         };
2108 
2109         let vcpu_events = self.get_vcpu_events()?;
2110         let tsc_khz = self.tsc_khz()?;
2111 
2112         Ok(VcpuKvmState {
2113             cpuid,
2114             msrs,
2115             vcpu_events,
2116             regs: regs.into(),
2117             sregs: sregs.into(),
2118             fpu,
2119             lapic_state,
2120             xsave,
2121             xcrs,
2122             mp_state,
2123             tsc_khz,
2124         }
2125         .into())
2126     }
2127 
2128     ///
2129     /// Get the current AArch64 CPU state
2130     ///
2131     #[cfg(target_arch = "aarch64")]
2132     fn state(&self) -> cpu::Result<CpuState> {
2133         let mut state = VcpuKvmState {
2134             mp_state: self.get_mp_state()?.into(),
2135             ..Default::default()
2136         };
2137         // Get core registers
2138         state.core_regs = self.get_regs()?.into();
2139 
2140         // Get systerm register
2141         // Call KVM_GET_REG_LIST to get all registers available to the guest.
2142         // For ArmV8 there are around 500 registers.
2143         let mut sys_regs: Vec<Register> = Vec::new();
2144         let mut reg_list = RegList::new(500).unwrap();
2145         self.fd
2146             .lock()
2147             .unwrap()
2148             .get_reg_list(&mut reg_list)
2149             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?;
2150 
2151         // At this point reg_list should contain: core registers and system
2152         // registers.
2153         // The register list contains the number of registers and their ids. We
2154         // will be needing to call KVM_GET_ONE_REG on each id in order to save
2155         // all of them. We carve out from the list  the core registers which are
2156         // represented in the kernel by kvm_regs structure and for which we can
2157         // calculate the id based on the offset in the structure.
2158         reg_list.retain(|regid| is_system_register(*regid));
2159 
2160         // Now, for the rest of the registers left in the previously fetched
2161         // register list, we are simply calling KVM_GET_ONE_REG.
2162         let indices = reg_list.as_slice();
2163         for index in indices.iter() {
2164             let mut bytes = [0_u8; 8];
2165             self.fd
2166                 .lock()
2167                 .unwrap()
2168                 .get_one_reg(*index, &mut bytes)
2169                 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?;
2170             sys_regs.push(kvm_bindings::kvm_one_reg {
2171                 id: *index,
2172                 addr: u64::from_le_bytes(bytes),
2173             });
2174         }
2175 
2176         state.sys_regs = sys_regs;
2177 
2178         Ok(state.into())
2179     }
2180 
2181     #[cfg(target_arch = "x86_64")]
2182     ///
2183     /// Restore the previously saved CPU state
2184     ///
2185     /// Ordering requirements:
2186     ///
2187     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
2188     /// still running.
2189     ///
2190     /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so
2191     /// if we ever change the BSP, we have to do that before restoring anything.
2192     /// The same seems to be true for CPUID stuff.
2193     ///
2194     /// SREGS saves/restores a pending interrupt, similar to what
2195     /// VCPU_EVENTS also does.
2196     ///
2197     /// SET_REGS clears pending exceptions unconditionally, thus, it must be
2198     /// done before SET_VCPU_EVENTS, which restores it.
2199     ///
2200     /// SET_LAPIC must come after SET_SREGS, because the latter restores
2201     /// the apic base msr.
2202     ///
2203     /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR
2204     /// only restores successfully, when the LAPIC is correctly configured.
2205     ///
2206     /// Arguments: CpuState
2207     /// # Example
2208     ///
2209     /// ```rust
2210     /// # use hypervisor::kvm::KvmHypervisor;
2211     /// # use std::sync::Arc;
2212     /// let kvm = KvmHypervisor::new().unwrap();
2213     /// let hv = Arc::new(kvm);
2214     /// let vm = hv.create_vm().expect("new VM fd creation failed");
2215     /// vm.enable_split_irq().unwrap();
2216     /// let vcpu = vm.create_vcpu(0, None).unwrap();
2217     /// let state = vcpu.state().unwrap();
2218     /// vcpu.set_state(&state).unwrap();
2219     /// ```
2220     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
2221         let state: VcpuKvmState = state.clone().into();
2222         self.set_cpuid2(&state.cpuid)?;
2223         self.set_mp_state(state.mp_state.into())?;
2224         self.set_regs(&state.regs.into())?;
2225         self.set_sregs(&state.sregs.into())?;
2226         self.set_xsave(&state.xsave)?;
2227         self.set_xcrs(&state.xcrs)?;
2228         self.set_lapic(&state.lapic_state)?;
2229         self.set_fpu(&state.fpu)?;
2230 
2231         if let Some(freq) = state.tsc_khz {
2232             self.set_tsc_khz(freq)?;
2233         }
2234 
2235         // Try to set all MSRs previously stored.
2236         // If the number of MSRs set from SET_MSRS is different from the
2237         // expected amount, we fallback onto a slower method by setting MSRs
2238         // by chunks. This is the only way to make sure we try to set as many
2239         // MSRs as possible, even if some MSRs are not supported.
2240         let expected_num_msrs = state.msrs.len();
2241         let num_msrs = self.set_msrs(&state.msrs)?;
2242         if num_msrs != expected_num_msrs {
2243             let mut faulty_msr_index = num_msrs;
2244 
2245             loop {
2246                 warn!(
2247                     "Detected faulty MSR 0x{:x} while setting MSRs",
2248                     state.msrs[faulty_msr_index].index
2249                 );
2250 
2251                 // Skip the first bad MSR
2252                 let start_pos = faulty_msr_index + 1;
2253 
2254                 let sub_msr_entries = state.msrs[start_pos..].to_vec();
2255 
2256                 let num_msrs = self.set_msrs(&sub_msr_entries)?;
2257 
2258                 if num_msrs == sub_msr_entries.len() {
2259                     break;
2260                 }
2261 
2262                 faulty_msr_index = start_pos + num_msrs;
2263             }
2264         }
2265 
2266         self.set_vcpu_events(&state.vcpu_events)?;
2267 
2268         Ok(())
2269     }
2270 
2271     ///
2272     /// Restore the previously saved AArch64 CPU state
2273     ///
2274     #[cfg(target_arch = "aarch64")]
2275     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
2276         let state: VcpuKvmState = state.clone().into();
2277         // Set core registers
2278         self.set_regs(&state.core_regs.into())?;
2279         // Set system registers
2280         for reg in &state.sys_regs {
2281             self.fd
2282                 .lock()
2283                 .unwrap()
2284                 .set_one_reg(reg.id, &reg.addr.to_le_bytes())
2285                 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?;
2286         }
2287 
2288         self.set_mp_state(state.mp_state.into())?;
2289 
2290         Ok(())
2291     }
2292 
2293     ///
2294     /// Initialize TDX for this CPU
2295     ///
2296     #[cfg(feature = "tdx")]
2297     fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> {
2298         tdx_command(
2299             &self.fd.lock().unwrap().as_raw_fd(),
2300             TdxCommand::InitVcpu,
2301             0,
2302             hob_address,
2303         )
2304         .map_err(cpu::HypervisorCpuError::InitializeTdx)
2305     }
2306 
2307     ///
2308     /// Set the "immediate_exit" state
2309     ///
2310     fn set_immediate_exit(&self, exit: bool) {
2311         self.fd.lock().unwrap().set_kvm_immediate_exit(exit.into());
2312     }
2313 
2314     ///
2315     /// Returns the details about TDX exit reason
2316     ///
2317     #[cfg(feature = "tdx")]
2318     fn get_tdx_exit_details(&mut self) -> cpu::Result<TdxExitDetails> {
2319         let mut fd = self.fd.as_ref().lock().unwrap();
2320         let kvm_run = fd.get_kvm_run();
2321         // SAFETY: accessing a union field in a valid structure
2322         let tdx_vmcall = unsafe {
2323             &mut (*((&mut kvm_run.__bindgen_anon_1) as *mut kvm_run__bindgen_ty_1
2324                 as *mut KvmTdxExit))
2325                 .u
2326                 .vmcall
2327         };
2328 
2329         tdx_vmcall.status_code = TDG_VP_VMCALL_INVALID_OPERAND;
2330 
2331         if tdx_vmcall.type_ != 0 {
2332             return Err(cpu::HypervisorCpuError::UnknownTdxVmCall);
2333         }
2334 
2335         match tdx_vmcall.subfunction {
2336             TDG_VP_VMCALL_GET_QUOTE => Ok(TdxExitDetails::GetQuote),
2337             TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT => {
2338                 Ok(TdxExitDetails::SetupEventNotifyInterrupt)
2339             }
2340             _ => Err(cpu::HypervisorCpuError::UnknownTdxVmCall),
2341         }
2342     }
2343 
2344     ///
2345     /// Set the status code for TDX exit
2346     ///
2347     #[cfg(feature = "tdx")]
2348     fn set_tdx_status(&mut self, status: TdxExitStatus) {
2349         let mut fd = self.fd.as_ref().lock().unwrap();
2350         let kvm_run = fd.get_kvm_run();
2351         // SAFETY: accessing a union field in a valid structure
2352         let tdx_vmcall = unsafe {
2353             &mut (*((&mut kvm_run.__bindgen_anon_1) as *mut kvm_run__bindgen_ty_1
2354                 as *mut KvmTdxExit))
2355                 .u
2356                 .vmcall
2357         };
2358 
2359         tdx_vmcall.status_code = match status {
2360             TdxExitStatus::Success => TDG_VP_VMCALL_SUCCESS,
2361             TdxExitStatus::InvalidOperand => TDG_VP_VMCALL_INVALID_OPERAND,
2362         };
2363     }
2364 
2365     #[cfg(target_arch = "x86_64")]
2366     ///
2367     /// Return the list of initial MSR entries for a VCPU
2368     ///
2369     fn boot_msr_entries(&self) -> Vec<MsrEntry> {
2370         use crate::arch::x86::{msr_index, MTRR_ENABLE, MTRR_MEM_TYPE_WB};
2371 
2372         [
2373             msr!(msr_index::MSR_IA32_SYSENTER_CS),
2374             msr!(msr_index::MSR_IA32_SYSENTER_ESP),
2375             msr!(msr_index::MSR_IA32_SYSENTER_EIP),
2376             msr!(msr_index::MSR_STAR),
2377             msr!(msr_index::MSR_CSTAR),
2378             msr!(msr_index::MSR_LSTAR),
2379             msr!(msr_index::MSR_KERNEL_GS_BASE),
2380             msr!(msr_index::MSR_SYSCALL_MASK),
2381             msr!(msr_index::MSR_IA32_TSC),
2382             msr_data!(
2383                 msr_index::MSR_IA32_MISC_ENABLE,
2384                 msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64
2385             ),
2386             msr_data!(msr_index::MSR_MTRRdefType, MTRR_ENABLE | MTRR_MEM_TYPE_WB),
2387         ]
2388         .to_vec()
2389     }
2390 
2391     #[cfg(target_arch = "aarch64")]
2392     fn has_pmu_support(&self) -> bool {
2393         let cpu_attr = kvm_bindings::kvm_device_attr {
2394             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2395             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT),
2396             addr: 0x0,
2397             flags: 0,
2398         };
2399         self.fd.lock().unwrap().has_device_attr(&cpu_attr).is_ok()
2400     }
2401 
2402     #[cfg(target_arch = "aarch64")]
2403     fn init_pmu(&self, irq: u32) -> cpu::Result<()> {
2404         let cpu_attr = kvm_bindings::kvm_device_attr {
2405             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2406             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT),
2407             addr: 0x0,
2408             flags: 0,
2409         };
2410         let cpu_attr_irq = kvm_bindings::kvm_device_attr {
2411             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2412             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_IRQ),
2413             addr: &irq as *const u32 as u64,
2414             flags: 0,
2415         };
2416         self.fd
2417             .lock()
2418             .unwrap()
2419             .set_device_attr(&cpu_attr_irq)
2420             .map_err(|_| cpu::HypervisorCpuError::InitializePmu)?;
2421         self.fd
2422             .lock()
2423             .unwrap()
2424             .set_device_attr(&cpu_attr)
2425             .map_err(|_| cpu::HypervisorCpuError::InitializePmu)
2426     }
2427 
2428     #[cfg(target_arch = "x86_64")]
2429     ///
2430     /// Get the frequency of the TSC if available
2431     ///
2432     fn tsc_khz(&self) -> cpu::Result<Option<u32>> {
2433         match self.fd.lock().unwrap().get_tsc_khz() {
2434             Err(e) => {
2435                 if e.errno() == libc::EIO {
2436                     Ok(None)
2437                 } else {
2438                     Err(cpu::HypervisorCpuError::GetTscKhz(e.into()))
2439                 }
2440             }
2441             Ok(v) => Ok(Some(v)),
2442         }
2443     }
2444 
2445     #[cfg(target_arch = "x86_64")]
2446     ///
2447     /// Set the frequency of the TSC if available
2448     ///
2449     fn set_tsc_khz(&self, freq: u32) -> cpu::Result<()> {
2450         match self.fd.lock().unwrap().set_tsc_khz(freq) {
2451             Err(e) => {
2452                 if e.errno() == libc::EIO {
2453                     Ok(())
2454                 } else {
2455                     Err(cpu::HypervisorCpuError::SetTscKhz(e.into()))
2456                 }
2457             }
2458             Ok(_) => Ok(()),
2459         }
2460     }
2461 
2462     #[cfg(target_arch = "x86_64")]
2463     ///
2464     /// Trigger NMI interrupt
2465     ///
2466     fn nmi(&self) -> cpu::Result<()> {
2467         match self.fd.lock().unwrap().nmi() {
2468             Err(e) => {
2469                 if e.errno() == libc::EIO {
2470                     Ok(())
2471                 } else {
2472                     Err(cpu::HypervisorCpuError::Nmi(e.into()))
2473                 }
2474             }
2475             Ok(_) => Ok(()),
2476         }
2477     }
2478 }
2479 
2480 impl KvmVcpu {
2481     #[cfg(target_arch = "x86_64")]
2482     ///
2483     /// X86 specific call that returns the vcpu's current "xsave struct".
2484     ///
2485     fn get_xsave(&self) -> cpu::Result<XsaveState> {
2486         Ok(self
2487             .fd
2488             .lock()
2489             .unwrap()
2490             .get_xsave()
2491             .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into()))?
2492             .into())
2493     }
2494 
2495     #[cfg(target_arch = "x86_64")]
2496     ///
2497     /// X86 specific call that sets the vcpu's current "xsave struct".
2498     ///
2499     fn set_xsave(&self, xsave: &XsaveState) -> cpu::Result<()> {
2500         let xsave: kvm_bindings::kvm_xsave = (*xsave).clone().into();
2501         self.fd
2502             .lock()
2503             .unwrap()
2504             .set_xsave(&xsave)
2505             .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into()))
2506     }
2507 
2508     #[cfg(target_arch = "x86_64")]
2509     ///
2510     /// X86 specific call that returns the vcpu's current "xcrs".
2511     ///
2512     fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> {
2513         self.fd
2514             .lock()
2515             .unwrap()
2516             .get_xcrs()
2517             .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into()))
2518     }
2519 
2520     #[cfg(target_arch = "x86_64")]
2521     ///
2522     /// X86 specific call that sets the vcpu's current "xcrs".
2523     ///
2524     fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> {
2525         self.fd
2526             .lock()
2527             .unwrap()
2528             .set_xcrs(xcrs)
2529             .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into()))
2530     }
2531 
2532     #[cfg(target_arch = "x86_64")]
2533     ///
2534     /// Returns currently pending exceptions, interrupts, and NMIs as well as related
2535     /// states of the vcpu.
2536     ///
2537     fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> {
2538         self.fd
2539             .lock()
2540             .unwrap()
2541             .get_vcpu_events()
2542             .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into()))
2543     }
2544 
2545     #[cfg(target_arch = "x86_64")]
2546     ///
2547     /// Sets pending exceptions, interrupts, and NMIs as well as related states
2548     /// of the vcpu.
2549     ///
2550     fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> {
2551         self.fd
2552             .lock()
2553             .unwrap()
2554             .set_vcpu_events(events)
2555             .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into()))
2556     }
2557 }
2558