xref: /cloud-hypervisor/hypervisor/src/kvm/mod.rs (revision 9a96ea44becd0c9b6275dba4e535e6e6a18bc958)
1 // Copyright © 2024 Institute of Software, CAS. All rights reserved.
2 //
3 // Copyright © 2019 Intel Corporation
4 //
5 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
6 //
7 // Copyright © 2020, Microsoft Corporation
8 //
9 // Copyright 2018-2019 CrowdStrike, Inc.
10 //
11 //
12 
13 use std::any::Any;
14 use std::collections::HashMap;
15 #[cfg(target_arch = "x86_64")]
16 use std::fs::File;
17 #[cfg(target_arch = "x86_64")]
18 use std::os::unix::io::AsRawFd;
19 #[cfg(feature = "tdx")]
20 use std::os::unix::io::RawFd;
21 use std::result;
22 #[cfg(target_arch = "x86_64")]
23 use std::sync::atomic::{AtomicBool, Ordering};
24 use std::sync::{Arc, Mutex, RwLock};
25 
26 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd};
27 use vmm_sys_util::eventfd::EventFd;
28 
29 #[cfg(target_arch = "aarch64")]
30 use crate::aarch64::gic::KvmGicV3Its;
31 #[cfg(target_arch = "aarch64")]
32 pub use crate::aarch64::{
33     check_required_kvm_extensions, gic::Gicv3ItsState as GicState, is_system_register, VcpuKvmState,
34 };
35 #[cfg(target_arch = "aarch64")]
36 use crate::arch::aarch64::gic::{Vgic, VgicConfig};
37 #[cfg(target_arch = "riscv64")]
38 use crate::arch::riscv64::aia::{Vaia, VaiaConfig};
39 #[cfg(target_arch = "riscv64")]
40 use crate::riscv64::aia::KvmAiaImsics;
41 #[cfg(target_arch = "riscv64")]
42 pub use crate::riscv64::{
43     aia::AiaImsicsState as AiaState, check_required_kvm_extensions, is_non_core_register,
44     VcpuKvmState,
45 };
46 use crate::vm::{self, InterruptSourceConfig, VmOps};
47 #[cfg(target_arch = "aarch64")]
48 use crate::{arm64_core_reg_id, offset_of};
49 use crate::{cpu, hypervisor, vec_with_array_field, HypervisorType};
50 #[cfg(target_arch = "riscv64")]
51 use crate::{offset_of, riscv64_reg_id};
52 // x86_64 dependencies
53 #[cfg(target_arch = "x86_64")]
54 pub mod x86_64;
55 #[cfg(target_arch = "x86_64")]
56 use kvm_bindings::{
57     kvm_enable_cap, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, KVM_CAP_SPLIT_IRQCHIP,
58     KVM_GUESTDBG_USE_HW_BP,
59 };
60 #[cfg(target_arch = "x86_64")]
61 use x86_64::check_required_kvm_extensions;
62 #[cfg(target_arch = "x86_64")]
63 pub use x86_64::{CpuId, ExtendedControlRegisters, MsrEntries, VcpuKvmState};
64 
65 #[cfg(target_arch = "x86_64")]
66 use crate::arch::x86::{
67     CpuIdEntry, FpuState, LapicState, MsrEntry, SpecialRegisters, XsaveState, NUM_IOAPIC_PINS,
68 };
69 #[cfg(target_arch = "x86_64")]
70 use crate::ClockData;
71 use crate::{
72     CpuState, IoEventAddress, IrqRoutingEntry, MpState, StandardRegisters, UserMemoryRegion,
73     USER_MEMORY_REGION_LOG_DIRTY, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE,
74 };
75 // aarch64 dependencies
76 #[cfg(target_arch = "aarch64")]
77 pub mod aarch64;
78 // riscv64 dependencies
79 #[cfg(target_arch = "riscv64")]
80 pub mod riscv64;
81 #[cfg(target_arch = "aarch64")]
82 use std::mem;
83 
84 ///
85 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms
86 ///
87 #[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
88 pub use kvm_bindings::kvm_vcpu_events as VcpuEvents;
89 pub use kvm_bindings::{
90     kvm_clock_data, kvm_create_device, kvm_create_device as CreateDevice,
91     kvm_device_attr as DeviceAttr, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_guest_debug,
92     kvm_irq_routing, kvm_irq_routing_entry, kvm_mp_state, kvm_run, kvm_userspace_memory_region,
93     KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI,
94     KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID,
95 };
96 #[cfg(target_arch = "aarch64")]
97 use kvm_bindings::{
98     kvm_regs, user_fpsimd_state, user_pt_regs, KVM_GUESTDBG_USE_HW, KVM_NR_SPSR, KVM_REG_ARM64,
99     KVM_REG_ARM64_SYSREG, KVM_REG_ARM64_SYSREG_CRM_MASK, KVM_REG_ARM64_SYSREG_CRN_MASK,
100     KVM_REG_ARM64_SYSREG_OP0_MASK, KVM_REG_ARM64_SYSREG_OP1_MASK, KVM_REG_ARM64_SYSREG_OP2_MASK,
101     KVM_REG_ARM_CORE, KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64,
102 };
103 #[cfg(target_arch = "riscv64")]
104 use kvm_bindings::{kvm_riscv_core, user_regs_struct, KVM_REG_RISCV_CORE};
105 #[cfg(feature = "tdx")]
106 use kvm_bindings::{kvm_run__bindgen_ty_1, KVMIO};
107 pub use kvm_ioctls::{Cap, Kvm, VcpuExit};
108 use thiserror::Error;
109 use vfio_ioctls::VfioDeviceFd;
110 #[cfg(feature = "tdx")]
111 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_ioc_nr, ioctl_iowr_nr};
112 pub use {kvm_bindings, kvm_ioctls};
113 
114 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
115 use crate::RegList;
116 
117 #[cfg(target_arch = "x86_64")]
118 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196;
119 
120 #[cfg(target_arch = "x86_64")]
121 use vmm_sys_util::ioctl_io_nr;
122 #[cfg(all(not(feature = "tdx"), target_arch = "x86_64"))]
123 use vmm_sys_util::ioctl_ioc_nr;
124 
125 #[cfg(target_arch = "x86_64")]
126 ioctl_io_nr!(KVM_NMI, kvm_bindings::KVMIO, 0x9a);
127 
128 #[cfg(feature = "tdx")]
129 const KVM_EXIT_TDX: u32 = 50;
130 #[cfg(feature = "tdx")]
131 const TDG_VP_VMCALL_GET_QUOTE: u64 = 0x10002;
132 #[cfg(feature = "tdx")]
133 const TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: u64 = 0x10004;
134 #[cfg(feature = "tdx")]
135 const TDG_VP_VMCALL_SUCCESS: u64 = 0;
136 #[cfg(feature = "tdx")]
137 const TDG_VP_VMCALL_INVALID_OPERAND: u64 = 0x8000000000000000;
138 
139 #[cfg(feature = "tdx")]
140 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong);
141 
142 #[cfg(feature = "tdx")]
143 #[repr(u32)]
144 enum TdxCommand {
145     Capabilities = 0,
146     InitVm,
147     InitVcpu,
148     InitMemRegion,
149     Finalize,
150 }
151 
152 #[cfg(feature = "tdx")]
153 pub enum TdxExitDetails {
154     GetQuote,
155     SetupEventNotifyInterrupt,
156 }
157 
158 #[cfg(feature = "tdx")]
159 pub enum TdxExitStatus {
160     Success,
161     InvalidOperand,
162 }
163 
164 #[cfg(feature = "tdx")]
165 const TDX_MAX_NR_CPUID_CONFIGS: usize = 6;
166 
167 #[cfg(feature = "tdx")]
168 #[repr(C)]
169 #[derive(Debug, Default)]
170 pub struct TdxCpuidConfig {
171     pub leaf: u32,
172     pub sub_leaf: u32,
173     pub eax: u32,
174     pub ebx: u32,
175     pub ecx: u32,
176     pub edx: u32,
177 }
178 
179 #[cfg(feature = "tdx")]
180 #[repr(C)]
181 #[derive(Debug, Default)]
182 pub struct TdxCapabilities {
183     pub attrs_fixed0: u64,
184     pub attrs_fixed1: u64,
185     pub xfam_fixed0: u64,
186     pub xfam_fixed1: u64,
187     pub nr_cpuid_configs: u32,
188     pub padding: u32,
189     pub cpuid_configs: [TdxCpuidConfig; TDX_MAX_NR_CPUID_CONFIGS],
190 }
191 
192 #[cfg(feature = "tdx")]
193 #[derive(Copy, Clone)]
194 pub struct KvmTdxExit {
195     pub type_: u32,
196     pub pad: u32,
197     pub u: KvmTdxExitU,
198 }
199 
200 #[cfg(feature = "tdx")]
201 #[repr(C)]
202 #[derive(Copy, Clone)]
203 pub union KvmTdxExitU {
204     pub vmcall: KvmTdxExitVmcall,
205 }
206 
207 #[cfg(feature = "tdx")]
208 #[repr(C)]
209 #[derive(Debug, Default, Copy, Clone, PartialEq)]
210 pub struct KvmTdxExitVmcall {
211     pub type_: u64,
212     pub subfunction: u64,
213     pub reg_mask: u64,
214     pub in_r12: u64,
215     pub in_r13: u64,
216     pub in_r14: u64,
217     pub in_r15: u64,
218     pub in_rbx: u64,
219     pub in_rdi: u64,
220     pub in_rsi: u64,
221     pub in_r8: u64,
222     pub in_r9: u64,
223     pub in_rdx: u64,
224     pub status_code: u64,
225     pub out_r11: u64,
226     pub out_r12: u64,
227     pub out_r13: u64,
228     pub out_r14: u64,
229     pub out_r15: u64,
230     pub out_rbx: u64,
231     pub out_rdi: u64,
232     pub out_rsi: u64,
233     pub out_r8: u64,
234     pub out_r9: u64,
235     pub out_rdx: u64,
236 }
237 
238 impl From<kvm_userspace_memory_region> for UserMemoryRegion {
239     fn from(region: kvm_userspace_memory_region) -> Self {
240         let mut flags = USER_MEMORY_REGION_READ;
241         if region.flags & KVM_MEM_READONLY == 0 {
242             flags |= USER_MEMORY_REGION_WRITE;
243         }
244         if region.flags & KVM_MEM_LOG_DIRTY_PAGES != 0 {
245             flags |= USER_MEMORY_REGION_LOG_DIRTY;
246         }
247 
248         UserMemoryRegion {
249             slot: region.slot,
250             guest_phys_addr: region.guest_phys_addr,
251             memory_size: region.memory_size,
252             userspace_addr: region.userspace_addr,
253             flags,
254         }
255     }
256 }
257 
258 impl From<UserMemoryRegion> for kvm_userspace_memory_region {
259     fn from(region: UserMemoryRegion) -> Self {
260         assert!(
261             region.flags & USER_MEMORY_REGION_READ != 0,
262             "KVM mapped memory is always readable"
263         );
264 
265         let mut flags = 0;
266         if region.flags & USER_MEMORY_REGION_WRITE == 0 {
267             flags |= KVM_MEM_READONLY;
268         }
269         if region.flags & USER_MEMORY_REGION_LOG_DIRTY != 0 {
270             flags |= KVM_MEM_LOG_DIRTY_PAGES;
271         }
272 
273         kvm_userspace_memory_region {
274             slot: region.slot,
275             guest_phys_addr: region.guest_phys_addr,
276             memory_size: region.memory_size,
277             userspace_addr: region.userspace_addr,
278             flags,
279         }
280     }
281 }
282 
283 impl From<kvm_mp_state> for MpState {
284     fn from(s: kvm_mp_state) -> Self {
285         MpState::Kvm(s)
286     }
287 }
288 
289 impl From<MpState> for kvm_mp_state {
290     fn from(ms: MpState) -> Self {
291         match ms {
292             MpState::Kvm(s) => s,
293             /* Needed in case other hypervisors are enabled */
294             #[allow(unreachable_patterns)]
295             _ => panic!("CpuState is not valid"),
296         }
297     }
298 }
299 
300 impl From<kvm_ioctls::IoEventAddress> for IoEventAddress {
301     fn from(a: kvm_ioctls::IoEventAddress) -> Self {
302         match a {
303             kvm_ioctls::IoEventAddress::Pio(x) => Self::Pio(x),
304             kvm_ioctls::IoEventAddress::Mmio(x) => Self::Mmio(x),
305         }
306     }
307 }
308 
309 impl From<IoEventAddress> for kvm_ioctls::IoEventAddress {
310     fn from(a: IoEventAddress) -> Self {
311         match a {
312             IoEventAddress::Pio(x) => Self::Pio(x),
313             IoEventAddress::Mmio(x) => Self::Mmio(x),
314         }
315     }
316 }
317 
318 impl From<VcpuKvmState> for CpuState {
319     fn from(s: VcpuKvmState) -> Self {
320         CpuState::Kvm(s)
321     }
322 }
323 
324 impl From<CpuState> for VcpuKvmState {
325     fn from(s: CpuState) -> Self {
326         match s {
327             CpuState::Kvm(s) => s,
328             /* Needed in case other hypervisors are enabled */
329             #[allow(unreachable_patterns)]
330             _ => panic!("CpuState is not valid"),
331         }
332     }
333 }
334 
335 #[cfg(target_arch = "x86_64")]
336 impl From<kvm_clock_data> for ClockData {
337     fn from(d: kvm_clock_data) -> Self {
338         ClockData::Kvm(d)
339     }
340 }
341 
342 #[cfg(target_arch = "x86_64")]
343 impl From<ClockData> for kvm_clock_data {
344     fn from(ms: ClockData) -> Self {
345         match ms {
346             ClockData::Kvm(s) => s,
347             /* Needed in case other hypervisors are enabled */
348             #[allow(unreachable_patterns)]
349             _ => panic!("CpuState is not valid"),
350         }
351     }
352 }
353 
354 impl From<kvm_bindings::kvm_one_reg> for crate::Register {
355     fn from(s: kvm_bindings::kvm_one_reg) -> Self {
356         crate::Register::Kvm(s)
357     }
358 }
359 
360 impl From<crate::Register> for kvm_bindings::kvm_one_reg {
361     fn from(e: crate::Register) -> Self {
362         match e {
363             crate::Register::Kvm(e) => e,
364             /* Needed in case other hypervisors are enabled */
365             #[allow(unreachable_patterns)]
366             _ => panic!("Register is not valid"),
367         }
368     }
369 }
370 
371 #[cfg(target_arch = "aarch64")]
372 impl From<kvm_bindings::kvm_vcpu_init> for crate::VcpuInit {
373     fn from(s: kvm_bindings::kvm_vcpu_init) -> Self {
374         crate::VcpuInit::Kvm(s)
375     }
376 }
377 
378 #[cfg(target_arch = "aarch64")]
379 impl From<crate::VcpuInit> for kvm_bindings::kvm_vcpu_init {
380     fn from(e: crate::VcpuInit) -> Self {
381         match e {
382             crate::VcpuInit::Kvm(e) => e,
383             /* Needed in case other hypervisors are enabled */
384             #[allow(unreachable_patterns)]
385             _ => panic!("VcpuInit is not valid"),
386         }
387     }
388 }
389 
390 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
391 impl From<kvm_bindings::RegList> for crate::RegList {
392     fn from(s: kvm_bindings::RegList) -> Self {
393         crate::RegList::Kvm(s)
394     }
395 }
396 
397 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
398 impl From<crate::RegList> for kvm_bindings::RegList {
399     fn from(e: crate::RegList) -> Self {
400         match e {
401             crate::RegList::Kvm(e) => e,
402             /* Needed in case other hypervisors are enabled */
403             #[allow(unreachable_patterns)]
404             _ => panic!("RegList is not valid"),
405         }
406     }
407 }
408 
409 #[cfg(not(target_arch = "riscv64"))]
410 impl From<kvm_bindings::kvm_regs> for crate::StandardRegisters {
411     fn from(s: kvm_bindings::kvm_regs) -> Self {
412         crate::StandardRegisters::Kvm(s)
413     }
414 }
415 
416 #[cfg(not(target_arch = "riscv64"))]
417 impl From<crate::StandardRegisters> for kvm_bindings::kvm_regs {
418     fn from(e: crate::StandardRegisters) -> Self {
419         match e {
420             crate::StandardRegisters::Kvm(e) => e,
421             /* Needed in case other hypervisors are enabled */
422             #[allow(unreachable_patterns)]
423             _ => panic!("StandardRegisters are not valid"),
424         }
425     }
426 }
427 
428 #[cfg(target_arch = "riscv64")]
429 impl From<kvm_bindings::kvm_riscv_core> for crate::StandardRegisters {
430     fn from(s: kvm_bindings::kvm_riscv_core) -> Self {
431         crate::StandardRegisters::Kvm(s)
432     }
433 }
434 
435 #[cfg(target_arch = "riscv64")]
436 impl From<crate::StandardRegisters> for kvm_bindings::kvm_riscv_core {
437     fn from(e: crate::StandardRegisters) -> Self {
438         match e {
439             crate::StandardRegisters::Kvm(e) => e,
440             /* Needed in case other hypervisors are enabled */
441             #[allow(unreachable_patterns)]
442             _ => panic!("StandardRegisters are not valid"),
443         }
444     }
445 }
446 
447 impl From<kvm_irq_routing_entry> for IrqRoutingEntry {
448     fn from(s: kvm_irq_routing_entry) -> Self {
449         IrqRoutingEntry::Kvm(s)
450     }
451 }
452 
453 impl From<IrqRoutingEntry> for kvm_irq_routing_entry {
454     fn from(e: IrqRoutingEntry) -> Self {
455         match e {
456             IrqRoutingEntry::Kvm(e) => e,
457             /* Needed in case other hypervisors are enabled */
458             #[allow(unreachable_patterns)]
459             _ => panic!("IrqRoutingEntry is not valid"),
460         }
461     }
462 }
463 
464 struct KvmDirtyLogSlot {
465     slot: u32,
466     guest_phys_addr: u64,
467     memory_size: u64,
468     userspace_addr: u64,
469 }
470 
471 /// Wrapper over KVM VM ioctls.
472 pub struct KvmVm {
473     fd: Arc<VmFd>,
474     #[cfg(target_arch = "x86_64")]
475     msrs: Vec<MsrEntry>,
476     dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>,
477 }
478 
479 impl KvmVm {
480     ///
481     /// Creates an emulated device in the kernel.
482     ///
483     /// See the documentation for `KVM_CREATE_DEVICE`.
484     fn create_device(&self, device: &mut CreateDevice) -> vm::Result<vfio_ioctls::VfioDeviceFd> {
485         let device_fd = self
486             .fd
487             .create_device(device)
488             .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?;
489         Ok(VfioDeviceFd::new_from_kvm(device_fd))
490     }
491     /// Checks if a particular `Cap` is available.
492     pub fn check_extension(&self, c: Cap) -> bool {
493         self.fd.check_extension(c)
494     }
495 }
496 
497 /// Implementation of Vm trait for KVM
498 ///
499 /// # Examples
500 ///
501 /// ```
502 /// # use hypervisor::kvm::KvmHypervisor;
503 /// # use std::sync::Arc;
504 /// let kvm = KvmHypervisor::new().unwrap();
505 /// let hypervisor = Arc::new(kvm);
506 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
507 /// ```
508 impl vm::Vm for KvmVm {
509     #[cfg(target_arch = "x86_64")]
510     ///
511     /// Sets the address of the one-page region in the VM's address space.
512     ///
513     fn set_identity_map_address(&self, address: u64) -> vm::Result<()> {
514         self.fd
515             .set_identity_map_address(address)
516             .map_err(|e| vm::HypervisorVmError::SetIdentityMapAddress(e.into()))
517     }
518 
519     #[cfg(target_arch = "x86_64")]
520     ///
521     /// Sets the address of the three-page region in the VM's address space.
522     ///
523     fn set_tss_address(&self, offset: usize) -> vm::Result<()> {
524         self.fd
525             .set_tss_address(offset)
526             .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into()))
527     }
528 
529     #[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
530     ///
531     /// Creates an in-kernel interrupt controller.
532     ///
533     fn create_irq_chip(&self) -> vm::Result<()> {
534         self.fd
535             .create_irq_chip()
536             .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into()))
537     }
538 
539     ///
540     /// Registers an event that will, when signaled, trigger the `gsi` IRQ.
541     ///
542     fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
543         self.fd
544             .register_irqfd(fd, gsi)
545             .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into()))
546     }
547 
548     ///
549     /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ.
550     ///
551     fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
552         self.fd
553             .unregister_irqfd(fd, gsi)
554             .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into()))
555     }
556 
557     ///
558     /// Creates a VcpuFd object from a vcpu RawFd.
559     ///
560     fn create_vcpu(
561         &self,
562         id: u8,
563         vm_ops: Option<Arc<dyn VmOps>>,
564     ) -> vm::Result<Arc<dyn cpu::Vcpu>> {
565         let fd = self
566             .fd
567             .create_vcpu(id as u64)
568             .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?;
569         let vcpu = KvmVcpu {
570             fd: Arc::new(Mutex::new(fd)),
571             #[cfg(target_arch = "x86_64")]
572             msrs: self.msrs.clone(),
573             vm_ops,
574             #[cfg(target_arch = "x86_64")]
575             hyperv_synic: AtomicBool::new(false),
576         };
577         Ok(Arc::new(vcpu))
578     }
579 
580     #[cfg(target_arch = "aarch64")]
581     ///
582     /// Creates a virtual GIC device.
583     ///
584     fn create_vgic(&self, config: VgicConfig) -> vm::Result<Arc<Mutex<dyn Vgic>>> {
585         let gic_device = KvmGicV3Its::new(self, config)
586             .map_err(|e| vm::HypervisorVmError::CreateVgic(anyhow!("Vgic error {:?}", e)))?;
587         Ok(Arc::new(Mutex::new(gic_device)))
588     }
589 
590     #[cfg(target_arch = "riscv64")]
591     ///
592     /// Creates a virtual AIA device.
593     ///
594     fn create_vaia(&self, config: VaiaConfig) -> vm::Result<Arc<Mutex<dyn Vaia>>> {
595         let aia_device = KvmAiaImsics::new(self, config)
596             .map_err(|e| vm::HypervisorVmError::CreateVaia(anyhow!("Vaia error {:?}", e)))?;
597         Ok(Arc::new(Mutex::new(aia_device)))
598     }
599 
600     ///
601     /// Registers an event to be signaled whenever a certain address is written to.
602     ///
603     fn register_ioevent(
604         &self,
605         fd: &EventFd,
606         addr: &IoEventAddress,
607         datamatch: Option<vm::DataMatch>,
608     ) -> vm::Result<()> {
609         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
610         if let Some(dm) = datamatch {
611             match dm {
612                 vm::DataMatch::DataMatch32(kvm_dm32) => self
613                     .fd
614                     .register_ioevent(fd, addr, kvm_dm32)
615                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
616                 vm::DataMatch::DataMatch64(kvm_dm64) => self
617                     .fd
618                     .register_ioevent(fd, addr, kvm_dm64)
619                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
620             }
621         } else {
622             self.fd
623                 .register_ioevent(fd, addr, NoDatamatch)
624                 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into()))
625         }
626     }
627 
628     ///
629     /// Unregisters an event from a certain address it has been previously registered to.
630     ///
631     fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> {
632         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
633         self.fd
634             .unregister_ioevent(fd, addr, NoDatamatch)
635             .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into()))
636     }
637 
638     ///
639     /// Constructs a routing entry
640     ///
641     fn make_routing_entry(&self, gsi: u32, config: &InterruptSourceConfig) -> IrqRoutingEntry {
642         match &config {
643             InterruptSourceConfig::MsiIrq(cfg) => {
644                 let mut kvm_route = kvm_irq_routing_entry {
645                     gsi,
646                     type_: KVM_IRQ_ROUTING_MSI,
647                     ..Default::default()
648                 };
649 
650                 kvm_route.u.msi.address_lo = cfg.low_addr;
651                 kvm_route.u.msi.address_hi = cfg.high_addr;
652                 kvm_route.u.msi.data = cfg.data;
653 
654                 if self.check_extension(crate::kvm::Cap::MsiDevid) {
655                     // On AArch64, there is limitation on the range of the 'devid',
656                     // it cannot be greater than 65536 (the max of u16).
657                     //
658                     // BDF cannot be used directly, because 'segment' is in high
659                     // 16 bits. The layout of the u32 BDF is:
660                     // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --|
661                     // |      segment    |     bus    |   device   |  function  |
662                     //
663                     // Now that we support 1 bus only in a segment, we can build a
664                     // 'devid' by replacing the 'bus' bits with the low 8 bits of
665                     // 'segment' data.
666                     // This way we can resolve the range checking problem and give
667                     // different `devid` to all the devices. Limitation is that at
668                     // most 256 segments can be supported.
669                     //
670                     let modified_devid = ((cfg.devid & 0x00ff_0000) >> 8) | cfg.devid & 0xff;
671 
672                     kvm_route.flags = KVM_MSI_VALID_DEVID;
673                     kvm_route.u.msi.__bindgen_anon_1.devid = modified_devid;
674                 }
675                 kvm_route.into()
676             }
677             InterruptSourceConfig::LegacyIrq(cfg) => {
678                 let mut kvm_route = kvm_irq_routing_entry {
679                     gsi,
680                     type_: KVM_IRQ_ROUTING_IRQCHIP,
681                     ..Default::default()
682                 };
683                 kvm_route.u.irqchip.irqchip = cfg.irqchip;
684                 kvm_route.u.irqchip.pin = cfg.pin;
685 
686                 kvm_route.into()
687             }
688         }
689     }
690 
691     ///
692     /// Sets the GSI routing table entries, overwriting any previously set
693     /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl.
694     ///
695     fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> {
696         let mut irq_routing =
697             vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len());
698         irq_routing[0].nr = entries.len() as u32;
699         irq_routing[0].flags = 0;
700         let entries: Vec<kvm_irq_routing_entry> = entries
701             .iter()
702             .map(|entry| match entry {
703                 IrqRoutingEntry::Kvm(e) => *e,
704                 #[allow(unreachable_patterns)]
705                 _ => panic!("IrqRoutingEntry type is wrong"),
706             })
707             .collect();
708 
709         // SAFETY: irq_routing initialized with entries.len() and now it is being turned into
710         // entries_slice with entries.len() again. It is guaranteed to be large enough to hold
711         // everything from entries.
712         unsafe {
713             let entries_slice: &mut [kvm_irq_routing_entry] =
714                 irq_routing[0].entries.as_mut_slice(entries.len());
715             entries_slice.copy_from_slice(&entries);
716         }
717 
718         self.fd
719             .set_gsi_routing(&irq_routing[0])
720             .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into()))
721     }
722 
723     ///
724     /// Creates a memory region structure that can be used with {create/remove}_user_memory_region
725     ///
726     fn make_user_memory_region(
727         &self,
728         slot: u32,
729         guest_phys_addr: u64,
730         memory_size: u64,
731         userspace_addr: u64,
732         readonly: bool,
733         log_dirty_pages: bool,
734     ) -> UserMemoryRegion {
735         kvm_userspace_memory_region {
736             slot,
737             guest_phys_addr,
738             memory_size,
739             userspace_addr,
740             flags: if readonly { KVM_MEM_READONLY } else { 0 }
741                 | if log_dirty_pages {
742                     KVM_MEM_LOG_DIRTY_PAGES
743                 } else {
744                     0
745                 },
746         }
747         .into()
748     }
749 
750     ///
751     /// Creates a guest physical memory region.
752     ///
753     fn create_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
754         let mut region: kvm_userspace_memory_region = user_memory_region.into();
755 
756         if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 {
757             if (region.flags & KVM_MEM_READONLY) != 0 {
758                 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!(
759                     "Error creating regions with both 'dirty-pages-log' and 'read-only'."
760                 )));
761             }
762 
763             // Keep track of the regions that need dirty pages log
764             self.dirty_log_slots.write().unwrap().insert(
765                 region.slot,
766                 KvmDirtyLogSlot {
767                     slot: region.slot,
768                     guest_phys_addr: region.guest_phys_addr,
769                     memory_size: region.memory_size,
770                     userspace_addr: region.userspace_addr,
771                 },
772             );
773 
774             // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`.
775             // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`.
776             region.flags = 0;
777         }
778 
779         // SAFETY: Safe because guest regions are guaranteed not to overlap.
780         unsafe {
781             self.fd
782                 .set_user_memory_region(region)
783                 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))
784         }
785     }
786 
787     ///
788     /// Removes a guest physical memory region.
789     ///
790     fn remove_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
791         let mut region: kvm_userspace_memory_region = user_memory_region.into();
792 
793         // Remove the corresponding entry from "self.dirty_log_slots" if needed
794         self.dirty_log_slots.write().unwrap().remove(&region.slot);
795 
796         // Setting the size to 0 means "remove"
797         region.memory_size = 0;
798         // SAFETY: Safe because guest regions are guaranteed not to overlap.
799         unsafe {
800             self.fd
801                 .set_user_memory_region(region)
802                 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into()))
803         }
804     }
805 
806     ///
807     /// Returns the preferred CPU target type which can be emulated by KVM on underlying host.
808     ///
809     #[cfg(target_arch = "aarch64")]
810     fn get_preferred_target(&self, kvi: &mut crate::VcpuInit) -> vm::Result<()> {
811         let mut kvm_kvi: kvm_bindings::kvm_vcpu_init = (*kvi).into();
812         self.fd
813             .get_preferred_target(&mut kvm_kvi)
814             .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into()))?;
815         *kvi = kvm_kvi.into();
816         Ok(())
817     }
818 
819     #[cfg(target_arch = "x86_64")]
820     fn enable_split_irq(&self) -> vm::Result<()> {
821         // Create split irqchip
822         // Only the local APIC is emulated in kernel, both PICs and IOAPIC
823         // are not.
824         let mut cap = kvm_enable_cap {
825             cap: KVM_CAP_SPLIT_IRQCHIP,
826             ..Default::default()
827         };
828         cap.args[0] = NUM_IOAPIC_PINS as u64;
829         self.fd
830             .enable_cap(&cap)
831             .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?;
832         Ok(())
833     }
834 
835     #[cfg(target_arch = "x86_64")]
836     fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> {
837         let mut cap = kvm_enable_cap {
838             cap: KVM_CAP_SGX_ATTRIBUTE,
839             ..Default::default()
840         };
841         cap.args[0] = file.as_raw_fd() as u64;
842         self.fd
843             .enable_cap(&cap)
844             .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?;
845         Ok(())
846     }
847 
848     /// Retrieve guest clock.
849     #[cfg(target_arch = "x86_64")]
850     fn get_clock(&self) -> vm::Result<ClockData> {
851         Ok(self
852             .fd
853             .get_clock()
854             .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))?
855             .into())
856     }
857 
858     /// Set guest clock.
859     #[cfg(target_arch = "x86_64")]
860     fn set_clock(&self, data: &ClockData) -> vm::Result<()> {
861         let data = (*data).into();
862         self.fd
863             .set_clock(&data)
864             .map_err(|e| vm::HypervisorVmError::SetClock(e.into()))
865     }
866 
867     /// Create a device that is used for passthrough
868     fn create_passthrough_device(&self) -> vm::Result<VfioDeviceFd> {
869         let mut vfio_dev = kvm_create_device {
870             type_: kvm_device_type_KVM_DEV_TYPE_VFIO,
871             fd: 0,
872             flags: 0,
873         };
874 
875         self.create_device(&mut vfio_dev)
876             .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into()))
877     }
878 
879     ///
880     /// Start logging dirty pages
881     ///
882     fn start_dirty_log(&self) -> vm::Result<()> {
883         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
884         for (_, s) in dirty_log_slots.iter() {
885             let region = kvm_userspace_memory_region {
886                 slot: s.slot,
887                 guest_phys_addr: s.guest_phys_addr,
888                 memory_size: s.memory_size,
889                 userspace_addr: s.userspace_addr,
890                 flags: KVM_MEM_LOG_DIRTY_PAGES,
891             };
892             // SAFETY: Safe because guest regions are guaranteed not to overlap.
893             unsafe {
894                 self.fd
895                     .set_user_memory_region(region)
896                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
897             }
898         }
899 
900         Ok(())
901     }
902 
903     ///
904     /// Stop logging dirty pages
905     ///
906     fn stop_dirty_log(&self) -> vm::Result<()> {
907         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
908         for (_, s) in dirty_log_slots.iter() {
909             let region = kvm_userspace_memory_region {
910                 slot: s.slot,
911                 guest_phys_addr: s.guest_phys_addr,
912                 memory_size: s.memory_size,
913                 userspace_addr: s.userspace_addr,
914                 flags: 0,
915             };
916             // SAFETY: Safe because guest regions are guaranteed not to overlap.
917             unsafe {
918                 self.fd
919                     .set_user_memory_region(region)
920                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
921             }
922         }
923 
924         Ok(())
925     }
926 
927     ///
928     /// Get dirty pages bitmap (one bit per page)
929     ///
930     fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> {
931         self.fd
932             .get_dirty_log(slot, memory_size as usize)
933             .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into()))
934     }
935 
936     ///
937     /// Initialize TDX for this VM
938     ///
939     #[cfg(feature = "tdx")]
940     fn tdx_init(&self, cpuid: &[CpuIdEntry], max_vcpus: u32) -> vm::Result<()> {
941         const TDX_ATTR_SEPT_VE_DISABLE: usize = 28;
942 
943         let mut cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> =
944             cpuid.iter().map(|e| (*e).into()).collect();
945         cpuid.resize(256, kvm_bindings::kvm_cpuid_entry2::default());
946 
947         #[repr(C)]
948         struct TdxInitVm {
949             attributes: u64,
950             max_vcpus: u32,
951             padding: u32,
952             mrconfigid: [u64; 6],
953             mrowner: [u64; 6],
954             mrownerconfig: [u64; 6],
955             cpuid_nent: u32,
956             cpuid_padding: u32,
957             cpuid_entries: [kvm_bindings::kvm_cpuid_entry2; 256],
958         }
959         let data = TdxInitVm {
960             attributes: 1 << TDX_ATTR_SEPT_VE_DISABLE,
961             max_vcpus,
962             padding: 0,
963             mrconfigid: [0; 6],
964             mrowner: [0; 6],
965             mrownerconfig: [0; 6],
966             cpuid_nent: cpuid.len() as u32,
967             cpuid_padding: 0,
968             cpuid_entries: cpuid.as_slice().try_into().unwrap(),
969         };
970 
971         tdx_command(
972             &self.fd.as_raw_fd(),
973             TdxCommand::InitVm,
974             0,
975             &data as *const _ as u64,
976         )
977         .map_err(vm::HypervisorVmError::InitializeTdx)
978     }
979 
980     ///
981     /// Finalize the TDX setup for this VM
982     ///
983     #[cfg(feature = "tdx")]
984     fn tdx_finalize(&self) -> vm::Result<()> {
985         tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0)
986             .map_err(vm::HypervisorVmError::FinalizeTdx)
987     }
988 
989     ///
990     /// Initialize memory regions for the TDX VM
991     ///
992     #[cfg(feature = "tdx")]
993     fn tdx_init_memory_region(
994         &self,
995         host_address: u64,
996         guest_address: u64,
997         size: u64,
998         measure: bool,
999     ) -> vm::Result<()> {
1000         #[repr(C)]
1001         struct TdxInitMemRegion {
1002             host_address: u64,
1003             guest_address: u64,
1004             pages: u64,
1005         }
1006         let data = TdxInitMemRegion {
1007             host_address,
1008             guest_address,
1009             pages: size / 4096,
1010         };
1011 
1012         tdx_command(
1013             &self.fd.as_raw_fd(),
1014             TdxCommand::InitMemRegion,
1015             u32::from(measure),
1016             &data as *const _ as u64,
1017         )
1018         .map_err(vm::HypervisorVmError::InitMemRegionTdx)
1019     }
1020 
1021     /// Downcast to the underlying KvmVm type
1022     fn as_any(&self) -> &dyn Any {
1023         self
1024     }
1025 }
1026 
1027 #[cfg(feature = "tdx")]
1028 fn tdx_command(
1029     fd: &RawFd,
1030     command: TdxCommand,
1031     flags: u32,
1032     data: u64,
1033 ) -> std::result::Result<(), std::io::Error> {
1034     #[repr(C)]
1035     struct TdxIoctlCmd {
1036         command: TdxCommand,
1037         flags: u32,
1038         data: u64,
1039         error: u64,
1040         unused: u64,
1041     }
1042     let cmd = TdxIoctlCmd {
1043         command,
1044         flags,
1045         data,
1046         error: 0,
1047         unused: 0,
1048     };
1049     // SAFETY: FFI call. All input parameters are valid.
1050     let ret = unsafe {
1051         ioctl_with_val(
1052             fd,
1053             KVM_MEMORY_ENCRYPT_OP(),
1054             &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong,
1055         )
1056     };
1057 
1058     if ret < 0 {
1059         return Err(std::io::Error::last_os_error());
1060     }
1061     Ok(())
1062 }
1063 
1064 /// Wrapper over KVM system ioctls.
1065 pub struct KvmHypervisor {
1066     kvm: Kvm,
1067 }
1068 
1069 impl KvmHypervisor {
1070     #[cfg(target_arch = "x86_64")]
1071     ///
1072     /// Retrieve the list of MSRs supported by the hypervisor.
1073     ///
1074     fn get_msr_list(&self) -> hypervisor::Result<MsrList> {
1075         self.kvm
1076             .get_msr_index_list()
1077             .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into()))
1078     }
1079 }
1080 
1081 /// Enum for KVM related error
1082 #[derive(Debug, Error)]
1083 pub enum KvmError {
1084     #[error("Capability missing: {0:?}")]
1085     CapabilityMissing(Cap),
1086 }
1087 
1088 pub type KvmResult<T> = result::Result<T, KvmError>;
1089 
1090 impl KvmHypervisor {
1091     /// Create a hypervisor based on Kvm
1092     #[allow(clippy::new_ret_no_self)]
1093     pub fn new() -> hypervisor::Result<Arc<dyn hypervisor::Hypervisor>> {
1094         let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?;
1095         let api_version = kvm_obj.get_api_version();
1096 
1097         if api_version != kvm_bindings::KVM_API_VERSION as i32 {
1098             return Err(hypervisor::HypervisorError::IncompatibleApiVersion);
1099         }
1100 
1101         Ok(Arc::new(KvmHypervisor { kvm: kvm_obj }))
1102     }
1103 
1104     /// Check if the hypervisor is available
1105     pub fn is_available() -> hypervisor::Result<bool> {
1106         match std::fs::metadata("/dev/kvm") {
1107             Ok(_) => Ok(true),
1108             Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false),
1109             Err(err) => Err(hypervisor::HypervisorError::HypervisorAvailableCheck(
1110                 err.into(),
1111             )),
1112         }
1113     }
1114 }
1115 
1116 /// Implementation of Hypervisor trait for KVM
1117 ///
1118 /// # Examples
1119 ///
1120 /// ```
1121 /// # use hypervisor::kvm::KvmHypervisor;
1122 /// # use std::sync::Arc;
1123 /// let kvm = KvmHypervisor::new().unwrap();
1124 /// let hypervisor = Arc::new(kvm);
1125 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
1126 /// ```
1127 impl hypervisor::Hypervisor for KvmHypervisor {
1128     ///
1129     /// Returns the type of the hypervisor
1130     ///
1131     fn hypervisor_type(&self) -> HypervisorType {
1132         HypervisorType::Kvm
1133     }
1134 
1135     ///
1136     /// Create a Vm of a specific type using the underlying hypervisor, passing memory size
1137     /// Return a hypervisor-agnostic Vm trait object
1138     ///
1139     /// # Examples
1140     ///
1141     /// ```
1142     /// # use hypervisor::kvm::KvmHypervisor;
1143     /// use hypervisor::kvm::KvmVm;
1144     /// let hypervisor = KvmHypervisor::new().unwrap();
1145     /// let vm = hypervisor.create_vm_with_type_and_memory(0).unwrap();
1146     /// ```
1147     fn create_vm_with_type_and_memory(
1148         &self,
1149         vm_type: u64,
1150         #[cfg(feature = "sev_snp")] _mem_size: u64,
1151     ) -> hypervisor::Result<Arc<dyn vm::Vm>> {
1152         self.create_vm_with_type(vm_type)
1153     }
1154 
1155     /// Create a KVM vm object of a specific VM type and return the object as Vm trait object
1156     ///
1157     /// # Examples
1158     ///
1159     /// ```
1160     /// # use hypervisor::kvm::KvmHypervisor;
1161     /// use hypervisor::kvm::KvmVm;
1162     /// let hypervisor = KvmHypervisor::new().unwrap();
1163     /// let vm = hypervisor.create_vm_with_type(0).unwrap();
1164     /// ```
1165     fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> {
1166         let fd: VmFd;
1167         loop {
1168             match self.kvm.create_vm_with_type(vm_type) {
1169                 Ok(res) => fd = res,
1170                 Err(e) => {
1171                     if e.errno() == libc::EINTR {
1172                         // If the error returned is EINTR, which means the
1173                         // ioctl has been interrupted, we have to retry as
1174                         // this can't be considered as a regular error.
1175                         continue;
1176                     } else {
1177                         return Err(hypervisor::HypervisorError::VmCreate(e.into()));
1178                     }
1179                 }
1180             }
1181             break;
1182         }
1183 
1184         let vm_fd = Arc::new(fd);
1185 
1186         #[cfg(target_arch = "x86_64")]
1187         {
1188             let msr_list = self.get_msr_list()?;
1189             let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize;
1190             let mut msrs: Vec<MsrEntry> = vec![
1191                 MsrEntry {
1192                     ..Default::default()
1193                 };
1194                 num_msrs
1195             ];
1196             let indices = msr_list.as_slice();
1197             for (pos, index) in indices.iter().enumerate() {
1198                 msrs[pos].index = *index;
1199             }
1200 
1201             Ok(Arc::new(KvmVm {
1202                 fd: vm_fd,
1203                 msrs,
1204                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
1205             }))
1206         }
1207 
1208         #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
1209         {
1210             Ok(Arc::new(KvmVm {
1211                 fd: vm_fd,
1212                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
1213             }))
1214         }
1215     }
1216 
1217     /// Create a KVM vm object and return the object as Vm trait object
1218     ///
1219     /// # Examples
1220     ///
1221     /// ```
1222     /// # use hypervisor::kvm::KvmHypervisor;
1223     /// use hypervisor::kvm::KvmVm;
1224     /// let hypervisor = KvmHypervisor::new().unwrap();
1225     /// let vm = hypervisor.create_vm().unwrap();
1226     /// ```
1227     fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> {
1228         #[allow(unused_mut)]
1229         let mut vm_type: u64 = 0; // Create with default platform type
1230 
1231         // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA
1232         // size from the host and use that when creating the VM, which may
1233         // avoid unnecessary VM creation failures.
1234         #[cfg(target_arch = "aarch64")]
1235         if self.kvm.check_extension(Cap::ArmVmIPASize) {
1236             vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap();
1237         }
1238 
1239         self.create_vm_with_type(vm_type)
1240     }
1241 
1242     fn check_required_extensions(&self) -> hypervisor::Result<()> {
1243         check_required_kvm_extensions(&self.kvm)
1244             .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into()))
1245     }
1246 
1247     #[cfg(target_arch = "x86_64")]
1248     ///
1249     /// X86 specific call to get the system supported CPUID values.
1250     ///
1251     fn get_supported_cpuid(&self) -> hypervisor::Result<Vec<CpuIdEntry>> {
1252         let kvm_cpuid = self
1253             .kvm
1254             .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
1255             .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))?;
1256 
1257         let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect();
1258 
1259         Ok(v)
1260     }
1261 
1262     #[cfg(target_arch = "aarch64")]
1263     ///
1264     /// Retrieve AArch64 host maximum IPA size supported by KVM.
1265     ///
1266     fn get_host_ipa_limit(&self) -> i32 {
1267         self.kvm.get_host_ipa_limit()
1268     }
1269 
1270     ///
1271     /// Retrieve TDX capabilities
1272     ///
1273     #[cfg(feature = "tdx")]
1274     fn tdx_capabilities(&self) -> hypervisor::Result<TdxCapabilities> {
1275         let data = TdxCapabilities {
1276             nr_cpuid_configs: TDX_MAX_NR_CPUID_CONFIGS as u32,
1277             ..Default::default()
1278         };
1279 
1280         tdx_command(
1281             &self.kvm.as_raw_fd(),
1282             TdxCommand::Capabilities,
1283             0,
1284             &data as *const _ as u64,
1285         )
1286         .map_err(|e| hypervisor::HypervisorError::TdxCapabilities(e.into()))?;
1287 
1288         Ok(data)
1289     }
1290 
1291     #[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
1292     ///
1293     /// Get the number of supported hardware breakpoints
1294     ///
1295     fn get_guest_debug_hw_bps(&self) -> usize {
1296         #[cfg(target_arch = "x86_64")]
1297         {
1298             4
1299         }
1300         #[cfg(target_arch = "aarch64")]
1301         {
1302             self.kvm.get_guest_debug_hw_bps() as usize
1303         }
1304     }
1305 
1306     /// Get maximum number of vCPUs
1307     fn get_max_vcpus(&self) -> u32 {
1308         self.kvm.get_max_vcpus().min(u32::MAX as usize) as u32
1309     }
1310 }
1311 
1312 /// Vcpu struct for KVM
1313 pub struct KvmVcpu {
1314     fd: Arc<Mutex<VcpuFd>>,
1315     #[cfg(target_arch = "x86_64")]
1316     msrs: Vec<MsrEntry>,
1317     vm_ops: Option<Arc<dyn vm::VmOps>>,
1318     #[cfg(target_arch = "x86_64")]
1319     hyperv_synic: AtomicBool,
1320 }
1321 
1322 /// Implementation of Vcpu trait for KVM
1323 ///
1324 /// # Examples
1325 ///
1326 /// ```
1327 /// # use hypervisor::kvm::KvmHypervisor;
1328 /// # use std::sync::Arc;
1329 /// let kvm = KvmHypervisor::new().unwrap();
1330 /// let hypervisor = Arc::new(kvm);
1331 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
1332 /// let vcpu = vm.create_vcpu(0, None).unwrap();
1333 /// ```
1334 impl cpu::Vcpu for KvmVcpu {
1335     ///
1336     /// Returns StandardRegisters with default value set
1337     ///
1338     fn create_standard_regs(&self) -> StandardRegisters {
1339         #[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
1340         {
1341             kvm_bindings::kvm_regs::default().into()
1342         }
1343         #[cfg(target_arch = "riscv64")]
1344         {
1345             kvm_bindings::kvm_riscv_core::default().into()
1346         }
1347     }
1348     #[cfg(target_arch = "x86_64")]
1349     ///
1350     /// Returns the vCPU general purpose registers.
1351     ///
1352     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1353         Ok(self
1354             .fd
1355             .lock()
1356             .unwrap()
1357             .get_regs()
1358             .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))?
1359             .into())
1360     }
1361 
1362     ///
1363     /// Returns the vCPU general purpose registers.
1364     /// The `KVM_GET_REGS` ioctl is not available on AArch64, `KVM_GET_ONE_REG`
1365     /// is used to get registers one by one.
1366     ///
1367     #[cfg(target_arch = "aarch64")]
1368     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1369         let mut state = kvm_regs::default();
1370         let mut off = offset_of!(user_pt_regs, regs);
1371         // There are 31 user_pt_regs:
1372         // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72
1373         // These actually are the general-purpose registers of the Armv8-a
1374         // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register).
1375         for i in 0..31 {
1376             let mut bytes = [0_u8; 8];
1377             self.fd
1378                 .lock()
1379                 .unwrap()
1380                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1381                 .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1382             state.regs.regs[i] = u64::from_le_bytes(bytes);
1383             off += std::mem::size_of::<u64>();
1384         }
1385 
1386         // We are now entering the "Other register" section of the ARMv8-a architecture.
1387         // First one, stack pointer.
1388         let off = offset_of!(user_pt_regs, sp);
1389         let mut bytes = [0_u8; 8];
1390         self.fd
1391             .lock()
1392             .unwrap()
1393             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1394             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1395         state.regs.sp = u64::from_le_bytes(bytes);
1396 
1397         // Second one, the program counter.
1398         let off = offset_of!(user_pt_regs, pc);
1399         let mut bytes = [0_u8; 8];
1400         self.fd
1401             .lock()
1402             .unwrap()
1403             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1404             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1405         state.regs.pc = u64::from_le_bytes(bytes);
1406 
1407         // Next is the processor state.
1408         let off = offset_of!(user_pt_regs, pstate);
1409         let mut bytes = [0_u8; 8];
1410         self.fd
1411             .lock()
1412             .unwrap()
1413             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1414             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1415         state.regs.pstate = u64::from_le_bytes(bytes);
1416 
1417         // The stack pointer associated with EL1
1418         let off = offset_of!(kvm_regs, sp_el1);
1419         let mut bytes = [0_u8; 8];
1420         self.fd
1421             .lock()
1422             .unwrap()
1423             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1424             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1425         state.sp_el1 = u64::from_le_bytes(bytes);
1426 
1427         // Exception Link Register for EL1, when taking an exception to EL1, this register
1428         // holds the address to which to return afterwards.
1429         let off = offset_of!(kvm_regs, elr_el1);
1430         let mut bytes = [0_u8; 8];
1431         self.fd
1432             .lock()
1433             .unwrap()
1434             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1435             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1436         state.elr_el1 = u64::from_le_bytes(bytes);
1437 
1438         // Saved Program Status Registers, there are 5 of them used in the kernel.
1439         let mut off = offset_of!(kvm_regs, spsr);
1440         for i in 0..KVM_NR_SPSR as usize {
1441             let mut bytes = [0_u8; 8];
1442             self.fd
1443                 .lock()
1444                 .unwrap()
1445                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1446                 .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1447             state.spsr[i] = u64::from_le_bytes(bytes);
1448             off += std::mem::size_of::<u64>();
1449         }
1450 
1451         // Now moving on to floating point registers which are stored in the user_fpsimd_state in the kernel:
1452         // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53
1453         let mut off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, vregs);
1454         for i in 0..32 {
1455             let mut bytes = [0_u8; 16];
1456             self.fd
1457                 .lock()
1458                 .unwrap()
1459                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off), &mut bytes)
1460                 .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1461             state.fp_regs.vregs[i] = u128::from_le_bytes(bytes);
1462             off += mem::size_of::<u128>();
1463         }
1464 
1465         // Floating-point Status Register
1466         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpsr);
1467         let mut bytes = [0_u8; 4];
1468         self.fd
1469             .lock()
1470             .unwrap()
1471             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off), &mut bytes)
1472             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1473         state.fp_regs.fpsr = u32::from_le_bytes(bytes);
1474 
1475         // Floating-point Control Register
1476         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpcr);
1477         let mut bytes = [0_u8; 4];
1478         self.fd
1479             .lock()
1480             .unwrap()
1481             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off), &mut bytes)
1482             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1483         state.fp_regs.fpcr = u32::from_le_bytes(bytes);
1484         Ok(state.into())
1485     }
1486 
1487     #[cfg(target_arch = "riscv64")]
1488     ///
1489     /// Returns the RISC-V vCPU core registers.
1490     /// The `KVM_GET_REGS` ioctl is not available on RISC-V 64-bit,
1491     /// `KVM_GET_ONE_REG` is used to get registers one by one.
1492     ///
1493     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1494         let mut state = kvm_riscv_core::default();
1495 
1496         /// Macro used to extract RISC-V register data from KVM Vcpu according
1497         /// to `$reg_name` provided to `state`.
1498         macro_rules! riscv64_get_one_reg_from_vcpu {
1499             (mode) => {
1500                 let off = offset_of!(kvm_riscv_core, mode);
1501                 let mut bytes = [0_u8; 8];
1502                 self.fd
1503                     .lock()
1504                     .unwrap()
1505                     .get_one_reg(riscv64_reg_id!(KVM_REG_RISCV_CORE, off), &mut bytes)
1506                     .map_err(|e| cpu::HypervisorCpuError::GetRiscvCoreRegister(e.into()))?;
1507                 state.mode = u64::from_le_bytes(bytes);
1508             };
1509             ($reg_name:ident) => {
1510                 let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, $reg_name);
1511                 let mut bytes = [0_u8; 8];
1512                 self.fd
1513                     .lock()
1514                     .unwrap()
1515                     .get_one_reg(riscv64_reg_id!(KVM_REG_RISCV_CORE, off), &mut bytes)
1516                     .map_err(|e| cpu::HypervisorCpuError::GetRiscvCoreRegister(e.into()))?;
1517                 state.regs.$reg_name = u64::from_le_bytes(bytes);
1518             };
1519         }
1520 
1521         riscv64_get_one_reg_from_vcpu!(pc);
1522         riscv64_get_one_reg_from_vcpu!(ra);
1523         riscv64_get_one_reg_from_vcpu!(sp);
1524         riscv64_get_one_reg_from_vcpu!(gp);
1525         riscv64_get_one_reg_from_vcpu!(tp);
1526         riscv64_get_one_reg_from_vcpu!(t0);
1527         riscv64_get_one_reg_from_vcpu!(t1);
1528         riscv64_get_one_reg_from_vcpu!(t2);
1529         riscv64_get_one_reg_from_vcpu!(s0);
1530         riscv64_get_one_reg_from_vcpu!(s1);
1531         riscv64_get_one_reg_from_vcpu!(a0);
1532         riscv64_get_one_reg_from_vcpu!(a1);
1533         riscv64_get_one_reg_from_vcpu!(a2);
1534         riscv64_get_one_reg_from_vcpu!(a3);
1535         riscv64_get_one_reg_from_vcpu!(a4);
1536         riscv64_get_one_reg_from_vcpu!(a5);
1537         riscv64_get_one_reg_from_vcpu!(a6);
1538         riscv64_get_one_reg_from_vcpu!(a7);
1539         riscv64_get_one_reg_from_vcpu!(s2);
1540         riscv64_get_one_reg_from_vcpu!(s3);
1541         riscv64_get_one_reg_from_vcpu!(s4);
1542         riscv64_get_one_reg_from_vcpu!(s5);
1543         riscv64_get_one_reg_from_vcpu!(s6);
1544         riscv64_get_one_reg_from_vcpu!(s7);
1545         riscv64_get_one_reg_from_vcpu!(s8);
1546         riscv64_get_one_reg_from_vcpu!(s9);
1547         riscv64_get_one_reg_from_vcpu!(s10);
1548         riscv64_get_one_reg_from_vcpu!(s11);
1549         riscv64_get_one_reg_from_vcpu!(t3);
1550         riscv64_get_one_reg_from_vcpu!(t4);
1551         riscv64_get_one_reg_from_vcpu!(t5);
1552         riscv64_get_one_reg_from_vcpu!(t6);
1553         riscv64_get_one_reg_from_vcpu!(mode);
1554 
1555         Ok(state.into())
1556     }
1557 
1558     #[cfg(target_arch = "x86_64")]
1559     ///
1560     /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl.
1561     ///
1562     fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> {
1563         let regs = (*regs).into();
1564         self.fd
1565             .lock()
1566             .unwrap()
1567             .set_regs(&regs)
1568             .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into()))
1569     }
1570 
1571     ///
1572     /// Sets the vCPU general purpose registers.
1573     /// The `KVM_SET_REGS` ioctl is not available on AArch64, `KVM_SET_ONE_REG`
1574     /// is used to set registers one by one.
1575     ///
1576     #[cfg(target_arch = "aarch64")]
1577     fn set_regs(&self, state: &StandardRegisters) -> cpu::Result<()> {
1578         // The function follows the exact identical order from `state`. Look there
1579         // for some additional info on registers.
1580         let kvm_regs_state: kvm_regs = (*state).into();
1581         let mut off = offset_of!(user_pt_regs, regs);
1582         for i in 0..31 {
1583             self.fd
1584                 .lock()
1585                 .unwrap()
1586                 .set_one_reg(
1587                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1588                     &kvm_regs_state.regs.regs[i].to_le_bytes(),
1589                 )
1590                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1591             off += std::mem::size_of::<u64>();
1592         }
1593 
1594         let off = offset_of!(user_pt_regs, sp);
1595         self.fd
1596             .lock()
1597             .unwrap()
1598             .set_one_reg(
1599                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1600                 &kvm_regs_state.regs.sp.to_le_bytes(),
1601             )
1602             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1603 
1604         let off = offset_of!(user_pt_regs, pc);
1605         self.fd
1606             .lock()
1607             .unwrap()
1608             .set_one_reg(
1609                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1610                 &kvm_regs_state.regs.pc.to_le_bytes(),
1611             )
1612             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1613 
1614         let off = offset_of!(user_pt_regs, pstate);
1615         self.fd
1616             .lock()
1617             .unwrap()
1618             .set_one_reg(
1619                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1620                 &kvm_regs_state.regs.pstate.to_le_bytes(),
1621             )
1622             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1623 
1624         let off = offset_of!(kvm_regs, sp_el1);
1625         self.fd
1626             .lock()
1627             .unwrap()
1628             .set_one_reg(
1629                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1630                 &kvm_regs_state.sp_el1.to_le_bytes(),
1631             )
1632             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1633 
1634         let off = offset_of!(kvm_regs, elr_el1);
1635         self.fd
1636             .lock()
1637             .unwrap()
1638             .set_one_reg(
1639                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1640                 &kvm_regs_state.elr_el1.to_le_bytes(),
1641             )
1642             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1643 
1644         let mut off = offset_of!(kvm_regs, spsr);
1645         for i in 0..KVM_NR_SPSR as usize {
1646             self.fd
1647                 .lock()
1648                 .unwrap()
1649                 .set_one_reg(
1650                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1651                     &kvm_regs_state.spsr[i].to_le_bytes(),
1652                 )
1653                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1654             off += std::mem::size_of::<u64>();
1655         }
1656 
1657         let mut off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, vregs);
1658         for i in 0..32 {
1659             self.fd
1660                 .lock()
1661                 .unwrap()
1662                 .set_one_reg(
1663                     arm64_core_reg_id!(KVM_REG_SIZE_U128, off),
1664                     &kvm_regs_state.fp_regs.vregs[i].to_le_bytes(),
1665                 )
1666                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1667             off += mem::size_of::<u128>();
1668         }
1669 
1670         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpsr);
1671         self.fd
1672             .lock()
1673             .unwrap()
1674             .set_one_reg(
1675                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1676                 &kvm_regs_state.fp_regs.fpsr.to_le_bytes(),
1677             )
1678             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1679 
1680         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpcr);
1681         self.fd
1682             .lock()
1683             .unwrap()
1684             .set_one_reg(
1685                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1686                 &kvm_regs_state.fp_regs.fpcr.to_le_bytes(),
1687             )
1688             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1689         Ok(())
1690     }
1691 
1692     #[cfg(target_arch = "riscv64")]
1693     ///
1694     /// Sets the RISC-V vCPU core registers.
1695     /// The `KVM_SET_REGS` ioctl is not available on RISC-V 64-bit,
1696     /// `KVM_SET_ONE_REG` is used to set registers one by one.
1697     ///
1698     fn set_regs(&self, state: &StandardRegisters) -> cpu::Result<()> {
1699         // The function follows the exact identical order from `state`. Look there
1700         // for some additional info on registers.
1701         let kvm_regs_state: kvm_riscv_core = (*state).into();
1702 
1703         /// Macro used to set value of specific RISC-V `$reg_name` stored in
1704         /// `state` to KVM Vcpu.
1705         macro_rules! riscv64_set_one_reg_to_vcpu {
1706             (mode) => {
1707                 let off = offset_of!(kvm_riscv_core, mode);
1708                 self.fd
1709                     .lock()
1710                     .unwrap()
1711                     .set_one_reg(
1712                         riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1713                         &kvm_regs_state.mode.to_le_bytes(),
1714                     )
1715                     .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1716             };
1717             ($reg_name:ident) => {
1718                 let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, $reg_name);
1719                 self.fd
1720                     .lock()
1721                     .unwrap()
1722                     .set_one_reg(
1723                         riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1724                         &kvm_regs_state.regs.$reg_name.to_le_bytes(),
1725                     )
1726                     .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1727             };
1728         }
1729 
1730         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, pc);
1731         self.fd
1732             .lock()
1733             .unwrap()
1734             .set_one_reg(
1735                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1736                 &kvm_regs_state.regs.pc.to_le_bytes(),
1737             )
1738             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1739 
1740         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, ra);
1741         self.fd
1742             .lock()
1743             .unwrap()
1744             .set_one_reg(
1745                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1746                 &kvm_regs_state.regs.ra.to_le_bytes(),
1747             )
1748             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1749 
1750         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, sp);
1751         self.fd
1752             .lock()
1753             .unwrap()
1754             .set_one_reg(
1755                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1756                 &kvm_regs_state.regs.sp.to_le_bytes(),
1757             )
1758             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1759 
1760         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, gp);
1761         self.fd
1762             .lock()
1763             .unwrap()
1764             .set_one_reg(
1765                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1766                 &kvm_regs_state.regs.gp.to_le_bytes(),
1767             )
1768             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1769 
1770         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, tp);
1771         self.fd
1772             .lock()
1773             .unwrap()
1774             .set_one_reg(
1775                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1776                 &kvm_regs_state.regs.tp.to_le_bytes(),
1777             )
1778             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1779 
1780         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, t0);
1781         self.fd
1782             .lock()
1783             .unwrap()
1784             .set_one_reg(
1785                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1786                 &kvm_regs_state.regs.t0.to_le_bytes(),
1787             )
1788             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1789 
1790         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, t1);
1791         self.fd
1792             .lock()
1793             .unwrap()
1794             .set_one_reg(
1795                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1796                 &kvm_regs_state.regs.t1.to_le_bytes(),
1797             )
1798             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1799 
1800         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, t2);
1801         self.fd
1802             .lock()
1803             .unwrap()
1804             .set_one_reg(
1805                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1806                 &kvm_regs_state.regs.t2.to_le_bytes(),
1807             )
1808             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1809 
1810         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, s0);
1811         self.fd
1812             .lock()
1813             .unwrap()
1814             .set_one_reg(
1815                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1816                 &kvm_regs_state.regs.s0.to_le_bytes(),
1817             )
1818             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1819 
1820         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, s1);
1821         self.fd
1822             .lock()
1823             .unwrap()
1824             .set_one_reg(
1825                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1826                 &kvm_regs_state.regs.s1.to_le_bytes(),
1827             )
1828             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1829 
1830         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, a0);
1831         self.fd
1832             .lock()
1833             .unwrap()
1834             .set_one_reg(
1835                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1836                 &kvm_regs_state.regs.a0.to_le_bytes(),
1837             )
1838             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1839 
1840         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, a1);
1841         self.fd
1842             .lock()
1843             .unwrap()
1844             .set_one_reg(
1845                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1846                 &kvm_regs_state.regs.a1.to_le_bytes(),
1847             )
1848             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1849 
1850         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, a2);
1851         self.fd
1852             .lock()
1853             .unwrap()
1854             .set_one_reg(
1855                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1856                 &kvm_regs_state.regs.a2.to_le_bytes(),
1857             )
1858             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1859 
1860         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, a3);
1861         self.fd
1862             .lock()
1863             .unwrap()
1864             .set_one_reg(
1865                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1866                 &kvm_regs_state.regs.a3.to_le_bytes(),
1867             )
1868             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1869 
1870         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, a4);
1871         self.fd
1872             .lock()
1873             .unwrap()
1874             .set_one_reg(
1875                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1876                 &kvm_regs_state.regs.a4.to_le_bytes(),
1877             )
1878             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1879 
1880         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, a5);
1881         self.fd
1882             .lock()
1883             .unwrap()
1884             .set_one_reg(
1885                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1886                 &kvm_regs_state.regs.a5.to_le_bytes(),
1887             )
1888             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1889 
1890         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, a6);
1891         self.fd
1892             .lock()
1893             .unwrap()
1894             .set_one_reg(
1895                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1896                 &kvm_regs_state.regs.a6.to_le_bytes(),
1897             )
1898             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1899 
1900         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, a7);
1901         self.fd
1902             .lock()
1903             .unwrap()
1904             .set_one_reg(
1905                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1906                 &kvm_regs_state.regs.a7.to_le_bytes(),
1907             )
1908             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1909 
1910         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, s2);
1911         self.fd
1912             .lock()
1913             .unwrap()
1914             .set_one_reg(
1915                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1916                 &kvm_regs_state.regs.s2.to_le_bytes(),
1917             )
1918             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1919 
1920         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, s3);
1921         self.fd
1922             .lock()
1923             .unwrap()
1924             .set_one_reg(
1925                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1926                 &kvm_regs_state.regs.s3.to_le_bytes(),
1927             )
1928             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1929 
1930         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, s4);
1931         self.fd
1932             .lock()
1933             .unwrap()
1934             .set_one_reg(
1935                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1936                 &kvm_regs_state.regs.s4.to_le_bytes(),
1937             )
1938             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1939 
1940         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, s5);
1941         self.fd
1942             .lock()
1943             .unwrap()
1944             .set_one_reg(
1945                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1946                 &kvm_regs_state.regs.s5.to_le_bytes(),
1947             )
1948             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1949 
1950         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, s6);
1951         self.fd
1952             .lock()
1953             .unwrap()
1954             .set_one_reg(
1955                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1956                 &kvm_regs_state.regs.s6.to_le_bytes(),
1957             )
1958             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1959 
1960         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, s7);
1961         self.fd
1962             .lock()
1963             .unwrap()
1964             .set_one_reg(
1965                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1966                 &kvm_regs_state.regs.s7.to_le_bytes(),
1967             )
1968             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1969 
1970         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, s8);
1971         self.fd
1972             .lock()
1973             .unwrap()
1974             .set_one_reg(
1975                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1976                 &kvm_regs_state.regs.s8.to_le_bytes(),
1977             )
1978             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1979 
1980         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, s9);
1981         self.fd
1982             .lock()
1983             .unwrap()
1984             .set_one_reg(
1985                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1986                 &kvm_regs_state.regs.s9.to_le_bytes(),
1987             )
1988             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1989 
1990         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, s10);
1991         self.fd
1992             .lock()
1993             .unwrap()
1994             .set_one_reg(
1995                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1996                 &kvm_regs_state.regs.s10.to_le_bytes(),
1997             )
1998             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1999 
2000         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, s11);
2001         self.fd
2002             .lock()
2003             .unwrap()
2004             .set_one_reg(
2005                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
2006                 &kvm_regs_state.regs.s11.to_le_bytes(),
2007             )
2008             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
2009 
2010         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, t3);
2011         self.fd
2012             .lock()
2013             .unwrap()
2014             .set_one_reg(
2015                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
2016                 &kvm_regs_state.regs.t3.to_le_bytes(),
2017             )
2018             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
2019 
2020         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, t4);
2021         self.fd
2022             .lock()
2023             .unwrap()
2024             .set_one_reg(
2025                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
2026                 &kvm_regs_state.regs.t4.to_le_bytes(),
2027             )
2028             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
2029 
2030         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, t5);
2031         self.fd
2032             .lock()
2033             .unwrap()
2034             .set_one_reg(
2035                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
2036                 &kvm_regs_state.regs.t5.to_le_bytes(),
2037             )
2038             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
2039 
2040         let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, t6);
2041         self.fd
2042             .lock()
2043             .unwrap()
2044             .set_one_reg(
2045                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
2046                 &kvm_regs_state.regs.t6.to_le_bytes(),
2047             )
2048             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
2049 
2050         let off = offset_of!(kvm_riscv_core, mode);
2051         self.fd
2052             .lock()
2053             .unwrap()
2054             .set_one_reg(
2055                 riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
2056                 &kvm_regs_state.mode.to_le_bytes(),
2057             )
2058             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
2059 
2060         Ok(())
2061     }
2062 
2063     #[cfg(target_arch = "x86_64")]
2064     ///
2065     /// Returns the vCPU special registers.
2066     ///
2067     fn get_sregs(&self) -> cpu::Result<SpecialRegisters> {
2068         Ok(self
2069             .fd
2070             .lock()
2071             .unwrap()
2072             .get_sregs()
2073             .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))?
2074             .into())
2075     }
2076 
2077     #[cfg(target_arch = "x86_64")]
2078     ///
2079     /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl.
2080     ///
2081     fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> {
2082         let sregs = (*sregs).into();
2083         self.fd
2084             .lock()
2085             .unwrap()
2086             .set_sregs(&sregs)
2087             .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into()))
2088     }
2089 
2090     #[cfg(target_arch = "x86_64")]
2091     ///
2092     /// Returns the floating point state (FPU) from the vCPU.
2093     ///
2094     fn get_fpu(&self) -> cpu::Result<FpuState> {
2095         Ok(self
2096             .fd
2097             .lock()
2098             .unwrap()
2099             .get_fpu()
2100             .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))?
2101             .into())
2102     }
2103 
2104     #[cfg(target_arch = "x86_64")]
2105     ///
2106     /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioctl.
2107     ///
2108     fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> {
2109         let fpu: kvm_bindings::kvm_fpu = (*fpu).clone().into();
2110         self.fd
2111             .lock()
2112             .unwrap()
2113             .set_fpu(&fpu)
2114             .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into()))
2115     }
2116 
2117     #[cfg(target_arch = "x86_64")]
2118     ///
2119     /// X86 specific call to setup the CPUID registers.
2120     ///
2121     fn set_cpuid2(&self, cpuid: &[CpuIdEntry]) -> cpu::Result<()> {
2122         let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> =
2123             cpuid.iter().map(|e| (*e).into()).collect();
2124         let kvm_cpuid = <CpuId>::from_entries(&cpuid)
2125             .map_err(|_| cpu::HypervisorCpuError::SetCpuid(anyhow!("failed to create CpuId")))?;
2126 
2127         self.fd
2128             .lock()
2129             .unwrap()
2130             .set_cpuid2(&kvm_cpuid)
2131             .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into()))
2132     }
2133 
2134     #[cfg(target_arch = "x86_64")]
2135     ///
2136     /// X86 specific call to enable HyperV SynIC
2137     ///
2138     fn enable_hyperv_synic(&self) -> cpu::Result<()> {
2139         // Update the information about Hyper-V SynIC being enabled and
2140         // emulated as it will influence later which MSRs should be saved.
2141         self.hyperv_synic.store(true, Ordering::Release);
2142 
2143         let cap = kvm_enable_cap {
2144             cap: KVM_CAP_HYPERV_SYNIC,
2145             ..Default::default()
2146         };
2147         self.fd
2148             .lock()
2149             .unwrap()
2150             .enable_cap(&cap)
2151             .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into()))
2152     }
2153 
2154     ///
2155     /// X86 specific call to retrieve the CPUID registers.
2156     ///
2157     #[cfg(target_arch = "x86_64")]
2158     fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<Vec<CpuIdEntry>> {
2159         let kvm_cpuid = self
2160             .fd
2161             .lock()
2162             .unwrap()
2163             .get_cpuid2(num_entries)
2164             .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))?;
2165 
2166         let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect();
2167 
2168         Ok(v)
2169     }
2170 
2171     #[cfg(target_arch = "x86_64")]
2172     ///
2173     /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
2174     ///
2175     fn get_lapic(&self) -> cpu::Result<LapicState> {
2176         Ok(self
2177             .fd
2178             .lock()
2179             .unwrap()
2180             .get_lapic()
2181             .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))?
2182             .into())
2183     }
2184 
2185     #[cfg(target_arch = "x86_64")]
2186     ///
2187     /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
2188     ///
2189     fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> {
2190         let klapic: kvm_bindings::kvm_lapic_state = (*klapic).clone().into();
2191         self.fd
2192             .lock()
2193             .unwrap()
2194             .set_lapic(&klapic)
2195             .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into()))
2196     }
2197 
2198     #[cfg(target_arch = "x86_64")]
2199     ///
2200     /// Returns the model-specific registers (MSR) for this vCPU.
2201     ///
2202     fn get_msrs(&self, msrs: &mut Vec<MsrEntry>) -> cpu::Result<usize> {
2203         let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect();
2204         let mut kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap();
2205         let succ = self
2206             .fd
2207             .lock()
2208             .unwrap()
2209             .get_msrs(&mut kvm_msrs)
2210             .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))?;
2211 
2212         msrs[..succ].copy_from_slice(
2213             &kvm_msrs.as_slice()[..succ]
2214                 .iter()
2215                 .map(|e| (*e).into())
2216                 .collect::<Vec<MsrEntry>>(),
2217         );
2218 
2219         Ok(succ)
2220     }
2221 
2222     #[cfg(target_arch = "x86_64")]
2223     ///
2224     /// Setup the model-specific registers (MSR) for this vCPU.
2225     /// Returns the number of MSR entries actually written.
2226     ///
2227     fn set_msrs(&self, msrs: &[MsrEntry]) -> cpu::Result<usize> {
2228         let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect();
2229         let kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap();
2230         self.fd
2231             .lock()
2232             .unwrap()
2233             .set_msrs(&kvm_msrs)
2234             .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into()))
2235     }
2236 
2237     ///
2238     /// Returns the vcpu's current "multiprocessing state".
2239     ///
2240     fn get_mp_state(&self) -> cpu::Result<MpState> {
2241         Ok(self
2242             .fd
2243             .lock()
2244             .unwrap()
2245             .get_mp_state()
2246             .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))?
2247             .into())
2248     }
2249 
2250     ///
2251     /// Sets the vcpu's current "multiprocessing state".
2252     ///
2253     fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> {
2254         self.fd
2255             .lock()
2256             .unwrap()
2257             .set_mp_state(mp_state.into())
2258             .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into()))
2259     }
2260 
2261     #[cfg(target_arch = "x86_64")]
2262     ///
2263     /// Translates guest virtual address to guest physical address using the `KVM_TRANSLATE` ioctl.
2264     ///
2265     fn translate_gva(&self, gva: u64, _flags: u64) -> cpu::Result<(u64, u32)> {
2266         let tr = self
2267             .fd
2268             .lock()
2269             .unwrap()
2270             .translate_gva(gva)
2271             .map_err(|e| cpu::HypervisorCpuError::TranslateVirtualAddress(e.into()))?;
2272         // tr.valid is set if the GVA is mapped to valid GPA.
2273         match tr.valid {
2274             0 => Err(cpu::HypervisorCpuError::TranslateVirtualAddress(anyhow!(
2275                 "Invalid GVA: {:#x}",
2276                 gva
2277             ))),
2278             _ => Ok((tr.physical_address, 0)),
2279         }
2280     }
2281 
2282     ///
2283     /// Triggers the running of the current virtual CPU returning an exit reason.
2284     ///
2285     fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> {
2286         match self.fd.lock().unwrap().run() {
2287             Ok(run) => match run {
2288                 #[cfg(target_arch = "x86_64")]
2289                 VcpuExit::IoIn(addr, data) => {
2290                     if let Some(vm_ops) = &self.vm_ops {
2291                         return vm_ops
2292                             .pio_read(addr.into(), data)
2293                             .map(|_| cpu::VmExit::Ignore)
2294                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
2295                     }
2296 
2297                     Ok(cpu::VmExit::Ignore)
2298                 }
2299                 #[cfg(target_arch = "x86_64")]
2300                 VcpuExit::IoOut(addr, data) => {
2301                     if let Some(vm_ops) = &self.vm_ops {
2302                         return vm_ops
2303                             .pio_write(addr.into(), data)
2304                             .map(|_| cpu::VmExit::Ignore)
2305                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
2306                     }
2307 
2308                     Ok(cpu::VmExit::Ignore)
2309                 }
2310                 #[cfg(target_arch = "x86_64")]
2311                 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)),
2312                 #[cfg(target_arch = "x86_64")]
2313                 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset),
2314 
2315                 #[cfg(target_arch = "aarch64")]
2316                 VcpuExit::SystemEvent(event_type, flags) => {
2317                     use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN};
2318                     // On Aarch64, when the VM is shutdown, run() returns
2319                     // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN
2320                     if event_type == KVM_SYSTEM_EVENT_RESET {
2321                         Ok(cpu::VmExit::Reset)
2322                     } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN {
2323                         Ok(cpu::VmExit::Shutdown)
2324                     } else {
2325                         Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
2326                             "Unexpected system event with type 0x{:x}, flags 0x{:x?}",
2327                             event_type,
2328                             flags
2329                         )))
2330                     }
2331                 }
2332 
2333                 VcpuExit::MmioRead(addr, data) => {
2334                     if let Some(vm_ops) = &self.vm_ops {
2335                         return vm_ops
2336                             .mmio_read(addr, data)
2337                             .map(|_| cpu::VmExit::Ignore)
2338                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
2339                     }
2340 
2341                     Ok(cpu::VmExit::Ignore)
2342                 }
2343                 VcpuExit::MmioWrite(addr, data) => {
2344                     if let Some(vm_ops) = &self.vm_ops {
2345                         return vm_ops
2346                             .mmio_write(addr, data)
2347                             .map(|_| cpu::VmExit::Ignore)
2348                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
2349                     }
2350 
2351                     Ok(cpu::VmExit::Ignore)
2352                 }
2353                 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv),
2354                 #[cfg(feature = "tdx")]
2355                 VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx),
2356                 VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug),
2357 
2358                 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
2359                     "Unexpected exit reason on vcpu run: {:?}",
2360                     r
2361                 ))),
2362             },
2363 
2364             Err(ref e) => match e.errno() {
2365                 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore),
2366                 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
2367                     "VCPU error {:?}",
2368                     e
2369                 ))),
2370             },
2371         }
2372     }
2373 
2374     #[cfg(target_arch = "x86_64")]
2375     ///
2376     /// Let the guest know that it has been paused, which prevents from
2377     /// potential soft lockups when being resumed.
2378     ///
2379     fn notify_guest_clock_paused(&self) -> cpu::Result<()> {
2380         if let Err(e) = self.fd.lock().unwrap().kvmclock_ctrl() {
2381             // Linux kernel returns -EINVAL if the PV clock isn't yet initialised
2382             // which could be because we're still in firmware or the guest doesn't
2383             // use KVM clock.
2384             if e.errno() != libc::EINVAL {
2385                 return Err(cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into()));
2386             }
2387         }
2388 
2389         Ok(())
2390     }
2391 
2392     #[cfg(not(target_arch = "riscv64"))]
2393     ///
2394     /// Sets debug registers to set hardware breakpoints and/or enable single step.
2395     ///
2396     fn set_guest_debug(
2397         &self,
2398         addrs: &[vm_memory::GuestAddress],
2399         singlestep: bool,
2400     ) -> cpu::Result<()> {
2401         let mut dbg = kvm_guest_debug {
2402             #[cfg(target_arch = "x86_64")]
2403             control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP,
2404             #[cfg(target_arch = "aarch64")]
2405             control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW,
2406             ..Default::default()
2407         };
2408         if singlestep {
2409             dbg.control |= KVM_GUESTDBG_SINGLESTEP;
2410         }
2411 
2412         // Set the debug registers.
2413         // Here we assume that the number of addresses do not exceed what
2414         // `Hypervisor::get_guest_debug_hw_bps()` specifies.
2415         #[cfg(target_arch = "x86_64")]
2416         {
2417             // Set bits 9 and 10.
2418             // bit 9: GE (global exact breakpoint enable) flag.
2419             // bit 10: always 1.
2420             dbg.arch.debugreg[7] = 0x0600;
2421 
2422             for (i, addr) in addrs.iter().enumerate() {
2423                 dbg.arch.debugreg[i] = addr.0;
2424                 // Set global breakpoint enable flag
2425                 dbg.arch.debugreg[7] |= 2 << (i * 2);
2426             }
2427         }
2428         #[cfg(target_arch = "aarch64")]
2429         {
2430             for (i, addr) in addrs.iter().enumerate() {
2431                 // DBGBCR_EL1 (Debug Breakpoint Control Registers, D13.3.2):
2432                 // bit 0: 1 (Enabled)
2433                 // bit 1~2: 0b11 (PMC = EL1/EL0)
2434                 // bit 5~8: 0b1111 (BAS = AArch64)
2435                 // others: 0
2436                 dbg.arch.dbg_bcr[i] = 0b1u64 | 0b110u64 | 0b1_1110_0000u64;
2437                 // DBGBVR_EL1 (Debug Breakpoint Value Registers, D13.3.3):
2438                 // bit 2~52: VA[2:52]
2439                 dbg.arch.dbg_bvr[i] = (!0u64 >> 11) & addr.0;
2440             }
2441         }
2442         self.fd
2443             .lock()
2444             .unwrap()
2445             .set_guest_debug(&dbg)
2446             .map_err(|e| cpu::HypervisorCpuError::SetDebugRegs(e.into()))
2447     }
2448 
2449     #[cfg(target_arch = "aarch64")]
2450     fn vcpu_get_finalized_features(&self) -> i32 {
2451         kvm_bindings::KVM_ARM_VCPU_SVE as i32
2452     }
2453 
2454     #[cfg(target_arch = "aarch64")]
2455     fn vcpu_set_processor_features(
2456         &self,
2457         vm: &Arc<dyn crate::Vm>,
2458         kvi: &mut crate::VcpuInit,
2459         id: u8,
2460     ) -> cpu::Result<()> {
2461         use std::arch::is_aarch64_feature_detected;
2462         #[allow(clippy::nonminimal_bool)]
2463         let sve_supported =
2464             is_aarch64_feature_detected!("sve") || is_aarch64_feature_detected!("sve2");
2465 
2466         let mut kvm_kvi: kvm_bindings::kvm_vcpu_init = (*kvi).into();
2467 
2468         // We already checked that the capability is supported.
2469         kvm_kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2;
2470         if vm
2471             .as_any()
2472             .downcast_ref::<crate::kvm::KvmVm>()
2473             .unwrap()
2474             .check_extension(Cap::ArmPmuV3)
2475         {
2476             kvm_kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3;
2477         }
2478 
2479         if sve_supported
2480             && vm
2481                 .as_any()
2482                 .downcast_ref::<crate::kvm::KvmVm>()
2483                 .unwrap()
2484                 .check_extension(Cap::ArmSve)
2485         {
2486             kvm_kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_SVE;
2487         }
2488 
2489         // Non-boot cpus are powered off initially.
2490         if id > 0 {
2491             kvm_kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF;
2492         }
2493 
2494         *kvi = kvm_kvi.into();
2495 
2496         Ok(())
2497     }
2498 
2499     ///
2500     /// Return VcpuInit with default value set
2501     ///
2502     #[cfg(target_arch = "aarch64")]
2503     fn create_vcpu_init(&self) -> crate::VcpuInit {
2504         kvm_bindings::kvm_vcpu_init::default().into()
2505     }
2506 
2507     #[cfg(target_arch = "aarch64")]
2508     fn vcpu_init(&self, kvi: &crate::VcpuInit) -> cpu::Result<()> {
2509         let kvm_kvi: kvm_bindings::kvm_vcpu_init = (*kvi).into();
2510         self.fd
2511             .lock()
2512             .unwrap()
2513             .vcpu_init(&kvm_kvi)
2514             .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into()))
2515     }
2516 
2517     #[cfg(target_arch = "aarch64")]
2518     fn vcpu_finalize(&self, feature: i32) -> cpu::Result<()> {
2519         self.fd
2520             .lock()
2521             .unwrap()
2522             .vcpu_finalize(&feature)
2523             .map_err(|e| cpu::HypervisorCpuError::VcpuFinalize(e.into()))
2524     }
2525 
2526     #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
2527     ///
2528     /// Gets a list of the guest registers that are supported for the
2529     /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls.
2530     ///
2531     fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> {
2532         let mut kvm_reg_list: kvm_bindings::RegList = reg_list.clone().into();
2533         self.fd
2534             .lock()
2535             .unwrap()
2536             .get_reg_list(&mut kvm_reg_list)
2537             .map_err(|e: kvm_ioctls::Error| cpu::HypervisorCpuError::GetRegList(e.into()))?;
2538         *reg_list = kvm_reg_list.into();
2539         Ok(())
2540     }
2541 
2542     ///
2543     /// Gets the value of a system register
2544     ///
2545     #[cfg(target_arch = "aarch64")]
2546     fn get_sys_reg(&self, sys_reg: u32) -> cpu::Result<u64> {
2547         //
2548         // Arm Architecture Reference Manual defines the encoding of
2549         // AArch64 system registers, see
2550         // https://developer.arm.com/documentation/ddi0487 (chapter D12).
2551         // While KVM defines another ID for each AArch64 system register,
2552         // which is used in calling `KVM_G/SET_ONE_REG` to access a system
2553         // register of a guest.
2554         // A mapping exists between the Arm standard encoding and the KVM ID.
2555         // This function takes the standard u32 ID as input parameter, converts
2556         // it to the corresponding KVM ID, and call `KVM_GET_ONE_REG` API to
2557         // get the value of the system parameter.
2558         //
2559         let id: u64 = KVM_REG_ARM64
2560             | KVM_REG_SIZE_U64
2561             | KVM_REG_ARM64_SYSREG as u64
2562             | ((((sys_reg) >> 5)
2563                 & (KVM_REG_ARM64_SYSREG_OP0_MASK
2564                     | KVM_REG_ARM64_SYSREG_OP1_MASK
2565                     | KVM_REG_ARM64_SYSREG_CRN_MASK
2566                     | KVM_REG_ARM64_SYSREG_CRM_MASK
2567                     | KVM_REG_ARM64_SYSREG_OP2_MASK)) as u64);
2568         let mut bytes = [0_u8; 8];
2569         self.fd
2570             .lock()
2571             .unwrap()
2572             .get_one_reg(id, &mut bytes)
2573             .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?;
2574         Ok(u64::from_le_bytes(bytes))
2575     }
2576 
2577     ///
2578     /// Gets the value of a non-core register
2579     ///
2580     #[cfg(target_arch = "riscv64")]
2581     fn get_non_core_reg(&self, _non_core_reg: u32) -> cpu::Result<u64> {
2582         unimplemented!()
2583     }
2584 
2585     ///
2586     /// Configure core registers for a given CPU.
2587     ///
2588     #[cfg(target_arch = "aarch64")]
2589     fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> {
2590         #[allow(non_upper_case_globals)]
2591         // PSR (Processor State Register) bits.
2592         // Taken from arch/arm64/include/uapi/asm/ptrace.h.
2593         const PSR_MODE_EL1h: u64 = 0x0000_0005;
2594         const PSR_F_BIT: u64 = 0x0000_0040;
2595         const PSR_I_BIT: u64 = 0x0000_0080;
2596         const PSR_A_BIT: u64 = 0x0000_0100;
2597         const PSR_D_BIT: u64 = 0x0000_0200;
2598         // Taken from arch/arm64/kvm/inject_fault.c.
2599         const PSTATE_FAULT_BITS_64: u64 =
2600             PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | PSR_I_BIT | PSR_D_BIT;
2601 
2602         let kreg_off = offset_of!(kvm_regs, regs);
2603 
2604         // Get the register index of the PSTATE (Processor State) register.
2605         let pstate = offset_of!(user_pt_regs, pstate) + kreg_off;
2606         self.fd
2607             .lock()
2608             .unwrap()
2609             .set_one_reg(
2610                 arm64_core_reg_id!(KVM_REG_SIZE_U64, pstate),
2611                 &PSTATE_FAULT_BITS_64.to_le_bytes(),
2612             )
2613             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
2614 
2615         // Other vCPUs are powered off initially awaiting PSCI wakeup.
2616         if cpu_id == 0 {
2617             // Setting the PC (Processor Counter) to the current program address (kernel address).
2618             let pc = offset_of!(user_pt_regs, pc) + kreg_off;
2619             self.fd
2620                 .lock()
2621                 .unwrap()
2622                 .set_one_reg(
2623                     arm64_core_reg_id!(KVM_REG_SIZE_U64, pc),
2624                     &boot_ip.to_le_bytes(),
2625                 )
2626                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
2627 
2628             // Last mandatory thing to set -> the address pointing to the FDT (also called DTB).
2629             // "The device tree blob (dtb) must be placed on an 8-byte boundary and must
2630             // not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt.
2631             // We are choosing to place it the end of DRAM. See `get_fdt_addr`.
2632             let regs0 = offset_of!(user_pt_regs, regs) + kreg_off;
2633             self.fd
2634                 .lock()
2635                 .unwrap()
2636                 .set_one_reg(
2637                     arm64_core_reg_id!(KVM_REG_SIZE_U64, regs0),
2638                     &fdt_start.to_le_bytes(),
2639                 )
2640                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
2641         }
2642         Ok(())
2643     }
2644 
2645     #[cfg(target_arch = "riscv64")]
2646     ///
2647     /// Configure registers for a given RISC-V CPU.
2648     ///
2649     fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> {
2650         // Setting the A0 () to the hartid of this CPU.
2651         let a0 = offset_of!(kvm_riscv_core, regs, user_regs_struct, a0);
2652         self.fd
2653             .lock()
2654             .unwrap()
2655             .set_one_reg(
2656                 riscv64_reg_id!(KVM_REG_RISCV_CORE, a0),
2657                 &u64::from(cpu_id).to_le_bytes(),
2658             )
2659             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
2660 
2661         // Setting the PC (Processor Counter) to the current program address (kernel address).
2662         let pc = offset_of!(kvm_riscv_core, regs, user_regs_struct, pc);
2663         self.fd
2664             .lock()
2665             .unwrap()
2666             .set_one_reg(
2667                 riscv64_reg_id!(KVM_REG_RISCV_CORE, pc),
2668                 &boot_ip.to_le_bytes(),
2669             )
2670             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
2671 
2672         // Last mandatory thing to set -> the address pointing to the FDT (also called DTB).
2673         // "The device tree blob (dtb) must be placed on an 8-byte boundary and must
2674         // not exceed 64 kilobytes in size." -> https://www.kernel.org/doc/Documentation/arch/riscv/boot.txt.
2675         let a1 = offset_of!(kvm_riscv_core, regs, user_regs_struct, a1);
2676         self.fd
2677             .lock()
2678             .unwrap()
2679             .set_one_reg(
2680                 riscv64_reg_id!(KVM_REG_RISCV_CORE, a1),
2681                 &fdt_start.to_le_bytes(),
2682             )
2683             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
2684 
2685         Ok(())
2686     }
2687 
2688     #[cfg(target_arch = "x86_64")]
2689     ///
2690     /// Get the current CPU state
2691     ///
2692     /// Ordering requirements:
2693     ///
2694     /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify
2695     /// vCPU/LAPIC state. As such, it must be done before most everything
2696     /// else, otherwise we cannot restore everything and expect it to work.
2697     ///
2698     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
2699     /// still running.
2700     ///
2701     /// KVM_GET_LAPIC may change state of LAPIC before returning it.
2702     ///
2703     /// GET_VCPU_EVENTS should probably be last to save. The code looks as
2704     /// it might as well be affected by internal state modifications of the
2705     /// GET ioctls.
2706     ///
2707     /// SREGS saves/restores a pending interrupt, similar to what
2708     /// VCPU_EVENTS also does.
2709     ///
2710     /// GET_MSRS requires a prepopulated data structure to do something
2711     /// meaningful. For SET_MSRS it will then contain good data.
2712     ///
2713     /// # Example
2714     ///
2715     /// ```rust
2716     /// # use hypervisor::kvm::KvmHypervisor;
2717     /// # use std::sync::Arc;
2718     /// let kvm = KvmHypervisor::new().unwrap();
2719     /// let hv = Arc::new(kvm);
2720     /// let vm = hv.create_vm().expect("new VM fd creation failed");
2721     /// vm.enable_split_irq().unwrap();
2722     /// let vcpu = vm.create_vcpu(0, None).unwrap();
2723     /// let state = vcpu.state().unwrap();
2724     /// ```
2725     fn state(&self) -> cpu::Result<CpuState> {
2726         let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?;
2727         let mp_state = self.get_mp_state()?.into();
2728         let regs = self.get_regs()?;
2729         let sregs = self.get_sregs()?;
2730         let xsave = self.get_xsave()?;
2731         let xcrs = self.get_xcrs()?;
2732         let lapic_state = self.get_lapic()?;
2733         let fpu = self.get_fpu()?;
2734 
2735         // Try to get all MSRs based on the list previously retrieved from KVM.
2736         // If the number of MSRs obtained from GET_MSRS is different from the
2737         // expected amount, we fallback onto a slower method by getting MSRs
2738         // by chunks. This is the only way to make sure we try to get as many
2739         // MSRs as possible, even if some MSRs are not supported.
2740         let mut msr_entries = self.msrs.clone();
2741 
2742         // Save extra MSRs if the Hyper-V synthetic interrupt controller is
2743         // emulated.
2744         if self.hyperv_synic.load(Ordering::Acquire) {
2745             let hyperv_synic_msrs = vec![
2746                 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084,
2747                 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096,
2748                 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d,
2749                 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4,
2750                 0x400000b5, 0x400000b6, 0x400000b7,
2751             ];
2752             for index in hyperv_synic_msrs {
2753                 let msr = kvm_msr_entry {
2754                     index,
2755                     ..Default::default()
2756                 };
2757                 msr_entries.push(msr.into());
2758             }
2759         }
2760 
2761         let expected_num_msrs = msr_entries.len();
2762         let num_msrs = self.get_msrs(&mut msr_entries)?;
2763         let msrs = if num_msrs != expected_num_msrs {
2764             let mut faulty_msr_index = num_msrs;
2765             let mut msr_entries_tmp = msr_entries[..faulty_msr_index].to_vec();
2766 
2767             loop {
2768                 warn!(
2769                     "Detected faulty MSR 0x{:x} while getting MSRs",
2770                     msr_entries[faulty_msr_index].index
2771                 );
2772 
2773                 // Skip the first bad MSR
2774                 let start_pos = faulty_msr_index + 1;
2775 
2776                 let mut sub_msr_entries = msr_entries[start_pos..].to_vec();
2777                 let num_msrs = self.get_msrs(&mut sub_msr_entries)?;
2778 
2779                 msr_entries_tmp.extend(&sub_msr_entries[..num_msrs]);
2780 
2781                 if num_msrs == sub_msr_entries.len() {
2782                     break;
2783                 }
2784 
2785                 faulty_msr_index = start_pos + num_msrs;
2786             }
2787 
2788             msr_entries_tmp
2789         } else {
2790             msr_entries
2791         };
2792 
2793         let vcpu_events = self.get_vcpu_events()?;
2794         let tsc_khz = self.tsc_khz()?;
2795 
2796         Ok(VcpuKvmState {
2797             cpuid,
2798             msrs,
2799             vcpu_events,
2800             regs: regs.into(),
2801             sregs: sregs.into(),
2802             fpu,
2803             lapic_state,
2804             xsave,
2805             xcrs,
2806             mp_state,
2807             tsc_khz,
2808         }
2809         .into())
2810     }
2811 
2812     ///
2813     /// Get the current AArch64 CPU state
2814     ///
2815     #[cfg(target_arch = "aarch64")]
2816     fn state(&self) -> cpu::Result<CpuState> {
2817         let mut state = VcpuKvmState {
2818             mp_state: self.get_mp_state()?.into(),
2819             ..Default::default()
2820         };
2821         // Get core registers
2822         state.core_regs = self.get_regs()?.into();
2823 
2824         // Get systerm register
2825         // Call KVM_GET_REG_LIST to get all registers available to the guest.
2826         // For ArmV8 there are around 500 registers.
2827         let mut sys_regs: Vec<kvm_bindings::kvm_one_reg> = Vec::new();
2828         let mut reg_list = kvm_bindings::RegList::new(500).unwrap();
2829         self.fd
2830             .lock()
2831             .unwrap()
2832             .get_reg_list(&mut reg_list)
2833             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?;
2834 
2835         // At this point reg_list should contain: core registers and system
2836         // registers.
2837         // The register list contains the number of registers and their ids. We
2838         // will be needing to call KVM_GET_ONE_REG on each id in order to save
2839         // all of them. We carve out from the list  the core registers which are
2840         // represented in the kernel by kvm_regs structure and for which we can
2841         // calculate the id based on the offset in the structure.
2842         reg_list.retain(|regid| is_system_register(*regid));
2843 
2844         // Now, for the rest of the registers left in the previously fetched
2845         // register list, we are simply calling KVM_GET_ONE_REG.
2846         let indices = reg_list.as_slice();
2847         for index in indices.iter() {
2848             let mut bytes = [0_u8; 8];
2849             self.fd
2850                 .lock()
2851                 .unwrap()
2852                 .get_one_reg(*index, &mut bytes)
2853                 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?;
2854             sys_regs.push(kvm_bindings::kvm_one_reg {
2855                 id: *index,
2856                 addr: u64::from_le_bytes(bytes),
2857             });
2858         }
2859 
2860         state.sys_regs = sys_regs;
2861 
2862         Ok(state.into())
2863     }
2864 
2865     #[cfg(target_arch = "riscv64")]
2866     ///
2867     /// Get the current RISC-V 64-bit CPU state
2868     ///
2869     fn state(&self) -> cpu::Result<CpuState> {
2870         let mut state = VcpuKvmState {
2871             mp_state: self.get_mp_state()?.into(),
2872             ..Default::default()
2873         };
2874         // Get core registers
2875         state.core_regs = self.get_regs()?.into();
2876 
2877         // Get non-core register
2878         // Call KVM_GET_REG_LIST to get all registers available to the guest.
2879         // For RISC-V 64-bit there are around 200 registers.
2880         let mut sys_regs: Vec<kvm_bindings::kvm_one_reg> = Vec::new();
2881         let mut reg_list = kvm_bindings::RegList::new(200).unwrap();
2882         self.fd
2883             .lock()
2884             .unwrap()
2885             .get_reg_list(&mut reg_list)
2886             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?;
2887 
2888         // At this point reg_list should contain:
2889         // - core registers
2890         // - config registers
2891         // - timer registers
2892         // - control and status registers
2893         // - AIA control and status registers
2894         // - smstateen control and status registers
2895         // - sbi_sta control and status registers.
2896         //
2897         // The register list contains the number of registers and their ids. We
2898         // will be needing to call KVM_GET_ONE_REG on each id in order to save
2899         // all of them. We carve out from the list the core registers which are
2900         // represented in the kernel by `kvm_riscv_core` structure and for which
2901         // we can calculate the id based on the offset in the structure.
2902         reg_list.retain(|regid| is_non_core_register(*regid));
2903 
2904         // Now, for the rest of the registers left in the previously fetched
2905         // register list, we are simply calling KVM_GET_ONE_REG.
2906         let indices = reg_list.as_slice();
2907         for index in indices.iter() {
2908             let mut bytes = [0_u8; 8];
2909             self.fd
2910                 .lock()
2911                 .unwrap()
2912                 .get_one_reg(*index, &mut bytes)
2913                 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?;
2914             sys_regs.push(kvm_bindings::kvm_one_reg {
2915                 id: *index,
2916                 addr: u64::from_le_bytes(bytes),
2917             });
2918         }
2919 
2920         state.non_core_regs = sys_regs;
2921 
2922         Ok(state.into())
2923     }
2924 
2925     #[cfg(target_arch = "x86_64")]
2926     ///
2927     /// Restore the previously saved CPU state
2928     ///
2929     /// Ordering requirements:
2930     ///
2931     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
2932     /// still running.
2933     ///
2934     /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so
2935     /// if we ever change the BSP, we have to do that before restoring anything.
2936     /// The same seems to be true for CPUID stuff.
2937     ///
2938     /// SREGS saves/restores a pending interrupt, similar to what
2939     /// VCPU_EVENTS also does.
2940     ///
2941     /// SET_REGS clears pending exceptions unconditionally, thus, it must be
2942     /// done before SET_VCPU_EVENTS, which restores it.
2943     ///
2944     /// SET_LAPIC must come after SET_SREGS, because the latter restores
2945     /// the apic base msr.
2946     ///
2947     /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR
2948     /// only restores successfully, when the LAPIC is correctly configured.
2949     ///
2950     /// Arguments: CpuState
2951     /// # Example
2952     ///
2953     /// ```rust
2954     /// # use hypervisor::kvm::KvmHypervisor;
2955     /// # use std::sync::Arc;
2956     /// let kvm = KvmHypervisor::new().unwrap();
2957     /// let hv = Arc::new(kvm);
2958     /// let vm = hv.create_vm().expect("new VM fd creation failed");
2959     /// vm.enable_split_irq().unwrap();
2960     /// let vcpu = vm.create_vcpu(0, None).unwrap();
2961     /// let state = vcpu.state().unwrap();
2962     /// vcpu.set_state(&state).unwrap();
2963     /// ```
2964     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
2965         let state: VcpuKvmState = state.clone().into();
2966         self.set_cpuid2(&state.cpuid)?;
2967         self.set_mp_state(state.mp_state.into())?;
2968         self.set_regs(&state.regs.into())?;
2969         self.set_sregs(&state.sregs.into())?;
2970         self.set_xsave(&state.xsave)?;
2971         self.set_xcrs(&state.xcrs)?;
2972         self.set_lapic(&state.lapic_state)?;
2973         self.set_fpu(&state.fpu)?;
2974 
2975         if let Some(freq) = state.tsc_khz {
2976             self.set_tsc_khz(freq)?;
2977         }
2978 
2979         // Try to set all MSRs previously stored.
2980         // If the number of MSRs set from SET_MSRS is different from the
2981         // expected amount, we fallback onto a slower method by setting MSRs
2982         // by chunks. This is the only way to make sure we try to set as many
2983         // MSRs as possible, even if some MSRs are not supported.
2984         let expected_num_msrs = state.msrs.len();
2985         let num_msrs = self.set_msrs(&state.msrs)?;
2986         if num_msrs != expected_num_msrs {
2987             let mut faulty_msr_index = num_msrs;
2988 
2989             loop {
2990                 warn!(
2991                     "Detected faulty MSR 0x{:x} while setting MSRs",
2992                     state.msrs[faulty_msr_index].index
2993                 );
2994 
2995                 // Skip the first bad MSR
2996                 let start_pos = faulty_msr_index + 1;
2997 
2998                 let sub_msr_entries = state.msrs[start_pos..].to_vec();
2999 
3000                 let num_msrs = self.set_msrs(&sub_msr_entries)?;
3001 
3002                 if num_msrs == sub_msr_entries.len() {
3003                     break;
3004                 }
3005 
3006                 faulty_msr_index = start_pos + num_msrs;
3007             }
3008         }
3009 
3010         self.set_vcpu_events(&state.vcpu_events)?;
3011 
3012         Ok(())
3013     }
3014 
3015     ///
3016     /// Restore the previously saved AArch64 CPU state
3017     ///
3018     #[cfg(target_arch = "aarch64")]
3019     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
3020         let state: VcpuKvmState = state.clone().into();
3021         // Set core registers
3022         self.set_regs(&state.core_regs.into())?;
3023         // Set system registers
3024         for reg in &state.sys_regs {
3025             self.fd
3026                 .lock()
3027                 .unwrap()
3028                 .set_one_reg(reg.id, &reg.addr.to_le_bytes())
3029                 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?;
3030         }
3031 
3032         self.set_mp_state(state.mp_state.into())?;
3033 
3034         Ok(())
3035     }
3036 
3037     #[cfg(target_arch = "riscv64")]
3038     ///
3039     /// Restore the previously saved RISC-V 64-bit CPU state
3040     ///
3041     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
3042         let state: VcpuKvmState = state.clone().into();
3043         // Set core registers
3044         self.set_regs(&state.core_regs.into())?;
3045         // Set system registers
3046         for reg in &state.non_core_regs {
3047             self.fd
3048                 .lock()
3049                 .unwrap()
3050                 .set_one_reg(reg.id, &reg.addr.to_le_bytes())
3051                 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?;
3052         }
3053 
3054         self.set_mp_state(state.mp_state.into())?;
3055 
3056         Ok(())
3057     }
3058 
3059     ///
3060     /// Initialize TDX for this CPU
3061     ///
3062     #[cfg(feature = "tdx")]
3063     fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> {
3064         tdx_command(
3065             &self.fd.lock().unwrap().as_raw_fd(),
3066             TdxCommand::InitVcpu,
3067             0,
3068             hob_address,
3069         )
3070         .map_err(cpu::HypervisorCpuError::InitializeTdx)
3071     }
3072 
3073     ///
3074     /// Set the "immediate_exit" state
3075     ///
3076     fn set_immediate_exit(&self, exit: bool) {
3077         self.fd.lock().unwrap().set_kvm_immediate_exit(exit.into());
3078     }
3079 
3080     ///
3081     /// Returns the details about TDX exit reason
3082     ///
3083     #[cfg(feature = "tdx")]
3084     fn get_tdx_exit_details(&mut self) -> cpu::Result<TdxExitDetails> {
3085         let mut fd = self.fd.as_ref().lock().unwrap();
3086         let kvm_run = fd.get_kvm_run();
3087         // SAFETY: accessing a union field in a valid structure
3088         let tdx_vmcall = unsafe {
3089             &mut (*((&mut kvm_run.__bindgen_anon_1) as *mut kvm_run__bindgen_ty_1
3090                 as *mut KvmTdxExit))
3091                 .u
3092                 .vmcall
3093         };
3094 
3095         tdx_vmcall.status_code = TDG_VP_VMCALL_INVALID_OPERAND;
3096 
3097         if tdx_vmcall.type_ != 0 {
3098             return Err(cpu::HypervisorCpuError::UnknownTdxVmCall);
3099         }
3100 
3101         match tdx_vmcall.subfunction {
3102             TDG_VP_VMCALL_GET_QUOTE => Ok(TdxExitDetails::GetQuote),
3103             TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT => {
3104                 Ok(TdxExitDetails::SetupEventNotifyInterrupt)
3105             }
3106             _ => Err(cpu::HypervisorCpuError::UnknownTdxVmCall),
3107         }
3108     }
3109 
3110     ///
3111     /// Set the status code for TDX exit
3112     ///
3113     #[cfg(feature = "tdx")]
3114     fn set_tdx_status(&mut self, status: TdxExitStatus) {
3115         let mut fd = self.fd.as_ref().lock().unwrap();
3116         let kvm_run = fd.get_kvm_run();
3117         // SAFETY: accessing a union field in a valid structure
3118         let tdx_vmcall = unsafe {
3119             &mut (*((&mut kvm_run.__bindgen_anon_1) as *mut kvm_run__bindgen_ty_1
3120                 as *mut KvmTdxExit))
3121                 .u
3122                 .vmcall
3123         };
3124 
3125         tdx_vmcall.status_code = match status {
3126             TdxExitStatus::Success => TDG_VP_VMCALL_SUCCESS,
3127             TdxExitStatus::InvalidOperand => TDG_VP_VMCALL_INVALID_OPERAND,
3128         };
3129     }
3130 
3131     #[cfg(target_arch = "x86_64")]
3132     ///
3133     /// Return the list of initial MSR entries for a VCPU
3134     ///
3135     fn boot_msr_entries(&self) -> Vec<MsrEntry> {
3136         use crate::arch::x86::{msr_index, MTRR_ENABLE, MTRR_MEM_TYPE_WB};
3137 
3138         [
3139             msr!(msr_index::MSR_IA32_SYSENTER_CS),
3140             msr!(msr_index::MSR_IA32_SYSENTER_ESP),
3141             msr!(msr_index::MSR_IA32_SYSENTER_EIP),
3142             msr!(msr_index::MSR_STAR),
3143             msr!(msr_index::MSR_CSTAR),
3144             msr!(msr_index::MSR_LSTAR),
3145             msr!(msr_index::MSR_KERNEL_GS_BASE),
3146             msr!(msr_index::MSR_SYSCALL_MASK),
3147             msr!(msr_index::MSR_IA32_TSC),
3148             msr_data!(
3149                 msr_index::MSR_IA32_MISC_ENABLE,
3150                 msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64
3151             ),
3152             msr_data!(msr_index::MSR_MTRRdefType, MTRR_ENABLE | MTRR_MEM_TYPE_WB),
3153         ]
3154         .to_vec()
3155     }
3156 
3157     #[cfg(target_arch = "aarch64")]
3158     fn has_pmu_support(&self) -> bool {
3159         let cpu_attr = kvm_bindings::kvm_device_attr {
3160             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
3161             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT),
3162             addr: 0x0,
3163             flags: 0,
3164         };
3165         self.fd.lock().unwrap().has_device_attr(&cpu_attr).is_ok()
3166     }
3167 
3168     #[cfg(target_arch = "aarch64")]
3169     fn init_pmu(&self, irq: u32) -> cpu::Result<()> {
3170         let cpu_attr = kvm_bindings::kvm_device_attr {
3171             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
3172             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT),
3173             addr: 0x0,
3174             flags: 0,
3175         };
3176         let cpu_attr_irq = kvm_bindings::kvm_device_attr {
3177             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
3178             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_IRQ),
3179             addr: &irq as *const u32 as u64,
3180             flags: 0,
3181         };
3182         self.fd
3183             .lock()
3184             .unwrap()
3185             .set_device_attr(&cpu_attr_irq)
3186             .map_err(|_| cpu::HypervisorCpuError::InitializePmu)?;
3187         self.fd
3188             .lock()
3189             .unwrap()
3190             .set_device_attr(&cpu_attr)
3191             .map_err(|_| cpu::HypervisorCpuError::InitializePmu)
3192     }
3193 
3194     #[cfg(target_arch = "x86_64")]
3195     ///
3196     /// Get the frequency of the TSC if available
3197     ///
3198     fn tsc_khz(&self) -> cpu::Result<Option<u32>> {
3199         match self.fd.lock().unwrap().get_tsc_khz() {
3200             Err(e) => {
3201                 if e.errno() == libc::EIO {
3202                     Ok(None)
3203                 } else {
3204                     Err(cpu::HypervisorCpuError::GetTscKhz(e.into()))
3205                 }
3206             }
3207             Ok(v) => Ok(Some(v)),
3208         }
3209     }
3210 
3211     #[cfg(target_arch = "x86_64")]
3212     ///
3213     /// Set the frequency of the TSC if available
3214     ///
3215     fn set_tsc_khz(&self, freq: u32) -> cpu::Result<()> {
3216         match self.fd.lock().unwrap().set_tsc_khz(freq) {
3217             Err(e) => {
3218                 if e.errno() == libc::EIO {
3219                     Ok(())
3220                 } else {
3221                     Err(cpu::HypervisorCpuError::SetTscKhz(e.into()))
3222                 }
3223             }
3224             Ok(_) => Ok(()),
3225         }
3226     }
3227 
3228     #[cfg(target_arch = "x86_64")]
3229     ///
3230     /// Trigger NMI interrupt
3231     ///
3232     fn nmi(&self) -> cpu::Result<()> {
3233         match self.fd.lock().unwrap().nmi() {
3234             Err(e) => {
3235                 if e.errno() == libc::EIO {
3236                     Ok(())
3237                 } else {
3238                     Err(cpu::HypervisorCpuError::Nmi(e.into()))
3239                 }
3240             }
3241             Ok(_) => Ok(()),
3242         }
3243     }
3244 }
3245 
3246 impl KvmVcpu {
3247     #[cfg(target_arch = "x86_64")]
3248     ///
3249     /// X86 specific call that returns the vcpu's current "xsave struct".
3250     ///
3251     fn get_xsave(&self) -> cpu::Result<XsaveState> {
3252         Ok(self
3253             .fd
3254             .lock()
3255             .unwrap()
3256             .get_xsave()
3257             .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into()))?
3258             .into())
3259     }
3260 
3261     #[cfg(target_arch = "x86_64")]
3262     ///
3263     /// X86 specific call that sets the vcpu's current "xsave struct".
3264     ///
3265     fn set_xsave(&self, xsave: &XsaveState) -> cpu::Result<()> {
3266         let xsave: kvm_bindings::kvm_xsave = (*xsave).clone().into();
3267         self.fd
3268             .lock()
3269             .unwrap()
3270             .set_xsave(&xsave)
3271             .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into()))
3272     }
3273 
3274     #[cfg(target_arch = "x86_64")]
3275     ///
3276     /// X86 specific call that returns the vcpu's current "xcrs".
3277     ///
3278     fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> {
3279         self.fd
3280             .lock()
3281             .unwrap()
3282             .get_xcrs()
3283             .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into()))
3284     }
3285 
3286     #[cfg(target_arch = "x86_64")]
3287     ///
3288     /// X86 specific call that sets the vcpu's current "xcrs".
3289     ///
3290     fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> {
3291         self.fd
3292             .lock()
3293             .unwrap()
3294             .set_xcrs(xcrs)
3295             .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into()))
3296     }
3297 
3298     #[cfg(target_arch = "x86_64")]
3299     ///
3300     /// Returns currently pending exceptions, interrupts, and NMIs as well as related
3301     /// states of the vcpu.
3302     ///
3303     fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> {
3304         self.fd
3305             .lock()
3306             .unwrap()
3307             .get_vcpu_events()
3308             .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into()))
3309     }
3310 
3311     #[cfg(target_arch = "x86_64")]
3312     ///
3313     /// Sets pending exceptions, interrupts, and NMIs as well as related states
3314     /// of the vcpu.
3315     ///
3316     fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> {
3317         self.fd
3318             .lock()
3319             .unwrap()
3320             .set_vcpu_events(events)
3321             .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into()))
3322     }
3323 }
3324 
3325 #[cfg(test)]
3326 mod tests {
3327     #[test]
3328     #[cfg(target_arch = "riscv64")]
3329     fn test_get_and_set_regs() {
3330         use super::*;
3331 
3332         let kvm = KvmHypervisor::new().unwrap();
3333         let hypervisor = Arc::new(kvm);
3334         let vm = hypervisor.create_vm().expect("new VM fd creation failed");
3335         let vcpu0 = vm.create_vcpu(0, None).unwrap();
3336 
3337         let core_regs = StandardRegisters::from(kvm_riscv_core {
3338             regs: user_regs_struct {
3339                 pc: 0x00,
3340                 ra: 0x01,
3341                 sp: 0x02,
3342                 gp: 0x03,
3343                 tp: 0x04,
3344                 t0: 0x05,
3345                 t1: 0x06,
3346                 t2: 0x07,
3347                 s0: 0x08,
3348                 s1: 0x09,
3349                 a0: 0x0a,
3350                 a1: 0x0b,
3351                 a2: 0x0c,
3352                 a3: 0x0d,
3353                 a4: 0x0e,
3354                 a5: 0x0f,
3355                 a6: 0x10,
3356                 a7: 0x11,
3357                 s2: 0x12,
3358                 s3: 0x13,
3359                 s4: 0x14,
3360                 s5: 0x15,
3361                 s6: 0x16,
3362                 s7: 0x17,
3363                 s8: 0x18,
3364                 s9: 0x19,
3365                 s10: 0x1a,
3366                 s11: 0x1b,
3367                 t3: 0x1c,
3368                 t4: 0x1d,
3369                 t5: 0x1e,
3370                 t6: 0x1f,
3371             },
3372             mode: 0x00,
3373         });
3374 
3375         vcpu0.set_regs(&core_regs).unwrap();
3376         assert_eq!(vcpu0.get_regs().unwrap(), core_regs);
3377     }
3378 }
3379