xref: /cloud-hypervisor/hypervisor/src/kvm/mod.rs (revision ea4693a09123234951ae1516f112c5cfce5032ca)
1 // Copyright © 2024 Institute of Software, CAS. All rights reserved.
2 //
3 // Copyright © 2019 Intel Corporation
4 //
5 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
6 //
7 // Copyright © 2020, Microsoft Corporation
8 //
9 // Copyright 2018-2019 CrowdStrike, Inc.
10 //
11 //
12 
13 use std::any::Any;
14 use std::collections::HashMap;
15 #[cfg(target_arch = "x86_64")]
16 use std::fs::File;
17 #[cfg(target_arch = "x86_64")]
18 use std::os::unix::io::AsRawFd;
19 #[cfg(feature = "tdx")]
20 use std::os::unix::io::RawFd;
21 use std::result;
22 #[cfg(target_arch = "x86_64")]
23 use std::sync::atomic::{AtomicBool, Ordering};
24 use std::sync::{Arc, Mutex, RwLock};
25 
26 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd};
27 use vmm_sys_util::eventfd::EventFd;
28 
29 #[cfg(target_arch = "aarch64")]
30 use crate::aarch64::gic::KvmGicV3Its;
31 #[cfg(target_arch = "aarch64")]
32 pub use crate::aarch64::{check_required_kvm_extensions, is_system_register, VcpuKvmState};
33 #[cfg(target_arch = "aarch64")]
34 use crate::arch::aarch64::gic::{Vgic, VgicConfig};
35 #[cfg(target_arch = "riscv64")]
36 use crate::arch::riscv64::aia::{Vaia, VaiaConfig};
37 #[cfg(target_arch = "riscv64")]
38 use crate::riscv64::aia::KvmAiaImsics;
39 #[cfg(target_arch = "riscv64")]
40 pub use crate::riscv64::{
41     aia::AiaImsicsState as AiaState, check_required_kvm_extensions, is_non_core_register,
42     VcpuKvmState,
43 };
44 use crate::vm::{self, InterruptSourceConfig, VmOps};
45 #[cfg(target_arch = "aarch64")]
46 use crate::{arm64_core_reg_id, offset_of};
47 use crate::{cpu, hypervisor, vec_with_array_field, HypervisorType};
48 #[cfg(target_arch = "riscv64")]
49 use crate::{offset_of, riscv64_reg_id};
50 // x86_64 dependencies
51 #[cfg(target_arch = "x86_64")]
52 pub mod x86_64;
53 #[cfg(target_arch = "x86_64")]
54 use kvm_bindings::{
55     kvm_enable_cap, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, KVM_CAP_SPLIT_IRQCHIP,
56     KVM_GUESTDBG_USE_HW_BP,
57 };
58 #[cfg(target_arch = "x86_64")]
59 use x86_64::check_required_kvm_extensions;
60 #[cfg(target_arch = "x86_64")]
61 pub use x86_64::{CpuId, ExtendedControlRegisters, MsrEntries, VcpuKvmState};
62 
63 #[cfg(target_arch = "x86_64")]
64 use crate::arch::x86::{
65     CpuIdEntry, FpuState, LapicState, MsrEntry, SpecialRegisters, XsaveState, NUM_IOAPIC_PINS,
66 };
67 #[cfg(target_arch = "x86_64")]
68 use crate::ClockData;
69 use crate::{
70     CpuState, IoEventAddress, IrqRoutingEntry, MpState, StandardRegisters, UserMemoryRegion,
71     USER_MEMORY_REGION_LOG_DIRTY, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE,
72 };
73 // aarch64 dependencies
74 #[cfg(target_arch = "aarch64")]
75 pub mod aarch64;
76 // riscv64 dependencies
77 #[cfg(target_arch = "riscv64")]
78 pub mod riscv64;
79 #[cfg(target_arch = "aarch64")]
80 use std::mem;
81 
82 ///
83 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms
84 ///
85 #[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
86 pub use kvm_bindings::kvm_vcpu_events as VcpuEvents;
87 pub use kvm_bindings::{
88     kvm_clock_data, kvm_create_device, kvm_create_device as CreateDevice,
89     kvm_device_attr as DeviceAttr, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_guest_debug,
90     kvm_irq_routing, kvm_irq_routing_entry, kvm_mp_state, kvm_run, kvm_userspace_memory_region,
91     KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI,
92     KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID,
93 };
94 #[cfg(target_arch = "aarch64")]
95 use kvm_bindings::{
96     kvm_regs, user_fpsimd_state, user_pt_regs, KVM_GUESTDBG_USE_HW, KVM_NR_SPSR, KVM_REG_ARM64,
97     KVM_REG_ARM64_SYSREG, KVM_REG_ARM64_SYSREG_CRM_MASK, KVM_REG_ARM64_SYSREG_CRN_MASK,
98     KVM_REG_ARM64_SYSREG_OP0_MASK, KVM_REG_ARM64_SYSREG_OP1_MASK, KVM_REG_ARM64_SYSREG_OP2_MASK,
99     KVM_REG_ARM_CORE, KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64,
100 };
101 #[cfg(target_arch = "riscv64")]
102 use kvm_bindings::{kvm_riscv_core, user_regs_struct, KVM_REG_RISCV_CORE};
103 #[cfg(feature = "tdx")]
104 use kvm_bindings::{kvm_run__bindgen_ty_1, KVMIO};
105 pub use kvm_ioctls::{Cap, Kvm, VcpuExit};
106 use thiserror::Error;
107 use vfio_ioctls::VfioDeviceFd;
108 #[cfg(feature = "tdx")]
109 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_ioc_nr, ioctl_iowr_nr};
110 pub use {kvm_bindings, kvm_ioctls};
111 
112 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
113 use crate::RegList;
114 
115 #[cfg(target_arch = "x86_64")]
116 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196;
117 
118 #[cfg(target_arch = "x86_64")]
119 use vmm_sys_util::ioctl_io_nr;
120 #[cfg(all(not(feature = "tdx"), target_arch = "x86_64"))]
121 use vmm_sys_util::ioctl_ioc_nr;
122 
123 #[cfg(target_arch = "x86_64")]
124 ioctl_io_nr!(KVM_NMI, kvm_bindings::KVMIO, 0x9a);
125 
126 #[cfg(feature = "tdx")]
127 const KVM_EXIT_TDX: u32 = 50;
128 #[cfg(feature = "tdx")]
129 const TDG_VP_VMCALL_GET_QUOTE: u64 = 0x10002;
130 #[cfg(feature = "tdx")]
131 const TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: u64 = 0x10004;
132 #[cfg(feature = "tdx")]
133 const TDG_VP_VMCALL_SUCCESS: u64 = 0;
134 #[cfg(feature = "tdx")]
135 const TDG_VP_VMCALL_INVALID_OPERAND: u64 = 0x8000000000000000;
136 
137 #[cfg(feature = "tdx")]
138 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong);
139 
140 #[cfg(feature = "tdx")]
141 #[repr(u32)]
142 enum TdxCommand {
143     Capabilities = 0,
144     InitVm,
145     InitVcpu,
146     InitMemRegion,
147     Finalize,
148 }
149 
150 #[cfg(feature = "tdx")]
151 pub enum TdxExitDetails {
152     GetQuote,
153     SetupEventNotifyInterrupt,
154 }
155 
156 #[cfg(feature = "tdx")]
157 pub enum TdxExitStatus {
158     Success,
159     InvalidOperand,
160 }
161 
162 #[cfg(feature = "tdx")]
163 const TDX_MAX_NR_CPUID_CONFIGS: usize = 6;
164 
165 #[cfg(feature = "tdx")]
166 #[repr(C)]
167 #[derive(Debug, Default)]
168 pub struct TdxCpuidConfig {
169     pub leaf: u32,
170     pub sub_leaf: u32,
171     pub eax: u32,
172     pub ebx: u32,
173     pub ecx: u32,
174     pub edx: u32,
175 }
176 
177 #[cfg(feature = "tdx")]
178 #[repr(C)]
179 #[derive(Debug, Default)]
180 pub struct TdxCapabilities {
181     pub attrs_fixed0: u64,
182     pub attrs_fixed1: u64,
183     pub xfam_fixed0: u64,
184     pub xfam_fixed1: u64,
185     pub nr_cpuid_configs: u32,
186     pub padding: u32,
187     pub cpuid_configs: [TdxCpuidConfig; TDX_MAX_NR_CPUID_CONFIGS],
188 }
189 
190 #[cfg(feature = "tdx")]
191 #[derive(Copy, Clone)]
192 pub struct KvmTdxExit {
193     pub type_: u32,
194     pub pad: u32,
195     pub u: KvmTdxExitU,
196 }
197 
198 #[cfg(feature = "tdx")]
199 #[repr(C)]
200 #[derive(Copy, Clone)]
201 pub union KvmTdxExitU {
202     pub vmcall: KvmTdxExitVmcall,
203 }
204 
205 #[cfg(feature = "tdx")]
206 #[repr(C)]
207 #[derive(Debug, Default, Copy, Clone, PartialEq)]
208 pub struct KvmTdxExitVmcall {
209     pub type_: u64,
210     pub subfunction: u64,
211     pub reg_mask: u64,
212     pub in_r12: u64,
213     pub in_r13: u64,
214     pub in_r14: u64,
215     pub in_r15: u64,
216     pub in_rbx: u64,
217     pub in_rdi: u64,
218     pub in_rsi: u64,
219     pub in_r8: u64,
220     pub in_r9: u64,
221     pub in_rdx: u64,
222     pub status_code: u64,
223     pub out_r11: u64,
224     pub out_r12: u64,
225     pub out_r13: u64,
226     pub out_r14: u64,
227     pub out_r15: u64,
228     pub out_rbx: u64,
229     pub out_rdi: u64,
230     pub out_rsi: u64,
231     pub out_r8: u64,
232     pub out_r9: u64,
233     pub out_rdx: u64,
234 }
235 
236 impl From<kvm_userspace_memory_region> for UserMemoryRegion {
237     fn from(region: kvm_userspace_memory_region) -> Self {
238         let mut flags = USER_MEMORY_REGION_READ;
239         if region.flags & KVM_MEM_READONLY == 0 {
240             flags |= USER_MEMORY_REGION_WRITE;
241         }
242         if region.flags & KVM_MEM_LOG_DIRTY_PAGES != 0 {
243             flags |= USER_MEMORY_REGION_LOG_DIRTY;
244         }
245 
246         UserMemoryRegion {
247             slot: region.slot,
248             guest_phys_addr: region.guest_phys_addr,
249             memory_size: region.memory_size,
250             userspace_addr: region.userspace_addr,
251             flags,
252         }
253     }
254 }
255 
256 impl From<UserMemoryRegion> for kvm_userspace_memory_region {
257     fn from(region: UserMemoryRegion) -> Self {
258         assert!(
259             region.flags & USER_MEMORY_REGION_READ != 0,
260             "KVM mapped memory is always readable"
261         );
262 
263         let mut flags = 0;
264         if region.flags & USER_MEMORY_REGION_WRITE == 0 {
265             flags |= KVM_MEM_READONLY;
266         }
267         if region.flags & USER_MEMORY_REGION_LOG_DIRTY != 0 {
268             flags |= KVM_MEM_LOG_DIRTY_PAGES;
269         }
270 
271         kvm_userspace_memory_region {
272             slot: region.slot,
273             guest_phys_addr: region.guest_phys_addr,
274             memory_size: region.memory_size,
275             userspace_addr: region.userspace_addr,
276             flags,
277         }
278     }
279 }
280 
281 impl From<kvm_mp_state> for MpState {
282     fn from(s: kvm_mp_state) -> Self {
283         MpState::Kvm(s)
284     }
285 }
286 
287 impl From<MpState> for kvm_mp_state {
288     fn from(ms: MpState) -> Self {
289         match ms {
290             MpState::Kvm(s) => s,
291             /* Needed in case other hypervisors are enabled */
292             #[allow(unreachable_patterns)]
293             _ => panic!("CpuState is not valid"),
294         }
295     }
296 }
297 
298 impl From<kvm_ioctls::IoEventAddress> for IoEventAddress {
299     fn from(a: kvm_ioctls::IoEventAddress) -> Self {
300         match a {
301             kvm_ioctls::IoEventAddress::Pio(x) => Self::Pio(x),
302             kvm_ioctls::IoEventAddress::Mmio(x) => Self::Mmio(x),
303         }
304     }
305 }
306 
307 impl From<IoEventAddress> for kvm_ioctls::IoEventAddress {
308     fn from(a: IoEventAddress) -> Self {
309         match a {
310             IoEventAddress::Pio(x) => Self::Pio(x),
311             IoEventAddress::Mmio(x) => Self::Mmio(x),
312         }
313     }
314 }
315 
316 impl From<VcpuKvmState> for CpuState {
317     fn from(s: VcpuKvmState) -> Self {
318         CpuState::Kvm(s)
319     }
320 }
321 
322 impl From<CpuState> for VcpuKvmState {
323     fn from(s: CpuState) -> Self {
324         match s {
325             CpuState::Kvm(s) => s,
326             /* Needed in case other hypervisors are enabled */
327             #[allow(unreachable_patterns)]
328             _ => panic!("CpuState is not valid"),
329         }
330     }
331 }
332 
333 #[cfg(target_arch = "x86_64")]
334 impl From<kvm_clock_data> for ClockData {
335     fn from(d: kvm_clock_data) -> Self {
336         ClockData::Kvm(d)
337     }
338 }
339 
340 #[cfg(target_arch = "x86_64")]
341 impl From<ClockData> for kvm_clock_data {
342     fn from(ms: ClockData) -> Self {
343         match ms {
344             ClockData::Kvm(s) => s,
345             /* Needed in case other hypervisors are enabled */
346             #[allow(unreachable_patterns)]
347             _ => panic!("CpuState is not valid"),
348         }
349     }
350 }
351 
352 impl From<kvm_bindings::kvm_one_reg> for crate::Register {
353     fn from(s: kvm_bindings::kvm_one_reg) -> Self {
354         crate::Register::Kvm(s)
355     }
356 }
357 
358 impl From<crate::Register> for kvm_bindings::kvm_one_reg {
359     fn from(e: crate::Register) -> Self {
360         match e {
361             crate::Register::Kvm(e) => e,
362             /* Needed in case other hypervisors are enabled */
363             #[allow(unreachable_patterns)]
364             _ => panic!("Register is not valid"),
365         }
366     }
367 }
368 
369 #[cfg(target_arch = "aarch64")]
370 impl From<kvm_bindings::kvm_vcpu_init> for crate::VcpuInit {
371     fn from(s: kvm_bindings::kvm_vcpu_init) -> Self {
372         crate::VcpuInit::Kvm(s)
373     }
374 }
375 
376 #[cfg(target_arch = "aarch64")]
377 impl From<crate::VcpuInit> for kvm_bindings::kvm_vcpu_init {
378     fn from(e: crate::VcpuInit) -> Self {
379         match e {
380             crate::VcpuInit::Kvm(e) => e,
381             /* Needed in case other hypervisors are enabled */
382             #[allow(unreachable_patterns)]
383             _ => panic!("VcpuInit is not valid"),
384         }
385     }
386 }
387 
388 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
389 impl From<kvm_bindings::RegList> for crate::RegList {
390     fn from(s: kvm_bindings::RegList) -> Self {
391         crate::RegList::Kvm(s)
392     }
393 }
394 
395 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
396 impl From<crate::RegList> for kvm_bindings::RegList {
397     fn from(e: crate::RegList) -> Self {
398         match e {
399             crate::RegList::Kvm(e) => e,
400             /* Needed in case other hypervisors are enabled */
401             #[allow(unreachable_patterns)]
402             _ => panic!("RegList is not valid"),
403         }
404     }
405 }
406 
407 #[cfg(not(target_arch = "riscv64"))]
408 impl From<kvm_bindings::kvm_regs> for crate::StandardRegisters {
409     fn from(s: kvm_bindings::kvm_regs) -> Self {
410         crate::StandardRegisters::Kvm(s)
411     }
412 }
413 
414 #[cfg(not(target_arch = "riscv64"))]
415 impl From<crate::StandardRegisters> for kvm_bindings::kvm_regs {
416     fn from(e: crate::StandardRegisters) -> Self {
417         match e {
418             crate::StandardRegisters::Kvm(e) => e,
419             /* Needed in case other hypervisors are enabled */
420             #[allow(unreachable_patterns)]
421             _ => panic!("StandardRegisters are not valid"),
422         }
423     }
424 }
425 
426 #[cfg(target_arch = "riscv64")]
427 impl From<kvm_bindings::kvm_riscv_core> for crate::StandardRegisters {
428     fn from(s: kvm_bindings::kvm_riscv_core) -> Self {
429         crate::StandardRegisters::Kvm(s)
430     }
431 }
432 
433 #[cfg(target_arch = "riscv64")]
434 impl From<crate::StandardRegisters> for kvm_bindings::kvm_riscv_core {
435     fn from(e: crate::StandardRegisters) -> Self {
436         match e {
437             crate::StandardRegisters::Kvm(e) => e,
438             /* Needed in case other hypervisors are enabled */
439             #[allow(unreachable_patterns)]
440             _ => panic!("StandardRegisters are not valid"),
441         }
442     }
443 }
444 
445 impl From<kvm_irq_routing_entry> for IrqRoutingEntry {
446     fn from(s: kvm_irq_routing_entry) -> Self {
447         IrqRoutingEntry::Kvm(s)
448     }
449 }
450 
451 impl From<IrqRoutingEntry> for kvm_irq_routing_entry {
452     fn from(e: IrqRoutingEntry) -> Self {
453         match e {
454             IrqRoutingEntry::Kvm(e) => e,
455             /* Needed in case other hypervisors are enabled */
456             #[allow(unreachable_patterns)]
457             _ => panic!("IrqRoutingEntry is not valid"),
458         }
459     }
460 }
461 
462 struct KvmDirtyLogSlot {
463     slot: u32,
464     guest_phys_addr: u64,
465     memory_size: u64,
466     userspace_addr: u64,
467 }
468 
469 /// Wrapper over KVM VM ioctls.
470 pub struct KvmVm {
471     fd: Arc<VmFd>,
472     #[cfg(target_arch = "x86_64")]
473     msrs: Vec<MsrEntry>,
474     dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>,
475 }
476 
477 impl KvmVm {
478     ///
479     /// Creates an emulated device in the kernel.
480     ///
481     /// See the documentation for `KVM_CREATE_DEVICE`.
482     fn create_device(&self, device: &mut CreateDevice) -> vm::Result<vfio_ioctls::VfioDeviceFd> {
483         let device_fd = self
484             .fd
485             .create_device(device)
486             .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?;
487         Ok(VfioDeviceFd::new_from_kvm(device_fd))
488     }
489     /// Checks if a particular `Cap` is available.
490     pub fn check_extension(&self, c: Cap) -> bool {
491         self.fd.check_extension(c)
492     }
493 }
494 
495 /// Implementation of Vm trait for KVM
496 ///
497 /// # Examples
498 ///
499 /// ```
500 /// # use hypervisor::kvm::KvmHypervisor;
501 /// # use std::sync::Arc;
502 /// let kvm = KvmHypervisor::new().unwrap();
503 /// let hypervisor = Arc::new(kvm);
504 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
505 /// ```
506 impl vm::Vm for KvmVm {
507     #[cfg(target_arch = "x86_64")]
508     ///
509     /// Sets the address of the one-page region in the VM's address space.
510     ///
511     fn set_identity_map_address(&self, address: u64) -> vm::Result<()> {
512         self.fd
513             .set_identity_map_address(address)
514             .map_err(|e| vm::HypervisorVmError::SetIdentityMapAddress(e.into()))
515     }
516 
517     #[cfg(target_arch = "x86_64")]
518     ///
519     /// Sets the address of the three-page region in the VM's address space.
520     ///
521     fn set_tss_address(&self, offset: usize) -> vm::Result<()> {
522         self.fd
523             .set_tss_address(offset)
524             .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into()))
525     }
526 
527     #[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
528     ///
529     /// Creates an in-kernel interrupt controller.
530     ///
531     fn create_irq_chip(&self) -> vm::Result<()> {
532         self.fd
533             .create_irq_chip()
534             .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into()))
535     }
536 
537     ///
538     /// Registers an event that will, when signaled, trigger the `gsi` IRQ.
539     ///
540     fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
541         self.fd
542             .register_irqfd(fd, gsi)
543             .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into()))
544     }
545 
546     ///
547     /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ.
548     ///
549     fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
550         self.fd
551             .unregister_irqfd(fd, gsi)
552             .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into()))
553     }
554 
555     ///
556     /// Creates a VcpuFd object from a vcpu RawFd.
557     ///
558     fn create_vcpu(
559         &self,
560         id: u8,
561         vm_ops: Option<Arc<dyn VmOps>>,
562     ) -> vm::Result<Arc<dyn cpu::Vcpu>> {
563         let fd = self
564             .fd
565             .create_vcpu(id as u64)
566             .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?;
567         let vcpu = KvmVcpu {
568             fd: Arc::new(Mutex::new(fd)),
569             #[cfg(target_arch = "x86_64")]
570             msrs: self.msrs.clone(),
571             vm_ops,
572             #[cfg(target_arch = "x86_64")]
573             hyperv_synic: AtomicBool::new(false),
574         };
575         Ok(Arc::new(vcpu))
576     }
577 
578     #[cfg(target_arch = "aarch64")]
579     ///
580     /// Creates a virtual GIC device.
581     ///
582     fn create_vgic(&self, config: VgicConfig) -> vm::Result<Arc<Mutex<dyn Vgic>>> {
583         let gic_device = KvmGicV3Its::new(self, config)
584             .map_err(|e| vm::HypervisorVmError::CreateVgic(anyhow!("Vgic error {:?}", e)))?;
585         Ok(Arc::new(Mutex::new(gic_device)))
586     }
587 
588     #[cfg(target_arch = "riscv64")]
589     ///
590     /// Creates a virtual AIA device.
591     ///
592     fn create_vaia(&self, config: VaiaConfig) -> vm::Result<Arc<Mutex<dyn Vaia>>> {
593         let aia_device = KvmAiaImsics::new(self, config)
594             .map_err(|e| vm::HypervisorVmError::CreateVaia(anyhow!("Vaia error {:?}", e)))?;
595         Ok(Arc::new(Mutex::new(aia_device)))
596     }
597 
598     ///
599     /// Registers an event to be signaled whenever a certain address is written to.
600     ///
601     fn register_ioevent(
602         &self,
603         fd: &EventFd,
604         addr: &IoEventAddress,
605         datamatch: Option<vm::DataMatch>,
606     ) -> vm::Result<()> {
607         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
608         if let Some(dm) = datamatch {
609             match dm {
610                 vm::DataMatch::DataMatch32(kvm_dm32) => self
611                     .fd
612                     .register_ioevent(fd, addr, kvm_dm32)
613                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
614                 vm::DataMatch::DataMatch64(kvm_dm64) => self
615                     .fd
616                     .register_ioevent(fd, addr, kvm_dm64)
617                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
618             }
619         } else {
620             self.fd
621                 .register_ioevent(fd, addr, NoDatamatch)
622                 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into()))
623         }
624     }
625 
626     ///
627     /// Unregisters an event from a certain address it has been previously registered to.
628     ///
629     fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> {
630         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
631         self.fd
632             .unregister_ioevent(fd, addr, NoDatamatch)
633             .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into()))
634     }
635 
636     ///
637     /// Constructs a routing entry
638     ///
639     fn make_routing_entry(&self, gsi: u32, config: &InterruptSourceConfig) -> IrqRoutingEntry {
640         match &config {
641             InterruptSourceConfig::MsiIrq(cfg) => {
642                 let mut kvm_route = kvm_irq_routing_entry {
643                     gsi,
644                     type_: KVM_IRQ_ROUTING_MSI,
645                     ..Default::default()
646                 };
647 
648                 kvm_route.u.msi.address_lo = cfg.low_addr;
649                 kvm_route.u.msi.address_hi = cfg.high_addr;
650                 kvm_route.u.msi.data = cfg.data;
651 
652                 if self.check_extension(crate::kvm::Cap::MsiDevid) {
653                     // On AArch64, there is limitation on the range of the 'devid',
654                     // it cannot be greater than 65536 (the max of u16).
655                     //
656                     // BDF cannot be used directly, because 'segment' is in high
657                     // 16 bits. The layout of the u32 BDF is:
658                     // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --|
659                     // |      segment    |     bus    |   device   |  function  |
660                     //
661                     // Now that we support 1 bus only in a segment, we can build a
662                     // 'devid' by replacing the 'bus' bits with the low 8 bits of
663                     // 'segment' data.
664                     // This way we can resolve the range checking problem and give
665                     // different `devid` to all the devices. Limitation is that at
666                     // most 256 segments can be supported.
667                     //
668                     let modified_devid = ((cfg.devid & 0x00ff_0000) >> 8) | cfg.devid & 0xff;
669 
670                     kvm_route.flags = KVM_MSI_VALID_DEVID;
671                     kvm_route.u.msi.__bindgen_anon_1.devid = modified_devid;
672                 }
673                 kvm_route.into()
674             }
675             InterruptSourceConfig::LegacyIrq(cfg) => {
676                 let mut kvm_route = kvm_irq_routing_entry {
677                     gsi,
678                     type_: KVM_IRQ_ROUTING_IRQCHIP,
679                     ..Default::default()
680                 };
681                 kvm_route.u.irqchip.irqchip = cfg.irqchip;
682                 kvm_route.u.irqchip.pin = cfg.pin;
683 
684                 kvm_route.into()
685             }
686         }
687     }
688 
689     ///
690     /// Sets the GSI routing table entries, overwriting any previously set
691     /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl.
692     ///
693     fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> {
694         let mut irq_routing =
695             vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len());
696         irq_routing[0].nr = entries.len() as u32;
697         irq_routing[0].flags = 0;
698         let entries: Vec<kvm_irq_routing_entry> = entries
699             .iter()
700             .map(|entry| match entry {
701                 IrqRoutingEntry::Kvm(e) => *e,
702                 #[allow(unreachable_patterns)]
703                 _ => panic!("IrqRoutingEntry type is wrong"),
704             })
705             .collect();
706 
707         // SAFETY: irq_routing initialized with entries.len() and now it is being turned into
708         // entries_slice with entries.len() again. It is guaranteed to be large enough to hold
709         // everything from entries.
710         unsafe {
711             let entries_slice: &mut [kvm_irq_routing_entry] =
712                 irq_routing[0].entries.as_mut_slice(entries.len());
713             entries_slice.copy_from_slice(&entries);
714         }
715 
716         self.fd
717             .set_gsi_routing(&irq_routing[0])
718             .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into()))
719     }
720 
721     ///
722     /// Creates a memory region structure that can be used with {create/remove}_user_memory_region
723     ///
724     fn make_user_memory_region(
725         &self,
726         slot: u32,
727         guest_phys_addr: u64,
728         memory_size: u64,
729         userspace_addr: u64,
730         readonly: bool,
731         log_dirty_pages: bool,
732     ) -> UserMemoryRegion {
733         kvm_userspace_memory_region {
734             slot,
735             guest_phys_addr,
736             memory_size,
737             userspace_addr,
738             flags: if readonly { KVM_MEM_READONLY } else { 0 }
739                 | if log_dirty_pages {
740                     KVM_MEM_LOG_DIRTY_PAGES
741                 } else {
742                     0
743                 },
744         }
745         .into()
746     }
747 
748     ///
749     /// Creates a guest physical memory region.
750     ///
751     fn create_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
752         let mut region: kvm_userspace_memory_region = user_memory_region.into();
753 
754         if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 {
755             if (region.flags & KVM_MEM_READONLY) != 0 {
756                 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!(
757                     "Error creating regions with both 'dirty-pages-log' and 'read-only'."
758                 )));
759             }
760 
761             // Keep track of the regions that need dirty pages log
762             self.dirty_log_slots.write().unwrap().insert(
763                 region.slot,
764                 KvmDirtyLogSlot {
765                     slot: region.slot,
766                     guest_phys_addr: region.guest_phys_addr,
767                     memory_size: region.memory_size,
768                     userspace_addr: region.userspace_addr,
769                 },
770             );
771 
772             // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`.
773             // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`.
774             region.flags = 0;
775         }
776 
777         // SAFETY: Safe because guest regions are guaranteed not to overlap.
778         unsafe {
779             self.fd
780                 .set_user_memory_region(region)
781                 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))
782         }
783     }
784 
785     ///
786     /// Removes a guest physical memory region.
787     ///
788     fn remove_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
789         let mut region: kvm_userspace_memory_region = user_memory_region.into();
790 
791         // Remove the corresponding entry from "self.dirty_log_slots" if needed
792         self.dirty_log_slots.write().unwrap().remove(&region.slot);
793 
794         // Setting the size to 0 means "remove"
795         region.memory_size = 0;
796         // SAFETY: Safe because guest regions are guaranteed not to overlap.
797         unsafe {
798             self.fd
799                 .set_user_memory_region(region)
800                 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into()))
801         }
802     }
803 
804     ///
805     /// Returns the preferred CPU target type which can be emulated by KVM on underlying host.
806     ///
807     #[cfg(target_arch = "aarch64")]
808     fn get_preferred_target(&self, kvi: &mut crate::VcpuInit) -> vm::Result<()> {
809         let mut kvm_kvi: kvm_bindings::kvm_vcpu_init = (*kvi).into();
810         self.fd
811             .get_preferred_target(&mut kvm_kvi)
812             .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into()))?;
813         *kvi = kvm_kvi.into();
814         Ok(())
815     }
816 
817     #[cfg(target_arch = "x86_64")]
818     fn enable_split_irq(&self) -> vm::Result<()> {
819         // Create split irqchip
820         // Only the local APIC is emulated in kernel, both PICs and IOAPIC
821         // are not.
822         let mut cap = kvm_enable_cap {
823             cap: KVM_CAP_SPLIT_IRQCHIP,
824             ..Default::default()
825         };
826         cap.args[0] = NUM_IOAPIC_PINS as u64;
827         self.fd
828             .enable_cap(&cap)
829             .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?;
830         Ok(())
831     }
832 
833     #[cfg(target_arch = "x86_64")]
834     fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> {
835         let mut cap = kvm_enable_cap {
836             cap: KVM_CAP_SGX_ATTRIBUTE,
837             ..Default::default()
838         };
839         cap.args[0] = file.as_raw_fd() as u64;
840         self.fd
841             .enable_cap(&cap)
842             .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?;
843         Ok(())
844     }
845 
846     /// Retrieve guest clock.
847     #[cfg(target_arch = "x86_64")]
848     fn get_clock(&self) -> vm::Result<ClockData> {
849         Ok(self
850             .fd
851             .get_clock()
852             .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))?
853             .into())
854     }
855 
856     /// Set guest clock.
857     #[cfg(target_arch = "x86_64")]
858     fn set_clock(&self, data: &ClockData) -> vm::Result<()> {
859         let data = (*data).into();
860         self.fd
861             .set_clock(&data)
862             .map_err(|e| vm::HypervisorVmError::SetClock(e.into()))
863     }
864 
865     /// Create a device that is used for passthrough
866     fn create_passthrough_device(&self) -> vm::Result<VfioDeviceFd> {
867         let mut vfio_dev = kvm_create_device {
868             type_: kvm_device_type_KVM_DEV_TYPE_VFIO,
869             fd: 0,
870             flags: 0,
871         };
872 
873         self.create_device(&mut vfio_dev)
874             .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into()))
875     }
876 
877     ///
878     /// Start logging dirty pages
879     ///
880     fn start_dirty_log(&self) -> vm::Result<()> {
881         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
882         for (_, s) in dirty_log_slots.iter() {
883             let region = kvm_userspace_memory_region {
884                 slot: s.slot,
885                 guest_phys_addr: s.guest_phys_addr,
886                 memory_size: s.memory_size,
887                 userspace_addr: s.userspace_addr,
888                 flags: KVM_MEM_LOG_DIRTY_PAGES,
889             };
890             // SAFETY: Safe because guest regions are guaranteed not to overlap.
891             unsafe {
892                 self.fd
893                     .set_user_memory_region(region)
894                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
895             }
896         }
897 
898         Ok(())
899     }
900 
901     ///
902     /// Stop logging dirty pages
903     ///
904     fn stop_dirty_log(&self) -> vm::Result<()> {
905         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
906         for (_, s) in dirty_log_slots.iter() {
907             let region = kvm_userspace_memory_region {
908                 slot: s.slot,
909                 guest_phys_addr: s.guest_phys_addr,
910                 memory_size: s.memory_size,
911                 userspace_addr: s.userspace_addr,
912                 flags: 0,
913             };
914             // SAFETY: Safe because guest regions are guaranteed not to overlap.
915             unsafe {
916                 self.fd
917                     .set_user_memory_region(region)
918                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
919             }
920         }
921 
922         Ok(())
923     }
924 
925     ///
926     /// Get dirty pages bitmap (one bit per page)
927     ///
928     fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> {
929         self.fd
930             .get_dirty_log(slot, memory_size as usize)
931             .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into()))
932     }
933 
934     ///
935     /// Initialize TDX for this VM
936     ///
937     #[cfg(feature = "tdx")]
938     fn tdx_init(&self, cpuid: &[CpuIdEntry], max_vcpus: u32) -> vm::Result<()> {
939         const TDX_ATTR_SEPT_VE_DISABLE: usize = 28;
940 
941         let mut cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> =
942             cpuid.iter().map(|e| (*e).into()).collect();
943         cpuid.resize(256, kvm_bindings::kvm_cpuid_entry2::default());
944 
945         #[repr(C)]
946         struct TdxInitVm {
947             attributes: u64,
948             max_vcpus: u32,
949             padding: u32,
950             mrconfigid: [u64; 6],
951             mrowner: [u64; 6],
952             mrownerconfig: [u64; 6],
953             cpuid_nent: u32,
954             cpuid_padding: u32,
955             cpuid_entries: [kvm_bindings::kvm_cpuid_entry2; 256],
956         }
957         let data = TdxInitVm {
958             attributes: 1 << TDX_ATTR_SEPT_VE_DISABLE,
959             max_vcpus,
960             padding: 0,
961             mrconfigid: [0; 6],
962             mrowner: [0; 6],
963             mrownerconfig: [0; 6],
964             cpuid_nent: cpuid.len() as u32,
965             cpuid_padding: 0,
966             cpuid_entries: cpuid.as_slice().try_into().unwrap(),
967         };
968 
969         tdx_command(
970             &self.fd.as_raw_fd(),
971             TdxCommand::InitVm,
972             0,
973             &data as *const _ as u64,
974         )
975         .map_err(vm::HypervisorVmError::InitializeTdx)
976     }
977 
978     ///
979     /// Finalize the TDX setup for this VM
980     ///
981     #[cfg(feature = "tdx")]
982     fn tdx_finalize(&self) -> vm::Result<()> {
983         tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0)
984             .map_err(vm::HypervisorVmError::FinalizeTdx)
985     }
986 
987     ///
988     /// Initialize memory regions for the TDX VM
989     ///
990     #[cfg(feature = "tdx")]
991     fn tdx_init_memory_region(
992         &self,
993         host_address: u64,
994         guest_address: u64,
995         size: u64,
996         measure: bool,
997     ) -> vm::Result<()> {
998         #[repr(C)]
999         struct TdxInitMemRegion {
1000             host_address: u64,
1001             guest_address: u64,
1002             pages: u64,
1003         }
1004         let data = TdxInitMemRegion {
1005             host_address,
1006             guest_address,
1007             pages: size / 4096,
1008         };
1009 
1010         tdx_command(
1011             &self.fd.as_raw_fd(),
1012             TdxCommand::InitMemRegion,
1013             u32::from(measure),
1014             &data as *const _ as u64,
1015         )
1016         .map_err(vm::HypervisorVmError::InitMemRegionTdx)
1017     }
1018 
1019     /// Downcast to the underlying KvmVm type
1020     fn as_any(&self) -> &dyn Any {
1021         self
1022     }
1023 }
1024 
1025 #[cfg(feature = "tdx")]
1026 fn tdx_command(
1027     fd: &RawFd,
1028     command: TdxCommand,
1029     flags: u32,
1030     data: u64,
1031 ) -> std::result::Result<(), std::io::Error> {
1032     #[repr(C)]
1033     struct TdxIoctlCmd {
1034         command: TdxCommand,
1035         flags: u32,
1036         data: u64,
1037         error: u64,
1038         unused: u64,
1039     }
1040     let cmd = TdxIoctlCmd {
1041         command,
1042         flags,
1043         data,
1044         error: 0,
1045         unused: 0,
1046     };
1047     // SAFETY: FFI call. All input parameters are valid.
1048     let ret = unsafe {
1049         ioctl_with_val(
1050             fd,
1051             KVM_MEMORY_ENCRYPT_OP(),
1052             &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong,
1053         )
1054     };
1055 
1056     if ret < 0 {
1057         return Err(std::io::Error::last_os_error());
1058     }
1059     Ok(())
1060 }
1061 
1062 /// Wrapper over KVM system ioctls.
1063 pub struct KvmHypervisor {
1064     kvm: Kvm,
1065 }
1066 
1067 impl KvmHypervisor {
1068     #[cfg(target_arch = "x86_64")]
1069     ///
1070     /// Retrieve the list of MSRs supported by the hypervisor.
1071     ///
1072     fn get_msr_list(&self) -> hypervisor::Result<MsrList> {
1073         self.kvm
1074             .get_msr_index_list()
1075             .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into()))
1076     }
1077 }
1078 
1079 /// Enum for KVM related error
1080 #[derive(Debug, Error)]
1081 pub enum KvmError {
1082     #[error("Capability missing: {0:?}")]
1083     CapabilityMissing(Cap),
1084 }
1085 
1086 pub type KvmResult<T> = result::Result<T, KvmError>;
1087 
1088 impl KvmHypervisor {
1089     /// Create a hypervisor based on Kvm
1090     #[allow(clippy::new_ret_no_self)]
1091     pub fn new() -> hypervisor::Result<Arc<dyn hypervisor::Hypervisor>> {
1092         let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?;
1093         let api_version = kvm_obj.get_api_version();
1094 
1095         if api_version != kvm_bindings::KVM_API_VERSION as i32 {
1096             return Err(hypervisor::HypervisorError::IncompatibleApiVersion);
1097         }
1098 
1099         Ok(Arc::new(KvmHypervisor { kvm: kvm_obj }))
1100     }
1101 
1102     /// Check if the hypervisor is available
1103     pub fn is_available() -> hypervisor::Result<bool> {
1104         match std::fs::metadata("/dev/kvm") {
1105             Ok(_) => Ok(true),
1106             Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false),
1107             Err(err) => Err(hypervisor::HypervisorError::HypervisorAvailableCheck(
1108                 err.into(),
1109             )),
1110         }
1111     }
1112 }
1113 
1114 /// Implementation of Hypervisor trait for KVM
1115 ///
1116 /// # Examples
1117 ///
1118 /// ```
1119 /// # use hypervisor::kvm::KvmHypervisor;
1120 /// # use std::sync::Arc;
1121 /// let kvm = KvmHypervisor::new().unwrap();
1122 /// let hypervisor = Arc::new(kvm);
1123 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
1124 /// ```
1125 impl hypervisor::Hypervisor for KvmHypervisor {
1126     ///
1127     /// Returns the type of the hypervisor
1128     ///
1129     fn hypervisor_type(&self) -> HypervisorType {
1130         HypervisorType::Kvm
1131     }
1132 
1133     ///
1134     /// Create a Vm of a specific type using the underlying hypervisor, passing memory size
1135     /// Return a hypervisor-agnostic Vm trait object
1136     ///
1137     /// # Examples
1138     ///
1139     /// ```
1140     /// # use hypervisor::kvm::KvmHypervisor;
1141     /// use hypervisor::kvm::KvmVm;
1142     /// let hypervisor = KvmHypervisor::new().unwrap();
1143     /// let vm = hypervisor.create_vm_with_type_and_memory(0).unwrap();
1144     /// ```
1145     fn create_vm_with_type_and_memory(
1146         &self,
1147         vm_type: u64,
1148         #[cfg(feature = "sev_snp")] _mem_size: u64,
1149     ) -> hypervisor::Result<Arc<dyn vm::Vm>> {
1150         self.create_vm_with_type(vm_type)
1151     }
1152 
1153     /// Create a KVM vm object of a specific VM type and return the object as Vm trait object
1154     ///
1155     /// # Examples
1156     ///
1157     /// ```
1158     /// # use hypervisor::kvm::KvmHypervisor;
1159     /// use hypervisor::kvm::KvmVm;
1160     /// let hypervisor = KvmHypervisor::new().unwrap();
1161     /// let vm = hypervisor.create_vm_with_type(0).unwrap();
1162     /// ```
1163     fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> {
1164         let fd: VmFd;
1165         loop {
1166             match self.kvm.create_vm_with_type(vm_type) {
1167                 Ok(res) => fd = res,
1168                 Err(e) => {
1169                     if e.errno() == libc::EINTR {
1170                         // If the error returned is EINTR, which means the
1171                         // ioctl has been interrupted, we have to retry as
1172                         // this can't be considered as a regular error.
1173                         continue;
1174                     } else {
1175                         return Err(hypervisor::HypervisorError::VmCreate(e.into()));
1176                     }
1177                 }
1178             }
1179             break;
1180         }
1181 
1182         let vm_fd = Arc::new(fd);
1183 
1184         #[cfg(target_arch = "x86_64")]
1185         {
1186             let msr_list = self.get_msr_list()?;
1187             let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize;
1188             let mut msrs: Vec<MsrEntry> = vec![
1189                 MsrEntry {
1190                     ..Default::default()
1191                 };
1192                 num_msrs
1193             ];
1194             let indices = msr_list.as_slice();
1195             for (pos, index) in indices.iter().enumerate() {
1196                 msrs[pos].index = *index;
1197             }
1198 
1199             Ok(Arc::new(KvmVm {
1200                 fd: vm_fd,
1201                 msrs,
1202                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
1203             }))
1204         }
1205 
1206         #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
1207         {
1208             Ok(Arc::new(KvmVm {
1209                 fd: vm_fd,
1210                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
1211             }))
1212         }
1213     }
1214 
1215     /// Create a KVM vm object and return the object as Vm trait object
1216     ///
1217     /// # Examples
1218     ///
1219     /// ```
1220     /// # use hypervisor::kvm::KvmHypervisor;
1221     /// use hypervisor::kvm::KvmVm;
1222     /// let hypervisor = KvmHypervisor::new().unwrap();
1223     /// let vm = hypervisor.create_vm().unwrap();
1224     /// ```
1225     fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> {
1226         #[allow(unused_mut)]
1227         let mut vm_type: u64 = 0; // Create with default platform type
1228 
1229         // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA
1230         // size from the host and use that when creating the VM, which may
1231         // avoid unnecessary VM creation failures.
1232         #[cfg(target_arch = "aarch64")]
1233         if self.kvm.check_extension(Cap::ArmVmIPASize) {
1234             vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap();
1235         }
1236 
1237         self.create_vm_with_type(vm_type)
1238     }
1239 
1240     fn check_required_extensions(&self) -> hypervisor::Result<()> {
1241         check_required_kvm_extensions(&self.kvm)
1242             .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into()))
1243     }
1244 
1245     #[cfg(target_arch = "x86_64")]
1246     ///
1247     /// X86 specific call to get the system supported CPUID values.
1248     ///
1249     fn get_supported_cpuid(&self) -> hypervisor::Result<Vec<CpuIdEntry>> {
1250         let kvm_cpuid = self
1251             .kvm
1252             .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
1253             .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))?;
1254 
1255         let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect();
1256 
1257         Ok(v)
1258     }
1259 
1260     #[cfg(target_arch = "aarch64")]
1261     ///
1262     /// Retrieve AArch64 host maximum IPA size supported by KVM.
1263     ///
1264     fn get_host_ipa_limit(&self) -> i32 {
1265         self.kvm.get_host_ipa_limit()
1266     }
1267 
1268     ///
1269     /// Retrieve TDX capabilities
1270     ///
1271     #[cfg(feature = "tdx")]
1272     fn tdx_capabilities(&self) -> hypervisor::Result<TdxCapabilities> {
1273         let data = TdxCapabilities {
1274             nr_cpuid_configs: TDX_MAX_NR_CPUID_CONFIGS as u32,
1275             ..Default::default()
1276         };
1277 
1278         tdx_command(
1279             &self.kvm.as_raw_fd(),
1280             TdxCommand::Capabilities,
1281             0,
1282             &data as *const _ as u64,
1283         )
1284         .map_err(|e| hypervisor::HypervisorError::TdxCapabilities(e.into()))?;
1285 
1286         Ok(data)
1287     }
1288 
1289     #[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
1290     ///
1291     /// Get the number of supported hardware breakpoints
1292     ///
1293     fn get_guest_debug_hw_bps(&self) -> usize {
1294         #[cfg(target_arch = "x86_64")]
1295         {
1296             4
1297         }
1298         #[cfg(target_arch = "aarch64")]
1299         {
1300             self.kvm.get_guest_debug_hw_bps() as usize
1301         }
1302     }
1303 
1304     /// Get maximum number of vCPUs
1305     fn get_max_vcpus(&self) -> u32 {
1306         self.kvm.get_max_vcpus().min(u32::MAX as usize) as u32
1307     }
1308 }
1309 
1310 /// Vcpu struct for KVM
1311 pub struct KvmVcpu {
1312     fd: Arc<Mutex<VcpuFd>>,
1313     #[cfg(target_arch = "x86_64")]
1314     msrs: Vec<MsrEntry>,
1315     vm_ops: Option<Arc<dyn vm::VmOps>>,
1316     #[cfg(target_arch = "x86_64")]
1317     hyperv_synic: AtomicBool,
1318 }
1319 
1320 /// Implementation of Vcpu trait for KVM
1321 ///
1322 /// # Examples
1323 ///
1324 /// ```
1325 /// # use hypervisor::kvm::KvmHypervisor;
1326 /// # use std::sync::Arc;
1327 /// let kvm = KvmHypervisor::new().unwrap();
1328 /// let hypervisor = Arc::new(kvm);
1329 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
1330 /// let vcpu = vm.create_vcpu(0, None).unwrap();
1331 /// ```
1332 impl cpu::Vcpu for KvmVcpu {
1333     ///
1334     /// Returns StandardRegisters with default value set
1335     ///
1336     fn create_standard_regs(&self) -> StandardRegisters {
1337         #[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
1338         {
1339             kvm_bindings::kvm_regs::default().into()
1340         }
1341         #[cfg(target_arch = "riscv64")]
1342         {
1343             kvm_bindings::kvm_riscv_core::default().into()
1344         }
1345     }
1346     #[cfg(target_arch = "x86_64")]
1347     ///
1348     /// Returns the vCPU general purpose registers.
1349     ///
1350     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1351         Ok(self
1352             .fd
1353             .lock()
1354             .unwrap()
1355             .get_regs()
1356             .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))?
1357             .into())
1358     }
1359 
1360     ///
1361     /// Returns the vCPU general purpose registers.
1362     /// The `KVM_GET_REGS` ioctl is not available on AArch64, `KVM_GET_ONE_REG`
1363     /// is used to get registers one by one.
1364     ///
1365     #[cfg(target_arch = "aarch64")]
1366     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1367         let mut state = kvm_regs::default();
1368         let mut off = offset_of!(user_pt_regs, regs);
1369         // There are 31 user_pt_regs:
1370         // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72
1371         // These actually are the general-purpose registers of the Armv8-a
1372         // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register).
1373         for i in 0..31 {
1374             let mut bytes = [0_u8; 8];
1375             self.fd
1376                 .lock()
1377                 .unwrap()
1378                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1379                 .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1380             state.regs.regs[i] = u64::from_le_bytes(bytes);
1381             off += std::mem::size_of::<u64>();
1382         }
1383 
1384         // We are now entering the "Other register" section of the ARMv8-a architecture.
1385         // First one, stack pointer.
1386         let off = offset_of!(user_pt_regs, sp);
1387         let mut bytes = [0_u8; 8];
1388         self.fd
1389             .lock()
1390             .unwrap()
1391             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1392             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1393         state.regs.sp = u64::from_le_bytes(bytes);
1394 
1395         // Second one, the program counter.
1396         let off = offset_of!(user_pt_regs, pc);
1397         let mut bytes = [0_u8; 8];
1398         self.fd
1399             .lock()
1400             .unwrap()
1401             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1402             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1403         state.regs.pc = u64::from_le_bytes(bytes);
1404 
1405         // Next is the processor state.
1406         let off = offset_of!(user_pt_regs, pstate);
1407         let mut bytes = [0_u8; 8];
1408         self.fd
1409             .lock()
1410             .unwrap()
1411             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1412             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1413         state.regs.pstate = u64::from_le_bytes(bytes);
1414 
1415         // The stack pointer associated with EL1
1416         let off = offset_of!(kvm_regs, sp_el1);
1417         let mut bytes = [0_u8; 8];
1418         self.fd
1419             .lock()
1420             .unwrap()
1421             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1422             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1423         state.sp_el1 = u64::from_le_bytes(bytes);
1424 
1425         // Exception Link Register for EL1, when taking an exception to EL1, this register
1426         // holds the address to which to return afterwards.
1427         let off = offset_of!(kvm_regs, elr_el1);
1428         let mut bytes = [0_u8; 8];
1429         self.fd
1430             .lock()
1431             .unwrap()
1432             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1433             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1434         state.elr_el1 = u64::from_le_bytes(bytes);
1435 
1436         // Saved Program Status Registers, there are 5 of them used in the kernel.
1437         let mut off = offset_of!(kvm_regs, spsr);
1438         for i in 0..KVM_NR_SPSR as usize {
1439             let mut bytes = [0_u8; 8];
1440             self.fd
1441                 .lock()
1442                 .unwrap()
1443                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1444                 .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1445             state.spsr[i] = u64::from_le_bytes(bytes);
1446             off += std::mem::size_of::<u64>();
1447         }
1448 
1449         // Now moving on to floating point registers which are stored in the user_fpsimd_state in the kernel:
1450         // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53
1451         let mut off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, vregs);
1452         for i in 0..32 {
1453             let mut bytes = [0_u8; 16];
1454             self.fd
1455                 .lock()
1456                 .unwrap()
1457                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off), &mut bytes)
1458                 .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1459             state.fp_regs.vregs[i] = u128::from_le_bytes(bytes);
1460             off += mem::size_of::<u128>();
1461         }
1462 
1463         // Floating-point Status Register
1464         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpsr);
1465         let mut bytes = [0_u8; 4];
1466         self.fd
1467             .lock()
1468             .unwrap()
1469             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off), &mut bytes)
1470             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1471         state.fp_regs.fpsr = u32::from_le_bytes(bytes);
1472 
1473         // Floating-point Control Register
1474         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpcr);
1475         let mut bytes = [0_u8; 4];
1476         self.fd
1477             .lock()
1478             .unwrap()
1479             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off), &mut bytes)
1480             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1481         state.fp_regs.fpcr = u32::from_le_bytes(bytes);
1482         Ok(state.into())
1483     }
1484 
1485     #[cfg(target_arch = "riscv64")]
1486     ///
1487     /// Returns the RISC-V vCPU core registers.
1488     /// The `KVM_GET_REGS` ioctl is not available on RISC-V 64-bit,
1489     /// `KVM_GET_ONE_REG` is used to get registers one by one.
1490     ///
1491     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1492         let mut state = kvm_riscv_core::default();
1493 
1494         /// Macro used to extract RISC-V register data from KVM Vcpu according
1495         /// to `$reg_name` provided to `state`.
1496         macro_rules! riscv64_get_one_reg_from_vcpu {
1497             (mode) => {
1498                 let off = offset_of!(kvm_riscv_core, mode);
1499                 let mut bytes = [0_u8; 8];
1500                 self.fd
1501                     .lock()
1502                     .unwrap()
1503                     .get_one_reg(riscv64_reg_id!(KVM_REG_RISCV_CORE, off), &mut bytes)
1504                     .map_err(|e| cpu::HypervisorCpuError::GetRiscvCoreRegister(e.into()))?;
1505                 state.mode = u64::from_le_bytes(bytes);
1506             };
1507             ($reg_name:ident) => {
1508                 let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, $reg_name);
1509                 let mut bytes = [0_u8; 8];
1510                 self.fd
1511                     .lock()
1512                     .unwrap()
1513                     .get_one_reg(riscv64_reg_id!(KVM_REG_RISCV_CORE, off), &mut bytes)
1514                     .map_err(|e| cpu::HypervisorCpuError::GetRiscvCoreRegister(e.into()))?;
1515                 state.regs.$reg_name = u64::from_le_bytes(bytes);
1516             };
1517         }
1518 
1519         riscv64_get_one_reg_from_vcpu!(pc);
1520         riscv64_get_one_reg_from_vcpu!(ra);
1521         riscv64_get_one_reg_from_vcpu!(sp);
1522         riscv64_get_one_reg_from_vcpu!(gp);
1523         riscv64_get_one_reg_from_vcpu!(tp);
1524         riscv64_get_one_reg_from_vcpu!(t0);
1525         riscv64_get_one_reg_from_vcpu!(t1);
1526         riscv64_get_one_reg_from_vcpu!(t2);
1527         riscv64_get_one_reg_from_vcpu!(s0);
1528         riscv64_get_one_reg_from_vcpu!(s1);
1529         riscv64_get_one_reg_from_vcpu!(a0);
1530         riscv64_get_one_reg_from_vcpu!(a1);
1531         riscv64_get_one_reg_from_vcpu!(a2);
1532         riscv64_get_one_reg_from_vcpu!(a3);
1533         riscv64_get_one_reg_from_vcpu!(a4);
1534         riscv64_get_one_reg_from_vcpu!(a5);
1535         riscv64_get_one_reg_from_vcpu!(a6);
1536         riscv64_get_one_reg_from_vcpu!(a7);
1537         riscv64_get_one_reg_from_vcpu!(s2);
1538         riscv64_get_one_reg_from_vcpu!(s3);
1539         riscv64_get_one_reg_from_vcpu!(s4);
1540         riscv64_get_one_reg_from_vcpu!(s5);
1541         riscv64_get_one_reg_from_vcpu!(s6);
1542         riscv64_get_one_reg_from_vcpu!(s7);
1543         riscv64_get_one_reg_from_vcpu!(s8);
1544         riscv64_get_one_reg_from_vcpu!(s9);
1545         riscv64_get_one_reg_from_vcpu!(s10);
1546         riscv64_get_one_reg_from_vcpu!(s11);
1547         riscv64_get_one_reg_from_vcpu!(t3);
1548         riscv64_get_one_reg_from_vcpu!(t4);
1549         riscv64_get_one_reg_from_vcpu!(t5);
1550         riscv64_get_one_reg_from_vcpu!(t6);
1551         riscv64_get_one_reg_from_vcpu!(mode);
1552 
1553         Ok(state.into())
1554     }
1555 
1556     #[cfg(target_arch = "x86_64")]
1557     ///
1558     /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl.
1559     ///
1560     fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> {
1561         let regs = (*regs).into();
1562         self.fd
1563             .lock()
1564             .unwrap()
1565             .set_regs(&regs)
1566             .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into()))
1567     }
1568 
1569     ///
1570     /// Sets the vCPU general purpose registers.
1571     /// The `KVM_SET_REGS` ioctl is not available on AArch64, `KVM_SET_ONE_REG`
1572     /// is used to set registers one by one.
1573     ///
1574     #[cfg(target_arch = "aarch64")]
1575     fn set_regs(&self, state: &StandardRegisters) -> cpu::Result<()> {
1576         // The function follows the exact identical order from `state`. Look there
1577         // for some additional info on registers.
1578         let kvm_regs_state: kvm_regs = (*state).into();
1579         let mut off = offset_of!(user_pt_regs, regs);
1580         for i in 0..31 {
1581             self.fd
1582                 .lock()
1583                 .unwrap()
1584                 .set_one_reg(
1585                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1586                     &kvm_regs_state.regs.regs[i].to_le_bytes(),
1587                 )
1588                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1589             off += std::mem::size_of::<u64>();
1590         }
1591 
1592         let off = offset_of!(user_pt_regs, sp);
1593         self.fd
1594             .lock()
1595             .unwrap()
1596             .set_one_reg(
1597                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1598                 &kvm_regs_state.regs.sp.to_le_bytes(),
1599             )
1600             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1601 
1602         let off = offset_of!(user_pt_regs, pc);
1603         self.fd
1604             .lock()
1605             .unwrap()
1606             .set_one_reg(
1607                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1608                 &kvm_regs_state.regs.pc.to_le_bytes(),
1609             )
1610             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1611 
1612         let off = offset_of!(user_pt_regs, pstate);
1613         self.fd
1614             .lock()
1615             .unwrap()
1616             .set_one_reg(
1617                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1618                 &kvm_regs_state.regs.pstate.to_le_bytes(),
1619             )
1620             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1621 
1622         let off = offset_of!(kvm_regs, sp_el1);
1623         self.fd
1624             .lock()
1625             .unwrap()
1626             .set_one_reg(
1627                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1628                 &kvm_regs_state.sp_el1.to_le_bytes(),
1629             )
1630             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1631 
1632         let off = offset_of!(kvm_regs, elr_el1);
1633         self.fd
1634             .lock()
1635             .unwrap()
1636             .set_one_reg(
1637                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1638                 &kvm_regs_state.elr_el1.to_le_bytes(),
1639             )
1640             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1641 
1642         let mut off = offset_of!(kvm_regs, spsr);
1643         for i in 0..KVM_NR_SPSR as usize {
1644             self.fd
1645                 .lock()
1646                 .unwrap()
1647                 .set_one_reg(
1648                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1649                     &kvm_regs_state.spsr[i].to_le_bytes(),
1650                 )
1651                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1652             off += std::mem::size_of::<u64>();
1653         }
1654 
1655         let mut off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, vregs);
1656         for i in 0..32 {
1657             self.fd
1658                 .lock()
1659                 .unwrap()
1660                 .set_one_reg(
1661                     arm64_core_reg_id!(KVM_REG_SIZE_U128, off),
1662                     &kvm_regs_state.fp_regs.vregs[i].to_le_bytes(),
1663                 )
1664                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1665             off += mem::size_of::<u128>();
1666         }
1667 
1668         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpsr);
1669         self.fd
1670             .lock()
1671             .unwrap()
1672             .set_one_reg(
1673                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1674                 &kvm_regs_state.fp_regs.fpsr.to_le_bytes(),
1675             )
1676             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1677 
1678         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpcr);
1679         self.fd
1680             .lock()
1681             .unwrap()
1682             .set_one_reg(
1683                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1684                 &kvm_regs_state.fp_regs.fpcr.to_le_bytes(),
1685             )
1686             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1687         Ok(())
1688     }
1689 
1690     #[cfg(target_arch = "riscv64")]
1691     ///
1692     /// Sets the RISC-V vCPU core registers.
1693     /// The `KVM_SET_REGS` ioctl is not available on RISC-V 64-bit,
1694     /// `KVM_SET_ONE_REG` is used to set registers one by one.
1695     ///
1696     fn set_regs(&self, state: &StandardRegisters) -> cpu::Result<()> {
1697         // The function follows the exact identical order from `state`. Look there
1698         // for some additional info on registers.
1699         let kvm_regs_state: kvm_riscv_core = (*state).into();
1700 
1701         /// Macro used to set value of specific RISC-V `$reg_name` stored in
1702         /// `state` to KVM Vcpu.
1703         macro_rules! riscv64_set_one_reg_to_vcpu {
1704             (mode) => {
1705                 let off = offset_of!(kvm_riscv_core, mode);
1706                 self.fd
1707                     .lock()
1708                     .unwrap()
1709                     .set_one_reg(
1710                         riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1711                         &kvm_regs_state.mode.to_le_bytes(),
1712                     )
1713                     .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1714             };
1715             ($reg_name:ident) => {
1716                 let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, $reg_name);
1717                 self.fd
1718                     .lock()
1719                     .unwrap()
1720                     .set_one_reg(
1721                         riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1722                         &kvm_regs_state.regs.$reg_name.to_le_bytes(),
1723                     )
1724                     .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1725             };
1726         }
1727 
1728         riscv64_set_one_reg_to_vcpu!(pc);
1729         riscv64_set_one_reg_to_vcpu!(ra);
1730         riscv64_set_one_reg_to_vcpu!(sp);
1731         riscv64_set_one_reg_to_vcpu!(gp);
1732         riscv64_set_one_reg_to_vcpu!(tp);
1733         riscv64_set_one_reg_to_vcpu!(t0);
1734         riscv64_set_one_reg_to_vcpu!(t1);
1735         riscv64_set_one_reg_to_vcpu!(t2);
1736         riscv64_set_one_reg_to_vcpu!(s0);
1737         riscv64_set_one_reg_to_vcpu!(s1);
1738         riscv64_set_one_reg_to_vcpu!(a0);
1739         riscv64_set_one_reg_to_vcpu!(a1);
1740         riscv64_set_one_reg_to_vcpu!(a2);
1741         riscv64_set_one_reg_to_vcpu!(a3);
1742         riscv64_set_one_reg_to_vcpu!(a4);
1743         riscv64_set_one_reg_to_vcpu!(a5);
1744         riscv64_set_one_reg_to_vcpu!(a6);
1745         riscv64_set_one_reg_to_vcpu!(a7);
1746         riscv64_set_one_reg_to_vcpu!(s2);
1747         riscv64_set_one_reg_to_vcpu!(s3);
1748         riscv64_set_one_reg_to_vcpu!(s4);
1749         riscv64_set_one_reg_to_vcpu!(s5);
1750         riscv64_set_one_reg_to_vcpu!(s6);
1751         riscv64_set_one_reg_to_vcpu!(s7);
1752         riscv64_set_one_reg_to_vcpu!(s8);
1753         riscv64_set_one_reg_to_vcpu!(s9);
1754         riscv64_set_one_reg_to_vcpu!(s10);
1755         riscv64_set_one_reg_to_vcpu!(s11);
1756         riscv64_set_one_reg_to_vcpu!(t3);
1757         riscv64_set_one_reg_to_vcpu!(t4);
1758         riscv64_set_one_reg_to_vcpu!(t5);
1759         riscv64_set_one_reg_to_vcpu!(t6);
1760         riscv64_set_one_reg_to_vcpu!(mode);
1761 
1762         Ok(())
1763     }
1764 
1765     #[cfg(target_arch = "x86_64")]
1766     ///
1767     /// Returns the vCPU special registers.
1768     ///
1769     fn get_sregs(&self) -> cpu::Result<SpecialRegisters> {
1770         Ok(self
1771             .fd
1772             .lock()
1773             .unwrap()
1774             .get_sregs()
1775             .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))?
1776             .into())
1777     }
1778 
1779     #[cfg(target_arch = "x86_64")]
1780     ///
1781     /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl.
1782     ///
1783     fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> {
1784         let sregs = (*sregs).into();
1785         self.fd
1786             .lock()
1787             .unwrap()
1788             .set_sregs(&sregs)
1789             .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into()))
1790     }
1791 
1792     #[cfg(target_arch = "x86_64")]
1793     ///
1794     /// Returns the floating point state (FPU) from the vCPU.
1795     ///
1796     fn get_fpu(&self) -> cpu::Result<FpuState> {
1797         Ok(self
1798             .fd
1799             .lock()
1800             .unwrap()
1801             .get_fpu()
1802             .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))?
1803             .into())
1804     }
1805 
1806     #[cfg(target_arch = "x86_64")]
1807     ///
1808     /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioctl.
1809     ///
1810     fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> {
1811         let fpu: kvm_bindings::kvm_fpu = (*fpu).clone().into();
1812         self.fd
1813             .lock()
1814             .unwrap()
1815             .set_fpu(&fpu)
1816             .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into()))
1817     }
1818 
1819     #[cfg(target_arch = "x86_64")]
1820     ///
1821     /// X86 specific call to setup the CPUID registers.
1822     ///
1823     fn set_cpuid2(&self, cpuid: &[CpuIdEntry]) -> cpu::Result<()> {
1824         let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> =
1825             cpuid.iter().map(|e| (*e).into()).collect();
1826         let kvm_cpuid = <CpuId>::from_entries(&cpuid)
1827             .map_err(|_| cpu::HypervisorCpuError::SetCpuid(anyhow!("failed to create CpuId")))?;
1828 
1829         self.fd
1830             .lock()
1831             .unwrap()
1832             .set_cpuid2(&kvm_cpuid)
1833             .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into()))
1834     }
1835 
1836     #[cfg(target_arch = "x86_64")]
1837     ///
1838     /// X86 specific call to enable HyperV SynIC
1839     ///
1840     fn enable_hyperv_synic(&self) -> cpu::Result<()> {
1841         // Update the information about Hyper-V SynIC being enabled and
1842         // emulated as it will influence later which MSRs should be saved.
1843         self.hyperv_synic.store(true, Ordering::Release);
1844 
1845         let cap = kvm_enable_cap {
1846             cap: KVM_CAP_HYPERV_SYNIC,
1847             ..Default::default()
1848         };
1849         self.fd
1850             .lock()
1851             .unwrap()
1852             .enable_cap(&cap)
1853             .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into()))
1854     }
1855 
1856     ///
1857     /// X86 specific call to retrieve the CPUID registers.
1858     ///
1859     #[cfg(target_arch = "x86_64")]
1860     fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<Vec<CpuIdEntry>> {
1861         let kvm_cpuid = self
1862             .fd
1863             .lock()
1864             .unwrap()
1865             .get_cpuid2(num_entries)
1866             .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))?;
1867 
1868         let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect();
1869 
1870         Ok(v)
1871     }
1872 
1873     #[cfg(target_arch = "x86_64")]
1874     ///
1875     /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1876     ///
1877     fn get_lapic(&self) -> cpu::Result<LapicState> {
1878         Ok(self
1879             .fd
1880             .lock()
1881             .unwrap()
1882             .get_lapic()
1883             .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))?
1884             .into())
1885     }
1886 
1887     #[cfg(target_arch = "x86_64")]
1888     ///
1889     /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1890     ///
1891     fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> {
1892         let klapic: kvm_bindings::kvm_lapic_state = (*klapic).clone().into();
1893         self.fd
1894             .lock()
1895             .unwrap()
1896             .set_lapic(&klapic)
1897             .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into()))
1898     }
1899 
1900     #[cfg(target_arch = "x86_64")]
1901     ///
1902     /// Returns the model-specific registers (MSR) for this vCPU.
1903     ///
1904     fn get_msrs(&self, msrs: &mut Vec<MsrEntry>) -> cpu::Result<usize> {
1905         let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect();
1906         let mut kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap();
1907         let succ = self
1908             .fd
1909             .lock()
1910             .unwrap()
1911             .get_msrs(&mut kvm_msrs)
1912             .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))?;
1913 
1914         msrs[..succ].copy_from_slice(
1915             &kvm_msrs.as_slice()[..succ]
1916                 .iter()
1917                 .map(|e| (*e).into())
1918                 .collect::<Vec<MsrEntry>>(),
1919         );
1920 
1921         Ok(succ)
1922     }
1923 
1924     #[cfg(target_arch = "x86_64")]
1925     ///
1926     /// Setup the model-specific registers (MSR) for this vCPU.
1927     /// Returns the number of MSR entries actually written.
1928     ///
1929     fn set_msrs(&self, msrs: &[MsrEntry]) -> cpu::Result<usize> {
1930         let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect();
1931         let kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap();
1932         self.fd
1933             .lock()
1934             .unwrap()
1935             .set_msrs(&kvm_msrs)
1936             .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into()))
1937     }
1938 
1939     ///
1940     /// Returns the vcpu's current "multiprocessing state".
1941     ///
1942     fn get_mp_state(&self) -> cpu::Result<MpState> {
1943         Ok(self
1944             .fd
1945             .lock()
1946             .unwrap()
1947             .get_mp_state()
1948             .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))?
1949             .into())
1950     }
1951 
1952     ///
1953     /// Sets the vcpu's current "multiprocessing state".
1954     ///
1955     fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> {
1956         self.fd
1957             .lock()
1958             .unwrap()
1959             .set_mp_state(mp_state.into())
1960             .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into()))
1961     }
1962 
1963     #[cfg(target_arch = "x86_64")]
1964     ///
1965     /// Translates guest virtual address to guest physical address using the `KVM_TRANSLATE` ioctl.
1966     ///
1967     fn translate_gva(&self, gva: u64, _flags: u64) -> cpu::Result<(u64, u32)> {
1968         let tr = self
1969             .fd
1970             .lock()
1971             .unwrap()
1972             .translate_gva(gva)
1973             .map_err(|e| cpu::HypervisorCpuError::TranslateVirtualAddress(e.into()))?;
1974         // tr.valid is set if the GVA is mapped to valid GPA.
1975         match tr.valid {
1976             0 => Err(cpu::HypervisorCpuError::TranslateVirtualAddress(anyhow!(
1977                 "Invalid GVA: {:#x}",
1978                 gva
1979             ))),
1980             _ => Ok((tr.physical_address, 0)),
1981         }
1982     }
1983 
1984     ///
1985     /// Triggers the running of the current virtual CPU returning an exit reason.
1986     ///
1987     fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> {
1988         match self.fd.lock().unwrap().run() {
1989             Ok(run) => match run {
1990                 #[cfg(target_arch = "x86_64")]
1991                 VcpuExit::IoIn(addr, data) => {
1992                     if let Some(vm_ops) = &self.vm_ops {
1993                         return vm_ops
1994                             .pio_read(addr.into(), data)
1995                             .map(|_| cpu::VmExit::Ignore)
1996                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1997                     }
1998 
1999                     Ok(cpu::VmExit::Ignore)
2000                 }
2001                 #[cfg(target_arch = "x86_64")]
2002                 VcpuExit::IoOut(addr, data) => {
2003                     if let Some(vm_ops) = &self.vm_ops {
2004                         return vm_ops
2005                             .pio_write(addr.into(), data)
2006                             .map(|_| cpu::VmExit::Ignore)
2007                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
2008                     }
2009 
2010                     Ok(cpu::VmExit::Ignore)
2011                 }
2012                 #[cfg(target_arch = "x86_64")]
2013                 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)),
2014                 #[cfg(target_arch = "x86_64")]
2015                 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset),
2016 
2017                 #[cfg(target_arch = "aarch64")]
2018                 VcpuExit::SystemEvent(event_type, flags) => {
2019                     use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN};
2020                     // On Aarch64, when the VM is shutdown, run() returns
2021                     // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN
2022                     if event_type == KVM_SYSTEM_EVENT_RESET {
2023                         Ok(cpu::VmExit::Reset)
2024                     } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN {
2025                         Ok(cpu::VmExit::Shutdown)
2026                     } else {
2027                         Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
2028                             "Unexpected system event with type 0x{:x}, flags 0x{:x?}",
2029                             event_type,
2030                             flags
2031                         )))
2032                     }
2033                 }
2034 
2035                 VcpuExit::MmioRead(addr, data) => {
2036                     if let Some(vm_ops) = &self.vm_ops {
2037                         return vm_ops
2038                             .mmio_read(addr, data)
2039                             .map(|_| cpu::VmExit::Ignore)
2040                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
2041                     }
2042 
2043                     Ok(cpu::VmExit::Ignore)
2044                 }
2045                 VcpuExit::MmioWrite(addr, data) => {
2046                     if let Some(vm_ops) = &self.vm_ops {
2047                         return vm_ops
2048                             .mmio_write(addr, data)
2049                             .map(|_| cpu::VmExit::Ignore)
2050                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
2051                     }
2052 
2053                     Ok(cpu::VmExit::Ignore)
2054                 }
2055                 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv),
2056                 #[cfg(feature = "tdx")]
2057                 VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx),
2058                 VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug),
2059 
2060                 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
2061                     "Unexpected exit reason on vcpu run: {:?}",
2062                     r
2063                 ))),
2064             },
2065 
2066             Err(ref e) => match e.errno() {
2067                 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore),
2068                 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
2069                     "VCPU error {:?}",
2070                     e
2071                 ))),
2072             },
2073         }
2074     }
2075 
2076     #[cfg(target_arch = "x86_64")]
2077     ///
2078     /// Let the guest know that it has been paused, which prevents from
2079     /// potential soft lockups when being resumed.
2080     ///
2081     fn notify_guest_clock_paused(&self) -> cpu::Result<()> {
2082         if let Err(e) = self.fd.lock().unwrap().kvmclock_ctrl() {
2083             // Linux kernel returns -EINVAL if the PV clock isn't yet initialised
2084             // which could be because we're still in firmware or the guest doesn't
2085             // use KVM clock.
2086             if e.errno() != libc::EINVAL {
2087                 return Err(cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into()));
2088             }
2089         }
2090 
2091         Ok(())
2092     }
2093 
2094     #[cfg(not(target_arch = "riscv64"))]
2095     ///
2096     /// Sets debug registers to set hardware breakpoints and/or enable single step.
2097     ///
2098     fn set_guest_debug(
2099         &self,
2100         addrs: &[vm_memory::GuestAddress],
2101         singlestep: bool,
2102     ) -> cpu::Result<()> {
2103         let mut dbg = kvm_guest_debug {
2104             #[cfg(target_arch = "x86_64")]
2105             control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP,
2106             #[cfg(target_arch = "aarch64")]
2107             control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW,
2108             ..Default::default()
2109         };
2110         if singlestep {
2111             dbg.control |= KVM_GUESTDBG_SINGLESTEP;
2112         }
2113 
2114         // Set the debug registers.
2115         // Here we assume that the number of addresses do not exceed what
2116         // `Hypervisor::get_guest_debug_hw_bps()` specifies.
2117         #[cfg(target_arch = "x86_64")]
2118         {
2119             // Set bits 9 and 10.
2120             // bit 9: GE (global exact breakpoint enable) flag.
2121             // bit 10: always 1.
2122             dbg.arch.debugreg[7] = 0x0600;
2123 
2124             for (i, addr) in addrs.iter().enumerate() {
2125                 dbg.arch.debugreg[i] = addr.0;
2126                 // Set global breakpoint enable flag
2127                 dbg.arch.debugreg[7] |= 2 << (i * 2);
2128             }
2129         }
2130         #[cfg(target_arch = "aarch64")]
2131         {
2132             for (i, addr) in addrs.iter().enumerate() {
2133                 // DBGBCR_EL1 (Debug Breakpoint Control Registers, D13.3.2):
2134                 // bit 0: 1 (Enabled)
2135                 // bit 1~2: 0b11 (PMC = EL1/EL0)
2136                 // bit 5~8: 0b1111 (BAS = AArch64)
2137                 // others: 0
2138                 dbg.arch.dbg_bcr[i] = 0b1u64 | 0b110u64 | 0b1_1110_0000u64;
2139                 // DBGBVR_EL1 (Debug Breakpoint Value Registers, D13.3.3):
2140                 // bit 2~52: VA[2:52]
2141                 dbg.arch.dbg_bvr[i] = (!0u64 >> 11) & addr.0;
2142             }
2143         }
2144         self.fd
2145             .lock()
2146             .unwrap()
2147             .set_guest_debug(&dbg)
2148             .map_err(|e| cpu::HypervisorCpuError::SetDebugRegs(e.into()))
2149     }
2150 
2151     #[cfg(target_arch = "aarch64")]
2152     fn vcpu_get_finalized_features(&self) -> i32 {
2153         kvm_bindings::KVM_ARM_VCPU_SVE as i32
2154     }
2155 
2156     #[cfg(target_arch = "aarch64")]
2157     fn vcpu_set_processor_features(
2158         &self,
2159         vm: &Arc<dyn crate::Vm>,
2160         kvi: &mut crate::VcpuInit,
2161         id: u8,
2162     ) -> cpu::Result<()> {
2163         use std::arch::is_aarch64_feature_detected;
2164         #[allow(clippy::nonminimal_bool)]
2165         let sve_supported =
2166             is_aarch64_feature_detected!("sve") || is_aarch64_feature_detected!("sve2");
2167 
2168         let mut kvm_kvi: kvm_bindings::kvm_vcpu_init = (*kvi).into();
2169 
2170         // We already checked that the capability is supported.
2171         kvm_kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2;
2172         if vm
2173             .as_any()
2174             .downcast_ref::<crate::kvm::KvmVm>()
2175             .unwrap()
2176             .check_extension(Cap::ArmPmuV3)
2177         {
2178             kvm_kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3;
2179         }
2180 
2181         if sve_supported
2182             && vm
2183                 .as_any()
2184                 .downcast_ref::<crate::kvm::KvmVm>()
2185                 .unwrap()
2186                 .check_extension(Cap::ArmSve)
2187         {
2188             kvm_kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_SVE;
2189         }
2190 
2191         // Non-boot cpus are powered off initially.
2192         if id > 0 {
2193             kvm_kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF;
2194         }
2195 
2196         *kvi = kvm_kvi.into();
2197 
2198         Ok(())
2199     }
2200 
2201     ///
2202     /// Return VcpuInit with default value set
2203     ///
2204     #[cfg(target_arch = "aarch64")]
2205     fn create_vcpu_init(&self) -> crate::VcpuInit {
2206         kvm_bindings::kvm_vcpu_init::default().into()
2207     }
2208 
2209     #[cfg(target_arch = "aarch64")]
2210     fn vcpu_init(&self, kvi: &crate::VcpuInit) -> cpu::Result<()> {
2211         let kvm_kvi: kvm_bindings::kvm_vcpu_init = (*kvi).into();
2212         self.fd
2213             .lock()
2214             .unwrap()
2215             .vcpu_init(&kvm_kvi)
2216             .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into()))
2217     }
2218 
2219     #[cfg(target_arch = "aarch64")]
2220     fn vcpu_finalize(&self, feature: i32) -> cpu::Result<()> {
2221         self.fd
2222             .lock()
2223             .unwrap()
2224             .vcpu_finalize(&feature)
2225             .map_err(|e| cpu::HypervisorCpuError::VcpuFinalize(e.into()))
2226     }
2227 
2228     #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
2229     ///
2230     /// Gets a list of the guest registers that are supported for the
2231     /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls.
2232     ///
2233     fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> {
2234         let mut kvm_reg_list: kvm_bindings::RegList = reg_list.clone().into();
2235         self.fd
2236             .lock()
2237             .unwrap()
2238             .get_reg_list(&mut kvm_reg_list)
2239             .map_err(|e: kvm_ioctls::Error| cpu::HypervisorCpuError::GetRegList(e.into()))?;
2240         *reg_list = kvm_reg_list.into();
2241         Ok(())
2242     }
2243 
2244     ///
2245     /// Gets the value of a system register
2246     ///
2247     #[cfg(target_arch = "aarch64")]
2248     fn get_sys_reg(&self, sys_reg: u32) -> cpu::Result<u64> {
2249         //
2250         // Arm Architecture Reference Manual defines the encoding of
2251         // AArch64 system registers, see
2252         // https://developer.arm.com/documentation/ddi0487 (chapter D12).
2253         // While KVM defines another ID for each AArch64 system register,
2254         // which is used in calling `KVM_G/SET_ONE_REG` to access a system
2255         // register of a guest.
2256         // A mapping exists between the Arm standard encoding and the KVM ID.
2257         // This function takes the standard u32 ID as input parameter, converts
2258         // it to the corresponding KVM ID, and call `KVM_GET_ONE_REG` API to
2259         // get the value of the system parameter.
2260         //
2261         let id: u64 = KVM_REG_ARM64
2262             | KVM_REG_SIZE_U64
2263             | KVM_REG_ARM64_SYSREG as u64
2264             | ((((sys_reg) >> 5)
2265                 & (KVM_REG_ARM64_SYSREG_OP0_MASK
2266                     | KVM_REG_ARM64_SYSREG_OP1_MASK
2267                     | KVM_REG_ARM64_SYSREG_CRN_MASK
2268                     | KVM_REG_ARM64_SYSREG_CRM_MASK
2269                     | KVM_REG_ARM64_SYSREG_OP2_MASK)) as u64);
2270         let mut bytes = [0_u8; 8];
2271         self.fd
2272             .lock()
2273             .unwrap()
2274             .get_one_reg(id, &mut bytes)
2275             .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?;
2276         Ok(u64::from_le_bytes(bytes))
2277     }
2278 
2279     ///
2280     /// Gets the value of a non-core register
2281     ///
2282     #[cfg(target_arch = "riscv64")]
2283     fn get_non_core_reg(&self, _non_core_reg: u32) -> cpu::Result<u64> {
2284         unimplemented!()
2285     }
2286 
2287     ///
2288     /// Configure core registers for a given CPU.
2289     ///
2290     #[cfg(target_arch = "aarch64")]
2291     fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> {
2292         #[allow(non_upper_case_globals)]
2293         // PSR (Processor State Register) bits.
2294         // Taken from arch/arm64/include/uapi/asm/ptrace.h.
2295         const PSR_MODE_EL1h: u64 = 0x0000_0005;
2296         const PSR_F_BIT: u64 = 0x0000_0040;
2297         const PSR_I_BIT: u64 = 0x0000_0080;
2298         const PSR_A_BIT: u64 = 0x0000_0100;
2299         const PSR_D_BIT: u64 = 0x0000_0200;
2300         // Taken from arch/arm64/kvm/inject_fault.c.
2301         const PSTATE_FAULT_BITS_64: u64 =
2302             PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | PSR_I_BIT | PSR_D_BIT;
2303 
2304         let kreg_off = offset_of!(kvm_regs, regs);
2305 
2306         // Get the register index of the PSTATE (Processor State) register.
2307         let pstate = offset_of!(user_pt_regs, pstate) + kreg_off;
2308         self.fd
2309             .lock()
2310             .unwrap()
2311             .set_one_reg(
2312                 arm64_core_reg_id!(KVM_REG_SIZE_U64, pstate),
2313                 &PSTATE_FAULT_BITS_64.to_le_bytes(),
2314             )
2315             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
2316 
2317         // Other vCPUs are powered off initially awaiting PSCI wakeup.
2318         if cpu_id == 0 {
2319             // Setting the PC (Processor Counter) to the current program address (kernel address).
2320             let pc = offset_of!(user_pt_regs, pc) + kreg_off;
2321             self.fd
2322                 .lock()
2323                 .unwrap()
2324                 .set_one_reg(
2325                     arm64_core_reg_id!(KVM_REG_SIZE_U64, pc),
2326                     &boot_ip.to_le_bytes(),
2327                 )
2328                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
2329 
2330             // Last mandatory thing to set -> the address pointing to the FDT (also called DTB).
2331             // "The device tree blob (dtb) must be placed on an 8-byte boundary and must
2332             // not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt.
2333             // We are choosing to place it the end of DRAM. See `get_fdt_addr`.
2334             let regs0 = offset_of!(user_pt_regs, regs) + kreg_off;
2335             self.fd
2336                 .lock()
2337                 .unwrap()
2338                 .set_one_reg(
2339                     arm64_core_reg_id!(KVM_REG_SIZE_U64, regs0),
2340                     &fdt_start.to_le_bytes(),
2341                 )
2342                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
2343         }
2344         Ok(())
2345     }
2346 
2347     #[cfg(target_arch = "riscv64")]
2348     ///
2349     /// Configure registers for a given RISC-V CPU.
2350     ///
2351     fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> {
2352         // Setting the A0 () to the hartid of this CPU.
2353         let a0 = offset_of!(kvm_riscv_core, regs, user_regs_struct, a0);
2354         self.fd
2355             .lock()
2356             .unwrap()
2357             .set_one_reg(
2358                 riscv64_reg_id!(KVM_REG_RISCV_CORE, a0),
2359                 &u64::from(cpu_id).to_le_bytes(),
2360             )
2361             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
2362 
2363         // Setting the PC (Processor Counter) to the current program address (kernel address).
2364         let pc = offset_of!(kvm_riscv_core, regs, user_regs_struct, pc);
2365         self.fd
2366             .lock()
2367             .unwrap()
2368             .set_one_reg(
2369                 riscv64_reg_id!(KVM_REG_RISCV_CORE, pc),
2370                 &boot_ip.to_le_bytes(),
2371             )
2372             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
2373 
2374         // Last mandatory thing to set -> the address pointing to the FDT (also called DTB).
2375         // "The device tree blob (dtb) must be placed on an 8-byte boundary and must
2376         // not exceed 64 kilobytes in size." -> https://www.kernel.org/doc/Documentation/arch/riscv/boot.txt.
2377         let a1 = offset_of!(kvm_riscv_core, regs, user_regs_struct, a1);
2378         self.fd
2379             .lock()
2380             .unwrap()
2381             .set_one_reg(
2382                 riscv64_reg_id!(KVM_REG_RISCV_CORE, a1),
2383                 &fdt_start.to_le_bytes(),
2384             )
2385             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
2386 
2387         Ok(())
2388     }
2389 
2390     #[cfg(target_arch = "x86_64")]
2391     ///
2392     /// Get the current CPU state
2393     ///
2394     /// Ordering requirements:
2395     ///
2396     /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify
2397     /// vCPU/LAPIC state. As such, it must be done before most everything
2398     /// else, otherwise we cannot restore everything and expect it to work.
2399     ///
2400     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
2401     /// still running.
2402     ///
2403     /// KVM_GET_LAPIC may change state of LAPIC before returning it.
2404     ///
2405     /// GET_VCPU_EVENTS should probably be last to save. The code looks as
2406     /// it might as well be affected by internal state modifications of the
2407     /// GET ioctls.
2408     ///
2409     /// SREGS saves/restores a pending interrupt, similar to what
2410     /// VCPU_EVENTS also does.
2411     ///
2412     /// GET_MSRS requires a prepopulated data structure to do something
2413     /// meaningful. For SET_MSRS it will then contain good data.
2414     ///
2415     /// # Example
2416     ///
2417     /// ```rust
2418     /// # use hypervisor::kvm::KvmHypervisor;
2419     /// # use std::sync::Arc;
2420     /// let kvm = KvmHypervisor::new().unwrap();
2421     /// let hv = Arc::new(kvm);
2422     /// let vm = hv.create_vm().expect("new VM fd creation failed");
2423     /// vm.enable_split_irq().unwrap();
2424     /// let vcpu = vm.create_vcpu(0, None).unwrap();
2425     /// let state = vcpu.state().unwrap();
2426     /// ```
2427     fn state(&self) -> cpu::Result<CpuState> {
2428         let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?;
2429         let mp_state = self.get_mp_state()?.into();
2430         let regs = self.get_regs()?;
2431         let sregs = self.get_sregs()?;
2432         let xsave = self.get_xsave()?;
2433         let xcrs = self.get_xcrs()?;
2434         let lapic_state = self.get_lapic()?;
2435         let fpu = self.get_fpu()?;
2436 
2437         // Try to get all MSRs based on the list previously retrieved from KVM.
2438         // If the number of MSRs obtained from GET_MSRS is different from the
2439         // expected amount, we fallback onto a slower method by getting MSRs
2440         // by chunks. This is the only way to make sure we try to get as many
2441         // MSRs as possible, even if some MSRs are not supported.
2442         let mut msr_entries = self.msrs.clone();
2443 
2444         // Save extra MSRs if the Hyper-V synthetic interrupt controller is
2445         // emulated.
2446         if self.hyperv_synic.load(Ordering::Acquire) {
2447             let hyperv_synic_msrs = vec![
2448                 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084,
2449                 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096,
2450                 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d,
2451                 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4,
2452                 0x400000b5, 0x400000b6, 0x400000b7,
2453             ];
2454             for index in hyperv_synic_msrs {
2455                 let msr = kvm_msr_entry {
2456                     index,
2457                     ..Default::default()
2458                 };
2459                 msr_entries.push(msr.into());
2460             }
2461         }
2462 
2463         let expected_num_msrs = msr_entries.len();
2464         let num_msrs = self.get_msrs(&mut msr_entries)?;
2465         let msrs = if num_msrs != expected_num_msrs {
2466             let mut faulty_msr_index = num_msrs;
2467             let mut msr_entries_tmp = msr_entries[..faulty_msr_index].to_vec();
2468 
2469             loop {
2470                 warn!(
2471                     "Detected faulty MSR 0x{:x} while getting MSRs",
2472                     msr_entries[faulty_msr_index].index
2473                 );
2474 
2475                 // Skip the first bad MSR
2476                 let start_pos = faulty_msr_index + 1;
2477 
2478                 let mut sub_msr_entries = msr_entries[start_pos..].to_vec();
2479                 let num_msrs = self.get_msrs(&mut sub_msr_entries)?;
2480 
2481                 msr_entries_tmp.extend(&sub_msr_entries[..num_msrs]);
2482 
2483                 if num_msrs == sub_msr_entries.len() {
2484                     break;
2485                 }
2486 
2487                 faulty_msr_index = start_pos + num_msrs;
2488             }
2489 
2490             msr_entries_tmp
2491         } else {
2492             msr_entries
2493         };
2494 
2495         let vcpu_events = self.get_vcpu_events()?;
2496         let tsc_khz = self.tsc_khz()?;
2497 
2498         Ok(VcpuKvmState {
2499             cpuid,
2500             msrs,
2501             vcpu_events,
2502             regs: regs.into(),
2503             sregs: sregs.into(),
2504             fpu,
2505             lapic_state,
2506             xsave,
2507             xcrs,
2508             mp_state,
2509             tsc_khz,
2510         }
2511         .into())
2512     }
2513 
2514     ///
2515     /// Get the current AArch64 CPU state
2516     ///
2517     #[cfg(target_arch = "aarch64")]
2518     fn state(&self) -> cpu::Result<CpuState> {
2519         let mut state = VcpuKvmState {
2520             mp_state: self.get_mp_state()?.into(),
2521             ..Default::default()
2522         };
2523         // Get core registers
2524         state.core_regs = self.get_regs()?.into();
2525 
2526         // Get systerm register
2527         // Call KVM_GET_REG_LIST to get all registers available to the guest.
2528         // For ArmV8 there are around 500 registers.
2529         let mut sys_regs: Vec<kvm_bindings::kvm_one_reg> = Vec::new();
2530         let mut reg_list = kvm_bindings::RegList::new(500).unwrap();
2531         self.fd
2532             .lock()
2533             .unwrap()
2534             .get_reg_list(&mut reg_list)
2535             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?;
2536 
2537         // At this point reg_list should contain: core registers and system
2538         // registers.
2539         // The register list contains the number of registers and their ids. We
2540         // will be needing to call KVM_GET_ONE_REG on each id in order to save
2541         // all of them. We carve out from the list  the core registers which are
2542         // represented in the kernel by kvm_regs structure and for which we can
2543         // calculate the id based on the offset in the structure.
2544         reg_list.retain(|regid| is_system_register(*regid));
2545 
2546         // Now, for the rest of the registers left in the previously fetched
2547         // register list, we are simply calling KVM_GET_ONE_REG.
2548         let indices = reg_list.as_slice();
2549         for index in indices.iter() {
2550             let mut bytes = [0_u8; 8];
2551             self.fd
2552                 .lock()
2553                 .unwrap()
2554                 .get_one_reg(*index, &mut bytes)
2555                 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?;
2556             sys_regs.push(kvm_bindings::kvm_one_reg {
2557                 id: *index,
2558                 addr: u64::from_le_bytes(bytes),
2559             });
2560         }
2561 
2562         state.sys_regs = sys_regs;
2563 
2564         Ok(state.into())
2565     }
2566 
2567     #[cfg(target_arch = "riscv64")]
2568     ///
2569     /// Get the current RISC-V 64-bit CPU state
2570     ///
2571     fn state(&self) -> cpu::Result<CpuState> {
2572         let mut state = VcpuKvmState {
2573             mp_state: self.get_mp_state()?.into(),
2574             ..Default::default()
2575         };
2576         // Get core registers
2577         state.core_regs = self.get_regs()?.into();
2578 
2579         // Get non-core register
2580         // Call KVM_GET_REG_LIST to get all registers available to the guest.
2581         // For RISC-V 64-bit there are around 200 registers.
2582         let mut sys_regs: Vec<kvm_bindings::kvm_one_reg> = Vec::new();
2583         let mut reg_list = kvm_bindings::RegList::new(200).unwrap();
2584         self.fd
2585             .lock()
2586             .unwrap()
2587             .get_reg_list(&mut reg_list)
2588             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?;
2589 
2590         // At this point reg_list should contain:
2591         // - core registers
2592         // - config registers
2593         // - timer registers
2594         // - control and status registers
2595         // - AIA control and status registers
2596         // - smstateen control and status registers
2597         // - sbi_sta control and status registers.
2598         //
2599         // The register list contains the number of registers and their ids. We
2600         // will be needing to call KVM_GET_ONE_REG on each id in order to save
2601         // all of them. We carve out from the list the core registers which are
2602         // represented in the kernel by `kvm_riscv_core` structure and for which
2603         // we can calculate the id based on the offset in the structure.
2604         reg_list.retain(|regid| is_non_core_register(*regid));
2605 
2606         // Now, for the rest of the registers left in the previously fetched
2607         // register list, we are simply calling KVM_GET_ONE_REG.
2608         let indices = reg_list.as_slice();
2609         for index in indices.iter() {
2610             let mut bytes = [0_u8; 8];
2611             self.fd
2612                 .lock()
2613                 .unwrap()
2614                 .get_one_reg(*index, &mut bytes)
2615                 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?;
2616             sys_regs.push(kvm_bindings::kvm_one_reg {
2617                 id: *index,
2618                 addr: u64::from_le_bytes(bytes),
2619             });
2620         }
2621 
2622         state.non_core_regs = sys_regs;
2623 
2624         Ok(state.into())
2625     }
2626 
2627     #[cfg(target_arch = "x86_64")]
2628     ///
2629     /// Restore the previously saved CPU state
2630     ///
2631     /// Ordering requirements:
2632     ///
2633     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
2634     /// still running.
2635     ///
2636     /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so
2637     /// if we ever change the BSP, we have to do that before restoring anything.
2638     /// The same seems to be true for CPUID stuff.
2639     ///
2640     /// SREGS saves/restores a pending interrupt, similar to what
2641     /// VCPU_EVENTS also does.
2642     ///
2643     /// SET_REGS clears pending exceptions unconditionally, thus, it must be
2644     /// done before SET_VCPU_EVENTS, which restores it.
2645     ///
2646     /// SET_LAPIC must come after SET_SREGS, because the latter restores
2647     /// the apic base msr.
2648     ///
2649     /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR
2650     /// only restores successfully, when the LAPIC is correctly configured.
2651     ///
2652     /// Arguments: CpuState
2653     /// # Example
2654     ///
2655     /// ```rust
2656     /// # use hypervisor::kvm::KvmHypervisor;
2657     /// # use std::sync::Arc;
2658     /// let kvm = KvmHypervisor::new().unwrap();
2659     /// let hv = Arc::new(kvm);
2660     /// let vm = hv.create_vm().expect("new VM fd creation failed");
2661     /// vm.enable_split_irq().unwrap();
2662     /// let vcpu = vm.create_vcpu(0, None).unwrap();
2663     /// let state = vcpu.state().unwrap();
2664     /// vcpu.set_state(&state).unwrap();
2665     /// ```
2666     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
2667         let state: VcpuKvmState = state.clone().into();
2668         self.set_cpuid2(&state.cpuid)?;
2669         self.set_mp_state(state.mp_state.into())?;
2670         self.set_regs(&state.regs.into())?;
2671         self.set_sregs(&state.sregs.into())?;
2672         self.set_xsave(&state.xsave)?;
2673         self.set_xcrs(&state.xcrs)?;
2674         self.set_lapic(&state.lapic_state)?;
2675         self.set_fpu(&state.fpu)?;
2676 
2677         if let Some(freq) = state.tsc_khz {
2678             self.set_tsc_khz(freq)?;
2679         }
2680 
2681         // Try to set all MSRs previously stored.
2682         // If the number of MSRs set from SET_MSRS is different from the
2683         // expected amount, we fallback onto a slower method by setting MSRs
2684         // by chunks. This is the only way to make sure we try to set as many
2685         // MSRs as possible, even if some MSRs are not supported.
2686         let expected_num_msrs = state.msrs.len();
2687         let num_msrs = self.set_msrs(&state.msrs)?;
2688         if num_msrs != expected_num_msrs {
2689             let mut faulty_msr_index = num_msrs;
2690 
2691             loop {
2692                 warn!(
2693                     "Detected faulty MSR 0x{:x} while setting MSRs",
2694                     state.msrs[faulty_msr_index].index
2695                 );
2696 
2697                 // Skip the first bad MSR
2698                 let start_pos = faulty_msr_index + 1;
2699 
2700                 let sub_msr_entries = state.msrs[start_pos..].to_vec();
2701 
2702                 let num_msrs = self.set_msrs(&sub_msr_entries)?;
2703 
2704                 if num_msrs == sub_msr_entries.len() {
2705                     break;
2706                 }
2707 
2708                 faulty_msr_index = start_pos + num_msrs;
2709             }
2710         }
2711 
2712         self.set_vcpu_events(&state.vcpu_events)?;
2713 
2714         Ok(())
2715     }
2716 
2717     ///
2718     /// Restore the previously saved AArch64 CPU state
2719     ///
2720     #[cfg(target_arch = "aarch64")]
2721     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
2722         let state: VcpuKvmState = state.clone().into();
2723         // Set core registers
2724         self.set_regs(&state.core_regs.into())?;
2725         // Set system registers
2726         for reg in &state.sys_regs {
2727             self.fd
2728                 .lock()
2729                 .unwrap()
2730                 .set_one_reg(reg.id, &reg.addr.to_le_bytes())
2731                 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?;
2732         }
2733 
2734         self.set_mp_state(state.mp_state.into())?;
2735 
2736         Ok(())
2737     }
2738 
2739     #[cfg(target_arch = "riscv64")]
2740     ///
2741     /// Restore the previously saved RISC-V 64-bit CPU state
2742     ///
2743     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
2744         let state: VcpuKvmState = state.clone().into();
2745         // Set core registers
2746         self.set_regs(&state.core_regs.into())?;
2747         // Set system registers
2748         for reg in &state.non_core_regs {
2749             self.fd
2750                 .lock()
2751                 .unwrap()
2752                 .set_one_reg(reg.id, &reg.addr.to_le_bytes())
2753                 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?;
2754         }
2755 
2756         self.set_mp_state(state.mp_state.into())?;
2757 
2758         Ok(())
2759     }
2760 
2761     ///
2762     /// Initialize TDX for this CPU
2763     ///
2764     #[cfg(feature = "tdx")]
2765     fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> {
2766         tdx_command(
2767             &self.fd.lock().unwrap().as_raw_fd(),
2768             TdxCommand::InitVcpu,
2769             0,
2770             hob_address,
2771         )
2772         .map_err(cpu::HypervisorCpuError::InitializeTdx)
2773     }
2774 
2775     ///
2776     /// Set the "immediate_exit" state
2777     ///
2778     fn set_immediate_exit(&self, exit: bool) {
2779         self.fd.lock().unwrap().set_kvm_immediate_exit(exit.into());
2780     }
2781 
2782     ///
2783     /// Returns the details about TDX exit reason
2784     ///
2785     #[cfg(feature = "tdx")]
2786     fn get_tdx_exit_details(&mut self) -> cpu::Result<TdxExitDetails> {
2787         let mut fd = self.fd.as_ref().lock().unwrap();
2788         let kvm_run = fd.get_kvm_run();
2789         // SAFETY: accessing a union field in a valid structure
2790         let tdx_vmcall = unsafe {
2791             &mut (*((&mut kvm_run.__bindgen_anon_1) as *mut kvm_run__bindgen_ty_1
2792                 as *mut KvmTdxExit))
2793                 .u
2794                 .vmcall
2795         };
2796 
2797         tdx_vmcall.status_code = TDG_VP_VMCALL_INVALID_OPERAND;
2798 
2799         if tdx_vmcall.type_ != 0 {
2800             return Err(cpu::HypervisorCpuError::UnknownTdxVmCall);
2801         }
2802 
2803         match tdx_vmcall.subfunction {
2804             TDG_VP_VMCALL_GET_QUOTE => Ok(TdxExitDetails::GetQuote),
2805             TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT => {
2806                 Ok(TdxExitDetails::SetupEventNotifyInterrupt)
2807             }
2808             _ => Err(cpu::HypervisorCpuError::UnknownTdxVmCall),
2809         }
2810     }
2811 
2812     ///
2813     /// Set the status code for TDX exit
2814     ///
2815     #[cfg(feature = "tdx")]
2816     fn set_tdx_status(&mut self, status: TdxExitStatus) {
2817         let mut fd = self.fd.as_ref().lock().unwrap();
2818         let kvm_run = fd.get_kvm_run();
2819         // SAFETY: accessing a union field in a valid structure
2820         let tdx_vmcall = unsafe {
2821             &mut (*((&mut kvm_run.__bindgen_anon_1) as *mut kvm_run__bindgen_ty_1
2822                 as *mut KvmTdxExit))
2823                 .u
2824                 .vmcall
2825         };
2826 
2827         tdx_vmcall.status_code = match status {
2828             TdxExitStatus::Success => TDG_VP_VMCALL_SUCCESS,
2829             TdxExitStatus::InvalidOperand => TDG_VP_VMCALL_INVALID_OPERAND,
2830         };
2831     }
2832 
2833     #[cfg(target_arch = "x86_64")]
2834     ///
2835     /// Return the list of initial MSR entries for a VCPU
2836     ///
2837     fn boot_msr_entries(&self) -> Vec<MsrEntry> {
2838         use crate::arch::x86::{msr_index, MTRR_ENABLE, MTRR_MEM_TYPE_WB};
2839 
2840         [
2841             msr!(msr_index::MSR_IA32_SYSENTER_CS),
2842             msr!(msr_index::MSR_IA32_SYSENTER_ESP),
2843             msr!(msr_index::MSR_IA32_SYSENTER_EIP),
2844             msr!(msr_index::MSR_STAR),
2845             msr!(msr_index::MSR_CSTAR),
2846             msr!(msr_index::MSR_LSTAR),
2847             msr!(msr_index::MSR_KERNEL_GS_BASE),
2848             msr!(msr_index::MSR_SYSCALL_MASK),
2849             msr!(msr_index::MSR_IA32_TSC),
2850             msr_data!(
2851                 msr_index::MSR_IA32_MISC_ENABLE,
2852                 msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64
2853             ),
2854             msr_data!(msr_index::MSR_MTRRdefType, MTRR_ENABLE | MTRR_MEM_TYPE_WB),
2855         ]
2856         .to_vec()
2857     }
2858 
2859     #[cfg(target_arch = "aarch64")]
2860     fn has_pmu_support(&self) -> bool {
2861         let cpu_attr = kvm_bindings::kvm_device_attr {
2862             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2863             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT),
2864             addr: 0x0,
2865             flags: 0,
2866         };
2867         self.fd.lock().unwrap().has_device_attr(&cpu_attr).is_ok()
2868     }
2869 
2870     #[cfg(target_arch = "aarch64")]
2871     fn init_pmu(&self, irq: u32) -> cpu::Result<()> {
2872         let cpu_attr = kvm_bindings::kvm_device_attr {
2873             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2874             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT),
2875             addr: 0x0,
2876             flags: 0,
2877         };
2878         let cpu_attr_irq = kvm_bindings::kvm_device_attr {
2879             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2880             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_IRQ),
2881             addr: &irq as *const u32 as u64,
2882             flags: 0,
2883         };
2884         self.fd
2885             .lock()
2886             .unwrap()
2887             .set_device_attr(&cpu_attr_irq)
2888             .map_err(|_| cpu::HypervisorCpuError::InitializePmu)?;
2889         self.fd
2890             .lock()
2891             .unwrap()
2892             .set_device_attr(&cpu_attr)
2893             .map_err(|_| cpu::HypervisorCpuError::InitializePmu)
2894     }
2895 
2896     #[cfg(target_arch = "x86_64")]
2897     ///
2898     /// Get the frequency of the TSC if available
2899     ///
2900     fn tsc_khz(&self) -> cpu::Result<Option<u32>> {
2901         match self.fd.lock().unwrap().get_tsc_khz() {
2902             Err(e) => {
2903                 if e.errno() == libc::EIO {
2904                     Ok(None)
2905                 } else {
2906                     Err(cpu::HypervisorCpuError::GetTscKhz(e.into()))
2907                 }
2908             }
2909             Ok(v) => Ok(Some(v)),
2910         }
2911     }
2912 
2913     #[cfg(target_arch = "x86_64")]
2914     ///
2915     /// Set the frequency of the TSC if available
2916     ///
2917     fn set_tsc_khz(&self, freq: u32) -> cpu::Result<()> {
2918         match self.fd.lock().unwrap().set_tsc_khz(freq) {
2919             Err(e) => {
2920                 if e.errno() == libc::EIO {
2921                     Ok(())
2922                 } else {
2923                     Err(cpu::HypervisorCpuError::SetTscKhz(e.into()))
2924                 }
2925             }
2926             Ok(_) => Ok(()),
2927         }
2928     }
2929 
2930     #[cfg(target_arch = "x86_64")]
2931     ///
2932     /// Trigger NMI interrupt
2933     ///
2934     fn nmi(&self) -> cpu::Result<()> {
2935         match self.fd.lock().unwrap().nmi() {
2936             Err(e) => {
2937                 if e.errno() == libc::EIO {
2938                     Ok(())
2939                 } else {
2940                     Err(cpu::HypervisorCpuError::Nmi(e.into()))
2941                 }
2942             }
2943             Ok(_) => Ok(()),
2944         }
2945     }
2946 }
2947 
2948 impl KvmVcpu {
2949     #[cfg(target_arch = "x86_64")]
2950     ///
2951     /// X86 specific call that returns the vcpu's current "xsave struct".
2952     ///
2953     fn get_xsave(&self) -> cpu::Result<XsaveState> {
2954         Ok(self
2955             .fd
2956             .lock()
2957             .unwrap()
2958             .get_xsave()
2959             .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into()))?
2960             .into())
2961     }
2962 
2963     #[cfg(target_arch = "x86_64")]
2964     ///
2965     /// X86 specific call that sets the vcpu's current "xsave struct".
2966     ///
2967     fn set_xsave(&self, xsave: &XsaveState) -> cpu::Result<()> {
2968         let xsave: kvm_bindings::kvm_xsave = (*xsave).clone().into();
2969         self.fd
2970             .lock()
2971             .unwrap()
2972             .set_xsave(&xsave)
2973             .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into()))
2974     }
2975 
2976     #[cfg(target_arch = "x86_64")]
2977     ///
2978     /// X86 specific call that returns the vcpu's current "xcrs".
2979     ///
2980     fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> {
2981         self.fd
2982             .lock()
2983             .unwrap()
2984             .get_xcrs()
2985             .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into()))
2986     }
2987 
2988     #[cfg(target_arch = "x86_64")]
2989     ///
2990     /// X86 specific call that sets the vcpu's current "xcrs".
2991     ///
2992     fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> {
2993         self.fd
2994             .lock()
2995             .unwrap()
2996             .set_xcrs(xcrs)
2997             .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into()))
2998     }
2999 
3000     #[cfg(target_arch = "x86_64")]
3001     ///
3002     /// Returns currently pending exceptions, interrupts, and NMIs as well as related
3003     /// states of the vcpu.
3004     ///
3005     fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> {
3006         self.fd
3007             .lock()
3008             .unwrap()
3009             .get_vcpu_events()
3010             .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into()))
3011     }
3012 
3013     #[cfg(target_arch = "x86_64")]
3014     ///
3015     /// Sets pending exceptions, interrupts, and NMIs as well as related states
3016     /// of the vcpu.
3017     ///
3018     fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> {
3019         self.fd
3020             .lock()
3021             .unwrap()
3022             .set_vcpu_events(events)
3023             .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into()))
3024     }
3025 }
3026 
3027 #[cfg(test)]
3028 mod tests {
3029     #[test]
3030     #[cfg(target_arch = "riscv64")]
3031     fn test_get_and_set_regs() {
3032         use super::*;
3033 
3034         let kvm = KvmHypervisor::new().unwrap();
3035         let hypervisor = Arc::new(kvm);
3036         let vm = hypervisor.create_vm().expect("new VM fd creation failed");
3037         let vcpu0 = vm.create_vcpu(0, None).unwrap();
3038 
3039         let core_regs = StandardRegisters::from(kvm_riscv_core {
3040             regs: user_regs_struct {
3041                 pc: 0x00,
3042                 ra: 0x01,
3043                 sp: 0x02,
3044                 gp: 0x03,
3045                 tp: 0x04,
3046                 t0: 0x05,
3047                 t1: 0x06,
3048                 t2: 0x07,
3049                 s0: 0x08,
3050                 s1: 0x09,
3051                 a0: 0x0a,
3052                 a1: 0x0b,
3053                 a2: 0x0c,
3054                 a3: 0x0d,
3055                 a4: 0x0e,
3056                 a5: 0x0f,
3057                 a6: 0x10,
3058                 a7: 0x11,
3059                 s2: 0x12,
3060                 s3: 0x13,
3061                 s4: 0x14,
3062                 s5: 0x15,
3063                 s6: 0x16,
3064                 s7: 0x17,
3065                 s8: 0x18,
3066                 s9: 0x19,
3067                 s10: 0x1a,
3068                 s11: 0x1b,
3069                 t3: 0x1c,
3070                 t4: 0x1d,
3071                 t5: 0x1e,
3072                 t6: 0x1f,
3073             },
3074             mode: 0x00,
3075         });
3076 
3077         vcpu0.set_regs(&core_regs).unwrap();
3078         assert_eq!(vcpu0.get_regs().unwrap(), core_regs);
3079     }
3080 }
3081