xref: /cloud-hypervisor/hypervisor/src/kvm/mod.rs (revision 61e57e1cb149de03ae1e0b799b9e5ba9a4a63ace)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 // Copyright © 2020, Microsoft Corporation
6 //
7 // Copyright 2018-2019 CrowdStrike, Inc.
8 //
9 //
10 
11 use std::any::Any;
12 use std::collections::HashMap;
13 #[cfg(target_arch = "x86_64")]
14 use std::fs::File;
15 #[cfg(target_arch = "x86_64")]
16 use std::os::unix::io::AsRawFd;
17 #[cfg(feature = "tdx")]
18 use std::os::unix::io::RawFd;
19 use std::result;
20 #[cfg(target_arch = "x86_64")]
21 use std::sync::atomic::{AtomicBool, Ordering};
22 use std::sync::{Arc, Mutex, RwLock};
23 
24 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd};
25 use vmm_sys_util::eventfd::EventFd;
26 
27 #[cfg(target_arch = "aarch64")]
28 use crate::aarch64::gic::KvmGicV3Its;
29 #[cfg(target_arch = "aarch64")]
30 pub use crate::aarch64::{
31     check_required_kvm_extensions, gic::Gicv3ItsState as GicState, is_system_register, VcpuInit,
32     VcpuKvmState,
33 };
34 #[cfg(target_arch = "aarch64")]
35 use crate::arch::aarch64::gic::{Vgic, VgicConfig};
36 use crate::vm::{self, InterruptSourceConfig, VmOps};
37 #[cfg(target_arch = "aarch64")]
38 use crate::{arm64_core_reg_id, offset_of};
39 use crate::{cpu, hypervisor, vec_with_array_field, HypervisorType};
40 // x86_64 dependencies
41 #[cfg(target_arch = "x86_64")]
42 pub mod x86_64;
43 #[cfg(target_arch = "aarch64")]
44 use aarch64::{RegList, Register};
45 #[cfg(target_arch = "x86_64")]
46 use kvm_bindings::{
47     kvm_enable_cap, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, KVM_CAP_SPLIT_IRQCHIP,
48     KVM_GUESTDBG_USE_HW_BP,
49 };
50 #[cfg(target_arch = "x86_64")]
51 use x86_64::check_required_kvm_extensions;
52 #[cfg(target_arch = "x86_64")]
53 pub use x86_64::{CpuId, ExtendedControlRegisters, MsrEntries, VcpuKvmState};
54 
55 #[cfg(target_arch = "x86_64")]
56 use crate::arch::x86::{
57     CpuIdEntry, FpuState, LapicState, MsrEntry, SpecialRegisters, XsaveState, NUM_IOAPIC_PINS,
58 };
59 #[cfg(target_arch = "x86_64")]
60 use crate::ClockData;
61 use crate::{
62     CpuState, IoEventAddress, IrqRoutingEntry, MpState, StandardRegisters, UserMemoryRegion,
63     USER_MEMORY_REGION_LOG_DIRTY, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE,
64 };
65 // aarch64 dependencies
66 #[cfg(target_arch = "aarch64")]
67 pub mod aarch64;
68 #[cfg(target_arch = "aarch64")]
69 use std::mem;
70 
71 pub use kvm_bindings::{
72     kvm_clock_data, kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_guest_debug,
73     kvm_irq_routing, kvm_irq_routing_entry, kvm_mp_state, kvm_userspace_memory_region,
74     KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI,
75     KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID,
76 };
77 #[cfg(target_arch = "aarch64")]
78 use kvm_bindings::{
79     kvm_regs, user_fpsimd_state, user_pt_regs, KVM_GUESTDBG_USE_HW, KVM_NR_SPSR, KVM_REG_ARM64,
80     KVM_REG_ARM64_SYSREG, KVM_REG_ARM64_SYSREG_CRM_MASK, KVM_REG_ARM64_SYSREG_CRN_MASK,
81     KVM_REG_ARM64_SYSREG_OP0_MASK, KVM_REG_ARM64_SYSREG_OP1_MASK, KVM_REG_ARM64_SYSREG_OP2_MASK,
82     KVM_REG_ARM_CORE, KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64,
83 };
84 #[cfg(feature = "tdx")]
85 use kvm_bindings::{kvm_run__bindgen_ty_1, KVMIO};
86 pub use kvm_ioctls::{Cap, Kvm};
87 use thiserror::Error;
88 use vfio_ioctls::VfioDeviceFd;
89 #[cfg(feature = "tdx")]
90 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_ioc_nr, ioctl_iowr_nr};
91 pub use {kvm_bindings, kvm_ioctls};
92 ///
93 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms
94 ///
95 pub use {
96     kvm_bindings::kvm_create_device as CreateDevice, kvm_bindings::kvm_device_attr as DeviceAttr,
97     kvm_bindings::kvm_run, kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::VcpuExit,
98 };
99 
100 #[cfg(target_arch = "x86_64")]
101 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196;
102 
103 #[cfg(target_arch = "x86_64")]
104 use vmm_sys_util::ioctl_io_nr;
105 #[cfg(all(not(feature = "tdx"), target_arch = "x86_64"))]
106 use vmm_sys_util::ioctl_ioc_nr;
107 
108 #[cfg(target_arch = "x86_64")]
109 ioctl_io_nr!(KVM_NMI, kvm_bindings::KVMIO, 0x9a);
110 
111 #[cfg(feature = "tdx")]
112 const KVM_EXIT_TDX: u32 = 50;
113 #[cfg(feature = "tdx")]
114 const TDG_VP_VMCALL_GET_QUOTE: u64 = 0x10002;
115 #[cfg(feature = "tdx")]
116 const TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: u64 = 0x10004;
117 #[cfg(feature = "tdx")]
118 const TDG_VP_VMCALL_SUCCESS: u64 = 0;
119 #[cfg(feature = "tdx")]
120 const TDG_VP_VMCALL_INVALID_OPERAND: u64 = 0x8000000000000000;
121 
122 #[cfg(feature = "tdx")]
123 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong);
124 
125 #[cfg(feature = "tdx")]
126 #[repr(u32)]
127 enum TdxCommand {
128     Capabilities = 0,
129     InitVm,
130     InitVcpu,
131     InitMemRegion,
132     Finalize,
133 }
134 
135 #[cfg(feature = "tdx")]
136 pub enum TdxExitDetails {
137     GetQuote,
138     SetupEventNotifyInterrupt,
139 }
140 
141 #[cfg(feature = "tdx")]
142 pub enum TdxExitStatus {
143     Success,
144     InvalidOperand,
145 }
146 
147 #[cfg(feature = "tdx")]
148 const TDX_MAX_NR_CPUID_CONFIGS: usize = 6;
149 
150 #[cfg(feature = "tdx")]
151 #[repr(C)]
152 #[derive(Debug, Default)]
153 pub struct TdxCpuidConfig {
154     pub leaf: u32,
155     pub sub_leaf: u32,
156     pub eax: u32,
157     pub ebx: u32,
158     pub ecx: u32,
159     pub edx: u32,
160 }
161 
162 #[cfg(feature = "tdx")]
163 #[repr(C)]
164 #[derive(Debug, Default)]
165 pub struct TdxCapabilities {
166     pub attrs_fixed0: u64,
167     pub attrs_fixed1: u64,
168     pub xfam_fixed0: u64,
169     pub xfam_fixed1: u64,
170     pub nr_cpuid_configs: u32,
171     pub padding: u32,
172     pub cpuid_configs: [TdxCpuidConfig; TDX_MAX_NR_CPUID_CONFIGS],
173 }
174 
175 #[cfg(feature = "tdx")]
176 #[derive(Copy, Clone)]
177 pub struct KvmTdxExit {
178     pub type_: u32,
179     pub pad: u32,
180     pub u: KvmTdxExitU,
181 }
182 
183 #[cfg(feature = "tdx")]
184 #[repr(C)]
185 #[derive(Copy, Clone)]
186 pub union KvmTdxExitU {
187     pub vmcall: KvmTdxExitVmcall,
188 }
189 
190 #[cfg(feature = "tdx")]
191 #[repr(C)]
192 #[derive(Debug, Default, Copy, Clone, PartialEq)]
193 pub struct KvmTdxExitVmcall {
194     pub type_: u64,
195     pub subfunction: u64,
196     pub reg_mask: u64,
197     pub in_r12: u64,
198     pub in_r13: u64,
199     pub in_r14: u64,
200     pub in_r15: u64,
201     pub in_rbx: u64,
202     pub in_rdi: u64,
203     pub in_rsi: u64,
204     pub in_r8: u64,
205     pub in_r9: u64,
206     pub in_rdx: u64,
207     pub status_code: u64,
208     pub out_r11: u64,
209     pub out_r12: u64,
210     pub out_r13: u64,
211     pub out_r14: u64,
212     pub out_r15: u64,
213     pub out_rbx: u64,
214     pub out_rdi: u64,
215     pub out_rsi: u64,
216     pub out_r8: u64,
217     pub out_r9: u64,
218     pub out_rdx: u64,
219 }
220 
221 impl From<kvm_userspace_memory_region> for UserMemoryRegion {
222     fn from(region: kvm_userspace_memory_region) -> Self {
223         let mut flags = USER_MEMORY_REGION_READ;
224         if region.flags & KVM_MEM_READONLY == 0 {
225             flags |= USER_MEMORY_REGION_WRITE;
226         }
227         if region.flags & KVM_MEM_LOG_DIRTY_PAGES != 0 {
228             flags |= USER_MEMORY_REGION_LOG_DIRTY;
229         }
230 
231         UserMemoryRegion {
232             slot: region.slot,
233             guest_phys_addr: region.guest_phys_addr,
234             memory_size: region.memory_size,
235             userspace_addr: region.userspace_addr,
236             flags,
237         }
238     }
239 }
240 
241 impl From<UserMemoryRegion> for kvm_userspace_memory_region {
242     fn from(region: UserMemoryRegion) -> Self {
243         assert!(
244             region.flags & USER_MEMORY_REGION_READ != 0,
245             "KVM mapped memory is always readable"
246         );
247 
248         let mut flags = 0;
249         if region.flags & USER_MEMORY_REGION_WRITE == 0 {
250             flags |= KVM_MEM_READONLY;
251         }
252         if region.flags & USER_MEMORY_REGION_LOG_DIRTY != 0 {
253             flags |= KVM_MEM_LOG_DIRTY_PAGES;
254         }
255 
256         kvm_userspace_memory_region {
257             slot: region.slot,
258             guest_phys_addr: region.guest_phys_addr,
259             memory_size: region.memory_size,
260             userspace_addr: region.userspace_addr,
261             flags,
262         }
263     }
264 }
265 
266 impl From<kvm_mp_state> for MpState {
267     fn from(s: kvm_mp_state) -> Self {
268         MpState::Kvm(s)
269     }
270 }
271 
272 impl From<MpState> for kvm_mp_state {
273     fn from(ms: MpState) -> Self {
274         match ms {
275             MpState::Kvm(s) => s,
276             /* Needed in case other hypervisors are enabled */
277             #[allow(unreachable_patterns)]
278             _ => panic!("CpuState is not valid"),
279         }
280     }
281 }
282 
283 impl From<kvm_ioctls::IoEventAddress> for IoEventAddress {
284     fn from(a: kvm_ioctls::IoEventAddress) -> Self {
285         match a {
286             kvm_ioctls::IoEventAddress::Pio(x) => Self::Pio(x),
287             kvm_ioctls::IoEventAddress::Mmio(x) => Self::Mmio(x),
288         }
289     }
290 }
291 
292 impl From<IoEventAddress> for kvm_ioctls::IoEventAddress {
293     fn from(a: IoEventAddress) -> Self {
294         match a {
295             IoEventAddress::Pio(x) => Self::Pio(x),
296             IoEventAddress::Mmio(x) => Self::Mmio(x),
297         }
298     }
299 }
300 
301 impl From<VcpuKvmState> for CpuState {
302     fn from(s: VcpuKvmState) -> Self {
303         CpuState::Kvm(s)
304     }
305 }
306 
307 impl From<CpuState> for VcpuKvmState {
308     fn from(s: CpuState) -> Self {
309         match s {
310             CpuState::Kvm(s) => s,
311             /* Needed in case other hypervisors are enabled */
312             #[allow(unreachable_patterns)]
313             _ => panic!("CpuState is not valid"),
314         }
315     }
316 }
317 
318 #[cfg(target_arch = "x86_64")]
319 impl From<kvm_clock_data> for ClockData {
320     fn from(d: kvm_clock_data) -> Self {
321         ClockData::Kvm(d)
322     }
323 }
324 
325 #[cfg(target_arch = "x86_64")]
326 impl From<ClockData> for kvm_clock_data {
327     fn from(ms: ClockData) -> Self {
328         match ms {
329             ClockData::Kvm(s) => s,
330             /* Needed in case other hypervisors are enabled */
331             #[allow(unreachable_patterns)]
332             _ => panic!("CpuState is not valid"),
333         }
334     }
335 }
336 
337 impl From<kvm_bindings::kvm_regs> for crate::StandardRegisters {
338     fn from(s: kvm_bindings::kvm_regs) -> Self {
339         crate::StandardRegisters::Kvm(s)
340     }
341 }
342 
343 impl From<crate::StandardRegisters> for kvm_bindings::kvm_regs {
344     fn from(e: crate::StandardRegisters) -> Self {
345         match e {
346             crate::StandardRegisters::Kvm(e) => e,
347             /* Needed in case other hypervisors are enabled */
348             #[allow(unreachable_patterns)]
349             _ => panic!("StandardRegisters are not valid"),
350         }
351     }
352 }
353 
354 impl From<kvm_irq_routing_entry> for IrqRoutingEntry {
355     fn from(s: kvm_irq_routing_entry) -> Self {
356         IrqRoutingEntry::Kvm(s)
357     }
358 }
359 
360 impl From<IrqRoutingEntry> for kvm_irq_routing_entry {
361     fn from(e: IrqRoutingEntry) -> Self {
362         match e {
363             IrqRoutingEntry::Kvm(e) => e,
364             /* Needed in case other hypervisors are enabled */
365             #[allow(unreachable_patterns)]
366             _ => panic!("IrqRoutingEntry is not valid"),
367         }
368     }
369 }
370 
371 struct KvmDirtyLogSlot {
372     slot: u32,
373     guest_phys_addr: u64,
374     memory_size: u64,
375     userspace_addr: u64,
376 }
377 
378 /// Wrapper over KVM VM ioctls.
379 pub struct KvmVm {
380     fd: Arc<VmFd>,
381     #[cfg(target_arch = "x86_64")]
382     msrs: Vec<MsrEntry>,
383     dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>,
384 }
385 
386 impl KvmVm {
387     ///
388     /// Creates an emulated device in the kernel.
389     ///
390     /// See the documentation for `KVM_CREATE_DEVICE`.
391     fn create_device(&self, device: &mut CreateDevice) -> vm::Result<vfio_ioctls::VfioDeviceFd> {
392         let device_fd = self
393             .fd
394             .create_device(device)
395             .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?;
396         Ok(VfioDeviceFd::new_from_kvm(device_fd))
397     }
398     /// Checks if a particular `Cap` is available.
399     pub fn check_extension(&self, c: Cap) -> bool {
400         self.fd.check_extension(c)
401     }
402 }
403 
404 /// Implementation of Vm trait for KVM
405 ///
406 /// # Examples
407 ///
408 /// ```
409 /// # use hypervisor::kvm::KvmHypervisor;
410 /// # use std::sync::Arc;
411 /// let kvm = KvmHypervisor::new().unwrap();
412 /// let hypervisor = Arc::new(kvm);
413 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
414 /// ```
415 impl vm::Vm for KvmVm {
416     #[cfg(target_arch = "x86_64")]
417     ///
418     /// Sets the address of the one-page region in the VM's address space.
419     ///
420     fn set_identity_map_address(&self, address: u64) -> vm::Result<()> {
421         self.fd
422             .set_identity_map_address(address)
423             .map_err(|e| vm::HypervisorVmError::SetIdentityMapAddress(e.into()))
424     }
425 
426     #[cfg(target_arch = "x86_64")]
427     ///
428     /// Sets the address of the three-page region in the VM's address space.
429     ///
430     fn set_tss_address(&self, offset: usize) -> vm::Result<()> {
431         self.fd
432             .set_tss_address(offset)
433             .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into()))
434     }
435 
436     ///
437     /// Creates an in-kernel interrupt controller.
438     ///
439     fn create_irq_chip(&self) -> vm::Result<()> {
440         self.fd
441             .create_irq_chip()
442             .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into()))
443     }
444 
445     ///
446     /// Registers an event that will, when signaled, trigger the `gsi` IRQ.
447     ///
448     fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
449         self.fd
450             .register_irqfd(fd, gsi)
451             .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into()))
452     }
453 
454     ///
455     /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ.
456     ///
457     fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
458         self.fd
459             .unregister_irqfd(fd, gsi)
460             .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into()))
461     }
462 
463     ///
464     /// Creates a VcpuFd object from a vcpu RawFd.
465     ///
466     fn create_vcpu(
467         &self,
468         id: u8,
469         vm_ops: Option<Arc<dyn VmOps>>,
470     ) -> vm::Result<Arc<dyn cpu::Vcpu>> {
471         let fd = self
472             .fd
473             .create_vcpu(id as u64)
474             .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?;
475         let vcpu = KvmVcpu {
476             fd: Arc::new(Mutex::new(fd)),
477             #[cfg(target_arch = "x86_64")]
478             msrs: self.msrs.clone(),
479             vm_ops,
480             #[cfg(target_arch = "x86_64")]
481             hyperv_synic: AtomicBool::new(false),
482         };
483         Ok(Arc::new(vcpu))
484     }
485 
486     #[cfg(target_arch = "aarch64")]
487     ///
488     /// Creates a virtual GIC device.
489     ///
490     fn create_vgic(&self, config: VgicConfig) -> vm::Result<Arc<Mutex<dyn Vgic>>> {
491         let gic_device = KvmGicV3Its::new(self, config)
492             .map_err(|e| vm::HypervisorVmError::CreateVgic(anyhow!("Vgic error {:?}", e)))?;
493         Ok(Arc::new(Mutex::new(gic_device)))
494     }
495 
496     ///
497     /// Registers an event to be signaled whenever a certain address is written to.
498     ///
499     fn register_ioevent(
500         &self,
501         fd: &EventFd,
502         addr: &IoEventAddress,
503         datamatch: Option<vm::DataMatch>,
504     ) -> vm::Result<()> {
505         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
506         if let Some(dm) = datamatch {
507             match dm {
508                 vm::DataMatch::DataMatch32(kvm_dm32) => self
509                     .fd
510                     .register_ioevent(fd, addr, kvm_dm32)
511                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
512                 vm::DataMatch::DataMatch64(kvm_dm64) => self
513                     .fd
514                     .register_ioevent(fd, addr, kvm_dm64)
515                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
516             }
517         } else {
518             self.fd
519                 .register_ioevent(fd, addr, NoDatamatch)
520                 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into()))
521         }
522     }
523 
524     ///
525     /// Unregisters an event from a certain address it has been previously registered to.
526     ///
527     fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> {
528         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
529         self.fd
530             .unregister_ioevent(fd, addr, NoDatamatch)
531             .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into()))
532     }
533 
534     ///
535     /// Constructs a routing entry
536     ///
537     fn make_routing_entry(&self, gsi: u32, config: &InterruptSourceConfig) -> IrqRoutingEntry {
538         match &config {
539             InterruptSourceConfig::MsiIrq(cfg) => {
540                 let mut kvm_route = kvm_irq_routing_entry {
541                     gsi,
542                     type_: KVM_IRQ_ROUTING_MSI,
543                     ..Default::default()
544                 };
545 
546                 kvm_route.u.msi.address_lo = cfg.low_addr;
547                 kvm_route.u.msi.address_hi = cfg.high_addr;
548                 kvm_route.u.msi.data = cfg.data;
549 
550                 if self.check_extension(crate::kvm::Cap::MsiDevid) {
551                     // On AArch64, there is limitation on the range of the 'devid',
552                     // it cannot be greater than 65536 (the max of u16).
553                     //
554                     // BDF cannot be used directly, because 'segment' is in high
555                     // 16 bits. The layout of the u32 BDF is:
556                     // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --|
557                     // |      segment    |     bus    |   device   |  function  |
558                     //
559                     // Now that we support 1 bus only in a segment, we can build a
560                     // 'devid' by replacing the 'bus' bits with the low 8 bits of
561                     // 'segment' data.
562                     // This way we can resolve the range checking problem and give
563                     // different `devid` to all the devices. Limitation is that at
564                     // most 256 segments can be supported.
565                     //
566                     let modified_devid = (cfg.devid & 0x00ff_0000) >> 8 | cfg.devid & 0xff;
567 
568                     kvm_route.flags = KVM_MSI_VALID_DEVID;
569                     kvm_route.u.msi.__bindgen_anon_1.devid = modified_devid;
570                 }
571                 kvm_route.into()
572             }
573             InterruptSourceConfig::LegacyIrq(cfg) => {
574                 let mut kvm_route = kvm_irq_routing_entry {
575                     gsi,
576                     type_: KVM_IRQ_ROUTING_IRQCHIP,
577                     ..Default::default()
578                 };
579                 kvm_route.u.irqchip.irqchip = cfg.irqchip;
580                 kvm_route.u.irqchip.pin = cfg.pin;
581 
582                 kvm_route.into()
583             }
584         }
585     }
586 
587     ///
588     /// Sets the GSI routing table entries, overwriting any previously set
589     /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl.
590     ///
591     fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> {
592         let mut irq_routing =
593             vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len());
594         irq_routing[0].nr = entries.len() as u32;
595         irq_routing[0].flags = 0;
596         let entries: Vec<kvm_irq_routing_entry> = entries
597             .iter()
598             .map(|entry| match entry {
599                 IrqRoutingEntry::Kvm(e) => *e,
600                 #[allow(unreachable_patterns)]
601                 _ => panic!("IrqRoutingEntry type is wrong"),
602             })
603             .collect();
604 
605         // SAFETY: irq_routing initialized with entries.len() and now it is being turned into
606         // entries_slice with entries.len() again. It is guaranteed to be large enough to hold
607         // everything from entries.
608         unsafe {
609             let entries_slice: &mut [kvm_irq_routing_entry] =
610                 irq_routing[0].entries.as_mut_slice(entries.len());
611             entries_slice.copy_from_slice(&entries);
612         }
613 
614         self.fd
615             .set_gsi_routing(&irq_routing[0])
616             .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into()))
617     }
618 
619     ///
620     /// Creates a memory region structure that can be used with {create/remove}_user_memory_region
621     ///
622     fn make_user_memory_region(
623         &self,
624         slot: u32,
625         guest_phys_addr: u64,
626         memory_size: u64,
627         userspace_addr: u64,
628         readonly: bool,
629         log_dirty_pages: bool,
630     ) -> UserMemoryRegion {
631         kvm_userspace_memory_region {
632             slot,
633             guest_phys_addr,
634             memory_size,
635             userspace_addr,
636             flags: if readonly { KVM_MEM_READONLY } else { 0 }
637                 | if log_dirty_pages {
638                     KVM_MEM_LOG_DIRTY_PAGES
639                 } else {
640                     0
641                 },
642         }
643         .into()
644     }
645 
646     ///
647     /// Creates a guest physical memory region.
648     ///
649     fn create_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
650         let mut region: kvm_userspace_memory_region = user_memory_region.into();
651 
652         if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 {
653             if (region.flags & KVM_MEM_READONLY) != 0 {
654                 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!(
655                     "Error creating regions with both 'dirty-pages-log' and 'read-only'."
656                 )));
657             }
658 
659             // Keep track of the regions that need dirty pages log
660             self.dirty_log_slots.write().unwrap().insert(
661                 region.slot,
662                 KvmDirtyLogSlot {
663                     slot: region.slot,
664                     guest_phys_addr: region.guest_phys_addr,
665                     memory_size: region.memory_size,
666                     userspace_addr: region.userspace_addr,
667                 },
668             );
669 
670             // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`.
671             // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`.
672             region.flags = 0;
673         }
674 
675         // SAFETY: Safe because guest regions are guaranteed not to overlap.
676         unsafe {
677             self.fd
678                 .set_user_memory_region(region)
679                 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))
680         }
681     }
682 
683     ///
684     /// Removes a guest physical memory region.
685     ///
686     fn remove_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
687         let mut region: kvm_userspace_memory_region = user_memory_region.into();
688 
689         // Remove the corresponding entry from "self.dirty_log_slots" if needed
690         self.dirty_log_slots.write().unwrap().remove(&region.slot);
691 
692         // Setting the size to 0 means "remove"
693         region.memory_size = 0;
694         // SAFETY: Safe because guest regions are guaranteed not to overlap.
695         unsafe {
696             self.fd
697                 .set_user_memory_region(region)
698                 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into()))
699         }
700     }
701 
702     ///
703     /// Returns the preferred CPU target type which can be emulated by KVM on underlying host.
704     ///
705     #[cfg(target_arch = "aarch64")]
706     fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> {
707         self.fd
708             .get_preferred_target(kvi)
709             .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into()))
710     }
711 
712     #[cfg(target_arch = "x86_64")]
713     fn enable_split_irq(&self) -> vm::Result<()> {
714         // Create split irqchip
715         // Only the local APIC is emulated in kernel, both PICs and IOAPIC
716         // are not.
717         let mut cap = kvm_enable_cap {
718             cap: KVM_CAP_SPLIT_IRQCHIP,
719             ..Default::default()
720         };
721         cap.args[0] = NUM_IOAPIC_PINS as u64;
722         self.fd
723             .enable_cap(&cap)
724             .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?;
725         Ok(())
726     }
727 
728     #[cfg(target_arch = "x86_64")]
729     fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> {
730         let mut cap = kvm_enable_cap {
731             cap: KVM_CAP_SGX_ATTRIBUTE,
732             ..Default::default()
733         };
734         cap.args[0] = file.as_raw_fd() as u64;
735         self.fd
736             .enable_cap(&cap)
737             .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?;
738         Ok(())
739     }
740 
741     /// Retrieve guest clock.
742     #[cfg(target_arch = "x86_64")]
743     fn get_clock(&self) -> vm::Result<ClockData> {
744         Ok(self
745             .fd
746             .get_clock()
747             .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))?
748             .into())
749     }
750 
751     /// Set guest clock.
752     #[cfg(target_arch = "x86_64")]
753     fn set_clock(&self, data: &ClockData) -> vm::Result<()> {
754         let data = (*data).into();
755         self.fd
756             .set_clock(&data)
757             .map_err(|e| vm::HypervisorVmError::SetClock(e.into()))
758     }
759 
760     /// Create a device that is used for passthrough
761     fn create_passthrough_device(&self) -> vm::Result<VfioDeviceFd> {
762         let mut vfio_dev = kvm_create_device {
763             type_: kvm_device_type_KVM_DEV_TYPE_VFIO,
764             fd: 0,
765             flags: 0,
766         };
767 
768         self.create_device(&mut vfio_dev)
769             .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into()))
770     }
771 
772     ///
773     /// Start logging dirty pages
774     ///
775     fn start_dirty_log(&self) -> vm::Result<()> {
776         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
777         for (_, s) in dirty_log_slots.iter() {
778             let region = kvm_userspace_memory_region {
779                 slot: s.slot,
780                 guest_phys_addr: s.guest_phys_addr,
781                 memory_size: s.memory_size,
782                 userspace_addr: s.userspace_addr,
783                 flags: KVM_MEM_LOG_DIRTY_PAGES,
784             };
785             // SAFETY: Safe because guest regions are guaranteed not to overlap.
786             unsafe {
787                 self.fd
788                     .set_user_memory_region(region)
789                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
790             }
791         }
792 
793         Ok(())
794     }
795 
796     ///
797     /// Stop logging dirty pages
798     ///
799     fn stop_dirty_log(&self) -> vm::Result<()> {
800         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
801         for (_, s) in dirty_log_slots.iter() {
802             let region = kvm_userspace_memory_region {
803                 slot: s.slot,
804                 guest_phys_addr: s.guest_phys_addr,
805                 memory_size: s.memory_size,
806                 userspace_addr: s.userspace_addr,
807                 flags: 0,
808             };
809             // SAFETY: Safe because guest regions are guaranteed not to overlap.
810             unsafe {
811                 self.fd
812                     .set_user_memory_region(region)
813                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
814             }
815         }
816 
817         Ok(())
818     }
819 
820     ///
821     /// Get dirty pages bitmap (one bit per page)
822     ///
823     fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> {
824         self.fd
825             .get_dirty_log(slot, memory_size as usize)
826             .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into()))
827     }
828 
829     ///
830     /// Initialize TDX for this VM
831     ///
832     #[cfg(feature = "tdx")]
833     fn tdx_init(&self, cpuid: &[CpuIdEntry], max_vcpus: u32) -> vm::Result<()> {
834         const TDX_ATTR_SEPT_VE_DISABLE: usize = 28;
835 
836         let mut cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> =
837             cpuid.iter().map(|e| (*e).into()).collect();
838         cpuid.resize(256, kvm_bindings::kvm_cpuid_entry2::default());
839 
840         #[repr(C)]
841         struct TdxInitVm {
842             attributes: u64,
843             max_vcpus: u32,
844             padding: u32,
845             mrconfigid: [u64; 6],
846             mrowner: [u64; 6],
847             mrownerconfig: [u64; 6],
848             cpuid_nent: u32,
849             cpuid_padding: u32,
850             cpuid_entries: [kvm_bindings::kvm_cpuid_entry2; 256],
851         }
852         let data = TdxInitVm {
853             attributes: 1 << TDX_ATTR_SEPT_VE_DISABLE,
854             max_vcpus,
855             padding: 0,
856             mrconfigid: [0; 6],
857             mrowner: [0; 6],
858             mrownerconfig: [0; 6],
859             cpuid_nent: cpuid.len() as u32,
860             cpuid_padding: 0,
861             cpuid_entries: cpuid.as_slice().try_into().unwrap(),
862         };
863 
864         tdx_command(
865             &self.fd.as_raw_fd(),
866             TdxCommand::InitVm,
867             0,
868             &data as *const _ as u64,
869         )
870         .map_err(vm::HypervisorVmError::InitializeTdx)
871     }
872 
873     ///
874     /// Finalize the TDX setup for this VM
875     ///
876     #[cfg(feature = "tdx")]
877     fn tdx_finalize(&self) -> vm::Result<()> {
878         tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0)
879             .map_err(vm::HypervisorVmError::FinalizeTdx)
880     }
881 
882     ///
883     /// Initialize memory regions for the TDX VM
884     ///
885     #[cfg(feature = "tdx")]
886     fn tdx_init_memory_region(
887         &self,
888         host_address: u64,
889         guest_address: u64,
890         size: u64,
891         measure: bool,
892     ) -> vm::Result<()> {
893         #[repr(C)]
894         struct TdxInitMemRegion {
895             host_address: u64,
896             guest_address: u64,
897             pages: u64,
898         }
899         let data = TdxInitMemRegion {
900             host_address,
901             guest_address,
902             pages: size / 4096,
903         };
904 
905         tdx_command(
906             &self.fd.as_raw_fd(),
907             TdxCommand::InitMemRegion,
908             u32::from(measure),
909             &data as *const _ as u64,
910         )
911         .map_err(vm::HypervisorVmError::InitMemRegionTdx)
912     }
913 
914     /// Downcast to the underlying KvmVm type
915     fn as_any(&self) -> &dyn Any {
916         self
917     }
918 }
919 
920 #[cfg(feature = "tdx")]
921 fn tdx_command(
922     fd: &RawFd,
923     command: TdxCommand,
924     flags: u32,
925     data: u64,
926 ) -> std::result::Result<(), std::io::Error> {
927     #[repr(C)]
928     struct TdxIoctlCmd {
929         command: TdxCommand,
930         flags: u32,
931         data: u64,
932         error: u64,
933         unused: u64,
934     }
935     let cmd = TdxIoctlCmd {
936         command,
937         flags,
938         data,
939         error: 0,
940         unused: 0,
941     };
942     // SAFETY: FFI call. All input parameters are valid.
943     let ret = unsafe {
944         ioctl_with_val(
945             fd,
946             KVM_MEMORY_ENCRYPT_OP(),
947             &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong,
948         )
949     };
950 
951     if ret < 0 {
952         return Err(std::io::Error::last_os_error());
953     }
954     Ok(())
955 }
956 
957 /// Wrapper over KVM system ioctls.
958 pub struct KvmHypervisor {
959     kvm: Kvm,
960 }
961 
962 impl KvmHypervisor {
963     #[cfg(target_arch = "x86_64")]
964     ///
965     /// Retrieve the list of MSRs supported by the hypervisor.
966     ///
967     fn get_msr_list(&self) -> hypervisor::Result<MsrList> {
968         self.kvm
969             .get_msr_index_list()
970             .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into()))
971     }
972 }
973 
974 /// Enum for KVM related error
975 #[derive(Debug, Error)]
976 pub enum KvmError {
977     #[error("Capability missing: {0:?}")]
978     CapabilityMissing(Cap),
979 }
980 
981 pub type KvmResult<T> = result::Result<T, KvmError>;
982 
983 impl KvmHypervisor {
984     /// Create a hypervisor based on Kvm
985     #[allow(clippy::new_ret_no_self)]
986     pub fn new() -> hypervisor::Result<Arc<dyn hypervisor::Hypervisor>> {
987         let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?;
988         let api_version = kvm_obj.get_api_version();
989 
990         if api_version != kvm_bindings::KVM_API_VERSION as i32 {
991             return Err(hypervisor::HypervisorError::IncompatibleApiVersion);
992         }
993 
994         Ok(Arc::new(KvmHypervisor { kvm: kvm_obj }))
995     }
996 
997     /// Check if the hypervisor is available
998     pub fn is_available() -> hypervisor::Result<bool> {
999         match std::fs::metadata("/dev/kvm") {
1000             Ok(_) => Ok(true),
1001             Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false),
1002             Err(err) => Err(hypervisor::HypervisorError::HypervisorAvailableCheck(
1003                 err.into(),
1004             )),
1005         }
1006     }
1007 }
1008 
1009 /// Implementation of Hypervisor trait for KVM
1010 ///
1011 /// # Examples
1012 ///
1013 /// ```
1014 /// # use hypervisor::kvm::KvmHypervisor;
1015 /// # use std::sync::Arc;
1016 /// let kvm = KvmHypervisor::new().unwrap();
1017 /// let hypervisor = Arc::new(kvm);
1018 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
1019 /// ```
1020 impl hypervisor::Hypervisor for KvmHypervisor {
1021     ///
1022     /// Returns the type of the hypervisor
1023     ///
1024     fn hypervisor_type(&self) -> HypervisorType {
1025         HypervisorType::Kvm
1026     }
1027 
1028     /// Create a KVM vm object of a specific VM type and return the object as Vm trait object
1029     ///
1030     /// # Examples
1031     ///
1032     /// ```
1033     /// # use hypervisor::kvm::KvmHypervisor;
1034     /// use hypervisor::kvm::KvmVm;
1035     /// let hypervisor = KvmHypervisor::new().unwrap();
1036     /// let vm = hypervisor.create_vm_with_type(0).unwrap();
1037     /// ```
1038     fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> {
1039         let fd: VmFd;
1040         loop {
1041             match self.kvm.create_vm_with_type(vm_type) {
1042                 Ok(res) => fd = res,
1043                 Err(e) => {
1044                     if e.errno() == libc::EINTR {
1045                         // If the error returned is EINTR, which means the
1046                         // ioctl has been interrupted, we have to retry as
1047                         // this can't be considered as a regular error.
1048                         continue;
1049                     } else {
1050                         return Err(hypervisor::HypervisorError::VmCreate(e.into()));
1051                     }
1052                 }
1053             }
1054             break;
1055         }
1056 
1057         let vm_fd = Arc::new(fd);
1058 
1059         #[cfg(target_arch = "x86_64")]
1060         {
1061             let msr_list = self.get_msr_list()?;
1062             let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize;
1063             let mut msrs: Vec<MsrEntry> = vec![
1064                 MsrEntry {
1065                     ..Default::default()
1066                 };
1067                 num_msrs
1068             ];
1069             let indices = msr_list.as_slice();
1070             for (pos, index) in indices.iter().enumerate() {
1071                 msrs[pos].index = *index;
1072             }
1073 
1074             Ok(Arc::new(KvmVm {
1075                 fd: vm_fd,
1076                 msrs,
1077                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
1078             }))
1079         }
1080 
1081         #[cfg(target_arch = "aarch64")]
1082         {
1083             Ok(Arc::new(KvmVm {
1084                 fd: vm_fd,
1085                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
1086             }))
1087         }
1088     }
1089 
1090     /// Create a KVM vm object and return the object as Vm trait object
1091     ///
1092     /// # Examples
1093     ///
1094     /// ```
1095     /// # use hypervisor::kvm::KvmHypervisor;
1096     /// use hypervisor::kvm::KvmVm;
1097     /// let hypervisor = KvmHypervisor::new().unwrap();
1098     /// let vm = hypervisor.create_vm().unwrap();
1099     /// ```
1100     fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> {
1101         #[allow(unused_mut)]
1102         let mut vm_type: u64 = 0; // Create with default platform type
1103 
1104         // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA
1105         // size from the host and use that when creating the VM, which may
1106         // avoid unnecessary VM creation failures.
1107         #[cfg(target_arch = "aarch64")]
1108         if self.kvm.check_extension(Cap::ArmVmIPASize) {
1109             vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap();
1110         }
1111 
1112         self.create_vm_with_type(vm_type)
1113     }
1114 
1115     fn check_required_extensions(&self) -> hypervisor::Result<()> {
1116         check_required_kvm_extensions(&self.kvm)
1117             .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into()))
1118     }
1119 
1120     #[cfg(target_arch = "x86_64")]
1121     ///
1122     /// X86 specific call to get the system supported CPUID values.
1123     ///
1124     fn get_supported_cpuid(&self) -> hypervisor::Result<Vec<CpuIdEntry>> {
1125         let kvm_cpuid = self
1126             .kvm
1127             .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
1128             .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))?;
1129 
1130         let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect();
1131 
1132         Ok(v)
1133     }
1134 
1135     #[cfg(target_arch = "aarch64")]
1136     ///
1137     /// Retrieve AArch64 host maximum IPA size supported by KVM.
1138     ///
1139     fn get_host_ipa_limit(&self) -> i32 {
1140         self.kvm.get_host_ipa_limit()
1141     }
1142 
1143     ///
1144     /// Retrieve TDX capabilities
1145     ///
1146     #[cfg(feature = "tdx")]
1147     fn tdx_capabilities(&self) -> hypervisor::Result<TdxCapabilities> {
1148         let data = TdxCapabilities {
1149             nr_cpuid_configs: TDX_MAX_NR_CPUID_CONFIGS as u32,
1150             ..Default::default()
1151         };
1152 
1153         tdx_command(
1154             &self.kvm.as_raw_fd(),
1155             TdxCommand::Capabilities,
1156             0,
1157             &data as *const _ as u64,
1158         )
1159         .map_err(|e| hypervisor::HypervisorError::TdxCapabilities(e.into()))?;
1160 
1161         Ok(data)
1162     }
1163 
1164     ///
1165     /// Get the number of supported hardware breakpoints
1166     ///
1167     fn get_guest_debug_hw_bps(&self) -> usize {
1168         #[cfg(target_arch = "x86_64")]
1169         {
1170             4
1171         }
1172         #[cfg(target_arch = "aarch64")]
1173         {
1174             self.kvm.get_guest_debug_hw_bps() as usize
1175         }
1176     }
1177 
1178     /// Get maximum number of vCPUs
1179     fn get_max_vcpus(&self) -> u32 {
1180         self.kvm.get_max_vcpus().min(u32::MAX as usize) as u32
1181     }
1182 }
1183 
1184 /// Vcpu struct for KVM
1185 pub struct KvmVcpu {
1186     fd: Arc<Mutex<VcpuFd>>,
1187     #[cfg(target_arch = "x86_64")]
1188     msrs: Vec<MsrEntry>,
1189     vm_ops: Option<Arc<dyn vm::VmOps>>,
1190     #[cfg(target_arch = "x86_64")]
1191     hyperv_synic: AtomicBool,
1192 }
1193 
1194 /// Implementation of Vcpu trait for KVM
1195 ///
1196 /// # Examples
1197 ///
1198 /// ```
1199 /// # use hypervisor::kvm::KvmHypervisor;
1200 /// # use std::sync::Arc;
1201 /// let kvm = KvmHypervisor::new().unwrap();
1202 /// let hypervisor = Arc::new(kvm);
1203 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
1204 /// let vcpu = vm.create_vcpu(0, None).unwrap();
1205 /// ```
1206 impl cpu::Vcpu for KvmVcpu {
1207     ///
1208     /// Returns StandardRegisters with default value set
1209     ///
1210     #[cfg(target_arch = "x86_64")]
1211     fn create_standard_regs(&self) -> StandardRegisters {
1212         kvm_bindings::kvm_regs::default().into()
1213     }
1214     #[cfg(target_arch = "x86_64")]
1215     ///
1216     /// Returns the vCPU general purpose registers.
1217     ///
1218     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1219         Ok(self
1220             .fd
1221             .lock()
1222             .unwrap()
1223             .get_regs()
1224             .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))?
1225             .into())
1226     }
1227 
1228     ///
1229     /// Returns the vCPU general purpose registers.
1230     /// The `KVM_GET_REGS` ioctl is not available on AArch64, `KVM_GET_ONE_REG`
1231     /// is used to get registers one by one.
1232     ///
1233     #[cfg(target_arch = "aarch64")]
1234     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1235         let mut state = kvm_regs::default();
1236         let mut off = offset_of!(user_pt_regs, regs);
1237         // There are 31 user_pt_regs:
1238         // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72
1239         // These actually are the general-purpose registers of the Armv8-a
1240         // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register).
1241         for i in 0..31 {
1242             let mut bytes = [0_u8; 8];
1243             self.fd
1244                 .lock()
1245                 .unwrap()
1246                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1247                 .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1248             state.regs.regs[i] = u64::from_le_bytes(bytes);
1249             off += std::mem::size_of::<u64>();
1250         }
1251 
1252         // We are now entering the "Other register" section of the ARMv8-a architecture.
1253         // First one, stack pointer.
1254         let off = offset_of!(user_pt_regs, sp);
1255         let mut bytes = [0_u8; 8];
1256         self.fd
1257             .lock()
1258             .unwrap()
1259             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1260             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1261         state.regs.sp = u64::from_le_bytes(bytes);
1262 
1263         // Second one, the program counter.
1264         let off = offset_of!(user_pt_regs, pc);
1265         let mut bytes = [0_u8; 8];
1266         self.fd
1267             .lock()
1268             .unwrap()
1269             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1270             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1271         state.regs.pc = u64::from_le_bytes(bytes);
1272 
1273         // Next is the processor state.
1274         let off = offset_of!(user_pt_regs, pstate);
1275         let mut bytes = [0_u8; 8];
1276         self.fd
1277             .lock()
1278             .unwrap()
1279             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1280             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1281         state.regs.pstate = u64::from_le_bytes(bytes);
1282 
1283         // The stack pointer associated with EL1
1284         let off = offset_of!(kvm_regs, sp_el1);
1285         let mut bytes = [0_u8; 8];
1286         self.fd
1287             .lock()
1288             .unwrap()
1289             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1290             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1291         state.sp_el1 = u64::from_le_bytes(bytes);
1292 
1293         // Exception Link Register for EL1, when taking an exception to EL1, this register
1294         // holds the address to which to return afterwards.
1295         let off = offset_of!(kvm_regs, elr_el1);
1296         let mut bytes = [0_u8; 8];
1297         self.fd
1298             .lock()
1299             .unwrap()
1300             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1301             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1302         state.elr_el1 = u64::from_le_bytes(bytes);
1303 
1304         // Saved Program Status Registers, there are 5 of them used in the kernel.
1305         let mut off = offset_of!(kvm_regs, spsr);
1306         for i in 0..KVM_NR_SPSR as usize {
1307             let mut bytes = [0_u8; 8];
1308             self.fd
1309                 .lock()
1310                 .unwrap()
1311                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1312                 .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1313             state.spsr[i] = u64::from_le_bytes(bytes);
1314             off += std::mem::size_of::<u64>();
1315         }
1316 
1317         // Now moving on to floating point registers which are stored in the user_fpsimd_state in the kernel:
1318         // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53
1319         let mut off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, vregs);
1320         for i in 0..32 {
1321             let mut bytes = [0_u8; 16];
1322             self.fd
1323                 .lock()
1324                 .unwrap()
1325                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off), &mut bytes)
1326                 .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1327             state.fp_regs.vregs[i] = u128::from_le_bytes(bytes);
1328             off += mem::size_of::<u128>();
1329         }
1330 
1331         // Floating-point Status Register
1332         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpsr);
1333         let mut bytes = [0_u8; 4];
1334         self.fd
1335             .lock()
1336             .unwrap()
1337             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off), &mut bytes)
1338             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1339         state.fp_regs.fpsr = u32::from_le_bytes(bytes);
1340 
1341         // Floating-point Control Register
1342         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpcr);
1343         let mut bytes = [0_u8; 4];
1344         self.fd
1345             .lock()
1346             .unwrap()
1347             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off), &mut bytes)
1348             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1349         state.fp_regs.fpcr = u32::from_le_bytes(bytes);
1350         Ok(state.into())
1351     }
1352 
1353     #[cfg(target_arch = "x86_64")]
1354     ///
1355     /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl.
1356     ///
1357     fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> {
1358         let regs = (*regs).into();
1359         self.fd
1360             .lock()
1361             .unwrap()
1362             .set_regs(&regs)
1363             .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into()))
1364     }
1365 
1366     ///
1367     /// Sets the vCPU general purpose registers.
1368     /// The `KVM_SET_REGS` ioctl is not available on AArch64, `KVM_SET_ONE_REG`
1369     /// is used to set registers one by one.
1370     ///
1371     #[cfg(target_arch = "aarch64")]
1372     fn set_regs(&self, state: &StandardRegisters) -> cpu::Result<()> {
1373         // The function follows the exact identical order from `state`. Look there
1374         // for some additional info on registers.
1375         let kvm_regs_state: kvm_regs = (*state).into();
1376         let mut off = offset_of!(user_pt_regs, regs);
1377         for i in 0..31 {
1378             self.fd
1379                 .lock()
1380                 .unwrap()
1381                 .set_one_reg(
1382                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1383                     &kvm_regs_state.regs.regs[i].to_le_bytes(),
1384                 )
1385                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1386             off += std::mem::size_of::<u64>();
1387         }
1388 
1389         let off = offset_of!(user_pt_regs, sp);
1390         self.fd
1391             .lock()
1392             .unwrap()
1393             .set_one_reg(
1394                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1395                 &kvm_regs_state.regs.sp.to_le_bytes(),
1396             )
1397             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1398 
1399         let off = offset_of!(user_pt_regs, pc);
1400         self.fd
1401             .lock()
1402             .unwrap()
1403             .set_one_reg(
1404                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1405                 &kvm_regs_state.regs.pc.to_le_bytes(),
1406             )
1407             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1408 
1409         let off = offset_of!(user_pt_regs, pstate);
1410         self.fd
1411             .lock()
1412             .unwrap()
1413             .set_one_reg(
1414                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1415                 &kvm_regs_state.regs.pstate.to_le_bytes(),
1416             )
1417             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1418 
1419         let off = offset_of!(kvm_regs, sp_el1);
1420         self.fd
1421             .lock()
1422             .unwrap()
1423             .set_one_reg(
1424                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1425                 &kvm_regs_state.sp_el1.to_le_bytes(),
1426             )
1427             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1428 
1429         let off = offset_of!(kvm_regs, elr_el1);
1430         self.fd
1431             .lock()
1432             .unwrap()
1433             .set_one_reg(
1434                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1435                 &kvm_regs_state.elr_el1.to_le_bytes(),
1436             )
1437             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1438 
1439         let mut off = offset_of!(kvm_regs, spsr);
1440         for i in 0..KVM_NR_SPSR as usize {
1441             self.fd
1442                 .lock()
1443                 .unwrap()
1444                 .set_one_reg(
1445                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1446                     &kvm_regs_state.spsr[i].to_le_bytes(),
1447                 )
1448                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1449             off += std::mem::size_of::<u64>();
1450         }
1451 
1452         let mut off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, vregs);
1453         for i in 0..32 {
1454             self.fd
1455                 .lock()
1456                 .unwrap()
1457                 .set_one_reg(
1458                     arm64_core_reg_id!(KVM_REG_SIZE_U128, off),
1459                     &kvm_regs_state.fp_regs.vregs[i].to_le_bytes(),
1460                 )
1461                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1462             off += mem::size_of::<u128>();
1463         }
1464 
1465         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpsr);
1466         self.fd
1467             .lock()
1468             .unwrap()
1469             .set_one_reg(
1470                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1471                 &kvm_regs_state.fp_regs.fpsr.to_le_bytes(),
1472             )
1473             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1474 
1475         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpcr);
1476         self.fd
1477             .lock()
1478             .unwrap()
1479             .set_one_reg(
1480                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1481                 &kvm_regs_state.fp_regs.fpcr.to_le_bytes(),
1482             )
1483             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1484         Ok(())
1485     }
1486 
1487     #[cfg(target_arch = "x86_64")]
1488     ///
1489     /// Returns the vCPU special registers.
1490     ///
1491     fn get_sregs(&self) -> cpu::Result<SpecialRegisters> {
1492         Ok(self
1493             .fd
1494             .lock()
1495             .unwrap()
1496             .get_sregs()
1497             .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))?
1498             .into())
1499     }
1500 
1501     #[cfg(target_arch = "x86_64")]
1502     ///
1503     /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl.
1504     ///
1505     fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> {
1506         let sregs = (*sregs).into();
1507         self.fd
1508             .lock()
1509             .unwrap()
1510             .set_sregs(&sregs)
1511             .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into()))
1512     }
1513 
1514     #[cfg(target_arch = "x86_64")]
1515     ///
1516     /// Returns the floating point state (FPU) from the vCPU.
1517     ///
1518     fn get_fpu(&self) -> cpu::Result<FpuState> {
1519         Ok(self
1520             .fd
1521             .lock()
1522             .unwrap()
1523             .get_fpu()
1524             .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))?
1525             .into())
1526     }
1527 
1528     #[cfg(target_arch = "x86_64")]
1529     ///
1530     /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioctl.
1531     ///
1532     fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> {
1533         let fpu: kvm_bindings::kvm_fpu = (*fpu).clone().into();
1534         self.fd
1535             .lock()
1536             .unwrap()
1537             .set_fpu(&fpu)
1538             .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into()))
1539     }
1540 
1541     #[cfg(target_arch = "x86_64")]
1542     ///
1543     /// X86 specific call to setup the CPUID registers.
1544     ///
1545     fn set_cpuid2(&self, cpuid: &[CpuIdEntry]) -> cpu::Result<()> {
1546         let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> =
1547             cpuid.iter().map(|e| (*e).into()).collect();
1548         let kvm_cpuid = <CpuId>::from_entries(&cpuid)
1549             .map_err(|_| cpu::HypervisorCpuError::SetCpuid(anyhow!("failed to create CpuId")))?;
1550 
1551         self.fd
1552             .lock()
1553             .unwrap()
1554             .set_cpuid2(&kvm_cpuid)
1555             .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into()))
1556     }
1557 
1558     #[cfg(target_arch = "x86_64")]
1559     ///
1560     /// X86 specific call to enable HyperV SynIC
1561     ///
1562     fn enable_hyperv_synic(&self) -> cpu::Result<()> {
1563         // Update the information about Hyper-V SynIC being enabled and
1564         // emulated as it will influence later which MSRs should be saved.
1565         self.hyperv_synic.store(true, Ordering::Release);
1566 
1567         let cap = kvm_enable_cap {
1568             cap: KVM_CAP_HYPERV_SYNIC,
1569             ..Default::default()
1570         };
1571         self.fd
1572             .lock()
1573             .unwrap()
1574             .enable_cap(&cap)
1575             .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into()))
1576     }
1577 
1578     ///
1579     /// X86 specific call to retrieve the CPUID registers.
1580     ///
1581     #[cfg(target_arch = "x86_64")]
1582     fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<Vec<CpuIdEntry>> {
1583         let kvm_cpuid = self
1584             .fd
1585             .lock()
1586             .unwrap()
1587             .get_cpuid2(num_entries)
1588             .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))?;
1589 
1590         let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect();
1591 
1592         Ok(v)
1593     }
1594 
1595     #[cfg(target_arch = "x86_64")]
1596     ///
1597     /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1598     ///
1599     fn get_lapic(&self) -> cpu::Result<LapicState> {
1600         Ok(self
1601             .fd
1602             .lock()
1603             .unwrap()
1604             .get_lapic()
1605             .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))?
1606             .into())
1607     }
1608 
1609     #[cfg(target_arch = "x86_64")]
1610     ///
1611     /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1612     ///
1613     fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> {
1614         let klapic: kvm_bindings::kvm_lapic_state = (*klapic).clone().into();
1615         self.fd
1616             .lock()
1617             .unwrap()
1618             .set_lapic(&klapic)
1619             .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into()))
1620     }
1621 
1622     #[cfg(target_arch = "x86_64")]
1623     ///
1624     /// Returns the model-specific registers (MSR) for this vCPU.
1625     ///
1626     fn get_msrs(&self, msrs: &mut Vec<MsrEntry>) -> cpu::Result<usize> {
1627         let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect();
1628         let mut kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap();
1629         let succ = self
1630             .fd
1631             .lock()
1632             .unwrap()
1633             .get_msrs(&mut kvm_msrs)
1634             .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))?;
1635 
1636         msrs[..succ].copy_from_slice(
1637             &kvm_msrs.as_slice()[..succ]
1638                 .iter()
1639                 .map(|e| (*e).into())
1640                 .collect::<Vec<MsrEntry>>(),
1641         );
1642 
1643         Ok(succ)
1644     }
1645 
1646     #[cfg(target_arch = "x86_64")]
1647     ///
1648     /// Setup the model-specific registers (MSR) for this vCPU.
1649     /// Returns the number of MSR entries actually written.
1650     ///
1651     fn set_msrs(&self, msrs: &[MsrEntry]) -> cpu::Result<usize> {
1652         let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect();
1653         let kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap();
1654         self.fd
1655             .lock()
1656             .unwrap()
1657             .set_msrs(&kvm_msrs)
1658             .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into()))
1659     }
1660 
1661     ///
1662     /// Returns the vcpu's current "multiprocessing state".
1663     ///
1664     fn get_mp_state(&self) -> cpu::Result<MpState> {
1665         Ok(self
1666             .fd
1667             .lock()
1668             .unwrap()
1669             .get_mp_state()
1670             .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))?
1671             .into())
1672     }
1673 
1674     ///
1675     /// Sets the vcpu's current "multiprocessing state".
1676     ///
1677     fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> {
1678         self.fd
1679             .lock()
1680             .unwrap()
1681             .set_mp_state(mp_state.into())
1682             .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into()))
1683     }
1684 
1685     #[cfg(target_arch = "x86_64")]
1686     ///
1687     /// Translates guest virtual address to guest physical address using the `KVM_TRANSLATE` ioctl.
1688     ///
1689     fn translate_gva(&self, gva: u64, _flags: u64) -> cpu::Result<(u64, u32)> {
1690         let tr = self
1691             .fd
1692             .lock()
1693             .unwrap()
1694             .translate_gva(gva)
1695             .map_err(|e| cpu::HypervisorCpuError::TranslateVirtualAddress(e.into()))?;
1696         // tr.valid is set if the GVA is mapped to valid GPA.
1697         match tr.valid {
1698             0 => Err(cpu::HypervisorCpuError::TranslateVirtualAddress(anyhow!(
1699                 "Invalid GVA: {:#x}",
1700                 gva
1701             ))),
1702             _ => Ok((tr.physical_address, 0)),
1703         }
1704     }
1705 
1706     ///
1707     /// Triggers the running of the current virtual CPU returning an exit reason.
1708     ///
1709     fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> {
1710         match self.fd.lock().unwrap().run() {
1711             Ok(run) => match run {
1712                 #[cfg(target_arch = "x86_64")]
1713                 VcpuExit::IoIn(addr, data) => {
1714                     if let Some(vm_ops) = &self.vm_ops {
1715                         return vm_ops
1716                             .pio_read(addr.into(), data)
1717                             .map(|_| cpu::VmExit::Ignore)
1718                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1719                     }
1720 
1721                     Ok(cpu::VmExit::Ignore)
1722                 }
1723                 #[cfg(target_arch = "x86_64")]
1724                 VcpuExit::IoOut(addr, data) => {
1725                     if let Some(vm_ops) = &self.vm_ops {
1726                         return vm_ops
1727                             .pio_write(addr.into(), data)
1728                             .map(|_| cpu::VmExit::Ignore)
1729                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1730                     }
1731 
1732                     Ok(cpu::VmExit::Ignore)
1733                 }
1734                 #[cfg(target_arch = "x86_64")]
1735                 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)),
1736                 #[cfg(target_arch = "x86_64")]
1737                 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset),
1738 
1739                 #[cfg(target_arch = "aarch64")]
1740                 VcpuExit::SystemEvent(event_type, flags) => {
1741                     use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN};
1742                     // On Aarch64, when the VM is shutdown, run() returns
1743                     // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN
1744                     if event_type == KVM_SYSTEM_EVENT_RESET {
1745                         Ok(cpu::VmExit::Reset)
1746                     } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN {
1747                         Ok(cpu::VmExit::Shutdown)
1748                     } else {
1749                         Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1750                             "Unexpected system event with type 0x{:x}, flags 0x{:x?}",
1751                             event_type,
1752                             flags
1753                         )))
1754                     }
1755                 }
1756 
1757                 VcpuExit::MmioRead(addr, data) => {
1758                     if let Some(vm_ops) = &self.vm_ops {
1759                         return vm_ops
1760                             .mmio_read(addr, data)
1761                             .map(|_| cpu::VmExit::Ignore)
1762                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1763                     }
1764 
1765                     Ok(cpu::VmExit::Ignore)
1766                 }
1767                 VcpuExit::MmioWrite(addr, data) => {
1768                     if let Some(vm_ops) = &self.vm_ops {
1769                         return vm_ops
1770                             .mmio_write(addr, data)
1771                             .map(|_| cpu::VmExit::Ignore)
1772                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1773                     }
1774 
1775                     Ok(cpu::VmExit::Ignore)
1776                 }
1777                 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv),
1778                 #[cfg(feature = "tdx")]
1779                 VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx),
1780                 VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug),
1781 
1782                 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1783                     "Unexpected exit reason on vcpu run: {:?}",
1784                     r
1785                 ))),
1786             },
1787 
1788             Err(ref e) => match e.errno() {
1789                 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore),
1790                 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1791                     "VCPU error {:?}",
1792                     e
1793                 ))),
1794             },
1795         }
1796     }
1797 
1798     #[cfg(target_arch = "x86_64")]
1799     ///
1800     /// Let the guest know that it has been paused, which prevents from
1801     /// potential soft lockups when being resumed.
1802     ///
1803     fn notify_guest_clock_paused(&self) -> cpu::Result<()> {
1804         if let Err(e) = self.fd.lock().unwrap().kvmclock_ctrl() {
1805             // Linux kernel returns -EINVAL if the PV clock isn't yet initialised
1806             // which could be because we're still in firmware or the guest doesn't
1807             // use KVM clock.
1808             if e.errno() != libc::EINVAL {
1809                 return Err(cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into()));
1810             }
1811         }
1812 
1813         Ok(())
1814     }
1815 
1816     ///
1817     /// Sets debug registers to set hardware breakpoints and/or enable single step.
1818     ///
1819     fn set_guest_debug(
1820         &self,
1821         addrs: &[vm_memory::GuestAddress],
1822         singlestep: bool,
1823     ) -> cpu::Result<()> {
1824         let mut dbg = kvm_guest_debug {
1825             #[cfg(target_arch = "x86_64")]
1826             control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP,
1827             #[cfg(target_arch = "aarch64")]
1828             control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW,
1829             ..Default::default()
1830         };
1831         if singlestep {
1832             dbg.control |= KVM_GUESTDBG_SINGLESTEP;
1833         }
1834 
1835         // Set the debug registers.
1836         // Here we assume that the number of addresses do not exceed what
1837         // `Hypervisor::get_guest_debug_hw_bps()` specifies.
1838         #[cfg(target_arch = "x86_64")]
1839         {
1840             // Set bits 9 and 10.
1841             // bit 9: GE (global exact breakpoint enable) flag.
1842             // bit 10: always 1.
1843             dbg.arch.debugreg[7] = 0x0600;
1844 
1845             for (i, addr) in addrs.iter().enumerate() {
1846                 dbg.arch.debugreg[i] = addr.0;
1847                 // Set global breakpoint enable flag
1848                 dbg.arch.debugreg[7] |= 2 << (i * 2);
1849             }
1850         }
1851         #[cfg(target_arch = "aarch64")]
1852         {
1853             for (i, addr) in addrs.iter().enumerate() {
1854                 // DBGBCR_EL1 (Debug Breakpoint Control Registers, D13.3.2):
1855                 // bit 0: 1 (Enabled)
1856                 // bit 1~2: 0b11 (PMC = EL1/EL0)
1857                 // bit 5~8: 0b1111 (BAS = AArch64)
1858                 // others: 0
1859                 dbg.arch.dbg_bcr[i] = 0b1u64 | 0b110u64 | 0b1_1110_0000u64;
1860                 // DBGBVR_EL1 (Debug Breakpoint Value Registers, D13.3.3):
1861                 // bit 2~52: VA[2:52]
1862                 dbg.arch.dbg_bvr[i] = (!0u64 >> 11) & addr.0;
1863             }
1864         }
1865         self.fd
1866             .lock()
1867             .unwrap()
1868             .set_guest_debug(&dbg)
1869             .map_err(|e| cpu::HypervisorCpuError::SetDebugRegs(e.into()))
1870     }
1871 
1872     #[cfg(target_arch = "aarch64")]
1873     fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> {
1874         self.fd
1875             .lock()
1876             .unwrap()
1877             .vcpu_init(kvi)
1878             .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into()))
1879     }
1880 
1881     #[cfg(target_arch = "aarch64")]
1882     fn vcpu_finalize(&self, feature: i32) -> cpu::Result<()> {
1883         self.fd
1884             .lock()
1885             .unwrap()
1886             .vcpu_finalize(&feature)
1887             .map_err(|e| cpu::HypervisorCpuError::VcpuFinalize(e.into()))
1888     }
1889 
1890     ///
1891     /// Gets a list of the guest registers that are supported for the
1892     /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls.
1893     ///
1894     #[cfg(target_arch = "aarch64")]
1895     fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> {
1896         self.fd
1897             .lock()
1898             .unwrap()
1899             .get_reg_list(reg_list)
1900             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))
1901     }
1902 
1903     ///
1904     /// Gets the value of a system register
1905     ///
1906     #[cfg(target_arch = "aarch64")]
1907     fn get_sys_reg(&self, sys_reg: u32) -> cpu::Result<u64> {
1908         //
1909         // Arm Architecture Reference Manual defines the encoding of
1910         // AArch64 system registers, see
1911         // https://developer.arm.com/documentation/ddi0487 (chapter D12).
1912         // While KVM defines another ID for each AArch64 system register,
1913         // which is used in calling `KVM_G/SET_ONE_REG` to access a system
1914         // register of a guest.
1915         // A mapping exists between the Arm standard encoding and the KVM ID.
1916         // This function takes the standard u32 ID as input parameter, converts
1917         // it to the corresponding KVM ID, and call `KVM_GET_ONE_REG` API to
1918         // get the value of the system parameter.
1919         //
1920         let id: u64 = KVM_REG_ARM64
1921             | KVM_REG_SIZE_U64
1922             | KVM_REG_ARM64_SYSREG as u64
1923             | ((((sys_reg) >> 5)
1924                 & (KVM_REG_ARM64_SYSREG_OP0_MASK
1925                     | KVM_REG_ARM64_SYSREG_OP1_MASK
1926                     | KVM_REG_ARM64_SYSREG_CRN_MASK
1927                     | KVM_REG_ARM64_SYSREG_CRM_MASK
1928                     | KVM_REG_ARM64_SYSREG_OP2_MASK)) as u64);
1929         let mut bytes = [0_u8; 8];
1930         self.fd
1931             .lock()
1932             .unwrap()
1933             .get_one_reg(id, &mut bytes)
1934             .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?;
1935         Ok(u64::from_le_bytes(bytes))
1936     }
1937 
1938     ///
1939     /// Configure core registers for a given CPU.
1940     ///
1941     #[cfg(target_arch = "aarch64")]
1942     fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> {
1943         #[allow(non_upper_case_globals)]
1944         // PSR (Processor State Register) bits.
1945         // Taken from arch/arm64/include/uapi/asm/ptrace.h.
1946         const PSR_MODE_EL1h: u64 = 0x0000_0005;
1947         const PSR_F_BIT: u64 = 0x0000_0040;
1948         const PSR_I_BIT: u64 = 0x0000_0080;
1949         const PSR_A_BIT: u64 = 0x0000_0100;
1950         const PSR_D_BIT: u64 = 0x0000_0200;
1951         // Taken from arch/arm64/kvm/inject_fault.c.
1952         const PSTATE_FAULT_BITS_64: u64 =
1953             PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | PSR_I_BIT | PSR_D_BIT;
1954 
1955         let kreg_off = offset_of!(kvm_regs, regs);
1956 
1957         // Get the register index of the PSTATE (Processor State) register.
1958         let pstate = offset_of!(user_pt_regs, pstate) + kreg_off;
1959         self.fd
1960             .lock()
1961             .unwrap()
1962             .set_one_reg(
1963                 arm64_core_reg_id!(KVM_REG_SIZE_U64, pstate),
1964                 &PSTATE_FAULT_BITS_64.to_le_bytes(),
1965             )
1966             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1967 
1968         // Other vCPUs are powered off initially awaiting PSCI wakeup.
1969         if cpu_id == 0 {
1970             // Setting the PC (Processor Counter) to the current program address (kernel address).
1971             let pc = offset_of!(user_pt_regs, pc) + kreg_off;
1972             self.fd
1973                 .lock()
1974                 .unwrap()
1975                 .set_one_reg(
1976                     arm64_core_reg_id!(KVM_REG_SIZE_U64, pc),
1977                     &boot_ip.to_le_bytes(),
1978                 )
1979                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1980 
1981             // Last mandatory thing to set -> the address pointing to the FDT (also called DTB).
1982             // "The device tree blob (dtb) must be placed on an 8-byte boundary and must
1983             // not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt.
1984             // We are choosing to place it the end of DRAM. See `get_fdt_addr`.
1985             let regs0 = offset_of!(user_pt_regs, regs) + kreg_off;
1986             self.fd
1987                 .lock()
1988                 .unwrap()
1989                 .set_one_reg(
1990                     arm64_core_reg_id!(KVM_REG_SIZE_U64, regs0),
1991                     &fdt_start.to_le_bytes(),
1992                 )
1993                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1994         }
1995         Ok(())
1996     }
1997 
1998     #[cfg(target_arch = "x86_64")]
1999     ///
2000     /// Get the current CPU state
2001     ///
2002     /// Ordering requirements:
2003     ///
2004     /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify
2005     /// vCPU/LAPIC state. As such, it must be done before most everything
2006     /// else, otherwise we cannot restore everything and expect it to work.
2007     ///
2008     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
2009     /// still running.
2010     ///
2011     /// KVM_GET_LAPIC may change state of LAPIC before returning it.
2012     ///
2013     /// GET_VCPU_EVENTS should probably be last to save. The code looks as
2014     /// it might as well be affected by internal state modifications of the
2015     /// GET ioctls.
2016     ///
2017     /// SREGS saves/restores a pending interrupt, similar to what
2018     /// VCPU_EVENTS also does.
2019     ///
2020     /// GET_MSRS requires a prepopulated data structure to do something
2021     /// meaningful. For SET_MSRS it will then contain good data.
2022     ///
2023     /// # Example
2024     ///
2025     /// ```rust
2026     /// # use hypervisor::kvm::KvmHypervisor;
2027     /// # use std::sync::Arc;
2028     /// let kvm = KvmHypervisor::new().unwrap();
2029     /// let hv = Arc::new(kvm);
2030     /// let vm = hv.create_vm().expect("new VM fd creation failed");
2031     /// vm.enable_split_irq().unwrap();
2032     /// let vcpu = vm.create_vcpu(0, None).unwrap();
2033     /// let state = vcpu.state().unwrap();
2034     /// ```
2035     fn state(&self) -> cpu::Result<CpuState> {
2036         let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?;
2037         let mp_state = self.get_mp_state()?.into();
2038         let regs = self.get_regs()?;
2039         let sregs = self.get_sregs()?;
2040         let xsave = self.get_xsave()?;
2041         let xcrs = self.get_xcrs()?;
2042         let lapic_state = self.get_lapic()?;
2043         let fpu = self.get_fpu()?;
2044 
2045         // Try to get all MSRs based on the list previously retrieved from KVM.
2046         // If the number of MSRs obtained from GET_MSRS is different from the
2047         // expected amount, we fallback onto a slower method by getting MSRs
2048         // by chunks. This is the only way to make sure we try to get as many
2049         // MSRs as possible, even if some MSRs are not supported.
2050         let mut msr_entries = self.msrs.clone();
2051 
2052         // Save extra MSRs if the Hyper-V synthetic interrupt controller is
2053         // emulated.
2054         if self.hyperv_synic.load(Ordering::Acquire) {
2055             let hyperv_synic_msrs = vec![
2056                 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084,
2057                 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096,
2058                 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d,
2059                 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4,
2060                 0x400000b5, 0x400000b6, 0x400000b7,
2061             ];
2062             for index in hyperv_synic_msrs {
2063                 let msr = kvm_msr_entry {
2064                     index,
2065                     ..Default::default()
2066                 };
2067                 msr_entries.push(msr.into());
2068             }
2069         }
2070 
2071         let expected_num_msrs = msr_entries.len();
2072         let num_msrs = self.get_msrs(&mut msr_entries)?;
2073         let msrs = if num_msrs != expected_num_msrs {
2074             let mut faulty_msr_index = num_msrs;
2075             let mut msr_entries_tmp = msr_entries[..faulty_msr_index].to_vec();
2076 
2077             loop {
2078                 warn!(
2079                     "Detected faulty MSR 0x{:x} while getting MSRs",
2080                     msr_entries[faulty_msr_index].index
2081                 );
2082 
2083                 // Skip the first bad MSR
2084                 let start_pos = faulty_msr_index + 1;
2085 
2086                 let mut sub_msr_entries = msr_entries[start_pos..].to_vec();
2087                 let num_msrs = self.get_msrs(&mut sub_msr_entries)?;
2088 
2089                 msr_entries_tmp.extend(&sub_msr_entries[..num_msrs]);
2090 
2091                 if num_msrs == sub_msr_entries.len() {
2092                     break;
2093                 }
2094 
2095                 faulty_msr_index = start_pos + num_msrs;
2096             }
2097 
2098             msr_entries_tmp
2099         } else {
2100             msr_entries
2101         };
2102 
2103         let vcpu_events = self.get_vcpu_events()?;
2104         let tsc_khz = self.tsc_khz()?;
2105 
2106         Ok(VcpuKvmState {
2107             cpuid,
2108             msrs,
2109             vcpu_events,
2110             regs: regs.into(),
2111             sregs: sregs.into(),
2112             fpu,
2113             lapic_state,
2114             xsave,
2115             xcrs,
2116             mp_state,
2117             tsc_khz,
2118         }
2119         .into())
2120     }
2121 
2122     ///
2123     /// Get the current AArch64 CPU state
2124     ///
2125     #[cfg(target_arch = "aarch64")]
2126     fn state(&self) -> cpu::Result<CpuState> {
2127         let mut state = VcpuKvmState {
2128             mp_state: self.get_mp_state()?.into(),
2129             ..Default::default()
2130         };
2131         // Get core registers
2132         state.core_regs = self.get_regs()?.into();
2133 
2134         // Get systerm register
2135         // Call KVM_GET_REG_LIST to get all registers available to the guest.
2136         // For ArmV8 there are around 500 registers.
2137         let mut sys_regs: Vec<Register> = Vec::new();
2138         let mut reg_list = RegList::new(500).unwrap();
2139         self.fd
2140             .lock()
2141             .unwrap()
2142             .get_reg_list(&mut reg_list)
2143             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?;
2144 
2145         // At this point reg_list should contain: core registers and system
2146         // registers.
2147         // The register list contains the number of registers and their ids. We
2148         // will be needing to call KVM_GET_ONE_REG on each id in order to save
2149         // all of them. We carve out from the list  the core registers which are
2150         // represented in the kernel by kvm_regs structure and for which we can
2151         // calculate the id based on the offset in the structure.
2152         reg_list.retain(|regid| is_system_register(*regid));
2153 
2154         // Now, for the rest of the registers left in the previously fetched
2155         // register list, we are simply calling KVM_GET_ONE_REG.
2156         let indices = reg_list.as_slice();
2157         for index in indices.iter() {
2158             let mut bytes = [0_u8; 8];
2159             self.fd
2160                 .lock()
2161                 .unwrap()
2162                 .get_one_reg(*index, &mut bytes)
2163                 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?;
2164             sys_regs.push(kvm_bindings::kvm_one_reg {
2165                 id: *index,
2166                 addr: u64::from_le_bytes(bytes),
2167             });
2168         }
2169 
2170         state.sys_regs = sys_regs;
2171 
2172         Ok(state.into())
2173     }
2174 
2175     #[cfg(target_arch = "x86_64")]
2176     ///
2177     /// Restore the previously saved CPU state
2178     ///
2179     /// Ordering requirements:
2180     ///
2181     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
2182     /// still running.
2183     ///
2184     /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so
2185     /// if we ever change the BSP, we have to do that before restoring anything.
2186     /// The same seems to be true for CPUID stuff.
2187     ///
2188     /// SREGS saves/restores a pending interrupt, similar to what
2189     /// VCPU_EVENTS also does.
2190     ///
2191     /// SET_REGS clears pending exceptions unconditionally, thus, it must be
2192     /// done before SET_VCPU_EVENTS, which restores it.
2193     ///
2194     /// SET_LAPIC must come after SET_SREGS, because the latter restores
2195     /// the apic base msr.
2196     ///
2197     /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR
2198     /// only restores successfully, when the LAPIC is correctly configured.
2199     ///
2200     /// Arguments: CpuState
2201     /// # Example
2202     ///
2203     /// ```rust
2204     /// # use hypervisor::kvm::KvmHypervisor;
2205     /// # use std::sync::Arc;
2206     /// let kvm = KvmHypervisor::new().unwrap();
2207     /// let hv = Arc::new(kvm);
2208     /// let vm = hv.create_vm().expect("new VM fd creation failed");
2209     /// vm.enable_split_irq().unwrap();
2210     /// let vcpu = vm.create_vcpu(0, None).unwrap();
2211     /// let state = vcpu.state().unwrap();
2212     /// vcpu.set_state(&state).unwrap();
2213     /// ```
2214     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
2215         let state: VcpuKvmState = state.clone().into();
2216         self.set_cpuid2(&state.cpuid)?;
2217         self.set_mp_state(state.mp_state.into())?;
2218         self.set_regs(&state.regs.into())?;
2219         self.set_sregs(&state.sregs.into())?;
2220         self.set_xsave(&state.xsave)?;
2221         self.set_xcrs(&state.xcrs)?;
2222         self.set_lapic(&state.lapic_state)?;
2223         self.set_fpu(&state.fpu)?;
2224 
2225         if let Some(freq) = state.tsc_khz {
2226             self.set_tsc_khz(freq)?;
2227         }
2228 
2229         // Try to set all MSRs previously stored.
2230         // If the number of MSRs set from SET_MSRS is different from the
2231         // expected amount, we fallback onto a slower method by setting MSRs
2232         // by chunks. This is the only way to make sure we try to set as many
2233         // MSRs as possible, even if some MSRs are not supported.
2234         let expected_num_msrs = state.msrs.len();
2235         let num_msrs = self.set_msrs(&state.msrs)?;
2236         if num_msrs != expected_num_msrs {
2237             let mut faulty_msr_index = num_msrs;
2238 
2239             loop {
2240                 warn!(
2241                     "Detected faulty MSR 0x{:x} while setting MSRs",
2242                     state.msrs[faulty_msr_index].index
2243                 );
2244 
2245                 // Skip the first bad MSR
2246                 let start_pos = faulty_msr_index + 1;
2247 
2248                 let sub_msr_entries = state.msrs[start_pos..].to_vec();
2249 
2250                 let num_msrs = self.set_msrs(&sub_msr_entries)?;
2251 
2252                 if num_msrs == sub_msr_entries.len() {
2253                     break;
2254                 }
2255 
2256                 faulty_msr_index = start_pos + num_msrs;
2257             }
2258         }
2259 
2260         self.set_vcpu_events(&state.vcpu_events)?;
2261 
2262         Ok(())
2263     }
2264 
2265     ///
2266     /// Restore the previously saved AArch64 CPU state
2267     ///
2268     #[cfg(target_arch = "aarch64")]
2269     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
2270         let state: VcpuKvmState = state.clone().into();
2271         // Set core registers
2272         self.set_regs(&state.core_regs.into())?;
2273         // Set system registers
2274         for reg in &state.sys_regs {
2275             self.fd
2276                 .lock()
2277                 .unwrap()
2278                 .set_one_reg(reg.id, &reg.addr.to_le_bytes())
2279                 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?;
2280         }
2281 
2282         self.set_mp_state(state.mp_state.into())?;
2283 
2284         Ok(())
2285     }
2286 
2287     ///
2288     /// Initialize TDX for this CPU
2289     ///
2290     #[cfg(feature = "tdx")]
2291     fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> {
2292         tdx_command(
2293             &self.fd.lock().unwrap().as_raw_fd(),
2294             TdxCommand::InitVcpu,
2295             0,
2296             hob_address,
2297         )
2298         .map_err(cpu::HypervisorCpuError::InitializeTdx)
2299     }
2300 
2301     ///
2302     /// Set the "immediate_exit" state
2303     ///
2304     fn set_immediate_exit(&self, exit: bool) {
2305         self.fd.lock().unwrap().set_kvm_immediate_exit(exit.into());
2306     }
2307 
2308     ///
2309     /// Returns the details about TDX exit reason
2310     ///
2311     #[cfg(feature = "tdx")]
2312     fn get_tdx_exit_details(&mut self) -> cpu::Result<TdxExitDetails> {
2313         let mut fd = self.fd.as_ref().lock().unwrap();
2314         let kvm_run = fd.get_kvm_run();
2315         // SAFETY: accessing a union field in a valid structure
2316         let tdx_vmcall = unsafe {
2317             &mut (*((&mut kvm_run.__bindgen_anon_1) as *mut kvm_run__bindgen_ty_1
2318                 as *mut KvmTdxExit))
2319                 .u
2320                 .vmcall
2321         };
2322 
2323         tdx_vmcall.status_code = TDG_VP_VMCALL_INVALID_OPERAND;
2324 
2325         if tdx_vmcall.type_ != 0 {
2326             return Err(cpu::HypervisorCpuError::UnknownTdxVmCall);
2327         }
2328 
2329         match tdx_vmcall.subfunction {
2330             TDG_VP_VMCALL_GET_QUOTE => Ok(TdxExitDetails::GetQuote),
2331             TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT => {
2332                 Ok(TdxExitDetails::SetupEventNotifyInterrupt)
2333             }
2334             _ => Err(cpu::HypervisorCpuError::UnknownTdxVmCall),
2335         }
2336     }
2337 
2338     ///
2339     /// Set the status code for TDX exit
2340     ///
2341     #[cfg(feature = "tdx")]
2342     fn set_tdx_status(&mut self, status: TdxExitStatus) {
2343         let mut fd = self.fd.as_ref().lock().unwrap();
2344         let kvm_run = fd.get_kvm_run();
2345         // SAFETY: accessing a union field in a valid structure
2346         let tdx_vmcall = unsafe {
2347             &mut (*((&mut kvm_run.__bindgen_anon_1) as *mut kvm_run__bindgen_ty_1
2348                 as *mut KvmTdxExit))
2349                 .u
2350                 .vmcall
2351         };
2352 
2353         tdx_vmcall.status_code = match status {
2354             TdxExitStatus::Success => TDG_VP_VMCALL_SUCCESS,
2355             TdxExitStatus::InvalidOperand => TDG_VP_VMCALL_INVALID_OPERAND,
2356         };
2357     }
2358 
2359     #[cfg(target_arch = "x86_64")]
2360     ///
2361     /// Return the list of initial MSR entries for a VCPU
2362     ///
2363     fn boot_msr_entries(&self) -> Vec<MsrEntry> {
2364         use crate::arch::x86::{msr_index, MTRR_ENABLE, MTRR_MEM_TYPE_WB};
2365 
2366         [
2367             msr!(msr_index::MSR_IA32_SYSENTER_CS),
2368             msr!(msr_index::MSR_IA32_SYSENTER_ESP),
2369             msr!(msr_index::MSR_IA32_SYSENTER_EIP),
2370             msr!(msr_index::MSR_STAR),
2371             msr!(msr_index::MSR_CSTAR),
2372             msr!(msr_index::MSR_LSTAR),
2373             msr!(msr_index::MSR_KERNEL_GS_BASE),
2374             msr!(msr_index::MSR_SYSCALL_MASK),
2375             msr!(msr_index::MSR_IA32_TSC),
2376             msr_data!(
2377                 msr_index::MSR_IA32_MISC_ENABLE,
2378                 msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64
2379             ),
2380             msr_data!(msr_index::MSR_MTRRdefType, MTRR_ENABLE | MTRR_MEM_TYPE_WB),
2381         ]
2382         .to_vec()
2383     }
2384 
2385     #[cfg(target_arch = "aarch64")]
2386     fn has_pmu_support(&self) -> bool {
2387         let cpu_attr = kvm_bindings::kvm_device_attr {
2388             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2389             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT),
2390             addr: 0x0,
2391             flags: 0,
2392         };
2393         self.fd.lock().unwrap().has_device_attr(&cpu_attr).is_ok()
2394     }
2395 
2396     #[cfg(target_arch = "aarch64")]
2397     fn init_pmu(&self, irq: u32) -> cpu::Result<()> {
2398         let cpu_attr = kvm_bindings::kvm_device_attr {
2399             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2400             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT),
2401             addr: 0x0,
2402             flags: 0,
2403         };
2404         let cpu_attr_irq = kvm_bindings::kvm_device_attr {
2405             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2406             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_IRQ),
2407             addr: &irq as *const u32 as u64,
2408             flags: 0,
2409         };
2410         self.fd
2411             .lock()
2412             .unwrap()
2413             .set_device_attr(&cpu_attr_irq)
2414             .map_err(|_| cpu::HypervisorCpuError::InitializePmu)?;
2415         self.fd
2416             .lock()
2417             .unwrap()
2418             .set_device_attr(&cpu_attr)
2419             .map_err(|_| cpu::HypervisorCpuError::InitializePmu)
2420     }
2421 
2422     #[cfg(target_arch = "x86_64")]
2423     ///
2424     /// Get the frequency of the TSC if available
2425     ///
2426     fn tsc_khz(&self) -> cpu::Result<Option<u32>> {
2427         match self.fd.lock().unwrap().get_tsc_khz() {
2428             Err(e) => {
2429                 if e.errno() == libc::EIO {
2430                     Ok(None)
2431                 } else {
2432                     Err(cpu::HypervisorCpuError::GetTscKhz(e.into()))
2433                 }
2434             }
2435             Ok(v) => Ok(Some(v)),
2436         }
2437     }
2438 
2439     #[cfg(target_arch = "x86_64")]
2440     ///
2441     /// Set the frequency of the TSC if available
2442     ///
2443     fn set_tsc_khz(&self, freq: u32) -> cpu::Result<()> {
2444         match self.fd.lock().unwrap().set_tsc_khz(freq) {
2445             Err(e) => {
2446                 if e.errno() == libc::EIO {
2447                     Ok(())
2448                 } else {
2449                     Err(cpu::HypervisorCpuError::SetTscKhz(e.into()))
2450                 }
2451             }
2452             Ok(_) => Ok(()),
2453         }
2454     }
2455 
2456     #[cfg(target_arch = "x86_64")]
2457     ///
2458     /// Trigger NMI interrupt
2459     ///
2460     fn nmi(&self) -> cpu::Result<()> {
2461         match self.fd.lock().unwrap().nmi() {
2462             Err(e) => {
2463                 if e.errno() == libc::EIO {
2464                     Ok(())
2465                 } else {
2466                     Err(cpu::HypervisorCpuError::Nmi(e.into()))
2467                 }
2468             }
2469             Ok(_) => Ok(()),
2470         }
2471     }
2472 }
2473 
2474 impl KvmVcpu {
2475     #[cfg(target_arch = "x86_64")]
2476     ///
2477     /// X86 specific call that returns the vcpu's current "xsave struct".
2478     ///
2479     fn get_xsave(&self) -> cpu::Result<XsaveState> {
2480         Ok(self
2481             .fd
2482             .lock()
2483             .unwrap()
2484             .get_xsave()
2485             .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into()))?
2486             .into())
2487     }
2488 
2489     #[cfg(target_arch = "x86_64")]
2490     ///
2491     /// X86 specific call that sets the vcpu's current "xsave struct".
2492     ///
2493     fn set_xsave(&self, xsave: &XsaveState) -> cpu::Result<()> {
2494         let xsave: kvm_bindings::kvm_xsave = (*xsave).clone().into();
2495         self.fd
2496             .lock()
2497             .unwrap()
2498             .set_xsave(&xsave)
2499             .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into()))
2500     }
2501 
2502     #[cfg(target_arch = "x86_64")]
2503     ///
2504     /// X86 specific call that returns the vcpu's current "xcrs".
2505     ///
2506     fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> {
2507         self.fd
2508             .lock()
2509             .unwrap()
2510             .get_xcrs()
2511             .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into()))
2512     }
2513 
2514     #[cfg(target_arch = "x86_64")]
2515     ///
2516     /// X86 specific call that sets the vcpu's current "xcrs".
2517     ///
2518     fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> {
2519         self.fd
2520             .lock()
2521             .unwrap()
2522             .set_xcrs(xcrs)
2523             .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into()))
2524     }
2525 
2526     #[cfg(target_arch = "x86_64")]
2527     ///
2528     /// Returns currently pending exceptions, interrupts, and NMIs as well as related
2529     /// states of the vcpu.
2530     ///
2531     fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> {
2532         self.fd
2533             .lock()
2534             .unwrap()
2535             .get_vcpu_events()
2536             .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into()))
2537     }
2538 
2539     #[cfg(target_arch = "x86_64")]
2540     ///
2541     /// Sets pending exceptions, interrupts, and NMIs as well as related states
2542     /// of the vcpu.
2543     ///
2544     fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> {
2545         self.fd
2546             .lock()
2547             .unwrap()
2548             .set_vcpu_events(events)
2549             .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into()))
2550     }
2551 }
2552