xref: /cloud-hypervisor/hypervisor/src/kvm/mod.rs (revision 19d36c765fdf00be749d95b3e61028bc302d6d73)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 // Copyright © 2020, Microsoft Corporation
6 //
7 // Copyright 2018-2019 CrowdStrike, Inc.
8 //
9 //
10 
11 use std::any::Any;
12 use std::collections::HashMap;
13 #[cfg(target_arch = "x86_64")]
14 use std::fs::File;
15 #[cfg(target_arch = "x86_64")]
16 use std::os::unix::io::AsRawFd;
17 #[cfg(feature = "tdx")]
18 use std::os::unix::io::RawFd;
19 use std::result;
20 #[cfg(target_arch = "x86_64")]
21 use std::sync::atomic::{AtomicBool, Ordering};
22 use std::sync::{Arc, Mutex, RwLock};
23 
24 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd};
25 use vmm_sys_util::eventfd::EventFd;
26 
27 #[cfg(target_arch = "aarch64")]
28 use crate::aarch64::gic::KvmGicV3Its;
29 #[cfg(target_arch = "aarch64")]
30 pub use crate::aarch64::{
31     check_required_kvm_extensions, gic::Gicv3ItsState as GicState, is_system_register, VcpuInit,
32     VcpuKvmState,
33 };
34 #[cfg(target_arch = "aarch64")]
35 use crate::arch::aarch64::gic::{Vgic, VgicConfig};
36 use crate::vm::{self, InterruptSourceConfig, VmOps};
37 #[cfg(target_arch = "aarch64")]
38 use crate::{arm64_core_reg_id, offset_of};
39 use crate::{cpu, hypervisor, vec_with_array_field, HypervisorType};
40 // x86_64 dependencies
41 #[cfg(target_arch = "x86_64")]
42 pub mod x86_64;
43 #[cfg(target_arch = "aarch64")]
44 use aarch64::{RegList, Register};
45 #[cfg(target_arch = "x86_64")]
46 use kvm_bindings::{
47     kvm_enable_cap, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, KVM_CAP_SPLIT_IRQCHIP,
48     KVM_GUESTDBG_USE_HW_BP,
49 };
50 #[cfg(target_arch = "x86_64")]
51 use x86_64::check_required_kvm_extensions;
52 #[cfg(target_arch = "x86_64")]
53 pub use x86_64::{CpuId, ExtendedControlRegisters, MsrEntries, VcpuKvmState};
54 
55 #[cfg(target_arch = "x86_64")]
56 use crate::arch::x86::{
57     CpuIdEntry, FpuState, LapicState, MsrEntry, SpecialRegisters, XsaveState, NUM_IOAPIC_PINS,
58 };
59 #[cfg(target_arch = "x86_64")]
60 use crate::ClockData;
61 use crate::{
62     CpuState, IoEventAddress, IrqRoutingEntry, MpState, StandardRegisters, UserMemoryRegion,
63     USER_MEMORY_REGION_LOG_DIRTY, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE,
64 };
65 // aarch64 dependencies
66 #[cfg(target_arch = "aarch64")]
67 pub mod aarch64;
68 #[cfg(target_arch = "aarch64")]
69 use std::mem;
70 
71 pub use kvm_bindings::{
72     kvm_clock_data, kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_guest_debug,
73     kvm_irq_routing, kvm_irq_routing_entry, kvm_mp_state, kvm_userspace_memory_region,
74     KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI,
75     KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID,
76 };
77 #[cfg(target_arch = "aarch64")]
78 use kvm_bindings::{
79     kvm_regs, user_fpsimd_state, user_pt_regs, KVM_GUESTDBG_USE_HW, KVM_NR_SPSR, KVM_REG_ARM64,
80     KVM_REG_ARM64_SYSREG, KVM_REG_ARM64_SYSREG_CRM_MASK, KVM_REG_ARM64_SYSREG_CRN_MASK,
81     KVM_REG_ARM64_SYSREG_OP0_MASK, KVM_REG_ARM64_SYSREG_OP1_MASK, KVM_REG_ARM64_SYSREG_OP2_MASK,
82     KVM_REG_ARM_CORE, KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64,
83 };
84 #[cfg(feature = "tdx")]
85 use kvm_bindings::{kvm_run__bindgen_ty_1, KVMIO};
86 pub use kvm_ioctls::{Cap, Kvm};
87 use thiserror::Error;
88 use vfio_ioctls::VfioDeviceFd;
89 #[cfg(feature = "tdx")]
90 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_ioc_nr, ioctl_iowr_nr};
91 pub use {kvm_bindings, kvm_ioctls};
92 ///
93 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms
94 ///
95 pub use {
96     kvm_bindings::kvm_create_device as CreateDevice, kvm_bindings::kvm_device_attr as DeviceAttr,
97     kvm_bindings::kvm_run, kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::VcpuExit,
98 };
99 
100 #[cfg(target_arch = "x86_64")]
101 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196;
102 
103 #[cfg(target_arch = "x86_64")]
104 use vmm_sys_util::ioctl_io_nr;
105 #[cfg(all(not(feature = "tdx"), target_arch = "x86_64"))]
106 use vmm_sys_util::ioctl_ioc_nr;
107 
108 #[cfg(target_arch = "x86_64")]
109 ioctl_io_nr!(KVM_NMI, kvm_bindings::KVMIO, 0x9a);
110 
111 #[cfg(feature = "tdx")]
112 const KVM_EXIT_TDX: u32 = 50;
113 #[cfg(feature = "tdx")]
114 const TDG_VP_VMCALL_GET_QUOTE: u64 = 0x10002;
115 #[cfg(feature = "tdx")]
116 const TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: u64 = 0x10004;
117 #[cfg(feature = "tdx")]
118 const TDG_VP_VMCALL_SUCCESS: u64 = 0;
119 #[cfg(feature = "tdx")]
120 const TDG_VP_VMCALL_INVALID_OPERAND: u64 = 0x8000000000000000;
121 
122 #[cfg(feature = "tdx")]
123 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong);
124 
125 #[cfg(feature = "tdx")]
126 #[repr(u32)]
127 enum TdxCommand {
128     Capabilities = 0,
129     InitVm,
130     InitVcpu,
131     InitMemRegion,
132     Finalize,
133 }
134 
135 #[cfg(feature = "tdx")]
136 pub enum TdxExitDetails {
137     GetQuote,
138     SetupEventNotifyInterrupt,
139 }
140 
141 #[cfg(feature = "tdx")]
142 pub enum TdxExitStatus {
143     Success,
144     InvalidOperand,
145 }
146 
147 #[cfg(feature = "tdx")]
148 const TDX_MAX_NR_CPUID_CONFIGS: usize = 6;
149 
150 #[cfg(feature = "tdx")]
151 #[repr(C)]
152 #[derive(Debug, Default)]
153 pub struct TdxCpuidConfig {
154     pub leaf: u32,
155     pub sub_leaf: u32,
156     pub eax: u32,
157     pub ebx: u32,
158     pub ecx: u32,
159     pub edx: u32,
160 }
161 
162 #[cfg(feature = "tdx")]
163 #[repr(C)]
164 #[derive(Debug, Default)]
165 pub struct TdxCapabilities {
166     pub attrs_fixed0: u64,
167     pub attrs_fixed1: u64,
168     pub xfam_fixed0: u64,
169     pub xfam_fixed1: u64,
170     pub nr_cpuid_configs: u32,
171     pub padding: u32,
172     pub cpuid_configs: [TdxCpuidConfig; TDX_MAX_NR_CPUID_CONFIGS],
173 }
174 
175 #[cfg(feature = "tdx")]
176 #[derive(Copy, Clone)]
177 pub struct KvmTdxExit {
178     pub type_: u32,
179     pub pad: u32,
180     pub u: KvmTdxExitU,
181 }
182 
183 #[cfg(feature = "tdx")]
184 #[repr(C)]
185 #[derive(Copy, Clone)]
186 pub union KvmTdxExitU {
187     pub vmcall: KvmTdxExitVmcall,
188 }
189 
190 #[cfg(feature = "tdx")]
191 #[repr(C)]
192 #[derive(Debug, Default, Copy, Clone, PartialEq)]
193 pub struct KvmTdxExitVmcall {
194     pub type_: u64,
195     pub subfunction: u64,
196     pub reg_mask: u64,
197     pub in_r12: u64,
198     pub in_r13: u64,
199     pub in_r14: u64,
200     pub in_r15: u64,
201     pub in_rbx: u64,
202     pub in_rdi: u64,
203     pub in_rsi: u64,
204     pub in_r8: u64,
205     pub in_r9: u64,
206     pub in_rdx: u64,
207     pub status_code: u64,
208     pub out_r11: u64,
209     pub out_r12: u64,
210     pub out_r13: u64,
211     pub out_r14: u64,
212     pub out_r15: u64,
213     pub out_rbx: u64,
214     pub out_rdi: u64,
215     pub out_rsi: u64,
216     pub out_r8: u64,
217     pub out_r9: u64,
218     pub out_rdx: u64,
219 }
220 
221 impl From<kvm_userspace_memory_region> for UserMemoryRegion {
222     fn from(region: kvm_userspace_memory_region) -> Self {
223         let mut flags = USER_MEMORY_REGION_READ;
224         if region.flags & KVM_MEM_READONLY == 0 {
225             flags |= USER_MEMORY_REGION_WRITE;
226         }
227         if region.flags & KVM_MEM_LOG_DIRTY_PAGES != 0 {
228             flags |= USER_MEMORY_REGION_LOG_DIRTY;
229         }
230 
231         UserMemoryRegion {
232             slot: region.slot,
233             guest_phys_addr: region.guest_phys_addr,
234             memory_size: region.memory_size,
235             userspace_addr: region.userspace_addr,
236             flags,
237         }
238     }
239 }
240 
241 impl From<UserMemoryRegion> for kvm_userspace_memory_region {
242     fn from(region: UserMemoryRegion) -> Self {
243         assert!(
244             region.flags & USER_MEMORY_REGION_READ != 0,
245             "KVM mapped memory is always readable"
246         );
247 
248         let mut flags = 0;
249         if region.flags & USER_MEMORY_REGION_WRITE == 0 {
250             flags |= KVM_MEM_READONLY;
251         }
252         if region.flags & USER_MEMORY_REGION_LOG_DIRTY != 0 {
253             flags |= KVM_MEM_LOG_DIRTY_PAGES;
254         }
255 
256         kvm_userspace_memory_region {
257             slot: region.slot,
258             guest_phys_addr: region.guest_phys_addr,
259             memory_size: region.memory_size,
260             userspace_addr: region.userspace_addr,
261             flags,
262         }
263     }
264 }
265 
266 impl From<kvm_mp_state> for MpState {
267     fn from(s: kvm_mp_state) -> Self {
268         MpState::Kvm(s)
269     }
270 }
271 
272 impl From<MpState> for kvm_mp_state {
273     fn from(ms: MpState) -> Self {
274         match ms {
275             MpState::Kvm(s) => s,
276             /* Needed in case other hypervisors are enabled */
277             #[allow(unreachable_patterns)]
278             _ => panic!("CpuState is not valid"),
279         }
280     }
281 }
282 
283 impl From<kvm_ioctls::IoEventAddress> for IoEventAddress {
284     fn from(a: kvm_ioctls::IoEventAddress) -> Self {
285         match a {
286             kvm_ioctls::IoEventAddress::Pio(x) => Self::Pio(x),
287             kvm_ioctls::IoEventAddress::Mmio(x) => Self::Mmio(x),
288         }
289     }
290 }
291 
292 impl From<IoEventAddress> for kvm_ioctls::IoEventAddress {
293     fn from(a: IoEventAddress) -> Self {
294         match a {
295             IoEventAddress::Pio(x) => Self::Pio(x),
296             IoEventAddress::Mmio(x) => Self::Mmio(x),
297         }
298     }
299 }
300 
301 impl From<VcpuKvmState> for CpuState {
302     fn from(s: VcpuKvmState) -> Self {
303         CpuState::Kvm(s)
304     }
305 }
306 
307 impl From<CpuState> for VcpuKvmState {
308     fn from(s: CpuState) -> Self {
309         match s {
310             CpuState::Kvm(s) => s,
311             /* Needed in case other hypervisors are enabled */
312             #[allow(unreachable_patterns)]
313             _ => panic!("CpuState is not valid"),
314         }
315     }
316 }
317 
318 #[cfg(target_arch = "x86_64")]
319 impl From<kvm_clock_data> for ClockData {
320     fn from(d: kvm_clock_data) -> Self {
321         ClockData::Kvm(d)
322     }
323 }
324 
325 #[cfg(target_arch = "x86_64")]
326 impl From<ClockData> for kvm_clock_data {
327     fn from(ms: ClockData) -> Self {
328         match ms {
329             ClockData::Kvm(s) => s,
330             /* Needed in case other hypervisors are enabled */
331             #[allow(unreachable_patterns)]
332             _ => panic!("CpuState is not valid"),
333         }
334     }
335 }
336 
337 impl From<kvm_bindings::kvm_regs> for crate::StandardRegisters {
338     fn from(s: kvm_bindings::kvm_regs) -> Self {
339         crate::StandardRegisters::Kvm(s)
340     }
341 }
342 
343 impl From<crate::StandardRegisters> for kvm_bindings::kvm_regs {
344     fn from(e: crate::StandardRegisters) -> Self {
345         match e {
346             crate::StandardRegisters::Kvm(e) => e,
347             /* Needed in case other hypervisors are enabled */
348             #[allow(unreachable_patterns)]
349             _ => panic!("StandardRegisters are not valid"),
350         }
351     }
352 }
353 
354 impl From<kvm_irq_routing_entry> for IrqRoutingEntry {
355     fn from(s: kvm_irq_routing_entry) -> Self {
356         IrqRoutingEntry::Kvm(s)
357     }
358 }
359 
360 impl From<IrqRoutingEntry> for kvm_irq_routing_entry {
361     fn from(e: IrqRoutingEntry) -> Self {
362         match e {
363             IrqRoutingEntry::Kvm(e) => e,
364             /* Needed in case other hypervisors are enabled */
365             #[allow(unreachable_patterns)]
366             _ => panic!("IrqRoutingEntry is not valid"),
367         }
368     }
369 }
370 
371 struct KvmDirtyLogSlot {
372     slot: u32,
373     guest_phys_addr: u64,
374     memory_size: u64,
375     userspace_addr: u64,
376 }
377 
378 /// Wrapper over KVM VM ioctls.
379 pub struct KvmVm {
380     fd: Arc<VmFd>,
381     #[cfg(target_arch = "x86_64")]
382     msrs: Vec<MsrEntry>,
383     dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>,
384 }
385 
386 impl KvmVm {
387     ///
388     /// Creates an emulated device in the kernel.
389     ///
390     /// See the documentation for `KVM_CREATE_DEVICE`.
391     fn create_device(&self, device: &mut CreateDevice) -> vm::Result<vfio_ioctls::VfioDeviceFd> {
392         let device_fd = self
393             .fd
394             .create_device(device)
395             .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?;
396         Ok(VfioDeviceFd::new_from_kvm(device_fd))
397     }
398     /// Checks if a particular `Cap` is available.
399     pub fn check_extension(&self, c: Cap) -> bool {
400         self.fd.check_extension(c)
401     }
402 }
403 
404 /// Implementation of Vm trait for KVM
405 ///
406 /// # Examples
407 ///
408 /// ```
409 /// # use hypervisor::kvm::KvmHypervisor;
410 /// # use std::sync::Arc;
411 /// let kvm = KvmHypervisor::new().unwrap();
412 /// let hypervisor = Arc::new(kvm);
413 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
414 /// ```
415 impl vm::Vm for KvmVm {
416     #[cfg(target_arch = "x86_64")]
417     ///
418     /// Sets the address of the one-page region in the VM's address space.
419     ///
420     fn set_identity_map_address(&self, address: u64) -> vm::Result<()> {
421         self.fd
422             .set_identity_map_address(address)
423             .map_err(|e| vm::HypervisorVmError::SetIdentityMapAddress(e.into()))
424     }
425 
426     #[cfg(target_arch = "x86_64")]
427     ///
428     /// Sets the address of the three-page region in the VM's address space.
429     ///
430     fn set_tss_address(&self, offset: usize) -> vm::Result<()> {
431         self.fd
432             .set_tss_address(offset)
433             .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into()))
434     }
435 
436     ///
437     /// Creates an in-kernel interrupt controller.
438     ///
439     fn create_irq_chip(&self) -> vm::Result<()> {
440         self.fd
441             .create_irq_chip()
442             .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into()))
443     }
444 
445     ///
446     /// Registers an event that will, when signaled, trigger the `gsi` IRQ.
447     ///
448     fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
449         self.fd
450             .register_irqfd(fd, gsi)
451             .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into()))
452     }
453 
454     ///
455     /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ.
456     ///
457     fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
458         self.fd
459             .unregister_irqfd(fd, gsi)
460             .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into()))
461     }
462 
463     ///
464     /// Creates a VcpuFd object from a vcpu RawFd.
465     ///
466     fn create_vcpu(
467         &self,
468         id: u8,
469         vm_ops: Option<Arc<dyn VmOps>>,
470     ) -> vm::Result<Arc<dyn cpu::Vcpu>> {
471         let fd = self
472             .fd
473             .create_vcpu(id as u64)
474             .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?;
475         let vcpu = KvmVcpu {
476             fd: Arc::new(Mutex::new(fd)),
477             #[cfg(target_arch = "x86_64")]
478             msrs: self.msrs.clone(),
479             vm_ops,
480             #[cfg(target_arch = "x86_64")]
481             hyperv_synic: AtomicBool::new(false),
482         };
483         Ok(Arc::new(vcpu))
484     }
485 
486     #[cfg(target_arch = "aarch64")]
487     ///
488     /// Creates a virtual GIC device.
489     ///
490     fn create_vgic(&self, config: VgicConfig) -> vm::Result<Arc<Mutex<dyn Vgic>>> {
491         let gic_device = KvmGicV3Its::new(self, config)
492             .map_err(|e| vm::HypervisorVmError::CreateVgic(anyhow!("Vgic error {:?}", e)))?;
493         Ok(Arc::new(Mutex::new(gic_device)))
494     }
495 
496     ///
497     /// Registers an event to be signaled whenever a certain address is written to.
498     ///
499     fn register_ioevent(
500         &self,
501         fd: &EventFd,
502         addr: &IoEventAddress,
503         datamatch: Option<vm::DataMatch>,
504     ) -> vm::Result<()> {
505         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
506         if let Some(dm) = datamatch {
507             match dm {
508                 vm::DataMatch::DataMatch32(kvm_dm32) => self
509                     .fd
510                     .register_ioevent(fd, addr, kvm_dm32)
511                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
512                 vm::DataMatch::DataMatch64(kvm_dm64) => self
513                     .fd
514                     .register_ioevent(fd, addr, kvm_dm64)
515                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
516             }
517         } else {
518             self.fd
519                 .register_ioevent(fd, addr, NoDatamatch)
520                 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into()))
521         }
522     }
523 
524     ///
525     /// Unregisters an event from a certain address it has been previously registered to.
526     ///
527     fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> {
528         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
529         self.fd
530             .unregister_ioevent(fd, addr, NoDatamatch)
531             .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into()))
532     }
533 
534     ///
535     /// Constructs a routing entry
536     ///
537     fn make_routing_entry(&self, gsi: u32, config: &InterruptSourceConfig) -> IrqRoutingEntry {
538         match &config {
539             InterruptSourceConfig::MsiIrq(cfg) => {
540                 let mut kvm_route = kvm_irq_routing_entry {
541                     gsi,
542                     type_: KVM_IRQ_ROUTING_MSI,
543                     ..Default::default()
544                 };
545 
546                 kvm_route.u.msi.address_lo = cfg.low_addr;
547                 kvm_route.u.msi.address_hi = cfg.high_addr;
548                 kvm_route.u.msi.data = cfg.data;
549 
550                 if self.check_extension(crate::kvm::Cap::MsiDevid) {
551                     // On AArch64, there is limitation on the range of the 'devid',
552                     // it cannot be greater than 65536 (the max of u16).
553                     //
554                     // BDF cannot be used directly, because 'segment' is in high
555                     // 16 bits. The layout of the u32 BDF is:
556                     // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --|
557                     // |      segment    |     bus    |   device   |  function  |
558                     //
559                     // Now that we support 1 bus only in a segment, we can build a
560                     // 'devid' by replacing the 'bus' bits with the low 8 bits of
561                     // 'segment' data.
562                     // This way we can resolve the range checking problem and give
563                     // different `devid` to all the devices. Limitation is that at
564                     // most 256 segments can be supported.
565                     //
566                     let modified_devid = (cfg.devid & 0x00ff_0000) >> 8 | cfg.devid & 0xff;
567 
568                     kvm_route.flags = KVM_MSI_VALID_DEVID;
569                     kvm_route.u.msi.__bindgen_anon_1.devid = modified_devid;
570                 }
571                 kvm_route.into()
572             }
573             InterruptSourceConfig::LegacyIrq(cfg) => {
574                 let mut kvm_route = kvm_irq_routing_entry {
575                     gsi,
576                     type_: KVM_IRQ_ROUTING_IRQCHIP,
577                     ..Default::default()
578                 };
579                 kvm_route.u.irqchip.irqchip = cfg.irqchip;
580                 kvm_route.u.irqchip.pin = cfg.pin;
581 
582                 kvm_route.into()
583             }
584         }
585     }
586 
587     ///
588     /// Sets the GSI routing table entries, overwriting any previously set
589     /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl.
590     ///
591     fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> {
592         let mut irq_routing =
593             vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len());
594         irq_routing[0].nr = entries.len() as u32;
595         irq_routing[0].flags = 0;
596         let entries: Vec<kvm_irq_routing_entry> = entries
597             .iter()
598             .map(|entry| match entry {
599                 IrqRoutingEntry::Kvm(e) => *e,
600                 #[allow(unreachable_patterns)]
601                 _ => panic!("IrqRoutingEntry type is wrong"),
602             })
603             .collect();
604 
605         // SAFETY: irq_routing initialized with entries.len() and now it is being turned into
606         // entries_slice with entries.len() again. It is guaranteed to be large enough to hold
607         // everything from entries.
608         unsafe {
609             let entries_slice: &mut [kvm_irq_routing_entry] =
610                 irq_routing[0].entries.as_mut_slice(entries.len());
611             entries_slice.copy_from_slice(&entries);
612         }
613 
614         self.fd
615             .set_gsi_routing(&irq_routing[0])
616             .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into()))
617     }
618 
619     ///
620     /// Creates a memory region structure that can be used with {create/remove}_user_memory_region
621     ///
622     fn make_user_memory_region(
623         &self,
624         slot: u32,
625         guest_phys_addr: u64,
626         memory_size: u64,
627         userspace_addr: u64,
628         readonly: bool,
629         log_dirty_pages: bool,
630     ) -> UserMemoryRegion {
631         kvm_userspace_memory_region {
632             slot,
633             guest_phys_addr,
634             memory_size,
635             userspace_addr,
636             flags: if readonly { KVM_MEM_READONLY } else { 0 }
637                 | if log_dirty_pages {
638                     KVM_MEM_LOG_DIRTY_PAGES
639                 } else {
640                     0
641                 },
642         }
643         .into()
644     }
645 
646     ///
647     /// Creates a guest physical memory region.
648     ///
649     fn create_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
650         let mut region: kvm_userspace_memory_region = user_memory_region.into();
651 
652         if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 {
653             if (region.flags & KVM_MEM_READONLY) != 0 {
654                 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!(
655                     "Error creating regions with both 'dirty-pages-log' and 'read-only'."
656                 )));
657             }
658 
659             // Keep track of the regions that need dirty pages log
660             self.dirty_log_slots.write().unwrap().insert(
661                 region.slot,
662                 KvmDirtyLogSlot {
663                     slot: region.slot,
664                     guest_phys_addr: region.guest_phys_addr,
665                     memory_size: region.memory_size,
666                     userspace_addr: region.userspace_addr,
667                 },
668             );
669 
670             // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`.
671             // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`.
672             region.flags = 0;
673         }
674 
675         // SAFETY: Safe because guest regions are guaranteed not to overlap.
676         unsafe {
677             self.fd
678                 .set_user_memory_region(region)
679                 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))
680         }
681     }
682 
683     ///
684     /// Removes a guest physical memory region.
685     ///
686     fn remove_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
687         let mut region: kvm_userspace_memory_region = user_memory_region.into();
688 
689         // Remove the corresponding entry from "self.dirty_log_slots" if needed
690         self.dirty_log_slots.write().unwrap().remove(&region.slot);
691 
692         // Setting the size to 0 means "remove"
693         region.memory_size = 0;
694         // SAFETY: Safe because guest regions are guaranteed not to overlap.
695         unsafe {
696             self.fd
697                 .set_user_memory_region(region)
698                 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into()))
699         }
700     }
701 
702     ///
703     /// Returns the preferred CPU target type which can be emulated by KVM on underlying host.
704     ///
705     #[cfg(target_arch = "aarch64")]
706     fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> {
707         self.fd
708             .get_preferred_target(kvi)
709             .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into()))
710     }
711 
712     #[cfg(target_arch = "x86_64")]
713     fn enable_split_irq(&self) -> vm::Result<()> {
714         // Create split irqchip
715         // Only the local APIC is emulated in kernel, both PICs and IOAPIC
716         // are not.
717         let mut cap = kvm_enable_cap {
718             cap: KVM_CAP_SPLIT_IRQCHIP,
719             ..Default::default()
720         };
721         cap.args[0] = NUM_IOAPIC_PINS as u64;
722         self.fd
723             .enable_cap(&cap)
724             .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?;
725         Ok(())
726     }
727 
728     #[cfg(target_arch = "x86_64")]
729     fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> {
730         let mut cap = kvm_enable_cap {
731             cap: KVM_CAP_SGX_ATTRIBUTE,
732             ..Default::default()
733         };
734         cap.args[0] = file.as_raw_fd() as u64;
735         self.fd
736             .enable_cap(&cap)
737             .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?;
738         Ok(())
739     }
740 
741     /// Retrieve guest clock.
742     #[cfg(target_arch = "x86_64")]
743     fn get_clock(&self) -> vm::Result<ClockData> {
744         Ok(self
745             .fd
746             .get_clock()
747             .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))?
748             .into())
749     }
750 
751     /// Set guest clock.
752     #[cfg(target_arch = "x86_64")]
753     fn set_clock(&self, data: &ClockData) -> vm::Result<()> {
754         let data = (*data).into();
755         self.fd
756             .set_clock(&data)
757             .map_err(|e| vm::HypervisorVmError::SetClock(e.into()))
758     }
759 
760     /// Create a device that is used for passthrough
761     fn create_passthrough_device(&self) -> vm::Result<VfioDeviceFd> {
762         let mut vfio_dev = kvm_create_device {
763             type_: kvm_device_type_KVM_DEV_TYPE_VFIO,
764             fd: 0,
765             flags: 0,
766         };
767 
768         self.create_device(&mut vfio_dev)
769             .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into()))
770     }
771 
772     ///
773     /// Start logging dirty pages
774     ///
775     fn start_dirty_log(&self) -> vm::Result<()> {
776         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
777         for (_, s) in dirty_log_slots.iter() {
778             let region = kvm_userspace_memory_region {
779                 slot: s.slot,
780                 guest_phys_addr: s.guest_phys_addr,
781                 memory_size: s.memory_size,
782                 userspace_addr: s.userspace_addr,
783                 flags: KVM_MEM_LOG_DIRTY_PAGES,
784             };
785             // SAFETY: Safe because guest regions are guaranteed not to overlap.
786             unsafe {
787                 self.fd
788                     .set_user_memory_region(region)
789                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
790             }
791         }
792 
793         Ok(())
794     }
795 
796     ///
797     /// Stop logging dirty pages
798     ///
799     fn stop_dirty_log(&self) -> vm::Result<()> {
800         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
801         for (_, s) in dirty_log_slots.iter() {
802             let region = kvm_userspace_memory_region {
803                 slot: s.slot,
804                 guest_phys_addr: s.guest_phys_addr,
805                 memory_size: s.memory_size,
806                 userspace_addr: s.userspace_addr,
807                 flags: 0,
808             };
809             // SAFETY: Safe because guest regions are guaranteed not to overlap.
810             unsafe {
811                 self.fd
812                     .set_user_memory_region(region)
813                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
814             }
815         }
816 
817         Ok(())
818     }
819 
820     ///
821     /// Get dirty pages bitmap (one bit per page)
822     ///
823     fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> {
824         self.fd
825             .get_dirty_log(slot, memory_size as usize)
826             .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into()))
827     }
828 
829     ///
830     /// Initialize TDX for this VM
831     ///
832     #[cfg(feature = "tdx")]
833     fn tdx_init(&self, cpuid: &[CpuIdEntry], max_vcpus: u32) -> vm::Result<()> {
834         const TDX_ATTR_SEPT_VE_DISABLE: usize = 28;
835 
836         let mut cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> =
837             cpuid.iter().map(|e| (*e).into()).collect();
838         cpuid.resize(256, kvm_bindings::kvm_cpuid_entry2::default());
839 
840         #[repr(C)]
841         struct TdxInitVm {
842             attributes: u64,
843             max_vcpus: u32,
844             padding: u32,
845             mrconfigid: [u64; 6],
846             mrowner: [u64; 6],
847             mrownerconfig: [u64; 6],
848             cpuid_nent: u32,
849             cpuid_padding: u32,
850             cpuid_entries: [kvm_bindings::kvm_cpuid_entry2; 256],
851         }
852         let data = TdxInitVm {
853             attributes: 1 << TDX_ATTR_SEPT_VE_DISABLE,
854             max_vcpus,
855             padding: 0,
856             mrconfigid: [0; 6],
857             mrowner: [0; 6],
858             mrownerconfig: [0; 6],
859             cpuid_nent: cpuid.len() as u32,
860             cpuid_padding: 0,
861             cpuid_entries: cpuid.as_slice().try_into().unwrap(),
862         };
863 
864         tdx_command(
865             &self.fd.as_raw_fd(),
866             TdxCommand::InitVm,
867             0,
868             &data as *const _ as u64,
869         )
870         .map_err(vm::HypervisorVmError::InitializeTdx)
871     }
872 
873     ///
874     /// Finalize the TDX setup for this VM
875     ///
876     #[cfg(feature = "tdx")]
877     fn tdx_finalize(&self) -> vm::Result<()> {
878         tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0)
879             .map_err(vm::HypervisorVmError::FinalizeTdx)
880     }
881 
882     ///
883     /// Initialize memory regions for the TDX VM
884     ///
885     #[cfg(feature = "tdx")]
886     fn tdx_init_memory_region(
887         &self,
888         host_address: u64,
889         guest_address: u64,
890         size: u64,
891         measure: bool,
892     ) -> vm::Result<()> {
893         #[repr(C)]
894         struct TdxInitMemRegion {
895             host_address: u64,
896             guest_address: u64,
897             pages: u64,
898         }
899         let data = TdxInitMemRegion {
900             host_address,
901             guest_address,
902             pages: size / 4096,
903         };
904 
905         tdx_command(
906             &self.fd.as_raw_fd(),
907             TdxCommand::InitMemRegion,
908             u32::from(measure),
909             &data as *const _ as u64,
910         )
911         .map_err(vm::HypervisorVmError::InitMemRegionTdx)
912     }
913 
914     /// Downcast to the underlying KvmVm type
915     fn as_any(&self) -> &dyn Any {
916         self
917     }
918 }
919 
920 #[cfg(feature = "tdx")]
921 fn tdx_command(
922     fd: &RawFd,
923     command: TdxCommand,
924     flags: u32,
925     data: u64,
926 ) -> std::result::Result<(), std::io::Error> {
927     #[repr(C)]
928     struct TdxIoctlCmd {
929         command: TdxCommand,
930         flags: u32,
931         data: u64,
932         error: u64,
933         unused: u64,
934     }
935     let cmd = TdxIoctlCmd {
936         command,
937         flags,
938         data,
939         error: 0,
940         unused: 0,
941     };
942     // SAFETY: FFI call. All input parameters are valid.
943     let ret = unsafe {
944         ioctl_with_val(
945             fd,
946             KVM_MEMORY_ENCRYPT_OP(),
947             &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong,
948         )
949     };
950 
951     if ret < 0 {
952         return Err(std::io::Error::last_os_error());
953     }
954     Ok(())
955 }
956 
957 /// Wrapper over KVM system ioctls.
958 pub struct KvmHypervisor {
959     kvm: Kvm,
960 }
961 
962 impl KvmHypervisor {
963     #[cfg(target_arch = "x86_64")]
964     ///
965     /// Retrieve the list of MSRs supported by the hypervisor.
966     ///
967     fn get_msr_list(&self) -> hypervisor::Result<MsrList> {
968         self.kvm
969             .get_msr_index_list()
970             .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into()))
971     }
972 }
973 
974 /// Enum for KVM related error
975 #[derive(Debug, Error)]
976 pub enum KvmError {
977     #[error("Capability missing: {0:?}")]
978     CapabilityMissing(Cap),
979 }
980 
981 pub type KvmResult<T> = result::Result<T, KvmError>;
982 
983 impl KvmHypervisor {
984     /// Create a hypervisor based on Kvm
985     #[allow(clippy::new_ret_no_self)]
986     pub fn new() -> hypervisor::Result<Arc<dyn hypervisor::Hypervisor>> {
987         let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?;
988         let api_version = kvm_obj.get_api_version();
989 
990         if api_version != kvm_bindings::KVM_API_VERSION as i32 {
991             return Err(hypervisor::HypervisorError::IncompatibleApiVersion);
992         }
993 
994         Ok(Arc::new(KvmHypervisor { kvm: kvm_obj }))
995     }
996 
997     /// Check if the hypervisor is available
998     pub fn is_available() -> hypervisor::Result<bool> {
999         match std::fs::metadata("/dev/kvm") {
1000             Ok(_) => Ok(true),
1001             Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false),
1002             Err(err) => Err(hypervisor::HypervisorError::HypervisorAvailableCheck(
1003                 err.into(),
1004             )),
1005         }
1006     }
1007 }
1008 
1009 /// Implementation of Hypervisor trait for KVM
1010 ///
1011 /// # Examples
1012 ///
1013 /// ```
1014 /// # use hypervisor::kvm::KvmHypervisor;
1015 /// # use std::sync::Arc;
1016 /// let kvm = KvmHypervisor::new().unwrap();
1017 /// let hypervisor = Arc::new(kvm);
1018 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
1019 /// ```
1020 impl hypervisor::Hypervisor for KvmHypervisor {
1021     ///
1022     /// Returns the type of the hypervisor
1023     ///
1024     fn hypervisor_type(&self) -> HypervisorType {
1025         HypervisorType::Kvm
1026     }
1027 
1028     /// Create a KVM vm object of a specific VM type and return the object as Vm trait object
1029     ///
1030     /// # Examples
1031     ///
1032     /// ```
1033     /// # use hypervisor::kvm::KvmHypervisor;
1034     /// use hypervisor::kvm::KvmVm;
1035     /// let hypervisor = KvmHypervisor::new().unwrap();
1036     /// let vm = hypervisor.create_vm_with_type(0).unwrap();
1037     /// ```
1038     fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> {
1039         let fd: VmFd;
1040         loop {
1041             match self.kvm.create_vm_with_type(vm_type) {
1042                 Ok(res) => fd = res,
1043                 Err(e) => {
1044                     if e.errno() == libc::EINTR {
1045                         // If the error returned is EINTR, which means the
1046                         // ioctl has been interrupted, we have to retry as
1047                         // this can't be considered as a regular error.
1048                         continue;
1049                     } else {
1050                         return Err(hypervisor::HypervisorError::VmCreate(e.into()));
1051                     }
1052                 }
1053             }
1054             break;
1055         }
1056 
1057         let vm_fd = Arc::new(fd);
1058 
1059         #[cfg(target_arch = "x86_64")]
1060         {
1061             let msr_list = self.get_msr_list()?;
1062             let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize;
1063             let mut msrs: Vec<MsrEntry> = vec![
1064                 MsrEntry {
1065                     ..Default::default()
1066                 };
1067                 num_msrs
1068             ];
1069             let indices = msr_list.as_slice();
1070             for (pos, index) in indices.iter().enumerate() {
1071                 msrs[pos].index = *index;
1072             }
1073 
1074             Ok(Arc::new(KvmVm {
1075                 fd: vm_fd,
1076                 msrs,
1077                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
1078             }))
1079         }
1080 
1081         #[cfg(target_arch = "aarch64")]
1082         {
1083             Ok(Arc::new(KvmVm {
1084                 fd: vm_fd,
1085                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
1086             }))
1087         }
1088     }
1089 
1090     /// Create a KVM vm object and return the object as Vm trait object
1091     ///
1092     /// # Examples
1093     ///
1094     /// ```
1095     /// # use hypervisor::kvm::KvmHypervisor;
1096     /// use hypervisor::kvm::KvmVm;
1097     /// let hypervisor = KvmHypervisor::new().unwrap();
1098     /// let vm = hypervisor.create_vm().unwrap();
1099     /// ```
1100     fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> {
1101         #[allow(unused_mut)]
1102         let mut vm_type: u64 = 0; // Create with default platform type
1103 
1104         // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA
1105         // size from the host and use that when creating the VM, which may
1106         // avoid unnecessary VM creation failures.
1107         #[cfg(target_arch = "aarch64")]
1108         if self.kvm.check_extension(Cap::ArmVmIPASize) {
1109             vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap();
1110         }
1111 
1112         self.create_vm_with_type(vm_type)
1113     }
1114 
1115     fn check_required_extensions(&self) -> hypervisor::Result<()> {
1116         check_required_kvm_extensions(&self.kvm)
1117             .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into()))
1118     }
1119 
1120     #[cfg(target_arch = "x86_64")]
1121     ///
1122     /// X86 specific call to get the system supported CPUID values.
1123     ///
1124     fn get_supported_cpuid(&self) -> hypervisor::Result<Vec<CpuIdEntry>> {
1125         let kvm_cpuid = self
1126             .kvm
1127             .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
1128             .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))?;
1129 
1130         let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect();
1131 
1132         Ok(v)
1133     }
1134 
1135     #[cfg(target_arch = "aarch64")]
1136     ///
1137     /// Retrieve AArch64 host maximum IPA size supported by KVM.
1138     ///
1139     fn get_host_ipa_limit(&self) -> i32 {
1140         self.kvm.get_host_ipa_limit()
1141     }
1142 
1143     ///
1144     /// Retrieve TDX capabilities
1145     ///
1146     #[cfg(feature = "tdx")]
1147     fn tdx_capabilities(&self) -> hypervisor::Result<TdxCapabilities> {
1148         let data = TdxCapabilities {
1149             nr_cpuid_configs: TDX_MAX_NR_CPUID_CONFIGS as u32,
1150             ..Default::default()
1151         };
1152 
1153         tdx_command(
1154             &self.kvm.as_raw_fd(),
1155             TdxCommand::Capabilities,
1156             0,
1157             &data as *const _ as u64,
1158         )
1159         .map_err(|e| hypervisor::HypervisorError::TdxCapabilities(e.into()))?;
1160 
1161         Ok(data)
1162     }
1163 
1164     ///
1165     /// Get the number of supported hardware breakpoints
1166     ///
1167     fn get_guest_debug_hw_bps(&self) -> usize {
1168         #[cfg(target_arch = "x86_64")]
1169         {
1170             4
1171         }
1172         #[cfg(target_arch = "aarch64")]
1173         {
1174             self.kvm.get_guest_debug_hw_bps() as usize
1175         }
1176     }
1177 
1178     /// Get maximum number of vCPUs
1179     fn get_max_vcpus(&self) -> u32 {
1180         self.kvm.get_max_vcpus().min(u32::MAX as usize) as u32
1181     }
1182 }
1183 
1184 /// Vcpu struct for KVM
1185 pub struct KvmVcpu {
1186     fd: Arc<Mutex<VcpuFd>>,
1187     #[cfg(target_arch = "x86_64")]
1188     msrs: Vec<MsrEntry>,
1189     vm_ops: Option<Arc<dyn vm::VmOps>>,
1190     #[cfg(target_arch = "x86_64")]
1191     hyperv_synic: AtomicBool,
1192 }
1193 
1194 /// Implementation of Vcpu trait for KVM
1195 ///
1196 /// # Examples
1197 ///
1198 /// ```
1199 /// # use hypervisor::kvm::KvmHypervisor;
1200 /// # use std::sync::Arc;
1201 /// let kvm = KvmHypervisor::new().unwrap();
1202 /// let hypervisor = Arc::new(kvm);
1203 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
1204 /// let vcpu = vm.create_vcpu(0, None).unwrap();
1205 /// ```
1206 impl cpu::Vcpu for KvmVcpu {
1207     ///
1208     /// Returns StandardRegisters with default value set
1209     ///
1210     fn create_standard_regs(&self) -> StandardRegisters {
1211         kvm_bindings::kvm_regs::default().into()
1212     }
1213     #[cfg(target_arch = "x86_64")]
1214     ///
1215     /// Returns the vCPU general purpose registers.
1216     ///
1217     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1218         Ok(self
1219             .fd
1220             .lock()
1221             .unwrap()
1222             .get_regs()
1223             .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))?
1224             .into())
1225     }
1226 
1227     ///
1228     /// Returns the vCPU general purpose registers.
1229     /// The `KVM_GET_REGS` ioctl is not available on AArch64, `KVM_GET_ONE_REG`
1230     /// is used to get registers one by one.
1231     ///
1232     #[cfg(target_arch = "aarch64")]
1233     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1234         let mut state = kvm_regs::default();
1235         let mut off = offset_of!(user_pt_regs, regs);
1236         // There are 31 user_pt_regs:
1237         // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72
1238         // These actually are the general-purpose registers of the Armv8-a
1239         // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register).
1240         for i in 0..31 {
1241             let mut bytes = [0_u8; 8];
1242             self.fd
1243                 .lock()
1244                 .unwrap()
1245                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1246                 .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1247             state.regs.regs[i] = u64::from_le_bytes(bytes);
1248             off += std::mem::size_of::<u64>();
1249         }
1250 
1251         // We are now entering the "Other register" section of the ARMv8-a architecture.
1252         // First one, stack pointer.
1253         let off = offset_of!(user_pt_regs, sp);
1254         let mut bytes = [0_u8; 8];
1255         self.fd
1256             .lock()
1257             .unwrap()
1258             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1259             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1260         state.regs.sp = u64::from_le_bytes(bytes);
1261 
1262         // Second one, the program counter.
1263         let off = offset_of!(user_pt_regs, pc);
1264         let mut bytes = [0_u8; 8];
1265         self.fd
1266             .lock()
1267             .unwrap()
1268             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1269             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1270         state.regs.pc = u64::from_le_bytes(bytes);
1271 
1272         // Next is the processor state.
1273         let off = offset_of!(user_pt_regs, pstate);
1274         let mut bytes = [0_u8; 8];
1275         self.fd
1276             .lock()
1277             .unwrap()
1278             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1279             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1280         state.regs.pstate = u64::from_le_bytes(bytes);
1281 
1282         // The stack pointer associated with EL1
1283         let off = offset_of!(kvm_regs, sp_el1);
1284         let mut bytes = [0_u8; 8];
1285         self.fd
1286             .lock()
1287             .unwrap()
1288             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1289             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1290         state.sp_el1 = u64::from_le_bytes(bytes);
1291 
1292         // Exception Link Register for EL1, when taking an exception to EL1, this register
1293         // holds the address to which to return afterwards.
1294         let off = offset_of!(kvm_regs, elr_el1);
1295         let mut bytes = [0_u8; 8];
1296         self.fd
1297             .lock()
1298             .unwrap()
1299             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1300             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1301         state.elr_el1 = u64::from_le_bytes(bytes);
1302 
1303         // Saved Program Status Registers, there are 5 of them used in the kernel.
1304         let mut off = offset_of!(kvm_regs, spsr);
1305         for i in 0..KVM_NR_SPSR as usize {
1306             let mut bytes = [0_u8; 8];
1307             self.fd
1308                 .lock()
1309                 .unwrap()
1310                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1311                 .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1312             state.spsr[i] = u64::from_le_bytes(bytes);
1313             off += std::mem::size_of::<u64>();
1314         }
1315 
1316         // Now moving on to floating point registers which are stored in the user_fpsimd_state in the kernel:
1317         // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53
1318         let mut off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, vregs);
1319         for i in 0..32 {
1320             let mut bytes = [0_u8; 16];
1321             self.fd
1322                 .lock()
1323                 .unwrap()
1324                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off), &mut bytes)
1325                 .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1326             state.fp_regs.vregs[i] = u128::from_le_bytes(bytes);
1327             off += mem::size_of::<u128>();
1328         }
1329 
1330         // Floating-point Status Register
1331         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpsr);
1332         let mut bytes = [0_u8; 4];
1333         self.fd
1334             .lock()
1335             .unwrap()
1336             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off), &mut bytes)
1337             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1338         state.fp_regs.fpsr = u32::from_le_bytes(bytes);
1339 
1340         // Floating-point Control Register
1341         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpcr);
1342         let mut bytes = [0_u8; 4];
1343         self.fd
1344             .lock()
1345             .unwrap()
1346             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off), &mut bytes)
1347             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1348         state.fp_regs.fpcr = u32::from_le_bytes(bytes);
1349         Ok(state.into())
1350     }
1351 
1352     #[cfg(target_arch = "x86_64")]
1353     ///
1354     /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl.
1355     ///
1356     fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> {
1357         let regs = (*regs).into();
1358         self.fd
1359             .lock()
1360             .unwrap()
1361             .set_regs(&regs)
1362             .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into()))
1363     }
1364 
1365     ///
1366     /// Sets the vCPU general purpose registers.
1367     /// The `KVM_SET_REGS` ioctl is not available on AArch64, `KVM_SET_ONE_REG`
1368     /// is used to set registers one by one.
1369     ///
1370     #[cfg(target_arch = "aarch64")]
1371     fn set_regs(&self, state: &StandardRegisters) -> cpu::Result<()> {
1372         // The function follows the exact identical order from `state`. Look there
1373         // for some additional info on registers.
1374         let kvm_regs_state: kvm_regs = (*state).into();
1375         let mut off = offset_of!(user_pt_regs, regs);
1376         for i in 0..31 {
1377             self.fd
1378                 .lock()
1379                 .unwrap()
1380                 .set_one_reg(
1381                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1382                     &kvm_regs_state.regs.regs[i].to_le_bytes(),
1383                 )
1384                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1385             off += std::mem::size_of::<u64>();
1386         }
1387 
1388         let off = offset_of!(user_pt_regs, sp);
1389         self.fd
1390             .lock()
1391             .unwrap()
1392             .set_one_reg(
1393                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1394                 &kvm_regs_state.regs.sp.to_le_bytes(),
1395             )
1396             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1397 
1398         let off = offset_of!(user_pt_regs, pc);
1399         self.fd
1400             .lock()
1401             .unwrap()
1402             .set_one_reg(
1403                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1404                 &kvm_regs_state.regs.pc.to_le_bytes(),
1405             )
1406             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1407 
1408         let off = offset_of!(user_pt_regs, pstate);
1409         self.fd
1410             .lock()
1411             .unwrap()
1412             .set_one_reg(
1413                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1414                 &kvm_regs_state.regs.pstate.to_le_bytes(),
1415             )
1416             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1417 
1418         let off = offset_of!(kvm_regs, sp_el1);
1419         self.fd
1420             .lock()
1421             .unwrap()
1422             .set_one_reg(
1423                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1424                 &kvm_regs_state.sp_el1.to_le_bytes(),
1425             )
1426             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1427 
1428         let off = offset_of!(kvm_regs, elr_el1);
1429         self.fd
1430             .lock()
1431             .unwrap()
1432             .set_one_reg(
1433                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1434                 &kvm_regs_state.elr_el1.to_le_bytes(),
1435             )
1436             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1437 
1438         let mut off = offset_of!(kvm_regs, spsr);
1439         for i in 0..KVM_NR_SPSR as usize {
1440             self.fd
1441                 .lock()
1442                 .unwrap()
1443                 .set_one_reg(
1444                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1445                     &kvm_regs_state.spsr[i].to_le_bytes(),
1446                 )
1447                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1448             off += std::mem::size_of::<u64>();
1449         }
1450 
1451         let mut off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, vregs);
1452         for i in 0..32 {
1453             self.fd
1454                 .lock()
1455                 .unwrap()
1456                 .set_one_reg(
1457                     arm64_core_reg_id!(KVM_REG_SIZE_U128, off),
1458                     &kvm_regs_state.fp_regs.vregs[i].to_le_bytes(),
1459                 )
1460                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1461             off += mem::size_of::<u128>();
1462         }
1463 
1464         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpsr);
1465         self.fd
1466             .lock()
1467             .unwrap()
1468             .set_one_reg(
1469                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1470                 &kvm_regs_state.fp_regs.fpsr.to_le_bytes(),
1471             )
1472             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1473 
1474         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpcr);
1475         self.fd
1476             .lock()
1477             .unwrap()
1478             .set_one_reg(
1479                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1480                 &kvm_regs_state.fp_regs.fpcr.to_le_bytes(),
1481             )
1482             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1483         Ok(())
1484     }
1485 
1486     #[cfg(target_arch = "x86_64")]
1487     ///
1488     /// Returns the vCPU special registers.
1489     ///
1490     fn get_sregs(&self) -> cpu::Result<SpecialRegisters> {
1491         Ok(self
1492             .fd
1493             .lock()
1494             .unwrap()
1495             .get_sregs()
1496             .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))?
1497             .into())
1498     }
1499 
1500     #[cfg(target_arch = "x86_64")]
1501     ///
1502     /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl.
1503     ///
1504     fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> {
1505         let sregs = (*sregs).into();
1506         self.fd
1507             .lock()
1508             .unwrap()
1509             .set_sregs(&sregs)
1510             .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into()))
1511     }
1512 
1513     #[cfg(target_arch = "x86_64")]
1514     ///
1515     /// Returns the floating point state (FPU) from the vCPU.
1516     ///
1517     fn get_fpu(&self) -> cpu::Result<FpuState> {
1518         Ok(self
1519             .fd
1520             .lock()
1521             .unwrap()
1522             .get_fpu()
1523             .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))?
1524             .into())
1525     }
1526 
1527     #[cfg(target_arch = "x86_64")]
1528     ///
1529     /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioctl.
1530     ///
1531     fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> {
1532         let fpu: kvm_bindings::kvm_fpu = (*fpu).clone().into();
1533         self.fd
1534             .lock()
1535             .unwrap()
1536             .set_fpu(&fpu)
1537             .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into()))
1538     }
1539 
1540     #[cfg(target_arch = "x86_64")]
1541     ///
1542     /// X86 specific call to setup the CPUID registers.
1543     ///
1544     fn set_cpuid2(&self, cpuid: &[CpuIdEntry]) -> cpu::Result<()> {
1545         let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> =
1546             cpuid.iter().map(|e| (*e).into()).collect();
1547         let kvm_cpuid = <CpuId>::from_entries(&cpuid)
1548             .map_err(|_| cpu::HypervisorCpuError::SetCpuid(anyhow!("failed to create CpuId")))?;
1549 
1550         self.fd
1551             .lock()
1552             .unwrap()
1553             .set_cpuid2(&kvm_cpuid)
1554             .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into()))
1555     }
1556 
1557     #[cfg(target_arch = "x86_64")]
1558     ///
1559     /// X86 specific call to enable HyperV SynIC
1560     ///
1561     fn enable_hyperv_synic(&self) -> cpu::Result<()> {
1562         // Update the information about Hyper-V SynIC being enabled and
1563         // emulated as it will influence later which MSRs should be saved.
1564         self.hyperv_synic.store(true, Ordering::Release);
1565 
1566         let cap = kvm_enable_cap {
1567             cap: KVM_CAP_HYPERV_SYNIC,
1568             ..Default::default()
1569         };
1570         self.fd
1571             .lock()
1572             .unwrap()
1573             .enable_cap(&cap)
1574             .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into()))
1575     }
1576 
1577     ///
1578     /// X86 specific call to retrieve the CPUID registers.
1579     ///
1580     #[cfg(target_arch = "x86_64")]
1581     fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<Vec<CpuIdEntry>> {
1582         let kvm_cpuid = self
1583             .fd
1584             .lock()
1585             .unwrap()
1586             .get_cpuid2(num_entries)
1587             .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))?;
1588 
1589         let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect();
1590 
1591         Ok(v)
1592     }
1593 
1594     #[cfg(target_arch = "x86_64")]
1595     ///
1596     /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1597     ///
1598     fn get_lapic(&self) -> cpu::Result<LapicState> {
1599         Ok(self
1600             .fd
1601             .lock()
1602             .unwrap()
1603             .get_lapic()
1604             .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))?
1605             .into())
1606     }
1607 
1608     #[cfg(target_arch = "x86_64")]
1609     ///
1610     /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1611     ///
1612     fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> {
1613         let klapic: kvm_bindings::kvm_lapic_state = (*klapic).clone().into();
1614         self.fd
1615             .lock()
1616             .unwrap()
1617             .set_lapic(&klapic)
1618             .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into()))
1619     }
1620 
1621     #[cfg(target_arch = "x86_64")]
1622     ///
1623     /// Returns the model-specific registers (MSR) for this vCPU.
1624     ///
1625     fn get_msrs(&self, msrs: &mut Vec<MsrEntry>) -> cpu::Result<usize> {
1626         let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect();
1627         let mut kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap();
1628         let succ = self
1629             .fd
1630             .lock()
1631             .unwrap()
1632             .get_msrs(&mut kvm_msrs)
1633             .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))?;
1634 
1635         msrs[..succ].copy_from_slice(
1636             &kvm_msrs.as_slice()[..succ]
1637                 .iter()
1638                 .map(|e| (*e).into())
1639                 .collect::<Vec<MsrEntry>>(),
1640         );
1641 
1642         Ok(succ)
1643     }
1644 
1645     #[cfg(target_arch = "x86_64")]
1646     ///
1647     /// Setup the model-specific registers (MSR) for this vCPU.
1648     /// Returns the number of MSR entries actually written.
1649     ///
1650     fn set_msrs(&self, msrs: &[MsrEntry]) -> cpu::Result<usize> {
1651         let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect();
1652         let kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap();
1653         self.fd
1654             .lock()
1655             .unwrap()
1656             .set_msrs(&kvm_msrs)
1657             .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into()))
1658     }
1659 
1660     ///
1661     /// Returns the vcpu's current "multiprocessing state".
1662     ///
1663     fn get_mp_state(&self) -> cpu::Result<MpState> {
1664         Ok(self
1665             .fd
1666             .lock()
1667             .unwrap()
1668             .get_mp_state()
1669             .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))?
1670             .into())
1671     }
1672 
1673     ///
1674     /// Sets the vcpu's current "multiprocessing state".
1675     ///
1676     fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> {
1677         self.fd
1678             .lock()
1679             .unwrap()
1680             .set_mp_state(mp_state.into())
1681             .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into()))
1682     }
1683 
1684     #[cfg(target_arch = "x86_64")]
1685     ///
1686     /// Translates guest virtual address to guest physical address using the `KVM_TRANSLATE` ioctl.
1687     ///
1688     fn translate_gva(&self, gva: u64, _flags: u64) -> cpu::Result<(u64, u32)> {
1689         let tr = self
1690             .fd
1691             .lock()
1692             .unwrap()
1693             .translate_gva(gva)
1694             .map_err(|e| cpu::HypervisorCpuError::TranslateVirtualAddress(e.into()))?;
1695         // tr.valid is set if the GVA is mapped to valid GPA.
1696         match tr.valid {
1697             0 => Err(cpu::HypervisorCpuError::TranslateVirtualAddress(anyhow!(
1698                 "Invalid GVA: {:#x}",
1699                 gva
1700             ))),
1701             _ => Ok((tr.physical_address, 0)),
1702         }
1703     }
1704 
1705     ///
1706     /// Triggers the running of the current virtual CPU returning an exit reason.
1707     ///
1708     fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> {
1709         match self.fd.lock().unwrap().run() {
1710             Ok(run) => match run {
1711                 #[cfg(target_arch = "x86_64")]
1712                 VcpuExit::IoIn(addr, data) => {
1713                     if let Some(vm_ops) = &self.vm_ops {
1714                         return vm_ops
1715                             .pio_read(addr.into(), data)
1716                             .map(|_| cpu::VmExit::Ignore)
1717                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1718                     }
1719 
1720                     Ok(cpu::VmExit::Ignore)
1721                 }
1722                 #[cfg(target_arch = "x86_64")]
1723                 VcpuExit::IoOut(addr, data) => {
1724                     if let Some(vm_ops) = &self.vm_ops {
1725                         return vm_ops
1726                             .pio_write(addr.into(), data)
1727                             .map(|_| cpu::VmExit::Ignore)
1728                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1729                     }
1730 
1731                     Ok(cpu::VmExit::Ignore)
1732                 }
1733                 #[cfg(target_arch = "x86_64")]
1734                 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)),
1735                 #[cfg(target_arch = "x86_64")]
1736                 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset),
1737 
1738                 #[cfg(target_arch = "aarch64")]
1739                 VcpuExit::SystemEvent(event_type, flags) => {
1740                     use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN};
1741                     // On Aarch64, when the VM is shutdown, run() returns
1742                     // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN
1743                     if event_type == KVM_SYSTEM_EVENT_RESET {
1744                         Ok(cpu::VmExit::Reset)
1745                     } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN {
1746                         Ok(cpu::VmExit::Shutdown)
1747                     } else {
1748                         Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1749                             "Unexpected system event with type 0x{:x}, flags 0x{:x?}",
1750                             event_type,
1751                             flags
1752                         )))
1753                     }
1754                 }
1755 
1756                 VcpuExit::MmioRead(addr, data) => {
1757                     if let Some(vm_ops) = &self.vm_ops {
1758                         return vm_ops
1759                             .mmio_read(addr, data)
1760                             .map(|_| cpu::VmExit::Ignore)
1761                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1762                     }
1763 
1764                     Ok(cpu::VmExit::Ignore)
1765                 }
1766                 VcpuExit::MmioWrite(addr, data) => {
1767                     if let Some(vm_ops) = &self.vm_ops {
1768                         return vm_ops
1769                             .mmio_write(addr, data)
1770                             .map(|_| cpu::VmExit::Ignore)
1771                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1772                     }
1773 
1774                     Ok(cpu::VmExit::Ignore)
1775                 }
1776                 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv),
1777                 #[cfg(feature = "tdx")]
1778                 VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx),
1779                 VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug),
1780 
1781                 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1782                     "Unexpected exit reason on vcpu run: {:?}",
1783                     r
1784                 ))),
1785             },
1786 
1787             Err(ref e) => match e.errno() {
1788                 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore),
1789                 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1790                     "VCPU error {:?}",
1791                     e
1792                 ))),
1793             },
1794         }
1795     }
1796 
1797     #[cfg(target_arch = "x86_64")]
1798     ///
1799     /// Let the guest know that it has been paused, which prevents from
1800     /// potential soft lockups when being resumed.
1801     ///
1802     fn notify_guest_clock_paused(&self) -> cpu::Result<()> {
1803         if let Err(e) = self.fd.lock().unwrap().kvmclock_ctrl() {
1804             // Linux kernel returns -EINVAL if the PV clock isn't yet initialised
1805             // which could be because we're still in firmware or the guest doesn't
1806             // use KVM clock.
1807             if e.errno() != libc::EINVAL {
1808                 return Err(cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into()));
1809             }
1810         }
1811 
1812         Ok(())
1813     }
1814 
1815     ///
1816     /// Sets debug registers to set hardware breakpoints and/or enable single step.
1817     ///
1818     fn set_guest_debug(
1819         &self,
1820         addrs: &[vm_memory::GuestAddress],
1821         singlestep: bool,
1822     ) -> cpu::Result<()> {
1823         let mut dbg = kvm_guest_debug {
1824             #[cfg(target_arch = "x86_64")]
1825             control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP,
1826             #[cfg(target_arch = "aarch64")]
1827             control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW,
1828             ..Default::default()
1829         };
1830         if singlestep {
1831             dbg.control |= KVM_GUESTDBG_SINGLESTEP;
1832         }
1833 
1834         // Set the debug registers.
1835         // Here we assume that the number of addresses do not exceed what
1836         // `Hypervisor::get_guest_debug_hw_bps()` specifies.
1837         #[cfg(target_arch = "x86_64")]
1838         {
1839             // Set bits 9 and 10.
1840             // bit 9: GE (global exact breakpoint enable) flag.
1841             // bit 10: always 1.
1842             dbg.arch.debugreg[7] = 0x0600;
1843 
1844             for (i, addr) in addrs.iter().enumerate() {
1845                 dbg.arch.debugreg[i] = addr.0;
1846                 // Set global breakpoint enable flag
1847                 dbg.arch.debugreg[7] |= 2 << (i * 2);
1848             }
1849         }
1850         #[cfg(target_arch = "aarch64")]
1851         {
1852             for (i, addr) in addrs.iter().enumerate() {
1853                 // DBGBCR_EL1 (Debug Breakpoint Control Registers, D13.3.2):
1854                 // bit 0: 1 (Enabled)
1855                 // bit 1~2: 0b11 (PMC = EL1/EL0)
1856                 // bit 5~8: 0b1111 (BAS = AArch64)
1857                 // others: 0
1858                 dbg.arch.dbg_bcr[i] = 0b1u64 | 0b110u64 | 0b1_1110_0000u64;
1859                 // DBGBVR_EL1 (Debug Breakpoint Value Registers, D13.3.3):
1860                 // bit 2~52: VA[2:52]
1861                 dbg.arch.dbg_bvr[i] = (!0u64 >> 11) & addr.0;
1862             }
1863         }
1864         self.fd
1865             .lock()
1866             .unwrap()
1867             .set_guest_debug(&dbg)
1868             .map_err(|e| cpu::HypervisorCpuError::SetDebugRegs(e.into()))
1869     }
1870 
1871     #[cfg(target_arch = "aarch64")]
1872     fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> {
1873         self.fd
1874             .lock()
1875             .unwrap()
1876             .vcpu_init(kvi)
1877             .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into()))
1878     }
1879 
1880     #[cfg(target_arch = "aarch64")]
1881     fn vcpu_finalize(&self, feature: i32) -> cpu::Result<()> {
1882         self.fd
1883             .lock()
1884             .unwrap()
1885             .vcpu_finalize(&feature)
1886             .map_err(|e| cpu::HypervisorCpuError::VcpuFinalize(e.into()))
1887     }
1888 
1889     ///
1890     /// Gets a list of the guest registers that are supported for the
1891     /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls.
1892     ///
1893     #[cfg(target_arch = "aarch64")]
1894     fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> {
1895         self.fd
1896             .lock()
1897             .unwrap()
1898             .get_reg_list(reg_list)
1899             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))
1900     }
1901 
1902     ///
1903     /// Gets the value of a system register
1904     ///
1905     #[cfg(target_arch = "aarch64")]
1906     fn get_sys_reg(&self, sys_reg: u32) -> cpu::Result<u64> {
1907         //
1908         // Arm Architecture Reference Manual defines the encoding of
1909         // AArch64 system registers, see
1910         // https://developer.arm.com/documentation/ddi0487 (chapter D12).
1911         // While KVM defines another ID for each AArch64 system register,
1912         // which is used in calling `KVM_G/SET_ONE_REG` to access a system
1913         // register of a guest.
1914         // A mapping exists between the Arm standard encoding and the KVM ID.
1915         // This function takes the standard u32 ID as input parameter, converts
1916         // it to the corresponding KVM ID, and call `KVM_GET_ONE_REG` API to
1917         // get the value of the system parameter.
1918         //
1919         let id: u64 = KVM_REG_ARM64
1920             | KVM_REG_SIZE_U64
1921             | KVM_REG_ARM64_SYSREG as u64
1922             | ((((sys_reg) >> 5)
1923                 & (KVM_REG_ARM64_SYSREG_OP0_MASK
1924                     | KVM_REG_ARM64_SYSREG_OP1_MASK
1925                     | KVM_REG_ARM64_SYSREG_CRN_MASK
1926                     | KVM_REG_ARM64_SYSREG_CRM_MASK
1927                     | KVM_REG_ARM64_SYSREG_OP2_MASK)) as u64);
1928         let mut bytes = [0_u8; 8];
1929         self.fd
1930             .lock()
1931             .unwrap()
1932             .get_one_reg(id, &mut bytes)
1933             .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?;
1934         Ok(u64::from_le_bytes(bytes))
1935     }
1936 
1937     ///
1938     /// Configure core registers for a given CPU.
1939     ///
1940     #[cfg(target_arch = "aarch64")]
1941     fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> {
1942         #[allow(non_upper_case_globals)]
1943         // PSR (Processor State Register) bits.
1944         // Taken from arch/arm64/include/uapi/asm/ptrace.h.
1945         const PSR_MODE_EL1h: u64 = 0x0000_0005;
1946         const PSR_F_BIT: u64 = 0x0000_0040;
1947         const PSR_I_BIT: u64 = 0x0000_0080;
1948         const PSR_A_BIT: u64 = 0x0000_0100;
1949         const PSR_D_BIT: u64 = 0x0000_0200;
1950         // Taken from arch/arm64/kvm/inject_fault.c.
1951         const PSTATE_FAULT_BITS_64: u64 =
1952             PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | PSR_I_BIT | PSR_D_BIT;
1953 
1954         let kreg_off = offset_of!(kvm_regs, regs);
1955 
1956         // Get the register index of the PSTATE (Processor State) register.
1957         let pstate = offset_of!(user_pt_regs, pstate) + kreg_off;
1958         self.fd
1959             .lock()
1960             .unwrap()
1961             .set_one_reg(
1962                 arm64_core_reg_id!(KVM_REG_SIZE_U64, pstate),
1963                 &PSTATE_FAULT_BITS_64.to_le_bytes(),
1964             )
1965             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1966 
1967         // Other vCPUs are powered off initially awaiting PSCI wakeup.
1968         if cpu_id == 0 {
1969             // Setting the PC (Processor Counter) to the current program address (kernel address).
1970             let pc = offset_of!(user_pt_regs, pc) + kreg_off;
1971             self.fd
1972                 .lock()
1973                 .unwrap()
1974                 .set_one_reg(
1975                     arm64_core_reg_id!(KVM_REG_SIZE_U64, pc),
1976                     &boot_ip.to_le_bytes(),
1977                 )
1978                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1979 
1980             // Last mandatory thing to set -> the address pointing to the FDT (also called DTB).
1981             // "The device tree blob (dtb) must be placed on an 8-byte boundary and must
1982             // not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt.
1983             // We are choosing to place it the end of DRAM. See `get_fdt_addr`.
1984             let regs0 = offset_of!(user_pt_regs, regs) + kreg_off;
1985             self.fd
1986                 .lock()
1987                 .unwrap()
1988                 .set_one_reg(
1989                     arm64_core_reg_id!(KVM_REG_SIZE_U64, regs0),
1990                     &fdt_start.to_le_bytes(),
1991                 )
1992                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1993         }
1994         Ok(())
1995     }
1996 
1997     #[cfg(target_arch = "x86_64")]
1998     ///
1999     /// Get the current CPU state
2000     ///
2001     /// Ordering requirements:
2002     ///
2003     /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify
2004     /// vCPU/LAPIC state. As such, it must be done before most everything
2005     /// else, otherwise we cannot restore everything and expect it to work.
2006     ///
2007     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
2008     /// still running.
2009     ///
2010     /// KVM_GET_LAPIC may change state of LAPIC before returning it.
2011     ///
2012     /// GET_VCPU_EVENTS should probably be last to save. The code looks as
2013     /// it might as well be affected by internal state modifications of the
2014     /// GET ioctls.
2015     ///
2016     /// SREGS saves/restores a pending interrupt, similar to what
2017     /// VCPU_EVENTS also does.
2018     ///
2019     /// GET_MSRS requires a prepopulated data structure to do something
2020     /// meaningful. For SET_MSRS it will then contain good data.
2021     ///
2022     /// # Example
2023     ///
2024     /// ```rust
2025     /// # use hypervisor::kvm::KvmHypervisor;
2026     /// # use std::sync::Arc;
2027     /// let kvm = KvmHypervisor::new().unwrap();
2028     /// let hv = Arc::new(kvm);
2029     /// let vm = hv.create_vm().expect("new VM fd creation failed");
2030     /// vm.enable_split_irq().unwrap();
2031     /// let vcpu = vm.create_vcpu(0, None).unwrap();
2032     /// let state = vcpu.state().unwrap();
2033     /// ```
2034     fn state(&self) -> cpu::Result<CpuState> {
2035         let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?;
2036         let mp_state = self.get_mp_state()?.into();
2037         let regs = self.get_regs()?;
2038         let sregs = self.get_sregs()?;
2039         let xsave = self.get_xsave()?;
2040         let xcrs = self.get_xcrs()?;
2041         let lapic_state = self.get_lapic()?;
2042         let fpu = self.get_fpu()?;
2043 
2044         // Try to get all MSRs based on the list previously retrieved from KVM.
2045         // If the number of MSRs obtained from GET_MSRS is different from the
2046         // expected amount, we fallback onto a slower method by getting MSRs
2047         // by chunks. This is the only way to make sure we try to get as many
2048         // MSRs as possible, even if some MSRs are not supported.
2049         let mut msr_entries = self.msrs.clone();
2050 
2051         // Save extra MSRs if the Hyper-V synthetic interrupt controller is
2052         // emulated.
2053         if self.hyperv_synic.load(Ordering::Acquire) {
2054             let hyperv_synic_msrs = vec![
2055                 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084,
2056                 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096,
2057                 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d,
2058                 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4,
2059                 0x400000b5, 0x400000b6, 0x400000b7,
2060             ];
2061             for index in hyperv_synic_msrs {
2062                 let msr = kvm_msr_entry {
2063                     index,
2064                     ..Default::default()
2065                 };
2066                 msr_entries.push(msr.into());
2067             }
2068         }
2069 
2070         let expected_num_msrs = msr_entries.len();
2071         let num_msrs = self.get_msrs(&mut msr_entries)?;
2072         let msrs = if num_msrs != expected_num_msrs {
2073             let mut faulty_msr_index = num_msrs;
2074             let mut msr_entries_tmp = msr_entries[..faulty_msr_index].to_vec();
2075 
2076             loop {
2077                 warn!(
2078                     "Detected faulty MSR 0x{:x} while getting MSRs",
2079                     msr_entries[faulty_msr_index].index
2080                 );
2081 
2082                 // Skip the first bad MSR
2083                 let start_pos = faulty_msr_index + 1;
2084 
2085                 let mut sub_msr_entries = msr_entries[start_pos..].to_vec();
2086                 let num_msrs = self.get_msrs(&mut sub_msr_entries)?;
2087 
2088                 msr_entries_tmp.extend(&sub_msr_entries[..num_msrs]);
2089 
2090                 if num_msrs == sub_msr_entries.len() {
2091                     break;
2092                 }
2093 
2094                 faulty_msr_index = start_pos + num_msrs;
2095             }
2096 
2097             msr_entries_tmp
2098         } else {
2099             msr_entries
2100         };
2101 
2102         let vcpu_events = self.get_vcpu_events()?;
2103         let tsc_khz = self.tsc_khz()?;
2104 
2105         Ok(VcpuKvmState {
2106             cpuid,
2107             msrs,
2108             vcpu_events,
2109             regs: regs.into(),
2110             sregs: sregs.into(),
2111             fpu,
2112             lapic_state,
2113             xsave,
2114             xcrs,
2115             mp_state,
2116             tsc_khz,
2117         }
2118         .into())
2119     }
2120 
2121     ///
2122     /// Get the current AArch64 CPU state
2123     ///
2124     #[cfg(target_arch = "aarch64")]
2125     fn state(&self) -> cpu::Result<CpuState> {
2126         let mut state = VcpuKvmState {
2127             mp_state: self.get_mp_state()?.into(),
2128             ..Default::default()
2129         };
2130         // Get core registers
2131         state.core_regs = self.get_regs()?.into();
2132 
2133         // Get systerm register
2134         // Call KVM_GET_REG_LIST to get all registers available to the guest.
2135         // For ArmV8 there are around 500 registers.
2136         let mut sys_regs: Vec<Register> = Vec::new();
2137         let mut reg_list = RegList::new(500).unwrap();
2138         self.fd
2139             .lock()
2140             .unwrap()
2141             .get_reg_list(&mut reg_list)
2142             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?;
2143 
2144         // At this point reg_list should contain: core registers and system
2145         // registers.
2146         // The register list contains the number of registers and their ids. We
2147         // will be needing to call KVM_GET_ONE_REG on each id in order to save
2148         // all of them. We carve out from the list  the core registers which are
2149         // represented in the kernel by kvm_regs structure and for which we can
2150         // calculate the id based on the offset in the structure.
2151         reg_list.retain(|regid| is_system_register(*regid));
2152 
2153         // Now, for the rest of the registers left in the previously fetched
2154         // register list, we are simply calling KVM_GET_ONE_REG.
2155         let indices = reg_list.as_slice();
2156         for index in indices.iter() {
2157             let mut bytes = [0_u8; 8];
2158             self.fd
2159                 .lock()
2160                 .unwrap()
2161                 .get_one_reg(*index, &mut bytes)
2162                 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?;
2163             sys_regs.push(kvm_bindings::kvm_one_reg {
2164                 id: *index,
2165                 addr: u64::from_le_bytes(bytes),
2166             });
2167         }
2168 
2169         state.sys_regs = sys_regs;
2170 
2171         Ok(state.into())
2172     }
2173 
2174     #[cfg(target_arch = "x86_64")]
2175     ///
2176     /// Restore the previously saved CPU state
2177     ///
2178     /// Ordering requirements:
2179     ///
2180     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
2181     /// still running.
2182     ///
2183     /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so
2184     /// if we ever change the BSP, we have to do that before restoring anything.
2185     /// The same seems to be true for CPUID stuff.
2186     ///
2187     /// SREGS saves/restores a pending interrupt, similar to what
2188     /// VCPU_EVENTS also does.
2189     ///
2190     /// SET_REGS clears pending exceptions unconditionally, thus, it must be
2191     /// done before SET_VCPU_EVENTS, which restores it.
2192     ///
2193     /// SET_LAPIC must come after SET_SREGS, because the latter restores
2194     /// the apic base msr.
2195     ///
2196     /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR
2197     /// only restores successfully, when the LAPIC is correctly configured.
2198     ///
2199     /// Arguments: CpuState
2200     /// # Example
2201     ///
2202     /// ```rust
2203     /// # use hypervisor::kvm::KvmHypervisor;
2204     /// # use std::sync::Arc;
2205     /// let kvm = KvmHypervisor::new().unwrap();
2206     /// let hv = Arc::new(kvm);
2207     /// let vm = hv.create_vm().expect("new VM fd creation failed");
2208     /// vm.enable_split_irq().unwrap();
2209     /// let vcpu = vm.create_vcpu(0, None).unwrap();
2210     /// let state = vcpu.state().unwrap();
2211     /// vcpu.set_state(&state).unwrap();
2212     /// ```
2213     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
2214         let state: VcpuKvmState = state.clone().into();
2215         self.set_cpuid2(&state.cpuid)?;
2216         self.set_mp_state(state.mp_state.into())?;
2217         self.set_regs(&state.regs.into())?;
2218         self.set_sregs(&state.sregs.into())?;
2219         self.set_xsave(&state.xsave)?;
2220         self.set_xcrs(&state.xcrs)?;
2221         self.set_lapic(&state.lapic_state)?;
2222         self.set_fpu(&state.fpu)?;
2223 
2224         if let Some(freq) = state.tsc_khz {
2225             self.set_tsc_khz(freq)?;
2226         }
2227 
2228         // Try to set all MSRs previously stored.
2229         // If the number of MSRs set from SET_MSRS is different from the
2230         // expected amount, we fallback onto a slower method by setting MSRs
2231         // by chunks. This is the only way to make sure we try to set as many
2232         // MSRs as possible, even if some MSRs are not supported.
2233         let expected_num_msrs = state.msrs.len();
2234         let num_msrs = self.set_msrs(&state.msrs)?;
2235         if num_msrs != expected_num_msrs {
2236             let mut faulty_msr_index = num_msrs;
2237 
2238             loop {
2239                 warn!(
2240                     "Detected faulty MSR 0x{:x} while setting MSRs",
2241                     state.msrs[faulty_msr_index].index
2242                 );
2243 
2244                 // Skip the first bad MSR
2245                 let start_pos = faulty_msr_index + 1;
2246 
2247                 let sub_msr_entries = state.msrs[start_pos..].to_vec();
2248 
2249                 let num_msrs = self.set_msrs(&sub_msr_entries)?;
2250 
2251                 if num_msrs == sub_msr_entries.len() {
2252                     break;
2253                 }
2254 
2255                 faulty_msr_index = start_pos + num_msrs;
2256             }
2257         }
2258 
2259         self.set_vcpu_events(&state.vcpu_events)?;
2260 
2261         Ok(())
2262     }
2263 
2264     ///
2265     /// Restore the previously saved AArch64 CPU state
2266     ///
2267     #[cfg(target_arch = "aarch64")]
2268     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
2269         let state: VcpuKvmState = state.clone().into();
2270         // Set core registers
2271         self.set_regs(&state.core_regs.into())?;
2272         // Set system registers
2273         for reg in &state.sys_regs {
2274             self.fd
2275                 .lock()
2276                 .unwrap()
2277                 .set_one_reg(reg.id, &reg.addr.to_le_bytes())
2278                 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?;
2279         }
2280 
2281         self.set_mp_state(state.mp_state.into())?;
2282 
2283         Ok(())
2284     }
2285 
2286     ///
2287     /// Initialize TDX for this CPU
2288     ///
2289     #[cfg(feature = "tdx")]
2290     fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> {
2291         tdx_command(
2292             &self.fd.lock().unwrap().as_raw_fd(),
2293             TdxCommand::InitVcpu,
2294             0,
2295             hob_address,
2296         )
2297         .map_err(cpu::HypervisorCpuError::InitializeTdx)
2298     }
2299 
2300     ///
2301     /// Set the "immediate_exit" state
2302     ///
2303     fn set_immediate_exit(&self, exit: bool) {
2304         self.fd.lock().unwrap().set_kvm_immediate_exit(exit.into());
2305     }
2306 
2307     ///
2308     /// Returns the details about TDX exit reason
2309     ///
2310     #[cfg(feature = "tdx")]
2311     fn get_tdx_exit_details(&mut self) -> cpu::Result<TdxExitDetails> {
2312         let mut fd = self.fd.as_ref().lock().unwrap();
2313         let kvm_run = fd.get_kvm_run();
2314         // SAFETY: accessing a union field in a valid structure
2315         let tdx_vmcall = unsafe {
2316             &mut (*((&mut kvm_run.__bindgen_anon_1) as *mut kvm_run__bindgen_ty_1
2317                 as *mut KvmTdxExit))
2318                 .u
2319                 .vmcall
2320         };
2321 
2322         tdx_vmcall.status_code = TDG_VP_VMCALL_INVALID_OPERAND;
2323 
2324         if tdx_vmcall.type_ != 0 {
2325             return Err(cpu::HypervisorCpuError::UnknownTdxVmCall);
2326         }
2327 
2328         match tdx_vmcall.subfunction {
2329             TDG_VP_VMCALL_GET_QUOTE => Ok(TdxExitDetails::GetQuote),
2330             TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT => {
2331                 Ok(TdxExitDetails::SetupEventNotifyInterrupt)
2332             }
2333             _ => Err(cpu::HypervisorCpuError::UnknownTdxVmCall),
2334         }
2335     }
2336 
2337     ///
2338     /// Set the status code for TDX exit
2339     ///
2340     #[cfg(feature = "tdx")]
2341     fn set_tdx_status(&mut self, status: TdxExitStatus) {
2342         let mut fd = self.fd.as_ref().lock().unwrap();
2343         let kvm_run = fd.get_kvm_run();
2344         // SAFETY: accessing a union field in a valid structure
2345         let tdx_vmcall = unsafe {
2346             &mut (*((&mut kvm_run.__bindgen_anon_1) as *mut kvm_run__bindgen_ty_1
2347                 as *mut KvmTdxExit))
2348                 .u
2349                 .vmcall
2350         };
2351 
2352         tdx_vmcall.status_code = match status {
2353             TdxExitStatus::Success => TDG_VP_VMCALL_SUCCESS,
2354             TdxExitStatus::InvalidOperand => TDG_VP_VMCALL_INVALID_OPERAND,
2355         };
2356     }
2357 
2358     #[cfg(target_arch = "x86_64")]
2359     ///
2360     /// Return the list of initial MSR entries for a VCPU
2361     ///
2362     fn boot_msr_entries(&self) -> Vec<MsrEntry> {
2363         use crate::arch::x86::{msr_index, MTRR_ENABLE, MTRR_MEM_TYPE_WB};
2364 
2365         [
2366             msr!(msr_index::MSR_IA32_SYSENTER_CS),
2367             msr!(msr_index::MSR_IA32_SYSENTER_ESP),
2368             msr!(msr_index::MSR_IA32_SYSENTER_EIP),
2369             msr!(msr_index::MSR_STAR),
2370             msr!(msr_index::MSR_CSTAR),
2371             msr!(msr_index::MSR_LSTAR),
2372             msr!(msr_index::MSR_KERNEL_GS_BASE),
2373             msr!(msr_index::MSR_SYSCALL_MASK),
2374             msr!(msr_index::MSR_IA32_TSC),
2375             msr_data!(
2376                 msr_index::MSR_IA32_MISC_ENABLE,
2377                 msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64
2378             ),
2379             msr_data!(msr_index::MSR_MTRRdefType, MTRR_ENABLE | MTRR_MEM_TYPE_WB),
2380         ]
2381         .to_vec()
2382     }
2383 
2384     #[cfg(target_arch = "aarch64")]
2385     fn has_pmu_support(&self) -> bool {
2386         let cpu_attr = kvm_bindings::kvm_device_attr {
2387             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2388             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT),
2389             addr: 0x0,
2390             flags: 0,
2391         };
2392         self.fd.lock().unwrap().has_device_attr(&cpu_attr).is_ok()
2393     }
2394 
2395     #[cfg(target_arch = "aarch64")]
2396     fn init_pmu(&self, irq: u32) -> cpu::Result<()> {
2397         let cpu_attr = kvm_bindings::kvm_device_attr {
2398             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2399             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT),
2400             addr: 0x0,
2401             flags: 0,
2402         };
2403         let cpu_attr_irq = kvm_bindings::kvm_device_attr {
2404             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2405             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_IRQ),
2406             addr: &irq as *const u32 as u64,
2407             flags: 0,
2408         };
2409         self.fd
2410             .lock()
2411             .unwrap()
2412             .set_device_attr(&cpu_attr_irq)
2413             .map_err(|_| cpu::HypervisorCpuError::InitializePmu)?;
2414         self.fd
2415             .lock()
2416             .unwrap()
2417             .set_device_attr(&cpu_attr)
2418             .map_err(|_| cpu::HypervisorCpuError::InitializePmu)
2419     }
2420 
2421     #[cfg(target_arch = "x86_64")]
2422     ///
2423     /// Get the frequency of the TSC if available
2424     ///
2425     fn tsc_khz(&self) -> cpu::Result<Option<u32>> {
2426         match self.fd.lock().unwrap().get_tsc_khz() {
2427             Err(e) => {
2428                 if e.errno() == libc::EIO {
2429                     Ok(None)
2430                 } else {
2431                     Err(cpu::HypervisorCpuError::GetTscKhz(e.into()))
2432                 }
2433             }
2434             Ok(v) => Ok(Some(v)),
2435         }
2436     }
2437 
2438     #[cfg(target_arch = "x86_64")]
2439     ///
2440     /// Set the frequency of the TSC if available
2441     ///
2442     fn set_tsc_khz(&self, freq: u32) -> cpu::Result<()> {
2443         match self.fd.lock().unwrap().set_tsc_khz(freq) {
2444             Err(e) => {
2445                 if e.errno() == libc::EIO {
2446                     Ok(())
2447                 } else {
2448                     Err(cpu::HypervisorCpuError::SetTscKhz(e.into()))
2449                 }
2450             }
2451             Ok(_) => Ok(()),
2452         }
2453     }
2454 
2455     #[cfg(target_arch = "x86_64")]
2456     ///
2457     /// Trigger NMI interrupt
2458     ///
2459     fn nmi(&self) -> cpu::Result<()> {
2460         match self.fd.lock().unwrap().nmi() {
2461             Err(e) => {
2462                 if e.errno() == libc::EIO {
2463                     Ok(())
2464                 } else {
2465                     Err(cpu::HypervisorCpuError::Nmi(e.into()))
2466                 }
2467             }
2468             Ok(_) => Ok(()),
2469         }
2470     }
2471 }
2472 
2473 impl KvmVcpu {
2474     #[cfg(target_arch = "x86_64")]
2475     ///
2476     /// X86 specific call that returns the vcpu's current "xsave struct".
2477     ///
2478     fn get_xsave(&self) -> cpu::Result<XsaveState> {
2479         Ok(self
2480             .fd
2481             .lock()
2482             .unwrap()
2483             .get_xsave()
2484             .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into()))?
2485             .into())
2486     }
2487 
2488     #[cfg(target_arch = "x86_64")]
2489     ///
2490     /// X86 specific call that sets the vcpu's current "xsave struct".
2491     ///
2492     fn set_xsave(&self, xsave: &XsaveState) -> cpu::Result<()> {
2493         let xsave: kvm_bindings::kvm_xsave = (*xsave).clone().into();
2494         self.fd
2495             .lock()
2496             .unwrap()
2497             .set_xsave(&xsave)
2498             .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into()))
2499     }
2500 
2501     #[cfg(target_arch = "x86_64")]
2502     ///
2503     /// X86 specific call that returns the vcpu's current "xcrs".
2504     ///
2505     fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> {
2506         self.fd
2507             .lock()
2508             .unwrap()
2509             .get_xcrs()
2510             .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into()))
2511     }
2512 
2513     #[cfg(target_arch = "x86_64")]
2514     ///
2515     /// X86 specific call that sets the vcpu's current "xcrs".
2516     ///
2517     fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> {
2518         self.fd
2519             .lock()
2520             .unwrap()
2521             .set_xcrs(xcrs)
2522             .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into()))
2523     }
2524 
2525     #[cfg(target_arch = "x86_64")]
2526     ///
2527     /// Returns currently pending exceptions, interrupts, and NMIs as well as related
2528     /// states of the vcpu.
2529     ///
2530     fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> {
2531         self.fd
2532             .lock()
2533             .unwrap()
2534             .get_vcpu_events()
2535             .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into()))
2536     }
2537 
2538     #[cfg(target_arch = "x86_64")]
2539     ///
2540     /// Sets pending exceptions, interrupts, and NMIs as well as related states
2541     /// of the vcpu.
2542     ///
2543     fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> {
2544         self.fd
2545             .lock()
2546             .unwrap()
2547             .set_vcpu_events(events)
2548             .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into()))
2549     }
2550 }
2551