xref: /cloud-hypervisor/hypervisor/src/kvm/mod.rs (revision 5a70d7ec69836ad66cdd1e4ea59414dcdaaeec8c)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 // Copyright © 2020, Microsoft Corporation
6 //
7 // Copyright 2018-2019 CrowdStrike, Inc.
8 //
9 //
10 
11 #[cfg(target_arch = "aarch64")]
12 use crate::aarch64::gic::KvmGicV3Its;
13 #[cfg(target_arch = "aarch64")]
14 pub use crate::aarch64::{
15     check_required_kvm_extensions, gic::Gicv3ItsState as GicState, is_system_register, VcpuInit,
16     VcpuKvmState,
17 };
18 #[cfg(target_arch = "aarch64")]
19 use crate::arch::aarch64::gic::{Vgic, VgicConfig};
20 use crate::cpu;
21 use crate::hypervisor;
22 use crate::vec_with_array_field;
23 use crate::vm::{self, InterruptSourceConfig, VmOps};
24 use crate::HypervisorType;
25 #[cfg(target_arch = "aarch64")]
26 use crate::{arm64_core_reg_id, offset_of};
27 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd};
28 use std::any::Any;
29 use std::collections::HashMap;
30 #[cfg(target_arch = "x86_64")]
31 use std::fs::File;
32 #[cfg(target_arch = "x86_64")]
33 use std::os::unix::io::AsRawFd;
34 #[cfg(feature = "tdx")]
35 use std::os::unix::io::RawFd;
36 use std::result;
37 #[cfg(target_arch = "x86_64")]
38 use std::sync::atomic::{AtomicBool, Ordering};
39 use std::sync::Mutex;
40 use std::sync::{Arc, RwLock};
41 use vmm_sys_util::eventfd::EventFd;
42 // x86_64 dependencies
43 #[cfg(target_arch = "x86_64")]
44 pub mod x86_64;
45 #[cfg(target_arch = "x86_64")]
46 use crate::arch::x86::{
47     CpuIdEntry, FpuState, LapicState, MsrEntry, SpecialRegisters, XsaveState, NUM_IOAPIC_PINS,
48 };
49 #[cfg(target_arch = "x86_64")]
50 use crate::ClockData;
51 use crate::StandardRegisters;
52 use crate::{
53     CpuState, IoEventAddress, IrqRoutingEntry, MpState, UserMemoryRegion,
54     USER_MEMORY_REGION_LOG_DIRTY, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE,
55 };
56 #[cfg(target_arch = "aarch64")]
57 use aarch64::{RegList, Register};
58 #[cfg(target_arch = "x86_64")]
59 use kvm_bindings::{
60     kvm_enable_cap, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, KVM_CAP_SPLIT_IRQCHIP,
61     KVM_GUESTDBG_USE_HW_BP,
62 };
63 #[cfg(target_arch = "x86_64")]
64 use x86_64::check_required_kvm_extensions;
65 #[cfg(target_arch = "x86_64")]
66 pub use x86_64::{CpuId, ExtendedControlRegisters, MsrEntries, VcpuKvmState};
67 // aarch64 dependencies
68 #[cfg(target_arch = "aarch64")]
69 pub mod aarch64;
70 pub use kvm_bindings;
71 pub use kvm_bindings::{
72     kvm_clock_data, kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_guest_debug,
73     kvm_irq_routing, kvm_irq_routing_entry, kvm_mp_state, kvm_userspace_memory_region,
74     KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI,
75     KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID,
76 };
77 #[cfg(target_arch = "aarch64")]
78 use kvm_bindings::{
79     kvm_regs, user_fpsimd_state, user_pt_regs, KVM_GUESTDBG_USE_HW, KVM_NR_SPSR, KVM_REG_ARM64,
80     KVM_REG_ARM64_SYSREG, KVM_REG_ARM64_SYSREG_CRM_MASK, KVM_REG_ARM64_SYSREG_CRN_MASK,
81     KVM_REG_ARM64_SYSREG_OP0_MASK, KVM_REG_ARM64_SYSREG_OP1_MASK, KVM_REG_ARM64_SYSREG_OP2_MASK,
82     KVM_REG_ARM_CORE, KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64,
83 };
84 #[cfg(feature = "tdx")]
85 use kvm_bindings::{kvm_run__bindgen_ty_1, KVMIO};
86 pub use kvm_ioctls;
87 pub use kvm_ioctls::{Cap, Kvm};
88 #[cfg(target_arch = "aarch64")]
89 use std::mem;
90 use thiserror::Error;
91 use vfio_ioctls::VfioDeviceFd;
92 #[cfg(feature = "tdx")]
93 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_ioc_nr, ioctl_iowr_nr};
94 ///
95 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms
96 ///
97 pub use {
98     kvm_bindings::kvm_create_device as CreateDevice, kvm_bindings::kvm_device_attr as DeviceAttr,
99     kvm_bindings::kvm_run, kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::VcpuExit,
100 };
101 
102 #[cfg(target_arch = "x86_64")]
103 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196;
104 
105 #[cfg(target_arch = "x86_64")]
106 use vmm_sys_util::ioctl_io_nr;
107 
108 #[cfg(all(not(feature = "tdx"), target_arch = "x86_64"))]
109 use vmm_sys_util::ioctl_ioc_nr;
110 
111 #[cfg(target_arch = "x86_64")]
112 ioctl_io_nr!(KVM_NMI, kvm_bindings::KVMIO, 0x9a);
113 
114 #[cfg(feature = "tdx")]
115 const KVM_EXIT_TDX: u32 = 50;
116 #[cfg(feature = "tdx")]
117 const TDG_VP_VMCALL_GET_QUOTE: u64 = 0x10002;
118 #[cfg(feature = "tdx")]
119 const TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: u64 = 0x10004;
120 #[cfg(feature = "tdx")]
121 const TDG_VP_VMCALL_SUCCESS: u64 = 0;
122 #[cfg(feature = "tdx")]
123 const TDG_VP_VMCALL_INVALID_OPERAND: u64 = 0x8000000000000000;
124 
125 #[cfg(feature = "tdx")]
126 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong);
127 
128 #[cfg(feature = "tdx")]
129 #[repr(u32)]
130 enum TdxCommand {
131     Capabilities = 0,
132     InitVm,
133     InitVcpu,
134     InitMemRegion,
135     Finalize,
136 }
137 
138 #[cfg(feature = "tdx")]
139 pub enum TdxExitDetails {
140     GetQuote,
141     SetupEventNotifyInterrupt,
142 }
143 
144 #[cfg(feature = "tdx")]
145 pub enum TdxExitStatus {
146     Success,
147     InvalidOperand,
148 }
149 
150 #[cfg(feature = "tdx")]
151 const TDX_MAX_NR_CPUID_CONFIGS: usize = 6;
152 
153 #[cfg(feature = "tdx")]
154 #[repr(C)]
155 #[derive(Debug, Default)]
156 pub struct TdxCpuidConfig {
157     pub leaf: u32,
158     pub sub_leaf: u32,
159     pub eax: u32,
160     pub ebx: u32,
161     pub ecx: u32,
162     pub edx: u32,
163 }
164 
165 #[cfg(feature = "tdx")]
166 #[repr(C)]
167 #[derive(Debug, Default)]
168 pub struct TdxCapabilities {
169     pub attrs_fixed0: u64,
170     pub attrs_fixed1: u64,
171     pub xfam_fixed0: u64,
172     pub xfam_fixed1: u64,
173     pub nr_cpuid_configs: u32,
174     pub padding: u32,
175     pub cpuid_configs: [TdxCpuidConfig; TDX_MAX_NR_CPUID_CONFIGS],
176 }
177 
178 #[cfg(feature = "tdx")]
179 #[derive(Copy, Clone)]
180 pub struct KvmTdxExit {
181     pub type_: u32,
182     pub pad: u32,
183     pub u: KvmTdxExitU,
184 }
185 
186 #[cfg(feature = "tdx")]
187 #[repr(C)]
188 #[derive(Copy, Clone)]
189 pub union KvmTdxExitU {
190     pub vmcall: KvmTdxExitVmcall,
191 }
192 
193 #[cfg(feature = "tdx")]
194 #[repr(C)]
195 #[derive(Debug, Default, Copy, Clone, PartialEq)]
196 pub struct KvmTdxExitVmcall {
197     pub type_: u64,
198     pub subfunction: u64,
199     pub reg_mask: u64,
200     pub in_r12: u64,
201     pub in_r13: u64,
202     pub in_r14: u64,
203     pub in_r15: u64,
204     pub in_rbx: u64,
205     pub in_rdi: u64,
206     pub in_rsi: u64,
207     pub in_r8: u64,
208     pub in_r9: u64,
209     pub in_rdx: u64,
210     pub status_code: u64,
211     pub out_r11: u64,
212     pub out_r12: u64,
213     pub out_r13: u64,
214     pub out_r14: u64,
215     pub out_r15: u64,
216     pub out_rbx: u64,
217     pub out_rdi: u64,
218     pub out_rsi: u64,
219     pub out_r8: u64,
220     pub out_r9: u64,
221     pub out_rdx: u64,
222 }
223 
224 impl From<kvm_userspace_memory_region> for UserMemoryRegion {
225     fn from(region: kvm_userspace_memory_region) -> Self {
226         let mut flags = USER_MEMORY_REGION_READ;
227         if region.flags & KVM_MEM_READONLY == 0 {
228             flags |= USER_MEMORY_REGION_WRITE;
229         }
230         if region.flags & KVM_MEM_LOG_DIRTY_PAGES != 0 {
231             flags |= USER_MEMORY_REGION_LOG_DIRTY;
232         }
233 
234         UserMemoryRegion {
235             slot: region.slot,
236             guest_phys_addr: region.guest_phys_addr,
237             memory_size: region.memory_size,
238             userspace_addr: region.userspace_addr,
239             flags,
240         }
241     }
242 }
243 
244 impl From<UserMemoryRegion> for kvm_userspace_memory_region {
245     fn from(region: UserMemoryRegion) -> Self {
246         assert!(
247             region.flags & USER_MEMORY_REGION_READ != 0,
248             "KVM mapped memory is always readable"
249         );
250 
251         let mut flags = 0;
252         if region.flags & USER_MEMORY_REGION_WRITE == 0 {
253             flags |= KVM_MEM_READONLY;
254         }
255         if region.flags & USER_MEMORY_REGION_LOG_DIRTY != 0 {
256             flags |= KVM_MEM_LOG_DIRTY_PAGES;
257         }
258 
259         kvm_userspace_memory_region {
260             slot: region.slot,
261             guest_phys_addr: region.guest_phys_addr,
262             memory_size: region.memory_size,
263             userspace_addr: region.userspace_addr,
264             flags,
265         }
266     }
267 }
268 
269 impl From<kvm_mp_state> for MpState {
270     fn from(s: kvm_mp_state) -> Self {
271         MpState::Kvm(s)
272     }
273 }
274 
275 impl From<MpState> for kvm_mp_state {
276     fn from(ms: MpState) -> Self {
277         match ms {
278             MpState::Kvm(s) => s,
279             /* Needed in case other hypervisors are enabled */
280             #[allow(unreachable_patterns)]
281             _ => panic!("CpuState is not valid"),
282         }
283     }
284 }
285 
286 impl From<kvm_ioctls::IoEventAddress> for IoEventAddress {
287     fn from(a: kvm_ioctls::IoEventAddress) -> Self {
288         match a {
289             kvm_ioctls::IoEventAddress::Pio(x) => Self::Pio(x),
290             kvm_ioctls::IoEventAddress::Mmio(x) => Self::Mmio(x),
291         }
292     }
293 }
294 
295 impl From<IoEventAddress> for kvm_ioctls::IoEventAddress {
296     fn from(a: IoEventAddress) -> Self {
297         match a {
298             IoEventAddress::Pio(x) => Self::Pio(x),
299             IoEventAddress::Mmio(x) => Self::Mmio(x),
300         }
301     }
302 }
303 
304 impl From<VcpuKvmState> for CpuState {
305     fn from(s: VcpuKvmState) -> Self {
306         CpuState::Kvm(s)
307     }
308 }
309 
310 impl From<CpuState> for VcpuKvmState {
311     fn from(s: CpuState) -> Self {
312         match s {
313             CpuState::Kvm(s) => s,
314             /* Needed in case other hypervisors are enabled */
315             #[allow(unreachable_patterns)]
316             _ => panic!("CpuState is not valid"),
317         }
318     }
319 }
320 
321 #[cfg(target_arch = "x86_64")]
322 impl From<kvm_clock_data> for ClockData {
323     fn from(d: kvm_clock_data) -> Self {
324         ClockData::Kvm(d)
325     }
326 }
327 
328 #[cfg(target_arch = "x86_64")]
329 impl From<ClockData> for kvm_clock_data {
330     fn from(ms: ClockData) -> Self {
331         match ms {
332             ClockData::Kvm(s) => s,
333             /* Needed in case other hypervisors are enabled */
334             #[allow(unreachable_patterns)]
335             _ => panic!("CpuState is not valid"),
336         }
337     }
338 }
339 
340 impl From<kvm_bindings::kvm_regs> for crate::StandardRegisters {
341     fn from(s: kvm_bindings::kvm_regs) -> Self {
342         crate::StandardRegisters::Kvm(s)
343     }
344 }
345 
346 impl From<crate::StandardRegisters> for kvm_bindings::kvm_regs {
347     fn from(e: crate::StandardRegisters) -> Self {
348         match e {
349             crate::StandardRegisters::Kvm(e) => e,
350             /* Needed in case other hypervisors are enabled */
351             #[allow(unreachable_patterns)]
352             _ => panic!("StandardRegisters are not valid"),
353         }
354     }
355 }
356 
357 impl From<kvm_irq_routing_entry> for IrqRoutingEntry {
358     fn from(s: kvm_irq_routing_entry) -> Self {
359         IrqRoutingEntry::Kvm(s)
360     }
361 }
362 
363 impl From<IrqRoutingEntry> for kvm_irq_routing_entry {
364     fn from(e: IrqRoutingEntry) -> Self {
365         match e {
366             IrqRoutingEntry::Kvm(e) => e,
367             /* Needed in case other hypervisors are enabled */
368             #[allow(unreachable_patterns)]
369             _ => panic!("IrqRoutingEntry is not valid"),
370         }
371     }
372 }
373 
374 struct KvmDirtyLogSlot {
375     slot: u32,
376     guest_phys_addr: u64,
377     memory_size: u64,
378     userspace_addr: u64,
379 }
380 
381 /// Wrapper over KVM VM ioctls.
382 pub struct KvmVm {
383     fd: Arc<VmFd>,
384     #[cfg(target_arch = "x86_64")]
385     msrs: Vec<MsrEntry>,
386     dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>,
387 }
388 
389 impl KvmVm {
390     ///
391     /// Creates an emulated device in the kernel.
392     ///
393     /// See the documentation for `KVM_CREATE_DEVICE`.
394     fn create_device(&self, device: &mut CreateDevice) -> vm::Result<vfio_ioctls::VfioDeviceFd> {
395         let device_fd = self
396             .fd
397             .create_device(device)
398             .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?;
399         Ok(VfioDeviceFd::new_from_kvm(device_fd))
400     }
401     /// Checks if a particular `Cap` is available.
402     pub fn check_extension(&self, c: Cap) -> bool {
403         self.fd.check_extension(c)
404     }
405 }
406 
407 /// Implementation of Vm trait for KVM
408 ///
409 /// # Examples
410 ///
411 /// ```
412 /// # use hypervisor::kvm::KvmHypervisor;
413 /// # use std::sync::Arc;
414 /// let kvm = KvmHypervisor::new().unwrap();
415 /// let hypervisor = Arc::new(kvm);
416 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
417 /// ```
418 impl vm::Vm for KvmVm {
419     #[cfg(target_arch = "x86_64")]
420     ///
421     /// Sets the address of the one-page region in the VM's address space.
422     ///
423     fn set_identity_map_address(&self, address: u64) -> vm::Result<()> {
424         self.fd
425             .set_identity_map_address(address)
426             .map_err(|e| vm::HypervisorVmError::SetIdentityMapAddress(e.into()))
427     }
428 
429     #[cfg(target_arch = "x86_64")]
430     ///
431     /// Sets the address of the three-page region in the VM's address space.
432     ///
433     fn set_tss_address(&self, offset: usize) -> vm::Result<()> {
434         self.fd
435             .set_tss_address(offset)
436             .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into()))
437     }
438 
439     ///
440     /// Creates an in-kernel interrupt controller.
441     ///
442     fn create_irq_chip(&self) -> vm::Result<()> {
443         self.fd
444             .create_irq_chip()
445             .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into()))
446     }
447 
448     ///
449     /// Registers an event that will, when signaled, trigger the `gsi` IRQ.
450     ///
451     fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
452         self.fd
453             .register_irqfd(fd, gsi)
454             .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into()))
455     }
456 
457     ///
458     /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ.
459     ///
460     fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
461         self.fd
462             .unregister_irqfd(fd, gsi)
463             .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into()))
464     }
465 
466     ///
467     /// Creates a VcpuFd object from a vcpu RawFd.
468     ///
469     fn create_vcpu(
470         &self,
471         id: u8,
472         vm_ops: Option<Arc<dyn VmOps>>,
473     ) -> vm::Result<Arc<dyn cpu::Vcpu>> {
474         let fd = self
475             .fd
476             .create_vcpu(id as u64)
477             .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?;
478         let vcpu = KvmVcpu {
479             fd: Arc::new(Mutex::new(fd)),
480             #[cfg(target_arch = "x86_64")]
481             msrs: self.msrs.clone(),
482             vm_ops,
483             #[cfg(target_arch = "x86_64")]
484             hyperv_synic: AtomicBool::new(false),
485         };
486         Ok(Arc::new(vcpu))
487     }
488 
489     #[cfg(target_arch = "aarch64")]
490     ///
491     /// Creates a virtual GIC device.
492     ///
493     fn create_vgic(&self, config: VgicConfig) -> vm::Result<Arc<Mutex<dyn Vgic>>> {
494         let gic_device = KvmGicV3Its::new(self, config)
495             .map_err(|e| vm::HypervisorVmError::CreateVgic(anyhow!("Vgic error {:?}", e)))?;
496         Ok(Arc::new(Mutex::new(gic_device)))
497     }
498 
499     ///
500     /// Registers an event to be signaled whenever a certain address is written to.
501     ///
502     fn register_ioevent(
503         &self,
504         fd: &EventFd,
505         addr: &IoEventAddress,
506         datamatch: Option<vm::DataMatch>,
507     ) -> vm::Result<()> {
508         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
509         if let Some(dm) = datamatch {
510             match dm {
511                 vm::DataMatch::DataMatch32(kvm_dm32) => self
512                     .fd
513                     .register_ioevent(fd, addr, kvm_dm32)
514                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
515                 vm::DataMatch::DataMatch64(kvm_dm64) => self
516                     .fd
517                     .register_ioevent(fd, addr, kvm_dm64)
518                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
519             }
520         } else {
521             self.fd
522                 .register_ioevent(fd, addr, NoDatamatch)
523                 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into()))
524         }
525     }
526 
527     ///
528     /// Unregisters an event from a certain address it has been previously registered to.
529     ///
530     fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> {
531         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
532         self.fd
533             .unregister_ioevent(fd, addr, NoDatamatch)
534             .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into()))
535     }
536 
537     ///
538     /// Constructs a routing entry
539     ///
540     fn make_routing_entry(&self, gsi: u32, config: &InterruptSourceConfig) -> IrqRoutingEntry {
541         match &config {
542             InterruptSourceConfig::MsiIrq(cfg) => {
543                 let mut kvm_route = kvm_irq_routing_entry {
544                     gsi,
545                     type_: KVM_IRQ_ROUTING_MSI,
546                     ..Default::default()
547                 };
548 
549                 kvm_route.u.msi.address_lo = cfg.low_addr;
550                 kvm_route.u.msi.address_hi = cfg.high_addr;
551                 kvm_route.u.msi.data = cfg.data;
552 
553                 if self.check_extension(crate::kvm::Cap::MsiDevid) {
554                     // On AArch64, there is limitation on the range of the 'devid',
555                     // it cannot be greater than 65536 (the max of u16).
556                     //
557                     // BDF cannot be used directly, because 'segment' is in high
558                     // 16 bits. The layout of the u32 BDF is:
559                     // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --|
560                     // |      segment    |     bus    |   device   |  function  |
561                     //
562                     // Now that we support 1 bus only in a segment, we can build a
563                     // 'devid' by replacing the 'bus' bits with the low 8 bits of
564                     // 'segment' data.
565                     // This way we can resolve the range checking problem and give
566                     // different `devid` to all the devices. Limitation is that at
567                     // most 256 segments can be supported.
568                     //
569                     let modified_devid = (cfg.devid & 0x00ff_0000) >> 8 | cfg.devid & 0xff;
570 
571                     kvm_route.flags = KVM_MSI_VALID_DEVID;
572                     kvm_route.u.msi.__bindgen_anon_1.devid = modified_devid;
573                 }
574                 kvm_route.into()
575             }
576             InterruptSourceConfig::LegacyIrq(cfg) => {
577                 let mut kvm_route = kvm_irq_routing_entry {
578                     gsi,
579                     type_: KVM_IRQ_ROUTING_IRQCHIP,
580                     ..Default::default()
581                 };
582                 kvm_route.u.irqchip.irqchip = cfg.irqchip;
583                 kvm_route.u.irqchip.pin = cfg.pin;
584 
585                 kvm_route.into()
586             }
587         }
588     }
589 
590     ///
591     /// Sets the GSI routing table entries, overwriting any previously set
592     /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl.
593     ///
594     fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> {
595         let mut irq_routing =
596             vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len());
597         irq_routing[0].nr = entries.len() as u32;
598         irq_routing[0].flags = 0;
599         let entries: Vec<kvm_irq_routing_entry> = entries
600             .iter()
601             .map(|entry| match entry {
602                 IrqRoutingEntry::Kvm(e) => *e,
603                 #[allow(unreachable_patterns)]
604                 _ => panic!("IrqRoutingEntry type is wrong"),
605             })
606             .collect();
607 
608         // SAFETY: irq_routing initialized with entries.len() and now it is being turned into
609         // entries_slice with entries.len() again. It is guaranteed to be large enough to hold
610         // everything from entries.
611         unsafe {
612             let entries_slice: &mut [kvm_irq_routing_entry] =
613                 irq_routing[0].entries.as_mut_slice(entries.len());
614             entries_slice.copy_from_slice(&entries);
615         }
616 
617         self.fd
618             .set_gsi_routing(&irq_routing[0])
619             .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into()))
620     }
621 
622     ///
623     /// Creates a memory region structure that can be used with {create/remove}_user_memory_region
624     ///
625     fn make_user_memory_region(
626         &self,
627         slot: u32,
628         guest_phys_addr: u64,
629         memory_size: u64,
630         userspace_addr: u64,
631         readonly: bool,
632         log_dirty_pages: bool,
633     ) -> UserMemoryRegion {
634         kvm_userspace_memory_region {
635             slot,
636             guest_phys_addr,
637             memory_size,
638             userspace_addr,
639             flags: if readonly { KVM_MEM_READONLY } else { 0 }
640                 | if log_dirty_pages {
641                     KVM_MEM_LOG_DIRTY_PAGES
642                 } else {
643                     0
644                 },
645         }
646         .into()
647     }
648 
649     ///
650     /// Creates a guest physical memory region.
651     ///
652     fn create_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
653         let mut region: kvm_userspace_memory_region = user_memory_region.into();
654 
655         if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 {
656             if (region.flags & KVM_MEM_READONLY) != 0 {
657                 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!(
658                     "Error creating regions with both 'dirty-pages-log' and 'read-only'."
659                 )));
660             }
661 
662             // Keep track of the regions that need dirty pages log
663             self.dirty_log_slots.write().unwrap().insert(
664                 region.slot,
665                 KvmDirtyLogSlot {
666                     slot: region.slot,
667                     guest_phys_addr: region.guest_phys_addr,
668                     memory_size: region.memory_size,
669                     userspace_addr: region.userspace_addr,
670                 },
671             );
672 
673             // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`.
674             // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`.
675             region.flags = 0;
676         }
677 
678         // SAFETY: Safe because guest regions are guaranteed not to overlap.
679         unsafe {
680             self.fd
681                 .set_user_memory_region(region)
682                 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))
683         }
684     }
685 
686     ///
687     /// Removes a guest physical memory region.
688     ///
689     fn remove_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
690         let mut region: kvm_userspace_memory_region = user_memory_region.into();
691 
692         // Remove the corresponding entry from "self.dirty_log_slots" if needed
693         self.dirty_log_slots.write().unwrap().remove(&region.slot);
694 
695         // Setting the size to 0 means "remove"
696         region.memory_size = 0;
697         // SAFETY: Safe because guest regions are guaranteed not to overlap.
698         unsafe {
699             self.fd
700                 .set_user_memory_region(region)
701                 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into()))
702         }
703     }
704 
705     ///
706     /// Returns the preferred CPU target type which can be emulated by KVM on underlying host.
707     ///
708     #[cfg(target_arch = "aarch64")]
709     fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> {
710         self.fd
711             .get_preferred_target(kvi)
712             .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into()))
713     }
714 
715     #[cfg(target_arch = "x86_64")]
716     fn enable_split_irq(&self) -> vm::Result<()> {
717         // Create split irqchip
718         // Only the local APIC is emulated in kernel, both PICs and IOAPIC
719         // are not.
720         let mut cap = kvm_enable_cap {
721             cap: KVM_CAP_SPLIT_IRQCHIP,
722             ..Default::default()
723         };
724         cap.args[0] = NUM_IOAPIC_PINS as u64;
725         self.fd
726             .enable_cap(&cap)
727             .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?;
728         Ok(())
729     }
730 
731     #[cfg(target_arch = "x86_64")]
732     fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> {
733         let mut cap = kvm_enable_cap {
734             cap: KVM_CAP_SGX_ATTRIBUTE,
735             ..Default::default()
736         };
737         cap.args[0] = file.as_raw_fd() as u64;
738         self.fd
739             .enable_cap(&cap)
740             .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?;
741         Ok(())
742     }
743 
744     /// Retrieve guest clock.
745     #[cfg(target_arch = "x86_64")]
746     fn get_clock(&self) -> vm::Result<ClockData> {
747         Ok(self
748             .fd
749             .get_clock()
750             .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))?
751             .into())
752     }
753 
754     /// Set guest clock.
755     #[cfg(target_arch = "x86_64")]
756     fn set_clock(&self, data: &ClockData) -> vm::Result<()> {
757         let data = (*data).into();
758         self.fd
759             .set_clock(&data)
760             .map_err(|e| vm::HypervisorVmError::SetClock(e.into()))
761     }
762 
763     /// Create a device that is used for passthrough
764     fn create_passthrough_device(&self) -> vm::Result<VfioDeviceFd> {
765         let mut vfio_dev = kvm_create_device {
766             type_: kvm_device_type_KVM_DEV_TYPE_VFIO,
767             fd: 0,
768             flags: 0,
769         };
770 
771         self.create_device(&mut vfio_dev)
772             .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into()))
773     }
774 
775     ///
776     /// Start logging dirty pages
777     ///
778     fn start_dirty_log(&self) -> vm::Result<()> {
779         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
780         for (_, s) in dirty_log_slots.iter() {
781             let region = kvm_userspace_memory_region {
782                 slot: s.slot,
783                 guest_phys_addr: s.guest_phys_addr,
784                 memory_size: s.memory_size,
785                 userspace_addr: s.userspace_addr,
786                 flags: KVM_MEM_LOG_DIRTY_PAGES,
787             };
788             // SAFETY: Safe because guest regions are guaranteed not to overlap.
789             unsafe {
790                 self.fd
791                     .set_user_memory_region(region)
792                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
793             }
794         }
795 
796         Ok(())
797     }
798 
799     ///
800     /// Stop logging dirty pages
801     ///
802     fn stop_dirty_log(&self) -> vm::Result<()> {
803         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
804         for (_, s) in dirty_log_slots.iter() {
805             let region = kvm_userspace_memory_region {
806                 slot: s.slot,
807                 guest_phys_addr: s.guest_phys_addr,
808                 memory_size: s.memory_size,
809                 userspace_addr: s.userspace_addr,
810                 flags: 0,
811             };
812             // SAFETY: Safe because guest regions are guaranteed not to overlap.
813             unsafe {
814                 self.fd
815                     .set_user_memory_region(region)
816                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
817             }
818         }
819 
820         Ok(())
821     }
822 
823     ///
824     /// Get dirty pages bitmap (one bit per page)
825     ///
826     fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> {
827         self.fd
828             .get_dirty_log(slot, memory_size as usize)
829             .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into()))
830     }
831 
832     ///
833     /// Initialize TDX for this VM
834     ///
835     #[cfg(feature = "tdx")]
836     fn tdx_init(&self, cpuid: &[CpuIdEntry], max_vcpus: u32) -> vm::Result<()> {
837         const TDX_ATTR_SEPT_VE_DISABLE: usize = 28;
838 
839         let mut cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> =
840             cpuid.iter().map(|e| (*e).into()).collect();
841         cpuid.resize(256, kvm_bindings::kvm_cpuid_entry2::default());
842 
843         #[repr(C)]
844         struct TdxInitVm {
845             attributes: u64,
846             max_vcpus: u32,
847             padding: u32,
848             mrconfigid: [u64; 6],
849             mrowner: [u64; 6],
850             mrownerconfig: [u64; 6],
851             cpuid_nent: u32,
852             cpuid_padding: u32,
853             cpuid_entries: [kvm_bindings::kvm_cpuid_entry2; 256],
854         }
855         let data = TdxInitVm {
856             attributes: 1 << TDX_ATTR_SEPT_VE_DISABLE,
857             max_vcpus,
858             padding: 0,
859             mrconfigid: [0; 6],
860             mrowner: [0; 6],
861             mrownerconfig: [0; 6],
862             cpuid_nent: cpuid.len() as u32,
863             cpuid_padding: 0,
864             cpuid_entries: cpuid.as_slice().try_into().unwrap(),
865         };
866 
867         tdx_command(
868             &self.fd.as_raw_fd(),
869             TdxCommand::InitVm,
870             0,
871             &data as *const _ as u64,
872         )
873         .map_err(vm::HypervisorVmError::InitializeTdx)
874     }
875 
876     ///
877     /// Finalize the TDX setup for this VM
878     ///
879     #[cfg(feature = "tdx")]
880     fn tdx_finalize(&self) -> vm::Result<()> {
881         tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0)
882             .map_err(vm::HypervisorVmError::FinalizeTdx)
883     }
884 
885     ///
886     /// Initialize memory regions for the TDX VM
887     ///
888     #[cfg(feature = "tdx")]
889     fn tdx_init_memory_region(
890         &self,
891         host_address: u64,
892         guest_address: u64,
893         size: u64,
894         measure: bool,
895     ) -> vm::Result<()> {
896         #[repr(C)]
897         struct TdxInitMemRegion {
898             host_address: u64,
899             guest_address: u64,
900             pages: u64,
901         }
902         let data = TdxInitMemRegion {
903             host_address,
904             guest_address,
905             pages: size / 4096,
906         };
907 
908         tdx_command(
909             &self.fd.as_raw_fd(),
910             TdxCommand::InitMemRegion,
911             u32::from(measure),
912             &data as *const _ as u64,
913         )
914         .map_err(vm::HypervisorVmError::InitMemRegionTdx)
915     }
916 
917     /// Downcast to the underlying KvmVm type
918     fn as_any(&self) -> &dyn Any {
919         self
920     }
921 }
922 
923 #[cfg(feature = "tdx")]
924 fn tdx_command(
925     fd: &RawFd,
926     command: TdxCommand,
927     flags: u32,
928     data: u64,
929 ) -> std::result::Result<(), std::io::Error> {
930     #[repr(C)]
931     struct TdxIoctlCmd {
932         command: TdxCommand,
933         flags: u32,
934         data: u64,
935         error: u64,
936         unused: u64,
937     }
938     let cmd = TdxIoctlCmd {
939         command,
940         flags,
941         data,
942         error: 0,
943         unused: 0,
944     };
945     // SAFETY: FFI call. All input parameters are valid.
946     let ret = unsafe {
947         ioctl_with_val(
948             fd,
949             KVM_MEMORY_ENCRYPT_OP(),
950             &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong,
951         )
952     };
953 
954     if ret < 0 {
955         return Err(std::io::Error::last_os_error());
956     }
957     Ok(())
958 }
959 
960 /// Wrapper over KVM system ioctls.
961 pub struct KvmHypervisor {
962     kvm: Kvm,
963 }
964 
965 impl KvmHypervisor {
966     #[cfg(target_arch = "x86_64")]
967     ///
968     /// Retrieve the list of MSRs supported by the hypervisor.
969     ///
970     fn get_msr_list(&self) -> hypervisor::Result<MsrList> {
971         self.kvm
972             .get_msr_index_list()
973             .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into()))
974     }
975 }
976 
977 /// Enum for KVM related error
978 #[derive(Debug, Error)]
979 pub enum KvmError {
980     #[error("Capability missing: {0:?}")]
981     CapabilityMissing(Cap),
982 }
983 
984 pub type KvmResult<T> = result::Result<T, KvmError>;
985 
986 impl KvmHypervisor {
987     /// Create a hypervisor based on Kvm
988     #[allow(clippy::new_ret_no_self)]
989     pub fn new() -> hypervisor::Result<Arc<dyn hypervisor::Hypervisor>> {
990         let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?;
991         let api_version = kvm_obj.get_api_version();
992 
993         if api_version != kvm_bindings::KVM_API_VERSION as i32 {
994             return Err(hypervisor::HypervisorError::IncompatibleApiVersion);
995         }
996 
997         Ok(Arc::new(KvmHypervisor { kvm: kvm_obj }))
998     }
999 
1000     /// Check if the hypervisor is available
1001     pub fn is_available() -> hypervisor::Result<bool> {
1002         match std::fs::metadata("/dev/kvm") {
1003             Ok(_) => Ok(true),
1004             Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false),
1005             Err(err) => Err(hypervisor::HypervisorError::HypervisorAvailableCheck(
1006                 err.into(),
1007             )),
1008         }
1009     }
1010 }
1011 
1012 /// Implementation of Hypervisor trait for KVM
1013 ///
1014 /// # Examples
1015 ///
1016 /// ```
1017 /// # use hypervisor::kvm::KvmHypervisor;
1018 /// # use std::sync::Arc;
1019 /// let kvm = KvmHypervisor::new().unwrap();
1020 /// let hypervisor = Arc::new(kvm);
1021 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
1022 /// ```
1023 impl hypervisor::Hypervisor for KvmHypervisor {
1024     ///
1025     /// Returns the type of the hypervisor
1026     ///
1027     fn hypervisor_type(&self) -> HypervisorType {
1028         HypervisorType::Kvm
1029     }
1030 
1031     /// Create a KVM vm object of a specific VM type and return the object as Vm trait object
1032     ///
1033     /// # Examples
1034     ///
1035     /// ```
1036     /// # use hypervisor::kvm::KvmHypervisor;
1037     /// use hypervisor::kvm::KvmVm;
1038     /// let hypervisor = KvmHypervisor::new().unwrap();
1039     /// let vm = hypervisor.create_vm_with_type(0).unwrap();
1040     /// ```
1041     fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> {
1042         let fd: VmFd;
1043         loop {
1044             match self.kvm.create_vm_with_type(vm_type) {
1045                 Ok(res) => fd = res,
1046                 Err(e) => {
1047                     if e.errno() == libc::EINTR {
1048                         // If the error returned is EINTR, which means the
1049                         // ioctl has been interrupted, we have to retry as
1050                         // this can't be considered as a regular error.
1051                         continue;
1052                     } else {
1053                         return Err(hypervisor::HypervisorError::VmCreate(e.into()));
1054                     }
1055                 }
1056             }
1057             break;
1058         }
1059 
1060         let vm_fd = Arc::new(fd);
1061 
1062         #[cfg(target_arch = "x86_64")]
1063         {
1064             let msr_list = self.get_msr_list()?;
1065             let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize;
1066             let mut msrs: Vec<MsrEntry> = vec![
1067                 MsrEntry {
1068                     ..Default::default()
1069                 };
1070                 num_msrs
1071             ];
1072             let indices = msr_list.as_slice();
1073             for (pos, index) in indices.iter().enumerate() {
1074                 msrs[pos].index = *index;
1075             }
1076 
1077             Ok(Arc::new(KvmVm {
1078                 fd: vm_fd,
1079                 msrs,
1080                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
1081             }))
1082         }
1083 
1084         #[cfg(target_arch = "aarch64")]
1085         {
1086             Ok(Arc::new(KvmVm {
1087                 fd: vm_fd,
1088                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
1089             }))
1090         }
1091     }
1092 
1093     /// Create a KVM vm object and return the object as Vm trait object
1094     ///
1095     /// # Examples
1096     ///
1097     /// ```
1098     /// # use hypervisor::kvm::KvmHypervisor;
1099     /// use hypervisor::kvm::KvmVm;
1100     /// let hypervisor = KvmHypervisor::new().unwrap();
1101     /// let vm = hypervisor.create_vm().unwrap();
1102     /// ```
1103     fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> {
1104         #[allow(unused_mut)]
1105         let mut vm_type: u64 = 0; // Create with default platform type
1106 
1107         // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA
1108         // size from the host and use that when creating the VM, which may
1109         // avoid unnecessary VM creation failures.
1110         #[cfg(target_arch = "aarch64")]
1111         if self.kvm.check_extension(Cap::ArmVmIPASize) {
1112             vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap();
1113         }
1114 
1115         self.create_vm_with_type(vm_type)
1116     }
1117 
1118     fn check_required_extensions(&self) -> hypervisor::Result<()> {
1119         check_required_kvm_extensions(&self.kvm)
1120             .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into()))
1121     }
1122 
1123     #[cfg(target_arch = "x86_64")]
1124     ///
1125     /// X86 specific call to get the system supported CPUID values.
1126     ///
1127     fn get_supported_cpuid(&self) -> hypervisor::Result<Vec<CpuIdEntry>> {
1128         let kvm_cpuid = self
1129             .kvm
1130             .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
1131             .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))?;
1132 
1133         let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect();
1134 
1135         Ok(v)
1136     }
1137 
1138     #[cfg(target_arch = "aarch64")]
1139     ///
1140     /// Retrieve AArch64 host maximum IPA size supported by KVM.
1141     ///
1142     fn get_host_ipa_limit(&self) -> i32 {
1143         self.kvm.get_host_ipa_limit()
1144     }
1145 
1146     ///
1147     /// Retrieve TDX capabilities
1148     ///
1149     #[cfg(feature = "tdx")]
1150     fn tdx_capabilities(&self) -> hypervisor::Result<TdxCapabilities> {
1151         let data = TdxCapabilities {
1152             nr_cpuid_configs: TDX_MAX_NR_CPUID_CONFIGS as u32,
1153             ..Default::default()
1154         };
1155 
1156         tdx_command(
1157             &self.kvm.as_raw_fd(),
1158             TdxCommand::Capabilities,
1159             0,
1160             &data as *const _ as u64,
1161         )
1162         .map_err(|e| hypervisor::HypervisorError::TdxCapabilities(e.into()))?;
1163 
1164         Ok(data)
1165     }
1166 
1167     ///
1168     /// Get the number of supported hardware breakpoints
1169     ///
1170     fn get_guest_debug_hw_bps(&self) -> usize {
1171         #[cfg(target_arch = "x86_64")]
1172         {
1173             4
1174         }
1175         #[cfg(target_arch = "aarch64")]
1176         {
1177             self.kvm.get_guest_debug_hw_bps() as usize
1178         }
1179     }
1180 
1181     /// Get maximum number of vCPUs
1182     fn get_max_vcpus(&self) -> u32 {
1183         self.kvm.get_max_vcpus().min(u32::MAX as usize) as u32
1184     }
1185 }
1186 
1187 /// Vcpu struct for KVM
1188 pub struct KvmVcpu {
1189     fd: Arc<Mutex<VcpuFd>>,
1190     #[cfg(target_arch = "x86_64")]
1191     msrs: Vec<MsrEntry>,
1192     vm_ops: Option<Arc<dyn vm::VmOps>>,
1193     #[cfg(target_arch = "x86_64")]
1194     hyperv_synic: AtomicBool,
1195 }
1196 
1197 /// Implementation of Vcpu trait for KVM
1198 ///
1199 /// # Examples
1200 ///
1201 /// ```
1202 /// # use hypervisor::kvm::KvmHypervisor;
1203 /// # use std::sync::Arc;
1204 /// let kvm = KvmHypervisor::new().unwrap();
1205 /// let hypervisor = Arc::new(kvm);
1206 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
1207 /// let vcpu = vm.create_vcpu(0, None).unwrap();
1208 /// ```
1209 impl cpu::Vcpu for KvmVcpu {
1210     ///
1211     /// Returns StandardRegisters with default value set
1212     ///
1213     #[cfg(target_arch = "x86_64")]
1214     fn create_standard_regs(&self) -> StandardRegisters {
1215         kvm_bindings::kvm_regs::default().into()
1216     }
1217     #[cfg(target_arch = "x86_64")]
1218     ///
1219     /// Returns the vCPU general purpose registers.
1220     ///
1221     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1222         Ok(self
1223             .fd
1224             .lock()
1225             .unwrap()
1226             .get_regs()
1227             .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))?
1228             .into())
1229     }
1230 
1231     ///
1232     /// Returns the vCPU general purpose registers.
1233     /// The `KVM_GET_REGS` ioctl is not available on AArch64, `KVM_GET_ONE_REG`
1234     /// is used to get registers one by one.
1235     ///
1236     #[cfg(target_arch = "aarch64")]
1237     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1238         let mut state = kvm_regs::default();
1239         let mut off = offset_of!(user_pt_regs, regs);
1240         // There are 31 user_pt_regs:
1241         // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72
1242         // These actually are the general-purpose registers of the Armv8-a
1243         // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register).
1244         for i in 0..31 {
1245             let mut bytes = [0_u8; 8];
1246             self.fd
1247                 .lock()
1248                 .unwrap()
1249                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1250                 .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1251             state.regs.regs[i] = u64::from_le_bytes(bytes);
1252             off += std::mem::size_of::<u64>();
1253         }
1254 
1255         // We are now entering the "Other register" section of the ARMv8-a architecture.
1256         // First one, stack pointer.
1257         let off = offset_of!(user_pt_regs, sp);
1258         let mut bytes = [0_u8; 8];
1259         self.fd
1260             .lock()
1261             .unwrap()
1262             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1263             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1264         state.regs.sp = u64::from_le_bytes(bytes);
1265 
1266         // Second one, the program counter.
1267         let off = offset_of!(user_pt_regs, pc);
1268         let mut bytes = [0_u8; 8];
1269         self.fd
1270             .lock()
1271             .unwrap()
1272             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1273             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1274         state.regs.pc = u64::from_le_bytes(bytes);
1275 
1276         // Next is the processor state.
1277         let off = offset_of!(user_pt_regs, pstate);
1278         let mut bytes = [0_u8; 8];
1279         self.fd
1280             .lock()
1281             .unwrap()
1282             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1283             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1284         state.regs.pstate = u64::from_le_bytes(bytes);
1285 
1286         // The stack pointer associated with EL1
1287         let off = offset_of!(kvm_regs, sp_el1);
1288         let mut bytes = [0_u8; 8];
1289         self.fd
1290             .lock()
1291             .unwrap()
1292             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1293             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1294         state.sp_el1 = u64::from_le_bytes(bytes);
1295 
1296         // Exception Link Register for EL1, when taking an exception to EL1, this register
1297         // holds the address to which to return afterwards.
1298         let off = offset_of!(kvm_regs, elr_el1);
1299         let mut bytes = [0_u8; 8];
1300         self.fd
1301             .lock()
1302             .unwrap()
1303             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1304             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1305         state.elr_el1 = u64::from_le_bytes(bytes);
1306 
1307         // Saved Program Status Registers, there are 5 of them used in the kernel.
1308         let mut off = offset_of!(kvm_regs, spsr);
1309         for i in 0..KVM_NR_SPSR as usize {
1310             let mut bytes = [0_u8; 8];
1311             self.fd
1312                 .lock()
1313                 .unwrap()
1314                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1315                 .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1316             state.spsr[i] = u64::from_le_bytes(bytes);
1317             off += std::mem::size_of::<u64>();
1318         }
1319 
1320         // Now moving on to floating point registers which are stored in the user_fpsimd_state in the kernel:
1321         // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53
1322         let mut off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, vregs);
1323         for i in 0..32 {
1324             let mut bytes = [0_u8; 16];
1325             self.fd
1326                 .lock()
1327                 .unwrap()
1328                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off), &mut bytes)
1329                 .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1330             state.fp_regs.vregs[i] = u128::from_le_bytes(bytes);
1331             off += mem::size_of::<u128>();
1332         }
1333 
1334         // Floating-point Status Register
1335         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpsr);
1336         let mut bytes = [0_u8; 4];
1337         self.fd
1338             .lock()
1339             .unwrap()
1340             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off), &mut bytes)
1341             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1342         state.fp_regs.fpsr = u32::from_le_bytes(bytes);
1343 
1344         // Floating-point Control Register
1345         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpcr);
1346         let mut bytes = [0_u8; 4];
1347         self.fd
1348             .lock()
1349             .unwrap()
1350             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off), &mut bytes)
1351             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1352         state.fp_regs.fpcr = u32::from_le_bytes(bytes);
1353         Ok(state.into())
1354     }
1355 
1356     #[cfg(target_arch = "x86_64")]
1357     ///
1358     /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl.
1359     ///
1360     fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> {
1361         let regs = (*regs).into();
1362         self.fd
1363             .lock()
1364             .unwrap()
1365             .set_regs(&regs)
1366             .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into()))
1367     }
1368 
1369     ///
1370     /// Sets the vCPU general purpose registers.
1371     /// The `KVM_SET_REGS` ioctl is not available on AArch64, `KVM_SET_ONE_REG`
1372     /// is used to set registers one by one.
1373     ///
1374     #[cfg(target_arch = "aarch64")]
1375     fn set_regs(&self, state: &StandardRegisters) -> cpu::Result<()> {
1376         // The function follows the exact identical order from `state`. Look there
1377         // for some additional info on registers.
1378         let kvm_regs_state: kvm_regs = (*state).into();
1379         let mut off = offset_of!(user_pt_regs, regs);
1380         for i in 0..31 {
1381             self.fd
1382                 .lock()
1383                 .unwrap()
1384                 .set_one_reg(
1385                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1386                     &kvm_regs_state.regs.regs[i].to_le_bytes(),
1387                 )
1388                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1389             off += std::mem::size_of::<u64>();
1390         }
1391 
1392         let off = offset_of!(user_pt_regs, sp);
1393         self.fd
1394             .lock()
1395             .unwrap()
1396             .set_one_reg(
1397                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1398                 &kvm_regs_state.regs.sp.to_le_bytes(),
1399             )
1400             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1401 
1402         let off = offset_of!(user_pt_regs, pc);
1403         self.fd
1404             .lock()
1405             .unwrap()
1406             .set_one_reg(
1407                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1408                 &kvm_regs_state.regs.pc.to_le_bytes(),
1409             )
1410             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1411 
1412         let off = offset_of!(user_pt_regs, pstate);
1413         self.fd
1414             .lock()
1415             .unwrap()
1416             .set_one_reg(
1417                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1418                 &kvm_regs_state.regs.pstate.to_le_bytes(),
1419             )
1420             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1421 
1422         let off = offset_of!(kvm_regs, sp_el1);
1423         self.fd
1424             .lock()
1425             .unwrap()
1426             .set_one_reg(
1427                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1428                 &kvm_regs_state.sp_el1.to_le_bytes(),
1429             )
1430             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1431 
1432         let off = offset_of!(kvm_regs, elr_el1);
1433         self.fd
1434             .lock()
1435             .unwrap()
1436             .set_one_reg(
1437                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1438                 &kvm_regs_state.elr_el1.to_le_bytes(),
1439             )
1440             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1441 
1442         let mut off = offset_of!(kvm_regs, spsr);
1443         for i in 0..KVM_NR_SPSR as usize {
1444             self.fd
1445                 .lock()
1446                 .unwrap()
1447                 .set_one_reg(
1448                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1449                     &kvm_regs_state.spsr[i].to_le_bytes(),
1450                 )
1451                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1452             off += std::mem::size_of::<u64>();
1453         }
1454 
1455         let mut off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, vregs);
1456         for i in 0..32 {
1457             self.fd
1458                 .lock()
1459                 .unwrap()
1460                 .set_one_reg(
1461                     arm64_core_reg_id!(KVM_REG_SIZE_U128, off),
1462                     &kvm_regs_state.fp_regs.vregs[i].to_le_bytes(),
1463                 )
1464                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1465             off += mem::size_of::<u128>();
1466         }
1467 
1468         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpsr);
1469         self.fd
1470             .lock()
1471             .unwrap()
1472             .set_one_reg(
1473                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1474                 &kvm_regs_state.fp_regs.fpsr.to_le_bytes(),
1475             )
1476             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1477 
1478         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpcr);
1479         self.fd
1480             .lock()
1481             .unwrap()
1482             .set_one_reg(
1483                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1484                 &kvm_regs_state.fp_regs.fpcr.to_le_bytes(),
1485             )
1486             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1487         Ok(())
1488     }
1489 
1490     #[cfg(target_arch = "x86_64")]
1491     ///
1492     /// Returns the vCPU special registers.
1493     ///
1494     fn get_sregs(&self) -> cpu::Result<SpecialRegisters> {
1495         Ok(self
1496             .fd
1497             .lock()
1498             .unwrap()
1499             .get_sregs()
1500             .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))?
1501             .into())
1502     }
1503 
1504     #[cfg(target_arch = "x86_64")]
1505     ///
1506     /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl.
1507     ///
1508     fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> {
1509         let sregs = (*sregs).into();
1510         self.fd
1511             .lock()
1512             .unwrap()
1513             .set_sregs(&sregs)
1514             .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into()))
1515     }
1516 
1517     #[cfg(target_arch = "x86_64")]
1518     ///
1519     /// Returns the floating point state (FPU) from the vCPU.
1520     ///
1521     fn get_fpu(&self) -> cpu::Result<FpuState> {
1522         Ok(self
1523             .fd
1524             .lock()
1525             .unwrap()
1526             .get_fpu()
1527             .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))?
1528             .into())
1529     }
1530 
1531     #[cfg(target_arch = "x86_64")]
1532     ///
1533     /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioctl.
1534     ///
1535     fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> {
1536         let fpu: kvm_bindings::kvm_fpu = (*fpu).clone().into();
1537         self.fd
1538             .lock()
1539             .unwrap()
1540             .set_fpu(&fpu)
1541             .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into()))
1542     }
1543 
1544     #[cfg(target_arch = "x86_64")]
1545     ///
1546     /// X86 specific call to setup the CPUID registers.
1547     ///
1548     fn set_cpuid2(&self, cpuid: &[CpuIdEntry]) -> cpu::Result<()> {
1549         let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> =
1550             cpuid.iter().map(|e| (*e).into()).collect();
1551         let kvm_cpuid = <CpuId>::from_entries(&cpuid)
1552             .map_err(|_| cpu::HypervisorCpuError::SetCpuid(anyhow!("failed to create CpuId")))?;
1553 
1554         self.fd
1555             .lock()
1556             .unwrap()
1557             .set_cpuid2(&kvm_cpuid)
1558             .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into()))
1559     }
1560 
1561     #[cfg(target_arch = "x86_64")]
1562     ///
1563     /// X86 specific call to enable HyperV SynIC
1564     ///
1565     fn enable_hyperv_synic(&self) -> cpu::Result<()> {
1566         // Update the information about Hyper-V SynIC being enabled and
1567         // emulated as it will influence later which MSRs should be saved.
1568         self.hyperv_synic.store(true, Ordering::Release);
1569 
1570         let cap = kvm_enable_cap {
1571             cap: KVM_CAP_HYPERV_SYNIC,
1572             ..Default::default()
1573         };
1574         self.fd
1575             .lock()
1576             .unwrap()
1577             .enable_cap(&cap)
1578             .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into()))
1579     }
1580 
1581     ///
1582     /// X86 specific call to retrieve the CPUID registers.
1583     ///
1584     #[cfg(target_arch = "x86_64")]
1585     fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<Vec<CpuIdEntry>> {
1586         let kvm_cpuid = self
1587             .fd
1588             .lock()
1589             .unwrap()
1590             .get_cpuid2(num_entries)
1591             .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))?;
1592 
1593         let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect();
1594 
1595         Ok(v)
1596     }
1597 
1598     #[cfg(target_arch = "x86_64")]
1599     ///
1600     /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1601     ///
1602     fn get_lapic(&self) -> cpu::Result<LapicState> {
1603         Ok(self
1604             .fd
1605             .lock()
1606             .unwrap()
1607             .get_lapic()
1608             .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))?
1609             .into())
1610     }
1611 
1612     #[cfg(target_arch = "x86_64")]
1613     ///
1614     /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1615     ///
1616     fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> {
1617         let klapic: kvm_bindings::kvm_lapic_state = (*klapic).clone().into();
1618         self.fd
1619             .lock()
1620             .unwrap()
1621             .set_lapic(&klapic)
1622             .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into()))
1623     }
1624 
1625     #[cfg(target_arch = "x86_64")]
1626     ///
1627     /// Returns the model-specific registers (MSR) for this vCPU.
1628     ///
1629     fn get_msrs(&self, msrs: &mut Vec<MsrEntry>) -> cpu::Result<usize> {
1630         let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect();
1631         let mut kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap();
1632         let succ = self
1633             .fd
1634             .lock()
1635             .unwrap()
1636             .get_msrs(&mut kvm_msrs)
1637             .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))?;
1638 
1639         msrs[..succ].copy_from_slice(
1640             &kvm_msrs.as_slice()[..succ]
1641                 .iter()
1642                 .map(|e| (*e).into())
1643                 .collect::<Vec<MsrEntry>>(),
1644         );
1645 
1646         Ok(succ)
1647     }
1648 
1649     #[cfg(target_arch = "x86_64")]
1650     ///
1651     /// Setup the model-specific registers (MSR) for this vCPU.
1652     /// Returns the number of MSR entries actually written.
1653     ///
1654     fn set_msrs(&self, msrs: &[MsrEntry]) -> cpu::Result<usize> {
1655         let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect();
1656         let kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap();
1657         self.fd
1658             .lock()
1659             .unwrap()
1660             .set_msrs(&kvm_msrs)
1661             .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into()))
1662     }
1663 
1664     ///
1665     /// Returns the vcpu's current "multiprocessing state".
1666     ///
1667     fn get_mp_state(&self) -> cpu::Result<MpState> {
1668         Ok(self
1669             .fd
1670             .lock()
1671             .unwrap()
1672             .get_mp_state()
1673             .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))?
1674             .into())
1675     }
1676 
1677     ///
1678     /// Sets the vcpu's current "multiprocessing state".
1679     ///
1680     fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> {
1681         self.fd
1682             .lock()
1683             .unwrap()
1684             .set_mp_state(mp_state.into())
1685             .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into()))
1686     }
1687 
1688     #[cfg(target_arch = "x86_64")]
1689     ///
1690     /// Translates guest virtual address to guest physical address using the `KVM_TRANSLATE` ioctl.
1691     ///
1692     fn translate_gva(&self, gva: u64, _flags: u64) -> cpu::Result<(u64, u32)> {
1693         let tr = self
1694             .fd
1695             .lock()
1696             .unwrap()
1697             .translate_gva(gva)
1698             .map_err(|e| cpu::HypervisorCpuError::TranslateVirtualAddress(e.into()))?;
1699         // tr.valid is set if the GVA is mapped to valid GPA.
1700         match tr.valid {
1701             0 => Err(cpu::HypervisorCpuError::TranslateVirtualAddress(anyhow!(
1702                 "Invalid GVA: {:#x}",
1703                 gva
1704             ))),
1705             _ => Ok((tr.physical_address, 0)),
1706         }
1707     }
1708 
1709     ///
1710     /// Triggers the running of the current virtual CPU returning an exit reason.
1711     ///
1712     fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> {
1713         match self.fd.lock().unwrap().run() {
1714             Ok(run) => match run {
1715                 #[cfg(target_arch = "x86_64")]
1716                 VcpuExit::IoIn(addr, data) => {
1717                     if let Some(vm_ops) = &self.vm_ops {
1718                         return vm_ops
1719                             .pio_read(addr.into(), data)
1720                             .map(|_| cpu::VmExit::Ignore)
1721                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1722                     }
1723 
1724                     Ok(cpu::VmExit::Ignore)
1725                 }
1726                 #[cfg(target_arch = "x86_64")]
1727                 VcpuExit::IoOut(addr, data) => {
1728                     if let Some(vm_ops) = &self.vm_ops {
1729                         return vm_ops
1730                             .pio_write(addr.into(), data)
1731                             .map(|_| cpu::VmExit::Ignore)
1732                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1733                     }
1734 
1735                     Ok(cpu::VmExit::Ignore)
1736                 }
1737                 #[cfg(target_arch = "x86_64")]
1738                 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)),
1739                 #[cfg(target_arch = "x86_64")]
1740                 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset),
1741 
1742                 #[cfg(target_arch = "aarch64")]
1743                 VcpuExit::SystemEvent(event_type, flags) => {
1744                     use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN};
1745                     // On Aarch64, when the VM is shutdown, run() returns
1746                     // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN
1747                     if event_type == KVM_SYSTEM_EVENT_RESET {
1748                         Ok(cpu::VmExit::Reset)
1749                     } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN {
1750                         Ok(cpu::VmExit::Shutdown)
1751                     } else {
1752                         Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1753                             "Unexpected system event with type 0x{:x}, flags 0x{:x?}",
1754                             event_type,
1755                             flags
1756                         )))
1757                     }
1758                 }
1759 
1760                 VcpuExit::MmioRead(addr, data) => {
1761                     if let Some(vm_ops) = &self.vm_ops {
1762                         return vm_ops
1763                             .mmio_read(addr, data)
1764                             .map(|_| cpu::VmExit::Ignore)
1765                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1766                     }
1767 
1768                     Ok(cpu::VmExit::Ignore)
1769                 }
1770                 VcpuExit::MmioWrite(addr, data) => {
1771                     if let Some(vm_ops) = &self.vm_ops {
1772                         return vm_ops
1773                             .mmio_write(addr, data)
1774                             .map(|_| cpu::VmExit::Ignore)
1775                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1776                     }
1777 
1778                     Ok(cpu::VmExit::Ignore)
1779                 }
1780                 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv),
1781                 #[cfg(feature = "tdx")]
1782                 VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx),
1783                 VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug),
1784 
1785                 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1786                     "Unexpected exit reason on vcpu run: {:?}",
1787                     r
1788                 ))),
1789             },
1790 
1791             Err(ref e) => match e.errno() {
1792                 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore),
1793                 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1794                     "VCPU error {:?}",
1795                     e
1796                 ))),
1797             },
1798         }
1799     }
1800 
1801     #[cfg(target_arch = "x86_64")]
1802     ///
1803     /// Let the guest know that it has been paused, which prevents from
1804     /// potential soft lockups when being resumed.
1805     ///
1806     fn notify_guest_clock_paused(&self) -> cpu::Result<()> {
1807         if let Err(e) = self.fd.lock().unwrap().kvmclock_ctrl() {
1808             // Linux kernel returns -EINVAL if the PV clock isn't yet initialised
1809             // which could be because we're still in firmware or the guest doesn't
1810             // use KVM clock.
1811             if e.errno() != libc::EINVAL {
1812                 return Err(cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into()));
1813             }
1814         }
1815 
1816         Ok(())
1817     }
1818 
1819     ///
1820     /// Sets debug registers to set hardware breakpoints and/or enable single step.
1821     ///
1822     fn set_guest_debug(
1823         &self,
1824         addrs: &[vm_memory::GuestAddress],
1825         singlestep: bool,
1826     ) -> cpu::Result<()> {
1827         let mut dbg = kvm_guest_debug {
1828             #[cfg(target_arch = "x86_64")]
1829             control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP,
1830             #[cfg(target_arch = "aarch64")]
1831             control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW,
1832             ..Default::default()
1833         };
1834         if singlestep {
1835             dbg.control |= KVM_GUESTDBG_SINGLESTEP;
1836         }
1837 
1838         // Set the debug registers.
1839         // Here we assume that the number of addresses do not exceed what
1840         // `Hypervisor::get_guest_debug_hw_bps()` specifies.
1841         #[cfg(target_arch = "x86_64")]
1842         {
1843             // Set bits 9 and 10.
1844             // bit 9: GE (global exact breakpoint enable) flag.
1845             // bit 10: always 1.
1846             dbg.arch.debugreg[7] = 0x0600;
1847 
1848             for (i, addr) in addrs.iter().enumerate() {
1849                 dbg.arch.debugreg[i] = addr.0;
1850                 // Set global breakpoint enable flag
1851                 dbg.arch.debugreg[7] |= 2 << (i * 2);
1852             }
1853         }
1854         #[cfg(target_arch = "aarch64")]
1855         {
1856             for (i, addr) in addrs.iter().enumerate() {
1857                 // DBGBCR_EL1 (Debug Breakpoint Control Registers, D13.3.2):
1858                 // bit 0: 1 (Enabled)
1859                 // bit 1~2: 0b11 (PMC = EL1/EL0)
1860                 // bit 5~8: 0b1111 (BAS = AArch64)
1861                 // others: 0
1862                 dbg.arch.dbg_bcr[i] = 0b1u64 | 0b110u64 | 0b1_1110_0000u64;
1863                 // DBGBVR_EL1 (Debug Breakpoint Value Registers, D13.3.3):
1864                 // bit 2~52: VA[2:52]
1865                 dbg.arch.dbg_bvr[i] = (!0u64 >> 11) & addr.0;
1866             }
1867         }
1868         self.fd
1869             .lock()
1870             .unwrap()
1871             .set_guest_debug(&dbg)
1872             .map_err(|e| cpu::HypervisorCpuError::SetDebugRegs(e.into()))
1873     }
1874 
1875     #[cfg(target_arch = "aarch64")]
1876     fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> {
1877         self.fd
1878             .lock()
1879             .unwrap()
1880             .vcpu_init(kvi)
1881             .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into()))
1882     }
1883 
1884     #[cfg(target_arch = "aarch64")]
1885     fn vcpu_finalize(&self, feature: i32) -> cpu::Result<()> {
1886         self.fd
1887             .lock()
1888             .unwrap()
1889             .vcpu_finalize(&feature)
1890             .map_err(|e| cpu::HypervisorCpuError::VcpuFinalize(e.into()))
1891     }
1892 
1893     ///
1894     /// Gets a list of the guest registers that are supported for the
1895     /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls.
1896     ///
1897     #[cfg(target_arch = "aarch64")]
1898     fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> {
1899         self.fd
1900             .lock()
1901             .unwrap()
1902             .get_reg_list(reg_list)
1903             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))
1904     }
1905 
1906     ///
1907     /// Gets the value of a system register
1908     ///
1909     #[cfg(target_arch = "aarch64")]
1910     fn get_sys_reg(&self, sys_reg: u32) -> cpu::Result<u64> {
1911         //
1912         // Arm Architecture Reference Manual defines the encoding of
1913         // AArch64 system registers, see
1914         // https://developer.arm.com/documentation/ddi0487 (chapter D12).
1915         // While KVM defines another ID for each AArch64 system register,
1916         // which is used in calling `KVM_G/SET_ONE_REG` to access a system
1917         // register of a guest.
1918         // A mapping exists between the Arm standard encoding and the KVM ID.
1919         // This function takes the standard u32 ID as input parameter, converts
1920         // it to the corresponding KVM ID, and call `KVM_GET_ONE_REG` API to
1921         // get the value of the system parameter.
1922         //
1923         let id: u64 = KVM_REG_ARM64
1924             | KVM_REG_SIZE_U64
1925             | KVM_REG_ARM64_SYSREG as u64
1926             | ((((sys_reg) >> 5)
1927                 & (KVM_REG_ARM64_SYSREG_OP0_MASK
1928                     | KVM_REG_ARM64_SYSREG_OP1_MASK
1929                     | KVM_REG_ARM64_SYSREG_CRN_MASK
1930                     | KVM_REG_ARM64_SYSREG_CRM_MASK
1931                     | KVM_REG_ARM64_SYSREG_OP2_MASK)) as u64);
1932         let mut bytes = [0_u8; 8];
1933         self.fd
1934             .lock()
1935             .unwrap()
1936             .get_one_reg(id, &mut bytes)
1937             .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?;
1938         Ok(u64::from_le_bytes(bytes))
1939     }
1940 
1941     ///
1942     /// Configure core registers for a given CPU.
1943     ///
1944     #[cfg(target_arch = "aarch64")]
1945     fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> {
1946         #[allow(non_upper_case_globals)]
1947         // PSR (Processor State Register) bits.
1948         // Taken from arch/arm64/include/uapi/asm/ptrace.h.
1949         const PSR_MODE_EL1h: u64 = 0x0000_0005;
1950         const PSR_F_BIT: u64 = 0x0000_0040;
1951         const PSR_I_BIT: u64 = 0x0000_0080;
1952         const PSR_A_BIT: u64 = 0x0000_0100;
1953         const PSR_D_BIT: u64 = 0x0000_0200;
1954         // Taken from arch/arm64/kvm/inject_fault.c.
1955         const PSTATE_FAULT_BITS_64: u64 =
1956             PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | PSR_I_BIT | PSR_D_BIT;
1957 
1958         let kreg_off = offset_of!(kvm_regs, regs);
1959 
1960         // Get the register index of the PSTATE (Processor State) register.
1961         let pstate = offset_of!(user_pt_regs, pstate) + kreg_off;
1962         self.fd
1963             .lock()
1964             .unwrap()
1965             .set_one_reg(
1966                 arm64_core_reg_id!(KVM_REG_SIZE_U64, pstate),
1967                 &PSTATE_FAULT_BITS_64.to_le_bytes(),
1968             )
1969             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1970 
1971         // Other vCPUs are powered off initially awaiting PSCI wakeup.
1972         if cpu_id == 0 {
1973             // Setting the PC (Processor Counter) to the current program address (kernel address).
1974             let pc = offset_of!(user_pt_regs, pc) + kreg_off;
1975             self.fd
1976                 .lock()
1977                 .unwrap()
1978                 .set_one_reg(
1979                     arm64_core_reg_id!(KVM_REG_SIZE_U64, pc),
1980                     &boot_ip.to_le_bytes(),
1981                 )
1982                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1983 
1984             // Last mandatory thing to set -> the address pointing to the FDT (also called DTB).
1985             // "The device tree blob (dtb) must be placed on an 8-byte boundary and must
1986             // not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt.
1987             // We are choosing to place it the end of DRAM. See `get_fdt_addr`.
1988             let regs0 = offset_of!(user_pt_regs, regs) + kreg_off;
1989             self.fd
1990                 .lock()
1991                 .unwrap()
1992                 .set_one_reg(
1993                     arm64_core_reg_id!(KVM_REG_SIZE_U64, regs0),
1994                     &fdt_start.to_le_bytes(),
1995                 )
1996                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1997         }
1998         Ok(())
1999     }
2000 
2001     #[cfg(target_arch = "x86_64")]
2002     ///
2003     /// Get the current CPU state
2004     ///
2005     /// Ordering requirements:
2006     ///
2007     /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify
2008     /// vCPU/LAPIC state. As such, it must be done before most everything
2009     /// else, otherwise we cannot restore everything and expect it to work.
2010     ///
2011     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
2012     /// still running.
2013     ///
2014     /// KVM_GET_LAPIC may change state of LAPIC before returning it.
2015     ///
2016     /// GET_VCPU_EVENTS should probably be last to save. The code looks as
2017     /// it might as well be affected by internal state modifications of the
2018     /// GET ioctls.
2019     ///
2020     /// SREGS saves/restores a pending interrupt, similar to what
2021     /// VCPU_EVENTS also does.
2022     ///
2023     /// GET_MSRS requires a prepopulated data structure to do something
2024     /// meaningful. For SET_MSRS it will then contain good data.
2025     ///
2026     /// # Example
2027     ///
2028     /// ```rust
2029     /// # use hypervisor::kvm::KvmHypervisor;
2030     /// # use std::sync::Arc;
2031     /// let kvm = KvmHypervisor::new().unwrap();
2032     /// let hv = Arc::new(kvm);
2033     /// let vm = hv.create_vm().expect("new VM fd creation failed");
2034     /// vm.enable_split_irq().unwrap();
2035     /// let vcpu = vm.create_vcpu(0, None).unwrap();
2036     /// let state = vcpu.state().unwrap();
2037     /// ```
2038     fn state(&self) -> cpu::Result<CpuState> {
2039         let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?;
2040         let mp_state = self.get_mp_state()?.into();
2041         let regs = self.get_regs()?;
2042         let sregs = self.get_sregs()?;
2043         let xsave = self.get_xsave()?;
2044         let xcrs = self.get_xcrs()?;
2045         let lapic_state = self.get_lapic()?;
2046         let fpu = self.get_fpu()?;
2047 
2048         // Try to get all MSRs based on the list previously retrieved from KVM.
2049         // If the number of MSRs obtained from GET_MSRS is different from the
2050         // expected amount, we fallback onto a slower method by getting MSRs
2051         // by chunks. This is the only way to make sure we try to get as many
2052         // MSRs as possible, even if some MSRs are not supported.
2053         let mut msr_entries = self.msrs.clone();
2054 
2055         // Save extra MSRs if the Hyper-V synthetic interrupt controller is
2056         // emulated.
2057         if self.hyperv_synic.load(Ordering::Acquire) {
2058             let hyperv_synic_msrs = vec![
2059                 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084,
2060                 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096,
2061                 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d,
2062                 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4,
2063                 0x400000b5, 0x400000b6, 0x400000b7,
2064             ];
2065             for index in hyperv_synic_msrs {
2066                 let msr = kvm_msr_entry {
2067                     index,
2068                     ..Default::default()
2069                 };
2070                 msr_entries.push(msr.into());
2071             }
2072         }
2073 
2074         let expected_num_msrs = msr_entries.len();
2075         let num_msrs = self.get_msrs(&mut msr_entries)?;
2076         let msrs = if num_msrs != expected_num_msrs {
2077             let mut faulty_msr_index = num_msrs;
2078             let mut msr_entries_tmp = msr_entries[..faulty_msr_index].to_vec();
2079 
2080             loop {
2081                 warn!(
2082                     "Detected faulty MSR 0x{:x} while getting MSRs",
2083                     msr_entries[faulty_msr_index].index
2084                 );
2085 
2086                 // Skip the first bad MSR
2087                 let start_pos = faulty_msr_index + 1;
2088 
2089                 let mut sub_msr_entries = msr_entries[start_pos..].to_vec();
2090                 let num_msrs = self.get_msrs(&mut sub_msr_entries)?;
2091 
2092                 msr_entries_tmp.extend(&sub_msr_entries[..num_msrs]);
2093 
2094                 if num_msrs == sub_msr_entries.len() {
2095                     break;
2096                 }
2097 
2098                 faulty_msr_index = start_pos + num_msrs;
2099             }
2100 
2101             msr_entries_tmp
2102         } else {
2103             msr_entries
2104         };
2105 
2106         let vcpu_events = self.get_vcpu_events()?;
2107         let tsc_khz = self.tsc_khz()?;
2108 
2109         Ok(VcpuKvmState {
2110             cpuid,
2111             msrs,
2112             vcpu_events,
2113             regs: regs.into(),
2114             sregs: sregs.into(),
2115             fpu,
2116             lapic_state,
2117             xsave,
2118             xcrs,
2119             mp_state,
2120             tsc_khz,
2121         }
2122         .into())
2123     }
2124 
2125     ///
2126     /// Get the current AArch64 CPU state
2127     ///
2128     #[cfg(target_arch = "aarch64")]
2129     fn state(&self) -> cpu::Result<CpuState> {
2130         let mut state = VcpuKvmState {
2131             mp_state: self.get_mp_state()?.into(),
2132             ..Default::default()
2133         };
2134         // Get core registers
2135         state.core_regs = self.get_regs()?.into();
2136 
2137         // Get systerm register
2138         // Call KVM_GET_REG_LIST to get all registers available to the guest.
2139         // For ArmV8 there are around 500 registers.
2140         let mut sys_regs: Vec<Register> = Vec::new();
2141         let mut reg_list = RegList::new(500).unwrap();
2142         self.fd
2143             .lock()
2144             .unwrap()
2145             .get_reg_list(&mut reg_list)
2146             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?;
2147 
2148         // At this point reg_list should contain: core registers and system
2149         // registers.
2150         // The register list contains the number of registers and their ids. We
2151         // will be needing to call KVM_GET_ONE_REG on each id in order to save
2152         // all of them. We carve out from the list  the core registers which are
2153         // represented in the kernel by kvm_regs structure and for which we can
2154         // calculate the id based on the offset in the structure.
2155         reg_list.retain(|regid| is_system_register(*regid));
2156 
2157         // Now, for the rest of the registers left in the previously fetched
2158         // register list, we are simply calling KVM_GET_ONE_REG.
2159         let indices = reg_list.as_slice();
2160         for index in indices.iter() {
2161             let mut bytes = [0_u8; 8];
2162             self.fd
2163                 .lock()
2164                 .unwrap()
2165                 .get_one_reg(*index, &mut bytes)
2166                 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?;
2167             sys_regs.push(kvm_bindings::kvm_one_reg {
2168                 id: *index,
2169                 addr: u64::from_le_bytes(bytes),
2170             });
2171         }
2172 
2173         state.sys_regs = sys_regs;
2174 
2175         Ok(state.into())
2176     }
2177 
2178     #[cfg(target_arch = "x86_64")]
2179     ///
2180     /// Restore the previously saved CPU state
2181     ///
2182     /// Ordering requirements:
2183     ///
2184     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
2185     /// still running.
2186     ///
2187     /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so
2188     /// if we ever change the BSP, we have to do that before restoring anything.
2189     /// The same seems to be true for CPUID stuff.
2190     ///
2191     /// SREGS saves/restores a pending interrupt, similar to what
2192     /// VCPU_EVENTS also does.
2193     ///
2194     /// SET_REGS clears pending exceptions unconditionally, thus, it must be
2195     /// done before SET_VCPU_EVENTS, which restores it.
2196     ///
2197     /// SET_LAPIC must come after SET_SREGS, because the latter restores
2198     /// the apic base msr.
2199     ///
2200     /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR
2201     /// only restores successfully, when the LAPIC is correctly configured.
2202     ///
2203     /// Arguments: CpuState
2204     /// # Example
2205     ///
2206     /// ```rust
2207     /// # use hypervisor::kvm::KvmHypervisor;
2208     /// # use std::sync::Arc;
2209     /// let kvm = KvmHypervisor::new().unwrap();
2210     /// let hv = Arc::new(kvm);
2211     /// let vm = hv.create_vm().expect("new VM fd creation failed");
2212     /// vm.enable_split_irq().unwrap();
2213     /// let vcpu = vm.create_vcpu(0, None).unwrap();
2214     /// let state = vcpu.state().unwrap();
2215     /// vcpu.set_state(&state).unwrap();
2216     /// ```
2217     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
2218         let state: VcpuKvmState = state.clone().into();
2219         self.set_cpuid2(&state.cpuid)?;
2220         self.set_mp_state(state.mp_state.into())?;
2221         self.set_regs(&state.regs.into())?;
2222         self.set_sregs(&state.sregs.into())?;
2223         self.set_xsave(&state.xsave)?;
2224         self.set_xcrs(&state.xcrs)?;
2225         self.set_lapic(&state.lapic_state)?;
2226         self.set_fpu(&state.fpu)?;
2227 
2228         if let Some(freq) = state.tsc_khz {
2229             self.set_tsc_khz(freq)?;
2230         }
2231 
2232         // Try to set all MSRs previously stored.
2233         // If the number of MSRs set from SET_MSRS is different from the
2234         // expected amount, we fallback onto a slower method by setting MSRs
2235         // by chunks. This is the only way to make sure we try to set as many
2236         // MSRs as possible, even if some MSRs are not supported.
2237         let expected_num_msrs = state.msrs.len();
2238         let num_msrs = self.set_msrs(&state.msrs)?;
2239         if num_msrs != expected_num_msrs {
2240             let mut faulty_msr_index = num_msrs;
2241 
2242             loop {
2243                 warn!(
2244                     "Detected faulty MSR 0x{:x} while setting MSRs",
2245                     state.msrs[faulty_msr_index].index
2246                 );
2247 
2248                 // Skip the first bad MSR
2249                 let start_pos = faulty_msr_index + 1;
2250 
2251                 let sub_msr_entries = state.msrs[start_pos..].to_vec();
2252 
2253                 let num_msrs = self.set_msrs(&sub_msr_entries)?;
2254 
2255                 if num_msrs == sub_msr_entries.len() {
2256                     break;
2257                 }
2258 
2259                 faulty_msr_index = start_pos + num_msrs;
2260             }
2261         }
2262 
2263         self.set_vcpu_events(&state.vcpu_events)?;
2264 
2265         Ok(())
2266     }
2267 
2268     ///
2269     /// Restore the previously saved AArch64 CPU state
2270     ///
2271     #[cfg(target_arch = "aarch64")]
2272     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
2273         let state: VcpuKvmState = state.clone().into();
2274         // Set core registers
2275         self.set_regs(&state.core_regs.into())?;
2276         // Set system registers
2277         for reg in &state.sys_regs {
2278             self.fd
2279                 .lock()
2280                 .unwrap()
2281                 .set_one_reg(reg.id, &reg.addr.to_le_bytes())
2282                 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?;
2283         }
2284 
2285         self.set_mp_state(state.mp_state.into())?;
2286 
2287         Ok(())
2288     }
2289 
2290     ///
2291     /// Initialize TDX for this CPU
2292     ///
2293     #[cfg(feature = "tdx")]
2294     fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> {
2295         tdx_command(
2296             &self.fd.lock().unwrap().as_raw_fd(),
2297             TdxCommand::InitVcpu,
2298             0,
2299             hob_address,
2300         )
2301         .map_err(cpu::HypervisorCpuError::InitializeTdx)
2302     }
2303 
2304     ///
2305     /// Set the "immediate_exit" state
2306     ///
2307     fn set_immediate_exit(&self, exit: bool) {
2308         self.fd.lock().unwrap().set_kvm_immediate_exit(exit.into());
2309     }
2310 
2311     ///
2312     /// Returns the details about TDX exit reason
2313     ///
2314     #[cfg(feature = "tdx")]
2315     fn get_tdx_exit_details(&mut self) -> cpu::Result<TdxExitDetails> {
2316         let mut fd = self.fd.as_ref().lock().unwrap();
2317         let kvm_run = fd.get_kvm_run();
2318         // SAFETY: accessing a union field in a valid structure
2319         let tdx_vmcall = unsafe {
2320             &mut (*((&mut kvm_run.__bindgen_anon_1) as *mut kvm_run__bindgen_ty_1
2321                 as *mut KvmTdxExit))
2322                 .u
2323                 .vmcall
2324         };
2325 
2326         tdx_vmcall.status_code = TDG_VP_VMCALL_INVALID_OPERAND;
2327 
2328         if tdx_vmcall.type_ != 0 {
2329             return Err(cpu::HypervisorCpuError::UnknownTdxVmCall);
2330         }
2331 
2332         match tdx_vmcall.subfunction {
2333             TDG_VP_VMCALL_GET_QUOTE => Ok(TdxExitDetails::GetQuote),
2334             TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT => {
2335                 Ok(TdxExitDetails::SetupEventNotifyInterrupt)
2336             }
2337             _ => Err(cpu::HypervisorCpuError::UnknownTdxVmCall),
2338         }
2339     }
2340 
2341     ///
2342     /// Set the status code for TDX exit
2343     ///
2344     #[cfg(feature = "tdx")]
2345     fn set_tdx_status(&mut self, status: TdxExitStatus) {
2346         let mut fd = self.fd.as_ref().lock().unwrap();
2347         let kvm_run = fd.get_kvm_run();
2348         // SAFETY: accessing a union field in a valid structure
2349         let tdx_vmcall = unsafe {
2350             &mut (*((&mut kvm_run.__bindgen_anon_1) as *mut kvm_run__bindgen_ty_1
2351                 as *mut KvmTdxExit))
2352                 .u
2353                 .vmcall
2354         };
2355 
2356         tdx_vmcall.status_code = match status {
2357             TdxExitStatus::Success => TDG_VP_VMCALL_SUCCESS,
2358             TdxExitStatus::InvalidOperand => TDG_VP_VMCALL_INVALID_OPERAND,
2359         };
2360     }
2361 
2362     #[cfg(target_arch = "x86_64")]
2363     ///
2364     /// Return the list of initial MSR entries for a VCPU
2365     ///
2366     fn boot_msr_entries(&self) -> Vec<MsrEntry> {
2367         use crate::arch::x86::{msr_index, MTRR_ENABLE, MTRR_MEM_TYPE_WB};
2368 
2369         [
2370             msr!(msr_index::MSR_IA32_SYSENTER_CS),
2371             msr!(msr_index::MSR_IA32_SYSENTER_ESP),
2372             msr!(msr_index::MSR_IA32_SYSENTER_EIP),
2373             msr!(msr_index::MSR_STAR),
2374             msr!(msr_index::MSR_CSTAR),
2375             msr!(msr_index::MSR_LSTAR),
2376             msr!(msr_index::MSR_KERNEL_GS_BASE),
2377             msr!(msr_index::MSR_SYSCALL_MASK),
2378             msr!(msr_index::MSR_IA32_TSC),
2379             msr_data!(
2380                 msr_index::MSR_IA32_MISC_ENABLE,
2381                 msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64
2382             ),
2383             msr_data!(msr_index::MSR_MTRRdefType, MTRR_ENABLE | MTRR_MEM_TYPE_WB),
2384         ]
2385         .to_vec()
2386     }
2387 
2388     #[cfg(target_arch = "aarch64")]
2389     fn has_pmu_support(&self) -> bool {
2390         let cpu_attr = kvm_bindings::kvm_device_attr {
2391             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2392             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT),
2393             addr: 0x0,
2394             flags: 0,
2395         };
2396         self.fd.lock().unwrap().has_device_attr(&cpu_attr).is_ok()
2397     }
2398 
2399     #[cfg(target_arch = "aarch64")]
2400     fn init_pmu(&self, irq: u32) -> cpu::Result<()> {
2401         let cpu_attr = kvm_bindings::kvm_device_attr {
2402             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2403             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT),
2404             addr: 0x0,
2405             flags: 0,
2406         };
2407         let cpu_attr_irq = kvm_bindings::kvm_device_attr {
2408             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2409             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_IRQ),
2410             addr: &irq as *const u32 as u64,
2411             flags: 0,
2412         };
2413         self.fd
2414             .lock()
2415             .unwrap()
2416             .set_device_attr(&cpu_attr_irq)
2417             .map_err(|_| cpu::HypervisorCpuError::InitializePmu)?;
2418         self.fd
2419             .lock()
2420             .unwrap()
2421             .set_device_attr(&cpu_attr)
2422             .map_err(|_| cpu::HypervisorCpuError::InitializePmu)
2423     }
2424 
2425     #[cfg(target_arch = "x86_64")]
2426     ///
2427     /// Get the frequency of the TSC if available
2428     ///
2429     fn tsc_khz(&self) -> cpu::Result<Option<u32>> {
2430         match self.fd.lock().unwrap().get_tsc_khz() {
2431             Err(e) => {
2432                 if e.errno() == libc::EIO {
2433                     Ok(None)
2434                 } else {
2435                     Err(cpu::HypervisorCpuError::GetTscKhz(e.into()))
2436                 }
2437             }
2438             Ok(v) => Ok(Some(v)),
2439         }
2440     }
2441 
2442     #[cfg(target_arch = "x86_64")]
2443     ///
2444     /// Set the frequency of the TSC if available
2445     ///
2446     fn set_tsc_khz(&self, freq: u32) -> cpu::Result<()> {
2447         match self.fd.lock().unwrap().set_tsc_khz(freq) {
2448             Err(e) => {
2449                 if e.errno() == libc::EIO {
2450                     Ok(())
2451                 } else {
2452                     Err(cpu::HypervisorCpuError::SetTscKhz(e.into()))
2453                 }
2454             }
2455             Ok(_) => Ok(()),
2456         }
2457     }
2458 
2459     #[cfg(target_arch = "x86_64")]
2460     ///
2461     /// Trigger NMI interrupt
2462     ///
2463     fn nmi(&self) -> cpu::Result<()> {
2464         match self.fd.lock().unwrap().nmi() {
2465             Err(e) => {
2466                 if e.errno() == libc::EIO {
2467                     Ok(())
2468                 } else {
2469                     Err(cpu::HypervisorCpuError::Nmi(e.into()))
2470                 }
2471             }
2472             Ok(_) => Ok(()),
2473         }
2474     }
2475 }
2476 
2477 impl KvmVcpu {
2478     #[cfg(target_arch = "x86_64")]
2479     ///
2480     /// X86 specific call that returns the vcpu's current "xsave struct".
2481     ///
2482     fn get_xsave(&self) -> cpu::Result<XsaveState> {
2483         Ok(self
2484             .fd
2485             .lock()
2486             .unwrap()
2487             .get_xsave()
2488             .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into()))?
2489             .into())
2490     }
2491 
2492     #[cfg(target_arch = "x86_64")]
2493     ///
2494     /// X86 specific call that sets the vcpu's current "xsave struct".
2495     ///
2496     fn set_xsave(&self, xsave: &XsaveState) -> cpu::Result<()> {
2497         let xsave: kvm_bindings::kvm_xsave = (*xsave).clone().into();
2498         self.fd
2499             .lock()
2500             .unwrap()
2501             .set_xsave(&xsave)
2502             .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into()))
2503     }
2504 
2505     #[cfg(target_arch = "x86_64")]
2506     ///
2507     /// X86 specific call that returns the vcpu's current "xcrs".
2508     ///
2509     fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> {
2510         self.fd
2511             .lock()
2512             .unwrap()
2513             .get_xcrs()
2514             .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into()))
2515     }
2516 
2517     #[cfg(target_arch = "x86_64")]
2518     ///
2519     /// X86 specific call that sets the vcpu's current "xcrs".
2520     ///
2521     fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> {
2522         self.fd
2523             .lock()
2524             .unwrap()
2525             .set_xcrs(xcrs)
2526             .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into()))
2527     }
2528 
2529     #[cfg(target_arch = "x86_64")]
2530     ///
2531     /// Returns currently pending exceptions, interrupts, and NMIs as well as related
2532     /// states of the vcpu.
2533     ///
2534     fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> {
2535         self.fd
2536             .lock()
2537             .unwrap()
2538             .get_vcpu_events()
2539             .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into()))
2540     }
2541 
2542     #[cfg(target_arch = "x86_64")]
2543     ///
2544     /// Sets pending exceptions, interrupts, and NMIs as well as related states
2545     /// of the vcpu.
2546     ///
2547     fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> {
2548         self.fd
2549             .lock()
2550             .unwrap()
2551             .set_vcpu_events(events)
2552             .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into()))
2553     }
2554 }
2555