xref: /cloud-hypervisor/hypervisor/src/kvm/mod.rs (revision de3ca97095bd2a4ad1398e475bf8bc29ca8da8db)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 // Copyright © 2020, Microsoft Corporation
6 //
7 // Copyright 2018-2019 CrowdStrike, Inc.
8 //
9 //
10 
11 #[cfg(target_arch = "aarch64")]
12 use crate::aarch64::gic::KvmGicV3Its;
13 #[cfg(target_arch = "aarch64")]
14 pub use crate::aarch64::{
15     check_required_kvm_extensions, gic::Gicv3ItsState as GicState, is_system_register, VcpuInit,
16     VcpuKvmState,
17 };
18 #[cfg(target_arch = "aarch64")]
19 use crate::arch::aarch64::gic::{Vgic, VgicConfig};
20 use crate::cpu;
21 use crate::hypervisor;
22 use crate::vec_with_array_field;
23 use crate::vm::{self, InterruptSourceConfig, VmOps};
24 use crate::HypervisorType;
25 #[cfg(target_arch = "aarch64")]
26 use crate::{arm64_core_reg_id, offset_of};
27 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd};
28 use std::any::Any;
29 use std::collections::HashMap;
30 #[cfg(target_arch = "aarch64")]
31 use std::convert::TryInto;
32 #[cfg(target_arch = "x86_64")]
33 use std::fs::File;
34 #[cfg(target_arch = "x86_64")]
35 use std::os::unix::io::AsRawFd;
36 #[cfg(feature = "tdx")]
37 use std::os::unix::io::RawFd;
38 use std::result;
39 #[cfg(target_arch = "x86_64")]
40 use std::sync::atomic::{AtomicBool, Ordering};
41 #[cfg(target_arch = "aarch64")]
42 use std::sync::Mutex;
43 use std::sync::{Arc, RwLock};
44 use vmm_sys_util::eventfd::EventFd;
45 // x86_64 dependencies
46 #[cfg(target_arch = "x86_64")]
47 pub mod x86_64;
48 #[cfg(target_arch = "x86_64")]
49 use crate::arch::x86::{
50     CpuIdEntry, FpuState, LapicState, MsrEntry, SpecialRegisters, StandardRegisters,
51     NUM_IOAPIC_PINS,
52 };
53 #[cfg(target_arch = "x86_64")]
54 use crate::ClockData;
55 use crate::{
56     CpuState, IoEventAddress, IrqRoutingEntry, MpState, UserMemoryRegion,
57     USER_MEMORY_REGION_LOG_DIRTY, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE,
58 };
59 #[cfg(target_arch = "aarch64")]
60 use aarch64::{RegList, Register, StandardRegisters};
61 #[cfg(target_arch = "x86_64")]
62 use kvm_bindings::{
63     kvm_enable_cap, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, KVM_CAP_SPLIT_IRQCHIP,
64     KVM_GUESTDBG_USE_HW_BP,
65 };
66 #[cfg(target_arch = "x86_64")]
67 use x86_64::check_required_kvm_extensions;
68 #[cfg(target_arch = "x86_64")]
69 pub use x86_64::{CpuId, ExtendedControlRegisters, MsrEntries, VcpuKvmState, Xsave};
70 // aarch64 dependencies
71 #[cfg(target_arch = "aarch64")]
72 pub mod aarch64;
73 pub use kvm_bindings;
74 #[cfg(feature = "tdx")]
75 use kvm_bindings::KVMIO;
76 pub use kvm_bindings::{
77     kvm_clock_data, kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_guest_debug,
78     kvm_irq_routing, kvm_irq_routing_entry, kvm_mp_state, kvm_userspace_memory_region,
79     KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI,
80     KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID,
81 };
82 #[cfg(target_arch = "aarch64")]
83 use kvm_bindings::{
84     kvm_regs, user_fpsimd_state, user_pt_regs, KVM_GUESTDBG_USE_HW, KVM_NR_SPSR, KVM_REG_ARM64,
85     KVM_REG_ARM64_SYSREG, KVM_REG_ARM64_SYSREG_CRM_MASK, KVM_REG_ARM64_SYSREG_CRN_MASK,
86     KVM_REG_ARM64_SYSREG_OP0_MASK, KVM_REG_ARM64_SYSREG_OP1_MASK, KVM_REG_ARM64_SYSREG_OP2_MASK,
87     KVM_REG_ARM_CORE, KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64,
88 };
89 pub use kvm_ioctls;
90 pub use kvm_ioctls::{Cap, Kvm};
91 #[cfg(target_arch = "aarch64")]
92 use std::mem;
93 use thiserror::Error;
94 use vfio_ioctls::VfioDeviceFd;
95 #[cfg(feature = "tdx")]
96 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_ioc_nr, ioctl_iowr_nr};
97 ///
98 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms
99 ///
100 pub use {
101     kvm_bindings::kvm_create_device as CreateDevice, kvm_bindings::kvm_device_attr as DeviceAttr,
102     kvm_bindings::kvm_run, kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::VcpuExit,
103 };
104 
105 #[cfg(target_arch = "x86_64")]
106 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196;
107 
108 #[cfg(feature = "tdx")]
109 const KVM_EXIT_TDX: u32 = 50;
110 #[cfg(feature = "tdx")]
111 const TDG_VP_VMCALL_GET_QUOTE: u64 = 0x10002;
112 #[cfg(feature = "tdx")]
113 const TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: u64 = 0x10004;
114 #[cfg(feature = "tdx")]
115 const TDG_VP_VMCALL_SUCCESS: u64 = 0;
116 #[cfg(feature = "tdx")]
117 const TDG_VP_VMCALL_INVALID_OPERAND: u64 = 0x8000000000000000;
118 
119 #[cfg(feature = "tdx")]
120 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong);
121 
122 #[cfg(feature = "tdx")]
123 #[repr(u32)]
124 enum TdxCommand {
125     Capabilities = 0,
126     InitVm,
127     InitVcpu,
128     InitMemRegion,
129     Finalize,
130 }
131 
132 #[cfg(feature = "tdx")]
133 pub enum TdxExitDetails {
134     GetQuote,
135     SetupEventNotifyInterrupt,
136 }
137 
138 #[cfg(feature = "tdx")]
139 pub enum TdxExitStatus {
140     Success,
141     InvalidOperand,
142 }
143 
144 #[cfg(feature = "tdx")]
145 const TDX_MAX_NR_CPUID_CONFIGS: usize = 6;
146 
147 #[cfg(feature = "tdx")]
148 #[repr(C)]
149 #[derive(Debug, Default)]
150 pub struct TdxCpuidConfig {
151     pub leaf: u32,
152     pub sub_leaf: u32,
153     pub eax: u32,
154     pub ebx: u32,
155     pub ecx: u32,
156     pub edx: u32,
157 }
158 
159 #[cfg(feature = "tdx")]
160 #[repr(C)]
161 #[derive(Debug, Default)]
162 pub struct TdxCapabilities {
163     pub attrs_fixed0: u64,
164     pub attrs_fixed1: u64,
165     pub xfam_fixed0: u64,
166     pub xfam_fixed1: u64,
167     pub nr_cpuid_configs: u32,
168     pub padding: u32,
169     pub cpuid_configs: [TdxCpuidConfig; TDX_MAX_NR_CPUID_CONFIGS],
170 }
171 
172 impl From<kvm_userspace_memory_region> for UserMemoryRegion {
173     fn from(region: kvm_userspace_memory_region) -> Self {
174         let mut flags = USER_MEMORY_REGION_READ;
175         if region.flags & KVM_MEM_READONLY == 0 {
176             flags |= USER_MEMORY_REGION_WRITE;
177         }
178         if region.flags & KVM_MEM_LOG_DIRTY_PAGES != 0 {
179             flags |= USER_MEMORY_REGION_LOG_DIRTY;
180         }
181 
182         UserMemoryRegion {
183             slot: region.slot,
184             guest_phys_addr: region.guest_phys_addr,
185             memory_size: region.memory_size,
186             userspace_addr: region.userspace_addr,
187             flags,
188         }
189     }
190 }
191 
192 impl From<UserMemoryRegion> for kvm_userspace_memory_region {
193     fn from(region: UserMemoryRegion) -> Self {
194         assert!(
195             region.flags & USER_MEMORY_REGION_READ != 0,
196             "KVM mapped memory is always readable"
197         );
198 
199         let mut flags = 0;
200         if region.flags & USER_MEMORY_REGION_WRITE == 0 {
201             flags |= KVM_MEM_READONLY;
202         }
203         if region.flags & USER_MEMORY_REGION_LOG_DIRTY != 0 {
204             flags |= KVM_MEM_LOG_DIRTY_PAGES;
205         }
206 
207         kvm_userspace_memory_region {
208             slot: region.slot,
209             guest_phys_addr: region.guest_phys_addr,
210             memory_size: region.memory_size,
211             userspace_addr: region.userspace_addr,
212             flags,
213         }
214     }
215 }
216 
217 impl From<kvm_mp_state> for MpState {
218     fn from(s: kvm_mp_state) -> Self {
219         MpState::Kvm(s)
220     }
221 }
222 
223 impl From<MpState> for kvm_mp_state {
224     fn from(ms: MpState) -> Self {
225         match ms {
226             MpState::Kvm(s) => s,
227             /* Needed in case other hypervisors are enabled */
228             #[allow(unreachable_patterns)]
229             _ => panic!("CpuState is not valid"),
230         }
231     }
232 }
233 
234 impl From<kvm_ioctls::IoEventAddress> for IoEventAddress {
235     fn from(a: kvm_ioctls::IoEventAddress) -> Self {
236         match a {
237             kvm_ioctls::IoEventAddress::Pio(x) => Self::Pio(x),
238             kvm_ioctls::IoEventAddress::Mmio(x) => Self::Mmio(x),
239         }
240     }
241 }
242 
243 impl From<IoEventAddress> for kvm_ioctls::IoEventAddress {
244     fn from(a: IoEventAddress) -> Self {
245         match a {
246             IoEventAddress::Pio(x) => Self::Pio(x),
247             IoEventAddress::Mmio(x) => Self::Mmio(x),
248         }
249     }
250 }
251 
252 impl From<VcpuKvmState> for CpuState {
253     fn from(s: VcpuKvmState) -> Self {
254         CpuState::Kvm(s)
255     }
256 }
257 
258 impl From<CpuState> for VcpuKvmState {
259     fn from(s: CpuState) -> Self {
260         match s {
261             CpuState::Kvm(s) => s,
262             /* Needed in case other hypervisors are enabled */
263             #[allow(unreachable_patterns)]
264             _ => panic!("CpuState is not valid"),
265         }
266     }
267 }
268 
269 #[cfg(target_arch = "x86_64")]
270 impl From<kvm_clock_data> for ClockData {
271     fn from(d: kvm_clock_data) -> Self {
272         ClockData::Kvm(d)
273     }
274 }
275 
276 #[cfg(target_arch = "x86_64")]
277 impl From<ClockData> for kvm_clock_data {
278     fn from(ms: ClockData) -> Self {
279         match ms {
280             ClockData::Kvm(s) => s,
281             /* Needed in case other hypervisors are enabled */
282             #[allow(unreachable_patterns)]
283             _ => panic!("CpuState is not valid"),
284         }
285     }
286 }
287 
288 impl From<kvm_irq_routing_entry> for IrqRoutingEntry {
289     fn from(s: kvm_irq_routing_entry) -> Self {
290         IrqRoutingEntry::Kvm(s)
291     }
292 }
293 
294 impl From<IrqRoutingEntry> for kvm_irq_routing_entry {
295     fn from(e: IrqRoutingEntry) -> Self {
296         match e {
297             IrqRoutingEntry::Kvm(e) => e,
298             /* Needed in case other hypervisors are enabled */
299             #[allow(unreachable_patterns)]
300             _ => panic!("IrqRoutingEntry is not valid"),
301         }
302     }
303 }
304 
305 struct KvmDirtyLogSlot {
306     slot: u32,
307     guest_phys_addr: u64,
308     memory_size: u64,
309     userspace_addr: u64,
310 }
311 
312 /// Wrapper over KVM VM ioctls.
313 pub struct KvmVm {
314     fd: Arc<VmFd>,
315     #[cfg(target_arch = "x86_64")]
316     msrs: Vec<MsrEntry>,
317     dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>,
318 }
319 
320 impl KvmVm {
321     ///
322     /// Creates an emulated device in the kernel.
323     ///
324     /// See the documentation for `KVM_CREATE_DEVICE`.
325     fn create_device(&self, device: &mut CreateDevice) -> vm::Result<vfio_ioctls::VfioDeviceFd> {
326         let device_fd = self
327             .fd
328             .create_device(device)
329             .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?;
330         Ok(VfioDeviceFd::new_from_kvm(device_fd))
331     }
332     /// Checks if a particular `Cap` is available.
333     fn check_extension(&self, c: Cap) -> bool {
334         self.fd.check_extension(c)
335     }
336 }
337 
338 ///
339 /// Implementation of Vm trait for KVM
340 /// Example:
341 /// #[cfg(feature = "kvm")]
342 /// extern crate hypervisor
343 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
344 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
345 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
346 /// vm.set/get().unwrap()
347 ///
348 impl vm::Vm for KvmVm {
349     #[cfg(target_arch = "x86_64")]
350     ///
351     /// Sets the address of the one-page region in the VM's address space.
352     ///
353     fn set_identity_map_address(&self, address: u64) -> vm::Result<()> {
354         self.fd
355             .set_identity_map_address(address)
356             .map_err(|e| vm::HypervisorVmError::SetIdentityMapAddress(e.into()))
357     }
358     #[cfg(target_arch = "x86_64")]
359     ///
360     /// Sets the address of the three-page region in the VM's address space.
361     ///
362     fn set_tss_address(&self, offset: usize) -> vm::Result<()> {
363         self.fd
364             .set_tss_address(offset)
365             .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into()))
366     }
367     ///
368     /// Creates an in-kernel interrupt controller.
369     ///
370     fn create_irq_chip(&self) -> vm::Result<()> {
371         self.fd
372             .create_irq_chip()
373             .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into()))
374     }
375     ///
376     /// Registers an event that will, when signaled, trigger the `gsi` IRQ.
377     ///
378     fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
379         self.fd
380             .register_irqfd(fd, gsi)
381             .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into()))
382     }
383     ///
384     /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ.
385     ///
386     fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
387         self.fd
388             .unregister_irqfd(fd, gsi)
389             .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into()))
390     }
391     ///
392     /// Creates a VcpuFd object from a vcpu RawFd.
393     ///
394     fn create_vcpu(
395         &self,
396         id: u8,
397         vm_ops: Option<Arc<dyn VmOps>>,
398     ) -> vm::Result<Arc<dyn cpu::Vcpu>> {
399         let vc = self
400             .fd
401             .create_vcpu(id as u64)
402             .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?;
403         let vcpu = KvmVcpu {
404             fd: vc,
405             #[cfg(target_arch = "x86_64")]
406             msrs: self.msrs.clone(),
407             vm_ops,
408             #[cfg(target_arch = "x86_64")]
409             hyperv_synic: AtomicBool::new(false),
410         };
411         Ok(Arc::new(vcpu))
412     }
413     #[cfg(target_arch = "aarch64")]
414     ///
415     /// Creates a virtual GIC device.
416     ///
417     fn create_vgic(&self, config: VgicConfig) -> vm::Result<Arc<Mutex<dyn Vgic>>> {
418         let gic_device = KvmGicV3Its::new(self, config)
419             .map_err(|e| vm::HypervisorVmError::CreateVgic(anyhow!("Vgic error {:?}", e)))?;
420         Ok(Arc::new(Mutex::new(gic_device)))
421     }
422     ///
423     /// Registers an event to be signaled whenever a certain address is written to.
424     ///
425     fn register_ioevent(
426         &self,
427         fd: &EventFd,
428         addr: &IoEventAddress,
429         datamatch: Option<vm::DataMatch>,
430     ) -> vm::Result<()> {
431         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
432         if let Some(dm) = datamatch {
433             match dm {
434                 vm::DataMatch::DataMatch32(kvm_dm32) => self
435                     .fd
436                     .register_ioevent(fd, addr, kvm_dm32)
437                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
438                 vm::DataMatch::DataMatch64(kvm_dm64) => self
439                     .fd
440                     .register_ioevent(fd, addr, kvm_dm64)
441                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
442             }
443         } else {
444             self.fd
445                 .register_ioevent(fd, addr, NoDatamatch)
446                 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into()))
447         }
448     }
449     ///
450     /// Unregisters an event from a certain address it has been previously registered to.
451     ///
452     fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> {
453         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
454         self.fd
455             .unregister_ioevent(fd, addr, NoDatamatch)
456             .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into()))
457     }
458 
459     ///
460     /// Constructs a routing entry
461     ///
462     fn make_routing_entry(&self, gsi: u32, config: &InterruptSourceConfig) -> IrqRoutingEntry {
463         match &config {
464             InterruptSourceConfig::MsiIrq(cfg) => {
465                 let mut kvm_route = kvm_irq_routing_entry {
466                     gsi,
467                     type_: KVM_IRQ_ROUTING_MSI,
468                     ..Default::default()
469                 };
470 
471                 kvm_route.u.msi.address_lo = cfg.low_addr;
472                 kvm_route.u.msi.address_hi = cfg.high_addr;
473                 kvm_route.u.msi.data = cfg.data;
474 
475                 if self.check_extension(crate::kvm::Cap::MsiDevid) {
476                     // On AArch64, there is limitation on the range of the 'devid',
477                     // it can not be greater than 65536 (the max of u16).
478                     //
479                     // BDF can not be used directly, because 'segment' is in high
480                     // 16 bits. The layout of the u32 BDF is:
481                     // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --|
482                     // |      segment    |     bus    |   device   |  function  |
483                     //
484                     // Now that we support 1 bus only in a segment, we can build a
485                     // 'devid' by replacing the 'bus' bits with the low 8 bits of
486                     // 'segment' data.
487                     // This way we can resolve the range checking problem and give
488                     // different `devid` to all the devices. Limitation is that at
489                     // most 256 segments can be supported.
490                     //
491                     let modified_devid = (cfg.devid & 0x00ff_0000) >> 8 | cfg.devid & 0xff;
492 
493                     kvm_route.flags = KVM_MSI_VALID_DEVID;
494                     kvm_route.u.msi.__bindgen_anon_1.devid = modified_devid;
495                 }
496                 kvm_route.into()
497             }
498             InterruptSourceConfig::LegacyIrq(cfg) => {
499                 let mut kvm_route = kvm_irq_routing_entry {
500                     gsi,
501                     type_: KVM_IRQ_ROUTING_IRQCHIP,
502                     ..Default::default()
503                 };
504                 kvm_route.u.irqchip.irqchip = cfg.irqchip;
505                 kvm_route.u.irqchip.pin = cfg.pin;
506 
507                 kvm_route.into()
508             }
509         }
510     }
511 
512     ///
513     /// Sets the GSI routing table entries, overwriting any previously set
514     /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl.
515     ///
516     fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> {
517         let mut irq_routing =
518             vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len());
519         irq_routing[0].nr = entries.len() as u32;
520         irq_routing[0].flags = 0;
521         let entries: Vec<kvm_irq_routing_entry> = entries
522             .iter()
523             .map(|entry| match entry {
524                 IrqRoutingEntry::Kvm(e) => *e,
525                 #[allow(unreachable_patterns)]
526                 _ => panic!("IrqRoutingEntry type is wrong"),
527             })
528             .collect();
529 
530         // SAFETY: irq_routing initialized with entries.len() and now it is being turned into
531         // entries_slice with entries.len() again. It is guaranteed to be large enough to hold
532         // everything from entries.
533         unsafe {
534             let entries_slice: &mut [kvm_irq_routing_entry] =
535                 irq_routing[0].entries.as_mut_slice(entries.len());
536             entries_slice.copy_from_slice(&entries);
537         }
538 
539         self.fd
540             .set_gsi_routing(&irq_routing[0])
541             .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into()))
542     }
543     ///
544     /// Creates a memory region structure that can be used with {create/remove}_user_memory_region
545     ///
546     fn make_user_memory_region(
547         &self,
548         slot: u32,
549         guest_phys_addr: u64,
550         memory_size: u64,
551         userspace_addr: u64,
552         readonly: bool,
553         log_dirty_pages: bool,
554     ) -> UserMemoryRegion {
555         kvm_userspace_memory_region {
556             slot,
557             guest_phys_addr,
558             memory_size,
559             userspace_addr,
560             flags: if readonly { KVM_MEM_READONLY } else { 0 }
561                 | if log_dirty_pages {
562                     KVM_MEM_LOG_DIRTY_PAGES
563                 } else {
564                     0
565                 },
566         }
567         .into()
568     }
569     ///
570     /// Creates a guest physical memory region.
571     ///
572     fn create_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
573         let mut region: kvm_userspace_memory_region = user_memory_region.into();
574 
575         if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 {
576             if (region.flags & KVM_MEM_READONLY) != 0 {
577                 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!(
578                     "Error creating regions with both 'dirty-pages-log' and 'read-only'."
579                 )));
580             }
581 
582             // Keep track of the regions that need dirty pages log
583             self.dirty_log_slots.write().unwrap().insert(
584                 region.slot,
585                 KvmDirtyLogSlot {
586                     slot: region.slot,
587                     guest_phys_addr: region.guest_phys_addr,
588                     memory_size: region.memory_size,
589                     userspace_addr: region.userspace_addr,
590                 },
591             );
592 
593             // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`.
594             // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`.
595             region.flags = 0;
596         }
597 
598         // SAFETY: Safe because guest regions are guaranteed not to overlap.
599         unsafe {
600             self.fd
601                 .set_user_memory_region(region)
602                 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))
603         }
604     }
605     ///
606     /// Removes a guest physical memory region.
607     ///
608     fn remove_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
609         let mut region: kvm_userspace_memory_region = user_memory_region.into();
610 
611         // Remove the corresponding entry from "self.dirty_log_slots" if needed
612         self.dirty_log_slots.write().unwrap().remove(&region.slot);
613 
614         // Setting the size to 0 means "remove"
615         region.memory_size = 0;
616         // SAFETY: Safe because guest regions are guaranteed not to overlap.
617         unsafe {
618             self.fd
619                 .set_user_memory_region(region)
620                 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into()))
621         }
622     }
623     ///
624     /// Returns the preferred CPU target type which can be emulated by KVM on underlying host.
625     ///
626     #[cfg(target_arch = "aarch64")]
627     fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> {
628         self.fd
629             .get_preferred_target(kvi)
630             .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into()))
631     }
632     #[cfg(target_arch = "x86_64")]
633     fn enable_split_irq(&self) -> vm::Result<()> {
634         // Create split irqchip
635         // Only the local APIC is emulated in kernel, both PICs and IOAPIC
636         // are not.
637         let mut cap = kvm_enable_cap {
638             cap: KVM_CAP_SPLIT_IRQCHIP,
639             ..Default::default()
640         };
641         cap.args[0] = NUM_IOAPIC_PINS as u64;
642         self.fd
643             .enable_cap(&cap)
644             .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?;
645         Ok(())
646     }
647     #[cfg(target_arch = "x86_64")]
648     fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> {
649         let mut cap = kvm_enable_cap {
650             cap: KVM_CAP_SGX_ATTRIBUTE,
651             ..Default::default()
652         };
653         cap.args[0] = file.as_raw_fd() as u64;
654         self.fd
655             .enable_cap(&cap)
656             .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?;
657         Ok(())
658     }
659     /// Retrieve guest clock.
660     #[cfg(target_arch = "x86_64")]
661     fn get_clock(&self) -> vm::Result<ClockData> {
662         Ok(self
663             .fd
664             .get_clock()
665             .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))?
666             .into())
667     }
668     /// Set guest clock.
669     #[cfg(target_arch = "x86_64")]
670     fn set_clock(&self, data: &ClockData) -> vm::Result<()> {
671         let data = (*data).into();
672         self.fd
673             .set_clock(&data)
674             .map_err(|e| vm::HypervisorVmError::SetClock(e.into()))
675     }
676     /// Create a device that is used for passthrough
677     fn create_passthrough_device(&self) -> vm::Result<VfioDeviceFd> {
678         let mut vfio_dev = kvm_create_device {
679             type_: kvm_device_type_KVM_DEV_TYPE_VFIO,
680             fd: 0,
681             flags: 0,
682         };
683 
684         self.create_device(&mut vfio_dev)
685             .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into()))
686     }
687     ///
688     /// Start logging dirty pages
689     ///
690     fn start_dirty_log(&self) -> vm::Result<()> {
691         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
692         for (_, s) in dirty_log_slots.iter() {
693             let region = kvm_userspace_memory_region {
694                 slot: s.slot,
695                 guest_phys_addr: s.guest_phys_addr,
696                 memory_size: s.memory_size,
697                 userspace_addr: s.userspace_addr,
698                 flags: KVM_MEM_LOG_DIRTY_PAGES,
699             };
700             // SAFETY: Safe because guest regions are guaranteed not to overlap.
701             unsafe {
702                 self.fd
703                     .set_user_memory_region(region)
704                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
705             }
706         }
707 
708         Ok(())
709     }
710 
711     ///
712     /// Stop logging dirty pages
713     ///
714     fn stop_dirty_log(&self) -> vm::Result<()> {
715         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
716         for (_, s) in dirty_log_slots.iter() {
717             let region = kvm_userspace_memory_region {
718                 slot: s.slot,
719                 guest_phys_addr: s.guest_phys_addr,
720                 memory_size: s.memory_size,
721                 userspace_addr: s.userspace_addr,
722                 flags: 0,
723             };
724             // SAFETY: Safe because guest regions are guaranteed not to overlap.
725             unsafe {
726                 self.fd
727                     .set_user_memory_region(region)
728                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
729             }
730         }
731 
732         Ok(())
733     }
734 
735     ///
736     /// Get dirty pages bitmap (one bit per page)
737     ///
738     fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> {
739         self.fd
740             .get_dirty_log(slot, memory_size as usize)
741             .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into()))
742     }
743 
744     ///
745     /// Initialize TDX for this VM
746     ///
747     #[cfg(feature = "tdx")]
748     fn tdx_init(&self, cpuid: &[CpuIdEntry], max_vcpus: u32) -> vm::Result<()> {
749         const TDX_ATTR_SEPT_VE_DISABLE: usize = 28;
750 
751         let mut cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> =
752             cpuid.iter().map(|e| (*e).into()).collect();
753         cpuid.resize(256, kvm_bindings::kvm_cpuid_entry2::default());
754 
755         #[repr(C)]
756         struct TdxInitVm {
757             attributes: u64,
758             max_vcpus: u32,
759             padding: u32,
760             mrconfigid: [u64; 6],
761             mrowner: [u64; 6],
762             mrownerconfig: [u64; 6],
763             cpuid_nent: u32,
764             cpuid_padding: u32,
765             cpuid_entries: [kvm_bindings::kvm_cpuid_entry2; 256],
766         }
767         let data = TdxInitVm {
768             attributes: 1 << TDX_ATTR_SEPT_VE_DISABLE,
769             max_vcpus,
770             padding: 0,
771             mrconfigid: [0; 6],
772             mrowner: [0; 6],
773             mrownerconfig: [0; 6],
774             cpuid_nent: cpuid.len() as u32,
775             cpuid_padding: 0,
776             cpuid_entries: cpuid.as_slice().try_into().unwrap(),
777         };
778 
779         tdx_command(
780             &self.fd.as_raw_fd(),
781             TdxCommand::InitVm,
782             0,
783             &data as *const _ as u64,
784         )
785         .map_err(vm::HypervisorVmError::InitializeTdx)
786     }
787 
788     ///
789     /// Finalize the TDX setup for this VM
790     ///
791     #[cfg(feature = "tdx")]
792     fn tdx_finalize(&self) -> vm::Result<()> {
793         tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0)
794             .map_err(vm::HypervisorVmError::FinalizeTdx)
795     }
796 
797     ///
798     /// Initialize memory regions for the TDX VM
799     ///
800     #[cfg(feature = "tdx")]
801     fn tdx_init_memory_region(
802         &self,
803         host_address: u64,
804         guest_address: u64,
805         size: u64,
806         measure: bool,
807     ) -> vm::Result<()> {
808         #[repr(C)]
809         struct TdxInitMemRegion {
810             host_address: u64,
811             guest_address: u64,
812             pages: u64,
813         }
814         let data = TdxInitMemRegion {
815             host_address,
816             guest_address,
817             pages: size / 4096,
818         };
819 
820         tdx_command(
821             &self.fd.as_raw_fd(),
822             TdxCommand::InitMemRegion,
823             u32::from(measure),
824             &data as *const _ as u64,
825         )
826         .map_err(vm::HypervisorVmError::InitMemRegionTdx)
827     }
828     /// Downcast to the underlying KvmVm type
829     fn as_any(&self) -> &dyn Any {
830         self
831     }
832 }
833 
834 #[cfg(feature = "tdx")]
835 fn tdx_command(
836     fd: &RawFd,
837     command: TdxCommand,
838     flags: u32,
839     data: u64,
840 ) -> std::result::Result<(), std::io::Error> {
841     #[repr(C)]
842     struct TdxIoctlCmd {
843         command: TdxCommand,
844         flags: u32,
845         data: u64,
846         error: u64,
847         unused: u64,
848     }
849     let cmd = TdxIoctlCmd {
850         command,
851         flags,
852         data,
853         error: 0,
854         unused: 0,
855     };
856     // SAFETY: FFI call. All input parameters are valid.
857     let ret = unsafe {
858         ioctl_with_val(
859             fd,
860             KVM_MEMORY_ENCRYPT_OP(),
861             &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong,
862         )
863     };
864 
865     if ret < 0 {
866         return Err(std::io::Error::last_os_error());
867     }
868     Ok(())
869 }
870 
871 /// Wrapper over KVM system ioctls.
872 pub struct KvmHypervisor {
873     kvm: Kvm,
874 }
875 
876 impl KvmHypervisor {
877     #[cfg(target_arch = "x86_64")]
878     ///
879     /// Retrieve the list of MSRs supported by the hypervisor.
880     ///
881     fn get_msr_list(&self) -> hypervisor::Result<MsrList> {
882         self.kvm
883             .get_msr_index_list()
884             .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into()))
885     }
886 }
887 
888 /// Enum for KVM related error
889 #[derive(Debug, Error)]
890 pub enum KvmError {
891     #[error("Capability missing: {0:?}")]
892     CapabilityMissing(Cap),
893 }
894 pub type KvmResult<T> = result::Result<T, KvmError>;
895 impl KvmHypervisor {
896     /// Create a hypervisor based on Kvm
897     #[allow(clippy::new_ret_no_self)]
898     pub fn new() -> hypervisor::Result<Arc<dyn hypervisor::Hypervisor>> {
899         let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?;
900         let api_version = kvm_obj.get_api_version();
901 
902         if api_version != kvm_bindings::KVM_API_VERSION as i32 {
903             return Err(hypervisor::HypervisorError::IncompatibleApiVersion);
904         }
905 
906         Ok(Arc::new(KvmHypervisor { kvm: kvm_obj }))
907     }
908     /// Check if the hypervisor is available
909     pub fn is_available() -> hypervisor::Result<bool> {
910         match std::fs::metadata("/dev/kvm") {
911             Ok(_) => Ok(true),
912             Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false),
913             Err(err) => Err(hypervisor::HypervisorError::HypervisorAvailableCheck(
914                 err.into(),
915             )),
916         }
917     }
918 }
919 /// Implementation of Hypervisor trait for KVM
920 /// Example:
921 /// #[cfg(feature = "kvm")]
922 /// extern crate hypervisor
923 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
924 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
925 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
926 ///
927 impl hypervisor::Hypervisor for KvmHypervisor {
928     ///
929     /// Returns the type of the hypervisor
930     ///
931     fn hypervisor_type(&self) -> HypervisorType {
932         HypervisorType::Kvm
933     }
934     /// Create a KVM vm object of a specific VM type and return the object as Vm trait object
935     /// Example
936     /// # extern crate hypervisor;
937     /// # use hypervisor::KvmHypervisor;
938     /// use hypervisor::KvmVm;
939     /// let hypervisor = KvmHypervisor::new().unwrap();
940     /// let vm = hypervisor.create_vm_with_type(KvmVmType::LegacyVm).unwrap()
941     ///
942     fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> {
943         let fd: VmFd;
944         loop {
945             match self.kvm.create_vm_with_type(vm_type) {
946                 Ok(res) => fd = res,
947                 Err(e) => {
948                     if e.errno() == libc::EINTR {
949                         // If the error returned is EINTR, which means the
950                         // ioctl has been interrupted, we have to retry as
951                         // this can't be considered as a regular error.
952                         continue;
953                     } else {
954                         return Err(hypervisor::HypervisorError::VmCreate(e.into()));
955                     }
956                 }
957             }
958             break;
959         }
960 
961         let vm_fd = Arc::new(fd);
962 
963         #[cfg(target_arch = "x86_64")]
964         {
965             let msr_list = self.get_msr_list()?;
966             let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize;
967             let mut msrs: Vec<MsrEntry> = vec![
968                 MsrEntry {
969                     ..Default::default()
970                 };
971                 num_msrs
972             ];
973             let indices = msr_list.as_slice();
974             for (pos, index) in indices.iter().enumerate() {
975                 msrs[pos].index = *index;
976             }
977 
978             Ok(Arc::new(KvmVm {
979                 fd: vm_fd,
980                 msrs,
981                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
982             }))
983         }
984 
985         #[cfg(target_arch = "aarch64")]
986         {
987             Ok(Arc::new(KvmVm {
988                 fd: vm_fd,
989                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
990             }))
991         }
992     }
993 
994     /// Create a KVM vm object and return the object as Vm trait object
995     /// Example
996     /// # extern crate hypervisor;
997     /// # use hypervisor::KvmHypervisor;
998     /// use hypervisor::KvmVm;
999     /// let hypervisor = KvmHypervisor::new().unwrap();
1000     /// let vm = hypervisor.create_vm().unwrap()
1001     ///
1002     fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> {
1003         #[allow(unused_mut)]
1004         let mut vm_type: u64 = 0; // Create with default platform type
1005 
1006         // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA
1007         // size from the host and use that when creating the VM, which may
1008         // avoid unnecessary VM creation failures.
1009         #[cfg(target_arch = "aarch64")]
1010         if self.kvm.check_extension(Cap::ArmVmIPASize) {
1011             vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap();
1012         }
1013 
1014         self.create_vm_with_type(vm_type)
1015     }
1016 
1017     fn check_required_extensions(&self) -> hypervisor::Result<()> {
1018         check_required_kvm_extensions(&self.kvm)
1019             .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into()))
1020     }
1021 
1022     #[cfg(target_arch = "x86_64")]
1023     ///
1024     /// X86 specific call to get the system supported CPUID values.
1025     ///
1026     fn get_supported_cpuid(&self) -> hypervisor::Result<Vec<CpuIdEntry>> {
1027         let kvm_cpuid = self
1028             .kvm
1029             .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
1030             .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))?;
1031 
1032         let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect();
1033 
1034         Ok(v)
1035     }
1036 
1037     #[cfg(target_arch = "aarch64")]
1038     ///
1039     /// Retrieve AArch64 host maximum IPA size supported by KVM.
1040     ///
1041     fn get_host_ipa_limit(&self) -> i32 {
1042         self.kvm.get_host_ipa_limit()
1043     }
1044 
1045     ///
1046     /// Retrieve TDX capabilities
1047     ///
1048     #[cfg(feature = "tdx")]
1049     fn tdx_capabilities(&self) -> hypervisor::Result<TdxCapabilities> {
1050         let data = TdxCapabilities {
1051             nr_cpuid_configs: TDX_MAX_NR_CPUID_CONFIGS as u32,
1052             ..Default::default()
1053         };
1054 
1055         tdx_command(
1056             &self.kvm.as_raw_fd(),
1057             TdxCommand::Capabilities,
1058             0,
1059             &data as *const _ as u64,
1060         )
1061         .map_err(|e| hypervisor::HypervisorError::TdxCapabilities(e.into()))?;
1062 
1063         Ok(data)
1064     }
1065 
1066     ///
1067     /// Get the number of supported hardware breakpoints
1068     ///
1069     fn get_guest_debug_hw_bps(&self) -> usize {
1070         #[cfg(target_arch = "x86_64")]
1071         {
1072             4
1073         }
1074         #[cfg(target_arch = "aarch64")]
1075         {
1076             self.kvm.get_guest_debug_hw_bps() as usize
1077         }
1078     }
1079 }
1080 /// Vcpu struct for KVM
1081 pub struct KvmVcpu {
1082     fd: VcpuFd,
1083     #[cfg(target_arch = "x86_64")]
1084     msrs: Vec<MsrEntry>,
1085     vm_ops: Option<Arc<dyn vm::VmOps>>,
1086     #[cfg(target_arch = "x86_64")]
1087     hyperv_synic: AtomicBool,
1088 }
1089 /// Implementation of Vcpu trait for KVM
1090 /// Example:
1091 /// #[cfg(feature = "kvm")]
1092 /// extern crate hypervisor
1093 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1094 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1095 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
1096 /// let vcpu = vm.create_vcpu(0, None).unwrap();
1097 /// vcpu.get/set().unwrap()
1098 ///
1099 impl cpu::Vcpu for KvmVcpu {
1100     #[cfg(target_arch = "x86_64")]
1101     ///
1102     /// Returns the vCPU general purpose registers.
1103     ///
1104     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1105         Ok(self
1106             .fd
1107             .get_regs()
1108             .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))?
1109             .into())
1110     }
1111     ///
1112     /// Returns the vCPU general purpose registers.
1113     /// The `KVM_GET_REGS` ioctl is not available on AArch64, `KVM_GET_ONE_REG`
1114     /// is used to get registers one by one.
1115     ///
1116     #[cfg(target_arch = "aarch64")]
1117     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1118         let mut state: StandardRegisters = kvm_regs::default();
1119         let mut off = offset_of!(user_pt_regs, regs);
1120         // There are 31 user_pt_regs:
1121         // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72
1122         // These actually are the general-purpose registers of the Armv8-a
1123         // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register).
1124         for i in 0..31 {
1125             state.regs.regs[i] = self
1126                 .fd
1127                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1128                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1129                 .try_into()
1130                 .unwrap();
1131             off += std::mem::size_of::<u64>();
1132         }
1133 
1134         // We are now entering the "Other register" section of the ARMv8-a architecture.
1135         // First one, stack pointer.
1136         let off = offset_of!(user_pt_regs, sp);
1137         state.regs.sp = self
1138             .fd
1139             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1140             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1141             .try_into()
1142             .unwrap();
1143 
1144         // Second one, the program counter.
1145         let off = offset_of!(user_pt_regs, pc);
1146         state.regs.pc = self
1147             .fd
1148             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1149             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1150             .try_into()
1151             .unwrap();
1152 
1153         // Next is the processor state.
1154         let off = offset_of!(user_pt_regs, pstate);
1155         state.regs.pstate = self
1156             .fd
1157             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1158             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1159             .try_into()
1160             .unwrap();
1161 
1162         // The stack pointer associated with EL1
1163         let off = offset_of!(kvm_regs, sp_el1);
1164         state.sp_el1 = self
1165             .fd
1166             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1167             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1168             .try_into()
1169             .unwrap();
1170 
1171         // Exception Link Register for EL1, when taking an exception to EL1, this register
1172         // holds the address to which to return afterwards.
1173         let off = offset_of!(kvm_regs, elr_el1);
1174         state.elr_el1 = self
1175             .fd
1176             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1177             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1178             .try_into()
1179             .unwrap();
1180 
1181         // Saved Program Status Registers, there are 5 of them used in the kernel.
1182         let mut off = offset_of!(kvm_regs, spsr);
1183         for i in 0..KVM_NR_SPSR as usize {
1184             state.spsr[i] = self
1185                 .fd
1186                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1187                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1188                 .try_into()
1189                 .unwrap();
1190             off += std::mem::size_of::<u64>();
1191         }
1192 
1193         // Now moving on to floting point registers which are stored in the user_fpsimd_state in the kernel:
1194         // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53
1195         let mut off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, vregs);
1196         for i in 0..32 {
1197             state.fp_regs.vregs[i] = self
1198                 .fd
1199                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off))
1200                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1201             off += mem::size_of::<u128>();
1202         }
1203 
1204         // Floating-point Status Register
1205         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpsr);
1206         state.fp_regs.fpsr = self
1207             .fd
1208             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1209             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1210             .try_into()
1211             .unwrap();
1212 
1213         // Floating-point Control Register
1214         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpcr);
1215         state.fp_regs.fpcr = self
1216             .fd
1217             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1218             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1219             .try_into()
1220             .unwrap();
1221         Ok(state)
1222     }
1223     #[cfg(target_arch = "x86_64")]
1224     ///
1225     /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl.
1226     ///
1227     fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> {
1228         let regs = (*regs).into();
1229         self.fd
1230             .set_regs(&regs)
1231             .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into()))
1232     }
1233 
1234     ///
1235     /// Sets the vCPU general purpose registers.
1236     /// The `KVM_SET_REGS` ioctl is not available on AArch64, `KVM_SET_ONE_REG`
1237     /// is used to set registers one by one.
1238     ///
1239     #[cfg(target_arch = "aarch64")]
1240     fn set_regs(&self, state: &StandardRegisters) -> cpu::Result<()> {
1241         // The function follows the exact identical order from `state`. Look there
1242         // for some additional info on registers.
1243         let mut off = offset_of!(user_pt_regs, regs);
1244         for i in 0..31 {
1245             self.fd
1246                 .set_one_reg(
1247                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1248                     state.regs.regs[i].into(),
1249                 )
1250                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1251             off += std::mem::size_of::<u64>();
1252         }
1253 
1254         let off = offset_of!(user_pt_regs, sp);
1255         self.fd
1256             .set_one_reg(
1257                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1258                 state.regs.sp.into(),
1259             )
1260             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1261 
1262         let off = offset_of!(user_pt_regs, pc);
1263         self.fd
1264             .set_one_reg(
1265                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1266                 state.regs.pc.into(),
1267             )
1268             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1269 
1270         let off = offset_of!(user_pt_regs, pstate);
1271         self.fd
1272             .set_one_reg(
1273                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1274                 state.regs.pstate.into(),
1275             )
1276             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1277 
1278         let off = offset_of!(kvm_regs, sp_el1);
1279         self.fd
1280             .set_one_reg(
1281                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1282                 state.sp_el1.into(),
1283             )
1284             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1285 
1286         let off = offset_of!(kvm_regs, elr_el1);
1287         self.fd
1288             .set_one_reg(
1289                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1290                 state.elr_el1.into(),
1291             )
1292             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1293 
1294         let mut off = offset_of!(kvm_regs, spsr);
1295         for i in 0..KVM_NR_SPSR as usize {
1296             self.fd
1297                 .set_one_reg(
1298                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1299                     state.spsr[i].into(),
1300                 )
1301                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1302             off += std::mem::size_of::<u64>();
1303         }
1304 
1305         let mut off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, vregs);
1306         for i in 0..32 {
1307             self.fd
1308                 .set_one_reg(
1309                     arm64_core_reg_id!(KVM_REG_SIZE_U128, off),
1310                     state.fp_regs.vregs[i],
1311                 )
1312                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1313             off += mem::size_of::<u128>();
1314         }
1315 
1316         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpsr);
1317         self.fd
1318             .set_one_reg(
1319                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1320                 state.fp_regs.fpsr.into(),
1321             )
1322             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1323 
1324         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpcr);
1325         self.fd
1326             .set_one_reg(
1327                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1328                 state.fp_regs.fpcr.into(),
1329             )
1330             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1331         Ok(())
1332     }
1333 
1334     #[cfg(target_arch = "x86_64")]
1335     ///
1336     /// Returns the vCPU special registers.
1337     ///
1338     fn get_sregs(&self) -> cpu::Result<SpecialRegisters> {
1339         Ok(self
1340             .fd
1341             .get_sregs()
1342             .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))?
1343             .into())
1344     }
1345     #[cfg(target_arch = "x86_64")]
1346     ///
1347     /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl.
1348     ///
1349     fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> {
1350         let sregs = (*sregs).into();
1351         self.fd
1352             .set_sregs(&sregs)
1353             .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into()))
1354     }
1355     #[cfg(target_arch = "x86_64")]
1356     ///
1357     /// Returns the floating point state (FPU) from the vCPU.
1358     ///
1359     fn get_fpu(&self) -> cpu::Result<FpuState> {
1360         Ok(self
1361             .fd
1362             .get_fpu()
1363             .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))?
1364             .into())
1365     }
1366     #[cfg(target_arch = "x86_64")]
1367     ///
1368     /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct.
1369     ///
1370     fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> {
1371         let fpu: kvm_bindings::kvm_fpu = (*fpu).clone().into();
1372         self.fd
1373             .set_fpu(&fpu)
1374             .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into()))
1375     }
1376     #[cfg(target_arch = "x86_64")]
1377     ///
1378     /// X86 specific call to setup the CPUID registers.
1379     ///
1380     fn set_cpuid2(&self, cpuid: &[CpuIdEntry]) -> cpu::Result<()> {
1381         let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> =
1382             cpuid.iter().map(|e| (*e).into()).collect();
1383         let kvm_cpuid = <CpuId>::from_entries(&cpuid)
1384             .map_err(|_| cpu::HypervisorCpuError::SetCpuid(anyhow!("failed to create CpuId")))?;
1385 
1386         self.fd
1387             .set_cpuid2(&kvm_cpuid)
1388             .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into()))
1389     }
1390     #[cfg(target_arch = "x86_64")]
1391     ///
1392     /// X86 specific call to enable HyperV SynIC
1393     ///
1394     fn enable_hyperv_synic(&self) -> cpu::Result<()> {
1395         // Update the information about Hyper-V SynIC being enabled and
1396         // emulated as it will influence later which MSRs should be saved.
1397         self.hyperv_synic.store(true, Ordering::Release);
1398 
1399         let cap = kvm_enable_cap {
1400             cap: KVM_CAP_HYPERV_SYNIC,
1401             ..Default::default()
1402         };
1403         self.fd
1404             .enable_cap(&cap)
1405             .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into()))
1406     }
1407     ///
1408     /// X86 specific call to retrieve the CPUID registers.
1409     ///
1410     #[cfg(target_arch = "x86_64")]
1411     fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<Vec<CpuIdEntry>> {
1412         let kvm_cpuid = self
1413             .fd
1414             .get_cpuid2(num_entries)
1415             .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))?;
1416 
1417         let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect();
1418 
1419         Ok(v)
1420     }
1421     #[cfg(target_arch = "x86_64")]
1422     ///
1423     /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1424     ///
1425     fn get_lapic(&self) -> cpu::Result<LapicState> {
1426         Ok(self
1427             .fd
1428             .get_lapic()
1429             .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))?
1430             .into())
1431     }
1432     #[cfg(target_arch = "x86_64")]
1433     ///
1434     /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1435     ///
1436     fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> {
1437         let klapic: kvm_bindings::kvm_lapic_state = (*klapic).clone().into();
1438         self.fd
1439             .set_lapic(&klapic)
1440             .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into()))
1441     }
1442     #[cfg(target_arch = "x86_64")]
1443     ///
1444     /// Returns the model-specific registers (MSR) for this vCPU.
1445     ///
1446     fn get_msrs(&self, msrs: &mut Vec<MsrEntry>) -> cpu::Result<usize> {
1447         let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect();
1448         let mut kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap();
1449         let succ = self
1450             .fd
1451             .get_msrs(&mut kvm_msrs)
1452             .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))?;
1453 
1454         msrs[..succ].copy_from_slice(
1455             &kvm_msrs.as_slice()[..succ]
1456                 .iter()
1457                 .map(|e| (*e).into())
1458                 .collect::<Vec<MsrEntry>>(),
1459         );
1460 
1461         Ok(succ)
1462     }
1463     #[cfg(target_arch = "x86_64")]
1464     ///
1465     /// Setup the model-specific registers (MSR) for this vCPU.
1466     /// Returns the number of MSR entries actually written.
1467     ///
1468     fn set_msrs(&self, msrs: &[MsrEntry]) -> cpu::Result<usize> {
1469         let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect();
1470         let kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap();
1471         self.fd
1472             .set_msrs(&kvm_msrs)
1473             .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into()))
1474     }
1475     ///
1476     /// Returns the vcpu's current "multiprocessing state".
1477     ///
1478     fn get_mp_state(&self) -> cpu::Result<MpState> {
1479         Ok(self
1480             .fd
1481             .get_mp_state()
1482             .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))?
1483             .into())
1484     }
1485     ///
1486     /// Sets the vcpu's current "multiprocessing state".
1487     ///
1488     fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> {
1489         self.fd
1490             .set_mp_state(mp_state.into())
1491             .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into()))
1492     }
1493     #[cfg(target_arch = "x86_64")]
1494     ///
1495     /// Translates guest virtual address to guest physical address using the `KVM_TRANSLATE` ioctl.
1496     ///
1497     fn translate_gva(&self, gva: u64, _flags: u64) -> cpu::Result<(u64, u32)> {
1498         let tr = self
1499             .fd
1500             .translate_gva(gva)
1501             .map_err(|e| cpu::HypervisorCpuError::TranslateVirtualAddress(e.into()))?;
1502         // tr.valid is set if the GVA is mapped to valid GPA.
1503         match tr.valid {
1504             0 => Err(cpu::HypervisorCpuError::TranslateVirtualAddress(anyhow!(
1505                 "Invalid GVA: {:#x}",
1506                 gva
1507             ))),
1508             _ => Ok((tr.physical_address, 0)),
1509         }
1510     }
1511     ///
1512     /// Triggers the running of the current virtual CPU returning an exit reason.
1513     ///
1514     fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> {
1515         match self.fd.run() {
1516             Ok(run) => match run {
1517                 #[cfg(target_arch = "x86_64")]
1518                 VcpuExit::IoIn(addr, data) => {
1519                     if let Some(vm_ops) = &self.vm_ops {
1520                         return vm_ops
1521                             .pio_read(addr.into(), data)
1522                             .map(|_| cpu::VmExit::Ignore)
1523                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1524                     }
1525 
1526                     Ok(cpu::VmExit::IoIn(addr, data))
1527                 }
1528                 #[cfg(target_arch = "x86_64")]
1529                 VcpuExit::IoOut(addr, data) => {
1530                     if let Some(vm_ops) = &self.vm_ops {
1531                         return vm_ops
1532                             .pio_write(addr.into(), data)
1533                             .map(|_| cpu::VmExit::Ignore)
1534                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1535                     }
1536 
1537                     Ok(cpu::VmExit::IoOut(addr, data))
1538                 }
1539                 #[cfg(target_arch = "x86_64")]
1540                 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)),
1541                 #[cfg(target_arch = "x86_64")]
1542                 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset),
1543 
1544                 #[cfg(target_arch = "aarch64")]
1545                 VcpuExit::SystemEvent(event_type, flags) => {
1546                     use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN};
1547                     // On Aarch64, when the VM is shutdown, run() returns
1548                     // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN
1549                     if event_type == KVM_SYSTEM_EVENT_RESET {
1550                         Ok(cpu::VmExit::Reset)
1551                     } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN {
1552                         Ok(cpu::VmExit::Shutdown)
1553                     } else {
1554                         Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1555                             "Unexpected system event with type 0x{:x}, flags 0x{:x}",
1556                             event_type,
1557                             flags
1558                         )))
1559                     }
1560                 }
1561 
1562                 VcpuExit::MmioRead(addr, data) => {
1563                     if let Some(vm_ops) = &self.vm_ops {
1564                         return vm_ops
1565                             .mmio_read(addr, data)
1566                             .map(|_| cpu::VmExit::Ignore)
1567                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1568                     }
1569 
1570                     Ok(cpu::VmExit::MmioRead(addr, data))
1571                 }
1572                 VcpuExit::MmioWrite(addr, data) => {
1573                     if let Some(vm_ops) = &self.vm_ops {
1574                         return vm_ops
1575                             .mmio_write(addr, data)
1576                             .map(|_| cpu::VmExit::Ignore)
1577                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1578                     }
1579 
1580                     Ok(cpu::VmExit::MmioWrite(addr, data))
1581                 }
1582                 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv),
1583                 #[cfg(feature = "tdx")]
1584                 VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx),
1585                 VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug),
1586 
1587                 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1588                     "Unexpected exit reason on vcpu run: {:?}",
1589                     r
1590                 ))),
1591             },
1592 
1593             Err(ref e) => match e.errno() {
1594                 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore),
1595                 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1596                     "VCPU error {:?}",
1597                     e
1598                 ))),
1599             },
1600         }
1601     }
1602     #[cfg(target_arch = "x86_64")]
1603     ///
1604     /// Let the guest know that it has been paused, which prevents from
1605     /// potential soft lockups when being resumed.
1606     ///
1607     fn notify_guest_clock_paused(&self) -> cpu::Result<()> {
1608         if let Err(e) = self.fd.kvmclock_ctrl() {
1609             // Linux kernel returns -EINVAL if the PV clock isn't yet initialised
1610             // which could be because we're still in firmware or the guest doesn't
1611             // use KVM clock.
1612             if e.errno() != libc::EINVAL {
1613                 return Err(cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into()));
1614             }
1615         }
1616 
1617         Ok(())
1618     }
1619     ///
1620     /// Sets debug registers to set hardware breakpoints and/or enable single step.
1621     ///
1622     fn set_guest_debug(
1623         &self,
1624         addrs: &[vm_memory::GuestAddress],
1625         singlestep: bool,
1626     ) -> cpu::Result<()> {
1627         let mut dbg = kvm_guest_debug {
1628             #[cfg(target_arch = "x86_64")]
1629             control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP,
1630             #[cfg(target_arch = "aarch64")]
1631             control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW,
1632             ..Default::default()
1633         };
1634         if singlestep {
1635             dbg.control |= KVM_GUESTDBG_SINGLESTEP;
1636         }
1637 
1638         // Set the debug registers.
1639         // Here we assume that the number of addresses do not exceed what
1640         // `Hypervisor::get_guest_debug_hw_bps()` specifies.
1641         #[cfg(target_arch = "x86_64")]
1642         {
1643             // Set bits 9 and 10.
1644             // bit 9: GE (global exact breakpoint enable) flag.
1645             // bit 10: always 1.
1646             dbg.arch.debugreg[7] = 0x0600;
1647 
1648             for (i, addr) in addrs.iter().enumerate() {
1649                 dbg.arch.debugreg[i] = addr.0;
1650                 // Set global breakpoint enable flag
1651                 dbg.arch.debugreg[7] |= 2 << (i * 2);
1652             }
1653         }
1654         #[cfg(target_arch = "aarch64")]
1655         {
1656             for (i, addr) in addrs.iter().enumerate() {
1657                 // DBGBCR_EL1 (Debug Breakpoint Control Registers, D13.3.2):
1658                 // bit 0: 1 (Enabled)
1659                 // bit 1~2: 0b11 (PMC = EL1/EL0)
1660                 // bit 5~8: 0b1111 (BAS = AArch64)
1661                 // others: 0
1662                 dbg.arch.dbg_bcr[i] = 0b1u64 | 0b110u64 | 0b1_1110_0000u64;
1663                 // DBGBVR_EL1 (Debug Breakpoint Value Registers, D13.3.3):
1664                 // bit 2~52: VA[2:52]
1665                 dbg.arch.dbg_bvr[i] = (!0u64 >> 11) & addr.0;
1666             }
1667         }
1668         self.fd
1669             .set_guest_debug(&dbg)
1670             .map_err(|e| cpu::HypervisorCpuError::SetDebugRegs(e.into()))
1671     }
1672     #[cfg(target_arch = "aarch64")]
1673     fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> {
1674         self.fd
1675             .vcpu_init(kvi)
1676             .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into()))
1677     }
1678     ///
1679     /// Gets a list of the guest registers that are supported for the
1680     /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls.
1681     ///
1682     #[cfg(target_arch = "aarch64")]
1683     fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> {
1684         self.fd
1685             .get_reg_list(reg_list)
1686             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))
1687     }
1688     ///
1689     /// Gets the value of a system register
1690     ///
1691     #[cfg(target_arch = "aarch64")]
1692     fn get_sys_reg(&self, sys_reg: u32) -> cpu::Result<u64> {
1693         //
1694         // Arm Architecture Reference Manual defines the encoding of
1695         // AArch64 system registers, see
1696         // https://developer.arm.com/documentation/ddi0487 (chapter D12).
1697         // While KVM defines another ID for each AArch64 system register,
1698         // which is used in calling `KVM_G/SET_ONE_REG` to access a system
1699         // register of a guest.
1700         // A mapping exists between the Arm standard encoding and the KVM ID.
1701         // This function takes the standard u32 ID as input parameter, converts
1702         // it to the corresponding KVM ID, and call `KVM_GET_ONE_REG` API to
1703         // get the value of the system parameter.
1704         //
1705         let id: u64 = KVM_REG_ARM64
1706             | KVM_REG_SIZE_U64
1707             | KVM_REG_ARM64_SYSREG as u64
1708             | ((((sys_reg) >> 5)
1709                 & (KVM_REG_ARM64_SYSREG_OP0_MASK
1710                     | KVM_REG_ARM64_SYSREG_OP1_MASK
1711                     | KVM_REG_ARM64_SYSREG_CRN_MASK
1712                     | KVM_REG_ARM64_SYSREG_CRM_MASK
1713                     | KVM_REG_ARM64_SYSREG_OP2_MASK)) as u64);
1714         Ok(self
1715             .fd
1716             .get_one_reg(id)
1717             .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?
1718             .try_into()
1719             .unwrap())
1720     }
1721     ///
1722     /// Configure core registers for a given CPU.
1723     ///
1724     #[cfg(target_arch = "aarch64")]
1725     fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> {
1726         #[allow(non_upper_case_globals)]
1727         // PSR (Processor State Register) bits.
1728         // Taken from arch/arm64/include/uapi/asm/ptrace.h.
1729         const PSR_MODE_EL1h: u64 = 0x0000_0005;
1730         const PSR_F_BIT: u64 = 0x0000_0040;
1731         const PSR_I_BIT: u64 = 0x0000_0080;
1732         const PSR_A_BIT: u64 = 0x0000_0100;
1733         const PSR_D_BIT: u64 = 0x0000_0200;
1734         // Taken from arch/arm64/kvm/inject_fault.c.
1735         const PSTATE_FAULT_BITS_64: u64 =
1736             PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | PSR_I_BIT | PSR_D_BIT;
1737 
1738         let kreg_off = offset_of!(kvm_regs, regs);
1739 
1740         // Get the register index of the PSTATE (Processor State) register.
1741         let pstate = offset_of!(user_pt_regs, pstate) + kreg_off;
1742         self.fd
1743             .set_one_reg(
1744                 arm64_core_reg_id!(KVM_REG_SIZE_U64, pstate),
1745                 PSTATE_FAULT_BITS_64.into(),
1746             )
1747             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1748 
1749         // Other vCPUs are powered off initially awaiting PSCI wakeup.
1750         if cpu_id == 0 {
1751             // Setting the PC (Processor Counter) to the current program address (kernel address).
1752             let pc = offset_of!(user_pt_regs, pc) + kreg_off;
1753             self.fd
1754                 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, pc), boot_ip.into())
1755                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1756 
1757             // Last mandatory thing to set -> the address pointing to the FDT (also called DTB).
1758             // "The device tree blob (dtb) must be placed on an 8-byte boundary and must
1759             // not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt.
1760             // We are choosing to place it the end of DRAM. See `get_fdt_addr`.
1761             let regs0 = offset_of!(user_pt_regs, regs) + kreg_off;
1762             self.fd
1763                 .set_one_reg(
1764                     arm64_core_reg_id!(KVM_REG_SIZE_U64, regs0),
1765                     fdt_start.into(),
1766                 )
1767                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1768         }
1769         Ok(())
1770     }
1771 
1772     #[cfg(target_arch = "x86_64")]
1773     ///
1774     /// Get the current CPU state
1775     ///
1776     /// Ordering requirements:
1777     ///
1778     /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify
1779     /// vCPU/LAPIC state. As such, it must be done before most everything
1780     /// else, otherwise we cannot restore everything and expect it to work.
1781     ///
1782     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1783     /// still running.
1784     ///
1785     /// KVM_GET_LAPIC may change state of LAPIC before returning it.
1786     ///
1787     /// GET_VCPU_EVENTS should probably be last to save. The code looks as
1788     /// it might as well be affected by internal state modifications of the
1789     /// GET ioctls.
1790     ///
1791     /// SREGS saves/restores a pending interrupt, similar to what
1792     /// VCPU_EVENTS also does.
1793     ///
1794     /// GET_MSRS requires a pre-populated data structure to do something
1795     /// meaningful. For SET_MSRS it will then contain good data.
1796     ///
1797     /// # Example
1798     ///
1799     /// ```rust
1800     /// # extern crate hypervisor;
1801     /// # use hypervisor::KvmHypervisor;
1802     /// # use std::sync::Arc;
1803     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1804     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1805     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1806     /// vm.enable_split_irq().unwrap();
1807     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1808     /// let state = vcpu.state().unwrap();
1809     /// ```
1810     fn state(&self) -> cpu::Result<CpuState> {
1811         let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?;
1812         let mp_state = self.get_mp_state()?.into();
1813         let regs = self.get_regs()?;
1814         let sregs = self.get_sregs()?;
1815         let xsave = self.get_xsave()?;
1816         let xcrs = self.get_xcrs()?;
1817         let lapic_state = self.get_lapic()?;
1818         let fpu = self.get_fpu()?;
1819 
1820         // Try to get all MSRs based on the list previously retrieved from KVM.
1821         // If the number of MSRs obtained from GET_MSRS is different from the
1822         // expected amount, we fallback onto a slower method by getting MSRs
1823         // by chunks. This is the only way to make sure we try to get as many
1824         // MSRs as possible, even if some MSRs are not supported.
1825         let mut msr_entries = self.msrs.clone();
1826 
1827         // Save extra MSRs if the Hyper-V synthetic interrupt controller is
1828         // emulated.
1829         if self.hyperv_synic.load(Ordering::Acquire) {
1830             let hyperv_synic_msrs = vec![
1831                 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084,
1832                 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096,
1833                 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d,
1834                 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4,
1835                 0x400000b5, 0x400000b6, 0x400000b7,
1836             ];
1837             for index in hyperv_synic_msrs {
1838                 let msr = kvm_msr_entry {
1839                     index,
1840                     ..Default::default()
1841                 };
1842                 msr_entries.push(msr.into());
1843             }
1844         }
1845 
1846         let expected_num_msrs = msr_entries.len();
1847         let num_msrs = self.get_msrs(&mut msr_entries)?;
1848         let msrs = if num_msrs != expected_num_msrs {
1849             let mut faulty_msr_index = num_msrs;
1850             let mut msr_entries_tmp = msr_entries[..faulty_msr_index].to_vec();
1851 
1852             loop {
1853                 warn!(
1854                     "Detected faulty MSR 0x{:x} while getting MSRs",
1855                     msr_entries[faulty_msr_index].index
1856                 );
1857 
1858                 // Skip the first bad MSR
1859                 let start_pos = faulty_msr_index + 1;
1860 
1861                 let mut sub_msr_entries = msr_entries[start_pos..].to_vec();
1862                 let num_msrs = self.get_msrs(&mut sub_msr_entries)?;
1863 
1864                 msr_entries_tmp.extend(&sub_msr_entries[..num_msrs]);
1865 
1866                 if num_msrs == sub_msr_entries.len() {
1867                     break;
1868                 }
1869 
1870                 faulty_msr_index = start_pos + num_msrs;
1871             }
1872 
1873             msr_entries_tmp
1874         } else {
1875             msr_entries
1876         };
1877 
1878         let vcpu_events = self.get_vcpu_events()?;
1879 
1880         Ok(VcpuKvmState {
1881             cpuid,
1882             msrs,
1883             vcpu_events,
1884             regs: regs.into(),
1885             sregs: sregs.into(),
1886             fpu,
1887             lapic_state,
1888             xsave,
1889             xcrs,
1890             mp_state,
1891         }
1892         .into())
1893     }
1894     ///
1895     /// Get the current AArch64 CPU state
1896     ///
1897     #[cfg(target_arch = "aarch64")]
1898     fn state(&self) -> cpu::Result<CpuState> {
1899         let mut state = VcpuKvmState {
1900             mp_state: self.get_mp_state()?.into(),
1901             ..Default::default()
1902         };
1903         // Get core registers
1904         state.core_regs = self.get_regs()?;
1905 
1906         // Get systerm register
1907         // Call KVM_GET_REG_LIST to get all registers available to the guest.
1908         // For ArmV8 there are around 500 registers.
1909         let mut sys_regs: Vec<Register> = Vec::new();
1910         let mut reg_list = RegList::new(500).unwrap();
1911         self.fd
1912             .get_reg_list(&mut reg_list)
1913             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?;
1914 
1915         // At this point reg_list should contain: core registers and system
1916         // registers.
1917         // The register list contains the number of registers and their ids. We
1918         // will be needing to call KVM_GET_ONE_REG on each id in order to save
1919         // all of them. We carve out from the list  the core registers which are
1920         // represented in the kernel by kvm_regs structure and for which we can
1921         // calculate the id based on the offset in the structure.
1922         reg_list.retain(|regid| is_system_register(*regid));
1923 
1924         // Now, for the rest of the registers left in the previously fetched
1925         // register list, we are simply calling KVM_GET_ONE_REG.
1926         let indices = reg_list.as_slice();
1927         for index in indices.iter() {
1928             sys_regs.push(kvm_bindings::kvm_one_reg {
1929                 id: *index,
1930                 addr: self
1931                     .fd
1932                     .get_one_reg(*index)
1933                     .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?
1934                     .try_into()
1935                     .unwrap(),
1936             });
1937         }
1938 
1939         state.sys_regs = sys_regs;
1940 
1941         Ok(state.into())
1942     }
1943     #[cfg(target_arch = "x86_64")]
1944     ///
1945     /// Restore the previously saved CPU state
1946     ///
1947     /// Ordering requirements:
1948     ///
1949     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1950     /// still running.
1951     ///
1952     /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so
1953     /// if we ever change the BSP, we have to do that before restoring anything.
1954     /// The same seems to be true for CPUID stuff.
1955     ///
1956     /// SREGS saves/restores a pending interrupt, similar to what
1957     /// VCPU_EVENTS also does.
1958     ///
1959     /// SET_REGS clears pending exceptions unconditionally, thus, it must be
1960     /// done before SET_VCPU_EVENTS, which restores it.
1961     ///
1962     /// SET_LAPIC must come after SET_SREGS, because the latter restores
1963     /// the apic base msr.
1964     ///
1965     /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR
1966     /// only restores successfully, when the LAPIC is correctly configured.
1967     ///
1968     /// Arguments: CpuState
1969     /// # Example
1970     ///
1971     /// ```rust
1972     /// # extern crate hypervisor;
1973     /// # use hypervisor::KvmHypervisor;
1974     /// # use std::sync::Arc;
1975     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1976     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1977     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1978     /// vm.enable_split_irq().unwrap();
1979     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1980     /// let state = vcpu.state().unwrap();
1981     /// vcpu.set_state(&state).unwrap();
1982     /// ```
1983     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
1984         let state: VcpuKvmState = state.clone().into();
1985         self.set_cpuid2(&state.cpuid)?;
1986         self.set_mp_state(state.mp_state.into())?;
1987         self.set_regs(&state.regs.into())?;
1988         self.set_sregs(&state.sregs.into())?;
1989         self.set_xsave(&state.xsave)?;
1990         self.set_xcrs(&state.xcrs)?;
1991         self.set_lapic(&state.lapic_state)?;
1992         self.set_fpu(&state.fpu)?;
1993 
1994         // Try to set all MSRs previously stored.
1995         // If the number of MSRs set from SET_MSRS is different from the
1996         // expected amount, we fallback onto a slower method by setting MSRs
1997         // by chunks. This is the only way to make sure we try to set as many
1998         // MSRs as possible, even if some MSRs are not supported.
1999         let expected_num_msrs = state.msrs.len();
2000         let num_msrs = self.set_msrs(&state.msrs)?;
2001         if num_msrs != expected_num_msrs {
2002             let mut faulty_msr_index = num_msrs;
2003 
2004             loop {
2005                 warn!(
2006                     "Detected faulty MSR 0x{:x} while setting MSRs",
2007                     state.msrs[faulty_msr_index].index
2008                 );
2009 
2010                 // Skip the first bad MSR
2011                 let start_pos = faulty_msr_index + 1;
2012 
2013                 let sub_msr_entries = state.msrs[start_pos..].to_vec();
2014 
2015                 let num_msrs = self.set_msrs(&sub_msr_entries)?;
2016 
2017                 if num_msrs == sub_msr_entries.len() {
2018                     break;
2019                 }
2020 
2021                 faulty_msr_index = start_pos + num_msrs;
2022             }
2023         }
2024 
2025         self.set_vcpu_events(&state.vcpu_events)?;
2026 
2027         Ok(())
2028     }
2029     ///
2030     /// Restore the previously saved AArch64 CPU state
2031     ///
2032     #[cfg(target_arch = "aarch64")]
2033     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
2034         let state: VcpuKvmState = state.clone().into();
2035         // Set core registers
2036         self.set_regs(&state.core_regs)?;
2037         // Set system registers
2038         for reg in &state.sys_regs {
2039             self.fd
2040                 .set_one_reg(reg.id, reg.addr.into())
2041                 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?;
2042         }
2043 
2044         self.set_mp_state(state.mp_state.into())?;
2045 
2046         Ok(())
2047     }
2048 
2049     ///
2050     /// Initialize TDX for this CPU
2051     ///
2052     #[cfg(feature = "tdx")]
2053     fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> {
2054         tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address)
2055             .map_err(cpu::HypervisorCpuError::InitializeTdx)
2056     }
2057 
2058     ///
2059     /// Set the "immediate_exit" state
2060     ///
2061     fn set_immediate_exit(&self, exit: bool) {
2062         self.fd.set_kvm_immediate_exit(exit.into());
2063     }
2064 
2065     ///
2066     /// Returns the details about TDX exit reason
2067     ///
2068     #[cfg(feature = "tdx")]
2069     fn get_tdx_exit_details(&mut self) -> cpu::Result<TdxExitDetails> {
2070         let kvm_run = self.fd.get_kvm_run();
2071         // SAFETY: accessing a union field in a valid structure
2072         let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall };
2073 
2074         tdx_vmcall.status_code = TDG_VP_VMCALL_INVALID_OPERAND;
2075 
2076         if tdx_vmcall.type_ != 0 {
2077             return Err(cpu::HypervisorCpuError::UnknownTdxVmCall);
2078         }
2079 
2080         match tdx_vmcall.subfunction {
2081             TDG_VP_VMCALL_GET_QUOTE => Ok(TdxExitDetails::GetQuote),
2082             TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT => {
2083                 Ok(TdxExitDetails::SetupEventNotifyInterrupt)
2084             }
2085             _ => Err(cpu::HypervisorCpuError::UnknownTdxVmCall),
2086         }
2087     }
2088 
2089     ///
2090     /// Set the status code for TDX exit
2091     ///
2092     #[cfg(feature = "tdx")]
2093     fn set_tdx_status(&mut self, status: TdxExitStatus) {
2094         let kvm_run = self.fd.get_kvm_run();
2095         // SAFETY: accessing a union field in a valid structure
2096         let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall };
2097 
2098         tdx_vmcall.status_code = match status {
2099             TdxExitStatus::Success => TDG_VP_VMCALL_SUCCESS,
2100             TdxExitStatus::InvalidOperand => TDG_VP_VMCALL_INVALID_OPERAND,
2101         };
2102     }
2103     #[cfg(target_arch = "x86_64")]
2104     ///
2105     /// Return the list of initial MSR entries for a VCPU
2106     ///
2107     fn boot_msr_entries(&self) -> Vec<MsrEntry> {
2108         use crate::arch::x86::{msr_index, MTRR_ENABLE, MTRR_MEM_TYPE_WB};
2109 
2110         [
2111             msr!(msr_index::MSR_IA32_SYSENTER_CS),
2112             msr!(msr_index::MSR_IA32_SYSENTER_ESP),
2113             msr!(msr_index::MSR_IA32_SYSENTER_EIP),
2114             msr!(msr_index::MSR_STAR),
2115             msr!(msr_index::MSR_CSTAR),
2116             msr!(msr_index::MSR_LSTAR),
2117             msr!(msr_index::MSR_KERNEL_GS_BASE),
2118             msr!(msr_index::MSR_SYSCALL_MASK),
2119             msr!(msr_index::MSR_IA32_TSC),
2120             msr_data!(
2121                 msr_index::MSR_IA32_MISC_ENABLE,
2122                 msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64
2123             ),
2124             msr_data!(msr_index::MSR_MTRRdefType, MTRR_ENABLE | MTRR_MEM_TYPE_WB),
2125         ]
2126         .to_vec()
2127     }
2128     #[cfg(target_arch = "aarch64")]
2129     fn has_pmu_support(&self) -> bool {
2130         let cpu_attr = kvm_bindings::kvm_device_attr {
2131             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2132             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT),
2133             addr: 0x0,
2134             flags: 0,
2135         };
2136         self.fd.has_device_attr(&cpu_attr).is_ok()
2137     }
2138     #[cfg(target_arch = "aarch64")]
2139     fn init_pmu(&self, irq: u32) -> cpu::Result<()> {
2140         let cpu_attr = kvm_bindings::kvm_device_attr {
2141             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2142             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT),
2143             addr: 0x0,
2144             flags: 0,
2145         };
2146         let cpu_attr_irq = kvm_bindings::kvm_device_attr {
2147             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2148             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_IRQ),
2149             addr: &irq as *const u32 as u64,
2150             flags: 0,
2151         };
2152         self.fd
2153             .set_device_attr(&cpu_attr_irq)
2154             .map_err(|_| cpu::HypervisorCpuError::InitializePmu)?;
2155         self.fd
2156             .set_device_attr(&cpu_attr)
2157             .map_err(|_| cpu::HypervisorCpuError::InitializePmu)
2158     }
2159 
2160     #[cfg(target_arch = "x86_64")]
2161     ///
2162     /// Get the frequency of the TSC if available
2163     ///
2164     fn tsc_khz(&self) -> cpu::Result<Option<u32>> {
2165         match self.fd.get_tsc_khz() {
2166             Err(e) => {
2167                 if e.errno() == libc::EIO {
2168                     Ok(None)
2169                 } else {
2170                     Err(cpu::HypervisorCpuError::GetTscKhz(e.into()))
2171                 }
2172             }
2173             Ok(v) => Ok(Some(v)),
2174         }
2175     }
2176 }
2177 
2178 impl KvmVcpu {
2179     #[cfg(target_arch = "x86_64")]
2180     ///
2181     /// X86 specific call that returns the vcpu's current "xsave struct".
2182     ///
2183     fn get_xsave(&self) -> cpu::Result<Xsave> {
2184         self.fd
2185             .get_xsave()
2186             .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into()))
2187     }
2188     #[cfg(target_arch = "x86_64")]
2189     ///
2190     /// X86 specific call that sets the vcpu's current "xsave struct".
2191     ///
2192     fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> {
2193         self.fd
2194             .set_xsave(xsave)
2195             .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into()))
2196     }
2197     #[cfg(target_arch = "x86_64")]
2198     ///
2199     /// X86 specific call that returns the vcpu's current "xcrs".
2200     ///
2201     fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> {
2202         self.fd
2203             .get_xcrs()
2204             .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into()))
2205     }
2206     #[cfg(target_arch = "x86_64")]
2207     ///
2208     /// X86 specific call that sets the vcpu's current "xcrs".
2209     ///
2210     fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> {
2211         self.fd
2212             .set_xcrs(xcrs)
2213             .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into()))
2214     }
2215     #[cfg(target_arch = "x86_64")]
2216     ///
2217     /// Returns currently pending exceptions, interrupts, and NMIs as well as related
2218     /// states of the vcpu.
2219     ///
2220     fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> {
2221         self.fd
2222             .get_vcpu_events()
2223             .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into()))
2224     }
2225     #[cfg(target_arch = "x86_64")]
2226     ///
2227     /// Sets pending exceptions, interrupts, and NMIs as well as related states
2228     /// of the vcpu.
2229     ///
2230     fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> {
2231         self.fd
2232             .set_vcpu_events(events)
2233             .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into()))
2234     }
2235 }
2236