xref: /cloud-hypervisor/hypervisor/src/kvm/mod.rs (revision 5e52729453cb62edbe4fb3a4aa24f8cca31e667e)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 // Copyright © 2020, Microsoft Corporation
6 //
7 // Copyright 2018-2019 CrowdStrike, Inc.
8 //
9 //
10 
11 #[cfg(target_arch = "aarch64")]
12 use crate::aarch64::gic::KvmGicV3Its;
13 #[cfg(target_arch = "aarch64")]
14 pub use crate::aarch64::{
15     check_required_kvm_extensions, gic::Gicv3ItsState as GicState, is_system_register, VcpuInit,
16     VcpuKvmState,
17 };
18 #[cfg(target_arch = "aarch64")]
19 use crate::arch::aarch64::gic::{Vgic, VgicConfig};
20 use crate::cpu;
21 use crate::hypervisor;
22 use crate::vec_with_array_field;
23 use crate::vm::{self, InterruptSourceConfig, VmOps};
24 use crate::HypervisorType;
25 #[cfg(target_arch = "aarch64")]
26 use crate::{arm64_core_reg_id, offset__of};
27 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd};
28 use std::any::Any;
29 use std::collections::HashMap;
30 #[cfg(target_arch = "aarch64")]
31 use std::convert::TryInto;
32 #[cfg(target_arch = "x86_64")]
33 use std::fs::File;
34 #[cfg(target_arch = "x86_64")]
35 use std::os::unix::io::AsRawFd;
36 #[cfg(feature = "tdx")]
37 use std::os::unix::io::RawFd;
38 use std::result;
39 #[cfg(target_arch = "x86_64")]
40 use std::sync::atomic::{AtomicBool, Ordering};
41 #[cfg(target_arch = "aarch64")]
42 use std::sync::Mutex;
43 use std::sync::{Arc, RwLock};
44 use vmm_sys_util::eventfd::EventFd;
45 // x86_64 dependencies
46 #[cfg(target_arch = "x86_64")]
47 pub mod x86_64;
48 #[cfg(target_arch = "x86_64")]
49 use crate::arch::x86::{
50     CpuIdEntry, FpuState, LapicState, MsrEntry, SpecialRegisters, StandardRegisters,
51     NUM_IOAPIC_PINS,
52 };
53 #[cfg(target_arch = "x86_64")]
54 use crate::ClockData;
55 use crate::{
56     CpuState, IoEventAddress, IrqRoutingEntry, MpState, UserMemoryRegion,
57     USER_MEMORY_REGION_LOG_DIRTY, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE,
58 };
59 #[cfg(target_arch = "aarch64")]
60 use aarch64::{RegList, Register, StandardRegisters};
61 #[cfg(target_arch = "x86_64")]
62 use kvm_bindings::{
63     kvm_enable_cap, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, KVM_CAP_SPLIT_IRQCHIP,
64     KVM_GUESTDBG_USE_HW_BP,
65 };
66 #[cfg(target_arch = "x86_64")]
67 use x86_64::check_required_kvm_extensions;
68 #[cfg(target_arch = "x86_64")]
69 pub use x86_64::{CpuId, ExtendedControlRegisters, MsrEntries, VcpuKvmState, Xsave};
70 // aarch64 dependencies
71 #[cfg(target_arch = "aarch64")]
72 pub mod aarch64;
73 pub use kvm_bindings;
74 #[cfg(feature = "tdx")]
75 use kvm_bindings::KVMIO;
76 pub use kvm_bindings::{
77     kvm_clock_data, kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_guest_debug,
78     kvm_irq_routing, kvm_irq_routing_entry, kvm_mp_state, kvm_userspace_memory_region,
79     KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI,
80     KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID,
81 };
82 #[cfg(target_arch = "aarch64")]
83 use kvm_bindings::{
84     kvm_regs, user_fpsimd_state, user_pt_regs, KVM_GUESTDBG_USE_HW, KVM_NR_SPSR, KVM_REG_ARM64,
85     KVM_REG_ARM64_SYSREG, KVM_REG_ARM64_SYSREG_CRM_MASK, KVM_REG_ARM64_SYSREG_CRN_MASK,
86     KVM_REG_ARM64_SYSREG_OP0_MASK, KVM_REG_ARM64_SYSREG_OP1_MASK, KVM_REG_ARM64_SYSREG_OP2_MASK,
87     KVM_REG_ARM_CORE, KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64,
88 };
89 pub use kvm_ioctls;
90 pub use kvm_ioctls::{Cap, Kvm};
91 #[cfg(target_arch = "aarch64")]
92 use std::mem;
93 use thiserror::Error;
94 use vfio_ioctls::VfioDeviceFd;
95 #[cfg(feature = "tdx")]
96 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_ioc_nr, ioctl_iowr_nr};
97 ///
98 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms
99 ///
100 pub use {
101     kvm_bindings::kvm_create_device as CreateDevice, kvm_bindings::kvm_device_attr as DeviceAttr,
102     kvm_bindings::kvm_run, kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::VcpuExit,
103 };
104 
105 #[cfg(target_arch = "x86_64")]
106 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196;
107 
108 #[cfg(feature = "tdx")]
109 const KVM_EXIT_TDX: u32 = 35;
110 #[cfg(feature = "tdx")]
111 const TDG_VP_VMCALL_GET_QUOTE: u64 = 0x10002;
112 #[cfg(feature = "tdx")]
113 const TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: u64 = 0x10004;
114 #[cfg(feature = "tdx")]
115 const TDG_VP_VMCALL_SUCCESS: u64 = 0;
116 #[cfg(feature = "tdx")]
117 const TDG_VP_VMCALL_INVALID_OPERAND: u64 = 0x8000000000000000;
118 
119 #[cfg(feature = "tdx")]
120 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong);
121 
122 #[cfg(feature = "tdx")]
123 #[repr(u32)]
124 enum TdxCommand {
125     Capabilities = 0,
126     InitVm,
127     InitVcpu,
128     InitMemRegion,
129     Finalize,
130 }
131 
132 #[cfg(feature = "tdx")]
133 pub enum TdxExitDetails {
134     GetQuote,
135     SetupEventNotifyInterrupt,
136 }
137 
138 #[cfg(feature = "tdx")]
139 pub enum TdxExitStatus {
140     Success,
141     InvalidOperand,
142 }
143 
144 #[cfg(feature = "tdx")]
145 const TDX_MAX_NR_CPUID_CONFIGS: usize = 6;
146 
147 #[cfg(feature = "tdx")]
148 #[repr(C)]
149 #[derive(Debug, Default)]
150 pub struct TdxCpuidConfig {
151     pub leaf: u32,
152     pub sub_leaf: u32,
153     pub eax: u32,
154     pub ebx: u32,
155     pub ecx: u32,
156     pub edx: u32,
157 }
158 
159 #[cfg(feature = "tdx")]
160 #[repr(C)]
161 #[derive(Debug, Default)]
162 pub struct TdxCapabilities {
163     pub attrs_fixed0: u64,
164     pub attrs_fixed1: u64,
165     pub xfam_fixed0: u64,
166     pub xfam_fixed1: u64,
167     pub nr_cpuid_configs: u32,
168     pub padding: u32,
169     pub cpuid_configs: [TdxCpuidConfig; TDX_MAX_NR_CPUID_CONFIGS],
170 }
171 
172 impl From<kvm_userspace_memory_region> for UserMemoryRegion {
173     fn from(region: kvm_userspace_memory_region) -> Self {
174         let mut flags = USER_MEMORY_REGION_READ;
175         if region.flags & KVM_MEM_READONLY == 0 {
176             flags |= USER_MEMORY_REGION_WRITE;
177         }
178         if region.flags & KVM_MEM_LOG_DIRTY_PAGES != 0 {
179             flags |= USER_MEMORY_REGION_LOG_DIRTY;
180         }
181 
182         UserMemoryRegion {
183             slot: region.slot,
184             guest_phys_addr: region.guest_phys_addr,
185             memory_size: region.memory_size,
186             userspace_addr: region.userspace_addr,
187             flags,
188         }
189     }
190 }
191 
192 impl From<UserMemoryRegion> for kvm_userspace_memory_region {
193     fn from(region: UserMemoryRegion) -> Self {
194         assert!(
195             region.flags & USER_MEMORY_REGION_READ != 0,
196             "KVM mapped memory is always readable"
197         );
198 
199         let mut flags = 0;
200         if region.flags & USER_MEMORY_REGION_WRITE == 0 {
201             flags |= KVM_MEM_READONLY;
202         }
203         if region.flags & USER_MEMORY_REGION_LOG_DIRTY != 0 {
204             flags |= KVM_MEM_LOG_DIRTY_PAGES;
205         }
206 
207         kvm_userspace_memory_region {
208             slot: region.slot,
209             guest_phys_addr: region.guest_phys_addr,
210             memory_size: region.memory_size,
211             userspace_addr: region.userspace_addr,
212             flags,
213         }
214     }
215 }
216 
217 impl From<kvm_mp_state> for MpState {
218     fn from(s: kvm_mp_state) -> Self {
219         MpState::Kvm(s)
220     }
221 }
222 
223 impl From<MpState> for kvm_mp_state {
224     fn from(ms: MpState) -> Self {
225         match ms {
226             MpState::Kvm(s) => s,
227             /* Needed in case other hypervisors are enabled */
228             #[allow(unreachable_patterns)]
229             _ => panic!("CpuState is not valid"),
230         }
231     }
232 }
233 
234 impl From<kvm_ioctls::IoEventAddress> for IoEventAddress {
235     fn from(a: kvm_ioctls::IoEventAddress) -> Self {
236         match a {
237             kvm_ioctls::IoEventAddress::Pio(x) => Self::Pio(x),
238             kvm_ioctls::IoEventAddress::Mmio(x) => Self::Mmio(x),
239         }
240     }
241 }
242 
243 impl From<IoEventAddress> for kvm_ioctls::IoEventAddress {
244     fn from(a: IoEventAddress) -> Self {
245         match a {
246             IoEventAddress::Pio(x) => Self::Pio(x),
247             IoEventAddress::Mmio(x) => Self::Mmio(x),
248         }
249     }
250 }
251 
252 impl From<VcpuKvmState> for CpuState {
253     fn from(s: VcpuKvmState) -> Self {
254         CpuState::Kvm(s)
255     }
256 }
257 
258 impl From<CpuState> for VcpuKvmState {
259     fn from(s: CpuState) -> Self {
260         match s {
261             CpuState::Kvm(s) => s,
262             /* Needed in case other hypervisors are enabled */
263             #[allow(unreachable_patterns)]
264             _ => panic!("CpuState is not valid"),
265         }
266     }
267 }
268 
269 #[cfg(target_arch = "x86_64")]
270 impl From<kvm_clock_data> for ClockData {
271     fn from(d: kvm_clock_data) -> Self {
272         ClockData::Kvm(d)
273     }
274 }
275 
276 #[cfg(target_arch = "x86_64")]
277 impl From<ClockData> for kvm_clock_data {
278     fn from(ms: ClockData) -> Self {
279         match ms {
280             ClockData::Kvm(s) => s,
281             /* Needed in case other hypervisors are enabled */
282             #[allow(unreachable_patterns)]
283             _ => panic!("CpuState is not valid"),
284         }
285     }
286 }
287 
288 impl From<kvm_irq_routing_entry> for IrqRoutingEntry {
289     fn from(s: kvm_irq_routing_entry) -> Self {
290         IrqRoutingEntry::Kvm(s)
291     }
292 }
293 
294 impl From<IrqRoutingEntry> for kvm_irq_routing_entry {
295     fn from(e: IrqRoutingEntry) -> Self {
296         match e {
297             IrqRoutingEntry::Kvm(e) => e,
298             /* Needed in case other hypervisors are enabled */
299             #[allow(unreachable_patterns)]
300             _ => panic!("IrqRoutingEntry is not valid"),
301         }
302     }
303 }
304 
305 struct KvmDirtyLogSlot {
306     slot: u32,
307     guest_phys_addr: u64,
308     memory_size: u64,
309     userspace_addr: u64,
310 }
311 
312 /// Wrapper over KVM VM ioctls.
313 pub struct KvmVm {
314     fd: Arc<VmFd>,
315     #[cfg(target_arch = "x86_64")]
316     msrs: Vec<MsrEntry>,
317     dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>,
318 }
319 
320 impl KvmVm {
321     ///
322     /// Creates an emulated device in the kernel.
323     ///
324     /// See the documentation for `KVM_CREATE_DEVICE`.
325     fn create_device(&self, device: &mut CreateDevice) -> vm::Result<vfio_ioctls::VfioDeviceFd> {
326         let device_fd = self
327             .fd
328             .create_device(device)
329             .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?;
330         Ok(VfioDeviceFd::new_from_kvm(device_fd))
331     }
332     /// Checks if a particular `Cap` is available.
333     fn check_extension(&self, c: Cap) -> bool {
334         self.fd.check_extension(c)
335     }
336 }
337 
338 ///
339 /// Implementation of Vm trait for KVM
340 /// Example:
341 /// #[cfg(feature = "kvm")]
342 /// extern crate hypervisor
343 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
344 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
345 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
346 /// vm.set/get().unwrap()
347 ///
348 impl vm::Vm for KvmVm {
349     #[cfg(target_arch = "x86_64")]
350     ///
351     /// Sets the address of the one-page region in the VM's address space.
352     ///
353     fn set_identity_map_address(&self, address: u64) -> vm::Result<()> {
354         self.fd
355             .set_identity_map_address(address)
356             .map_err(|e| vm::HypervisorVmError::SetIdentityMapAddress(e.into()))
357     }
358     #[cfg(target_arch = "x86_64")]
359     ///
360     /// Sets the address of the three-page region in the VM's address space.
361     ///
362     fn set_tss_address(&self, offset: usize) -> vm::Result<()> {
363         self.fd
364             .set_tss_address(offset)
365             .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into()))
366     }
367     ///
368     /// Creates an in-kernel interrupt controller.
369     ///
370     fn create_irq_chip(&self) -> vm::Result<()> {
371         self.fd
372             .create_irq_chip()
373             .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into()))
374     }
375     ///
376     /// Registers an event that will, when signaled, trigger the `gsi` IRQ.
377     ///
378     fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
379         self.fd
380             .register_irqfd(fd, gsi)
381             .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into()))
382     }
383     ///
384     /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ.
385     ///
386     fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
387         self.fd
388             .unregister_irqfd(fd, gsi)
389             .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into()))
390     }
391     ///
392     /// Creates a VcpuFd object from a vcpu RawFd.
393     ///
394     fn create_vcpu(
395         &self,
396         id: u8,
397         vm_ops: Option<Arc<dyn VmOps>>,
398     ) -> vm::Result<Arc<dyn cpu::Vcpu>> {
399         let vc = self
400             .fd
401             .create_vcpu(id as u64)
402             .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?;
403         let vcpu = KvmVcpu {
404             fd: vc,
405             #[cfg(target_arch = "x86_64")]
406             msrs: self.msrs.clone(),
407             vm_ops,
408             #[cfg(target_arch = "x86_64")]
409             hyperv_synic: AtomicBool::new(false),
410         };
411         Ok(Arc::new(vcpu))
412     }
413     #[cfg(target_arch = "aarch64")]
414     ///
415     /// Creates a virtual GIC device.
416     ///
417     fn create_vgic(&self, config: VgicConfig) -> vm::Result<Arc<Mutex<dyn Vgic>>> {
418         let gic_device = KvmGicV3Its::new(self, config)
419             .map_err(|e| vm::HypervisorVmError::CreateVgic(anyhow!("Vgic error {:?}", e)))?;
420         Ok(Arc::new(Mutex::new(gic_device)))
421     }
422     ///
423     /// Registers an event to be signaled whenever a certain address is written to.
424     ///
425     fn register_ioevent(
426         &self,
427         fd: &EventFd,
428         addr: &IoEventAddress,
429         datamatch: Option<vm::DataMatch>,
430     ) -> vm::Result<()> {
431         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
432         if let Some(dm) = datamatch {
433             match dm {
434                 vm::DataMatch::DataMatch32(kvm_dm32) => self
435                     .fd
436                     .register_ioevent(fd, addr, kvm_dm32)
437                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
438                 vm::DataMatch::DataMatch64(kvm_dm64) => self
439                     .fd
440                     .register_ioevent(fd, addr, kvm_dm64)
441                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
442             }
443         } else {
444             self.fd
445                 .register_ioevent(fd, addr, NoDatamatch)
446                 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into()))
447         }
448     }
449     ///
450     /// Unregisters an event from a certain address it has been previously registered to.
451     ///
452     fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> {
453         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
454         self.fd
455             .unregister_ioevent(fd, addr, NoDatamatch)
456             .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into()))
457     }
458 
459     ///
460     /// Constructs a routing entry
461     ///
462     fn make_routing_entry(&self, gsi: u32, config: &InterruptSourceConfig) -> IrqRoutingEntry {
463         match &config {
464             InterruptSourceConfig::MsiIrq(cfg) => {
465                 let mut kvm_route = kvm_irq_routing_entry {
466                     gsi,
467                     type_: KVM_IRQ_ROUTING_MSI,
468                     ..Default::default()
469                 };
470 
471                 kvm_route.u.msi.address_lo = cfg.low_addr;
472                 kvm_route.u.msi.address_hi = cfg.high_addr;
473                 kvm_route.u.msi.data = cfg.data;
474 
475                 if self.check_extension(crate::kvm::Cap::MsiDevid) {
476                     // On AArch64, there is limitation on the range of the 'devid',
477                     // it can not be greater than 65536 (the max of u16).
478                     //
479                     // BDF can not be used directly, because 'segment' is in high
480                     // 16 bits. The layout of the u32 BDF is:
481                     // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --|
482                     // |      segment    |     bus    |   device   |  function  |
483                     //
484                     // Now that we support 1 bus only in a segment, we can build a
485                     // 'devid' by replacing the 'bus' bits with the low 8 bits of
486                     // 'segment' data.
487                     // This way we can resolve the range checking problem and give
488                     // different `devid` to all the devices. Limitation is that at
489                     // most 256 segments can be supported.
490                     //
491                     let modified_devid = (cfg.devid & 0x00ff_0000) >> 8 | cfg.devid & 0xff;
492 
493                     kvm_route.flags = KVM_MSI_VALID_DEVID;
494                     kvm_route.u.msi.__bindgen_anon_1.devid = modified_devid;
495                 }
496                 kvm_route.into()
497             }
498             InterruptSourceConfig::LegacyIrq(cfg) => {
499                 let mut kvm_route = kvm_irq_routing_entry {
500                     gsi,
501                     type_: KVM_IRQ_ROUTING_IRQCHIP,
502                     ..Default::default()
503                 };
504                 kvm_route.u.irqchip.irqchip = cfg.irqchip;
505                 kvm_route.u.irqchip.pin = cfg.pin;
506 
507                 kvm_route.into()
508             }
509         }
510     }
511 
512     ///
513     /// Sets the GSI routing table entries, overwriting any previously set
514     /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl.
515     ///
516     fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> {
517         let mut irq_routing =
518             vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len());
519         irq_routing[0].nr = entries.len() as u32;
520         irq_routing[0].flags = 0;
521         let entries: Vec<kvm_irq_routing_entry> = entries
522             .iter()
523             .map(|entry| match entry {
524                 IrqRoutingEntry::Kvm(e) => *e,
525                 #[allow(unreachable_patterns)]
526                 _ => panic!("IrqRoutingEntry type is wrong"),
527             })
528             .collect();
529 
530         // SAFETY: irq_routing initialized with entries.len() and now it is being turned into
531         // entries_slice with entries.len() again. It is guaranteed to be large enough to hold
532         // everything from entries.
533         unsafe {
534             let entries_slice: &mut [kvm_irq_routing_entry] =
535                 irq_routing[0].entries.as_mut_slice(entries.len());
536             entries_slice.copy_from_slice(&entries);
537         }
538 
539         self.fd
540             .set_gsi_routing(&irq_routing[0])
541             .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into()))
542     }
543     ///
544     /// Creates a memory region structure that can be used with {create/remove}_user_memory_region
545     ///
546     fn make_user_memory_region(
547         &self,
548         slot: u32,
549         guest_phys_addr: u64,
550         memory_size: u64,
551         userspace_addr: u64,
552         readonly: bool,
553         log_dirty_pages: bool,
554     ) -> UserMemoryRegion {
555         kvm_userspace_memory_region {
556             slot,
557             guest_phys_addr,
558             memory_size,
559             userspace_addr,
560             flags: if readonly { KVM_MEM_READONLY } else { 0 }
561                 | if log_dirty_pages {
562                     KVM_MEM_LOG_DIRTY_PAGES
563                 } else {
564                     0
565                 },
566         }
567         .into()
568     }
569     ///
570     /// Creates a guest physical memory region.
571     ///
572     fn create_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
573         let mut region: kvm_userspace_memory_region = user_memory_region.into();
574 
575         if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 {
576             if (region.flags & KVM_MEM_READONLY) != 0 {
577                 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!(
578                     "Error creating regions with both 'dirty-pages-log' and 'read-only'."
579                 )));
580             }
581 
582             // Keep track of the regions that need dirty pages log
583             self.dirty_log_slots.write().unwrap().insert(
584                 region.slot,
585                 KvmDirtyLogSlot {
586                     slot: region.slot,
587                     guest_phys_addr: region.guest_phys_addr,
588                     memory_size: region.memory_size,
589                     userspace_addr: region.userspace_addr,
590                 },
591             );
592 
593             // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`.
594             // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`.
595             region.flags = 0;
596         }
597 
598         // SAFETY: Safe because guest regions are guaranteed not to overlap.
599         unsafe {
600             self.fd
601                 .set_user_memory_region(region)
602                 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))
603         }
604     }
605     ///
606     /// Removes a guest physical memory region.
607     ///
608     fn remove_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
609         let mut region: kvm_userspace_memory_region = user_memory_region.into();
610 
611         // Remove the corresponding entry from "self.dirty_log_slots" if needed
612         self.dirty_log_slots.write().unwrap().remove(&region.slot);
613 
614         // Setting the size to 0 means "remove"
615         region.memory_size = 0;
616         // SAFETY: Safe because guest regions are guaranteed not to overlap.
617         unsafe {
618             self.fd
619                 .set_user_memory_region(region)
620                 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into()))
621         }
622     }
623     ///
624     /// Returns the preferred CPU target type which can be emulated by KVM on underlying host.
625     ///
626     #[cfg(target_arch = "aarch64")]
627     fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> {
628         self.fd
629             .get_preferred_target(kvi)
630             .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into()))
631     }
632     #[cfg(target_arch = "x86_64")]
633     fn enable_split_irq(&self) -> vm::Result<()> {
634         // Create split irqchip
635         // Only the local APIC is emulated in kernel, both PICs and IOAPIC
636         // are not.
637         let mut cap = kvm_enable_cap {
638             cap: KVM_CAP_SPLIT_IRQCHIP,
639             ..Default::default()
640         };
641         cap.args[0] = NUM_IOAPIC_PINS as u64;
642         self.fd
643             .enable_cap(&cap)
644             .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?;
645         Ok(())
646     }
647     #[cfg(target_arch = "x86_64")]
648     fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> {
649         let mut cap = kvm_enable_cap {
650             cap: KVM_CAP_SGX_ATTRIBUTE,
651             ..Default::default()
652         };
653         cap.args[0] = file.as_raw_fd() as u64;
654         self.fd
655             .enable_cap(&cap)
656             .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?;
657         Ok(())
658     }
659     /// Retrieve guest clock.
660     #[cfg(target_arch = "x86_64")]
661     fn get_clock(&self) -> vm::Result<ClockData> {
662         Ok(self
663             .fd
664             .get_clock()
665             .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))?
666             .into())
667     }
668     /// Set guest clock.
669     #[cfg(target_arch = "x86_64")]
670     fn set_clock(&self, data: &ClockData) -> vm::Result<()> {
671         let data = (*data).into();
672         self.fd
673             .set_clock(&data)
674             .map_err(|e| vm::HypervisorVmError::SetClock(e.into()))
675     }
676     /// Create a device that is used for passthrough
677     fn create_passthrough_device(&self) -> vm::Result<VfioDeviceFd> {
678         let mut vfio_dev = kvm_create_device {
679             type_: kvm_device_type_KVM_DEV_TYPE_VFIO,
680             fd: 0,
681             flags: 0,
682         };
683 
684         self.create_device(&mut vfio_dev)
685             .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into()))
686     }
687     ///
688     /// Start logging dirty pages
689     ///
690     fn start_dirty_log(&self) -> vm::Result<()> {
691         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
692         for (_, s) in dirty_log_slots.iter() {
693             let region = kvm_userspace_memory_region {
694                 slot: s.slot,
695                 guest_phys_addr: s.guest_phys_addr,
696                 memory_size: s.memory_size,
697                 userspace_addr: s.userspace_addr,
698                 flags: KVM_MEM_LOG_DIRTY_PAGES,
699             };
700             // SAFETY: Safe because guest regions are guaranteed not to overlap.
701             unsafe {
702                 self.fd
703                     .set_user_memory_region(region)
704                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
705             }
706         }
707 
708         Ok(())
709     }
710 
711     ///
712     /// Stop logging dirty pages
713     ///
714     fn stop_dirty_log(&self) -> vm::Result<()> {
715         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
716         for (_, s) in dirty_log_slots.iter() {
717             let region = kvm_userspace_memory_region {
718                 slot: s.slot,
719                 guest_phys_addr: s.guest_phys_addr,
720                 memory_size: s.memory_size,
721                 userspace_addr: s.userspace_addr,
722                 flags: 0,
723             };
724             // SAFETY: Safe because guest regions are guaranteed not to overlap.
725             unsafe {
726                 self.fd
727                     .set_user_memory_region(region)
728                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
729             }
730         }
731 
732         Ok(())
733     }
734 
735     ///
736     /// Get dirty pages bitmap (one bit per page)
737     ///
738     fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> {
739         self.fd
740             .get_dirty_log(slot, memory_size as usize)
741             .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into()))
742     }
743 
744     ///
745     /// Initialize TDX for this VM
746     ///
747     #[cfg(feature = "tdx")]
748     fn tdx_init(&self, cpuid: &[CpuIdEntry], max_vcpus: u32) -> vm::Result<()> {
749         use std::io::{Error, ErrorKind};
750         let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> =
751             cpuid.iter().map(|e| (*e).into()).collect();
752         let kvm_cpuid = kvm_bindings::CpuId::from_entries(&cpuid).map_err(|_| {
753             vm::HypervisorVmError::InitializeTdx(Error::new(
754                 ErrorKind::Other,
755                 "failed to allocate CpuId",
756             ))
757         })?;
758 
759         #[repr(C)]
760         struct TdxInitVm {
761             max_vcpus: u32,
762             tsc_khz: u32,
763             attributes: u64,
764             cpuid: u64,
765             mrconfigid: [u64; 6],
766             mrowner: [u64; 6],
767             mrownerconfig: [u64; 6],
768             reserved: [u64; 43],
769         }
770         let data = TdxInitVm {
771             max_vcpus,
772             tsc_khz: 0,
773             attributes: 0,
774             cpuid: kvm_cpuid.as_fam_struct_ptr() as u64,
775             mrconfigid: [0; 6],
776             mrowner: [0; 6],
777             mrownerconfig: [0; 6],
778             reserved: [0; 43],
779         };
780 
781         tdx_command(
782             &self.fd.as_raw_fd(),
783             TdxCommand::InitVm,
784             0,
785             &data as *const _ as u64,
786         )
787         .map_err(vm::HypervisorVmError::InitializeTdx)
788     }
789 
790     ///
791     /// Finalize the TDX setup for this VM
792     ///
793     #[cfg(feature = "tdx")]
794     fn tdx_finalize(&self) -> vm::Result<()> {
795         tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0)
796             .map_err(vm::HypervisorVmError::FinalizeTdx)
797     }
798 
799     ///
800     /// Initialize memory regions for the TDX VM
801     ///
802     #[cfg(feature = "tdx")]
803     fn tdx_init_memory_region(
804         &self,
805         host_address: u64,
806         guest_address: u64,
807         size: u64,
808         measure: bool,
809     ) -> vm::Result<()> {
810         #[repr(C)]
811         struct TdxInitMemRegion {
812             host_address: u64,
813             guest_address: u64,
814             pages: u64,
815         }
816         let data = TdxInitMemRegion {
817             host_address,
818             guest_address,
819             pages: size / 4096,
820         };
821 
822         tdx_command(
823             &self.fd.as_raw_fd(),
824             TdxCommand::InitMemRegion,
825             u32::from(measure),
826             &data as *const _ as u64,
827         )
828         .map_err(vm::HypervisorVmError::InitMemRegionTdx)
829     }
830     /// Downcast to the underlying KvmVm type
831     fn as_any(&self) -> &dyn Any {
832         self
833     }
834 }
835 
836 #[cfg(feature = "tdx")]
837 fn tdx_command(
838     fd: &RawFd,
839     command: TdxCommand,
840     metadata: u32,
841     data: u64,
842 ) -> std::result::Result<(), std::io::Error> {
843     #[repr(C)]
844     struct TdxIoctlCmd {
845         command: TdxCommand,
846         metadata: u32,
847         data: u64,
848     }
849     let cmd = TdxIoctlCmd {
850         command,
851         metadata,
852         data,
853     };
854     // SAFETY: FFI call. All input parameters are valid.
855     let ret = unsafe {
856         ioctl_with_val(
857             fd,
858             KVM_MEMORY_ENCRYPT_OP(),
859             &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong,
860         )
861     };
862 
863     if ret < 0 {
864         return Err(std::io::Error::last_os_error());
865     }
866     Ok(())
867 }
868 
869 /// Wrapper over KVM system ioctls.
870 pub struct KvmHypervisor {
871     kvm: Kvm,
872 }
873 
874 impl KvmHypervisor {
875     #[cfg(target_arch = "x86_64")]
876     ///
877     /// Retrieve the list of MSRs supported by the hypervisor.
878     ///
879     fn get_msr_list(&self) -> hypervisor::Result<MsrList> {
880         self.kvm
881             .get_msr_index_list()
882             .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into()))
883     }
884 }
885 
886 /// Enum for KVM related error
887 #[derive(Debug, Error)]
888 pub enum KvmError {
889     #[error("Capability missing: {0:?}")]
890     CapabilityMissing(Cap),
891 }
892 pub type KvmResult<T> = result::Result<T, KvmError>;
893 impl KvmHypervisor {
894     /// Create a hypervisor based on Kvm
895     #[allow(clippy::new_ret_no_self)]
896     pub fn new() -> hypervisor::Result<Arc<dyn hypervisor::Hypervisor>> {
897         let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?;
898         let api_version = kvm_obj.get_api_version();
899 
900         if api_version != kvm_bindings::KVM_API_VERSION as i32 {
901             return Err(hypervisor::HypervisorError::IncompatibleApiVersion);
902         }
903 
904         Ok(Arc::new(KvmHypervisor { kvm: kvm_obj }))
905     }
906     /// Check if the hypervisor is available
907     pub fn is_available() -> hypervisor::Result<bool> {
908         match std::fs::metadata("/dev/kvm") {
909             Ok(_) => Ok(true),
910             Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false),
911             Err(err) => Err(hypervisor::HypervisorError::HypervisorAvailableCheck(
912                 err.into(),
913             )),
914         }
915     }
916 }
917 /// Implementation of Hypervisor trait for KVM
918 /// Example:
919 /// #[cfg(feature = "kvm")]
920 /// extern crate hypervisor
921 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
922 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
923 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
924 ///
925 impl hypervisor::Hypervisor for KvmHypervisor {
926     ///
927     /// Returns the type of the hypervisor
928     ///
929     fn hypervisor_type(&self) -> HypervisorType {
930         HypervisorType::Kvm
931     }
932     /// Create a KVM vm object of a specific VM type and return the object as Vm trait object
933     /// Example
934     /// # extern crate hypervisor;
935     /// # use hypervisor::KvmHypervisor;
936     /// use hypervisor::KvmVm;
937     /// let hypervisor = KvmHypervisor::new().unwrap();
938     /// let vm = hypervisor.create_vm_with_type(KvmVmType::LegacyVm).unwrap()
939     ///
940     fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> {
941         let fd: VmFd;
942         loop {
943             match self.kvm.create_vm_with_type(vm_type) {
944                 Ok(res) => fd = res,
945                 Err(e) => {
946                     if e.errno() == libc::EINTR {
947                         // If the error returned is EINTR, which means the
948                         // ioctl has been interrupted, we have to retry as
949                         // this can't be considered as a regular error.
950                         continue;
951                     } else {
952                         return Err(hypervisor::HypervisorError::VmCreate(e.into()));
953                     }
954                 }
955             }
956             break;
957         }
958 
959         let vm_fd = Arc::new(fd);
960 
961         #[cfg(target_arch = "x86_64")]
962         {
963             let msr_list = self.get_msr_list()?;
964             let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize;
965             let mut msrs: Vec<MsrEntry> = vec![
966                 MsrEntry {
967                     ..Default::default()
968                 };
969                 num_msrs
970             ];
971             let indices = msr_list.as_slice();
972             for (pos, index) in indices.iter().enumerate() {
973                 msrs[pos].index = *index;
974             }
975 
976             Ok(Arc::new(KvmVm {
977                 fd: vm_fd,
978                 msrs,
979                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
980             }))
981         }
982 
983         #[cfg(target_arch = "aarch64")]
984         {
985             Ok(Arc::new(KvmVm {
986                 fd: vm_fd,
987                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
988             }))
989         }
990     }
991 
992     /// Create a KVM vm object and return the object as Vm trait object
993     /// Example
994     /// # extern crate hypervisor;
995     /// # use hypervisor::KvmHypervisor;
996     /// use hypervisor::KvmVm;
997     /// let hypervisor = KvmHypervisor::new().unwrap();
998     /// let vm = hypervisor.create_vm().unwrap()
999     ///
1000     fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> {
1001         #[allow(unused_mut)]
1002         let mut vm_type: u64 = 0; // Create with default platform type
1003 
1004         // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA
1005         // size from the host and use that when creating the VM, which may
1006         // avoid unnecessary VM creation failures.
1007         #[cfg(target_arch = "aarch64")]
1008         if self.kvm.check_extension(Cap::ArmVmIPASize) {
1009             vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap();
1010         }
1011 
1012         self.create_vm_with_type(vm_type)
1013     }
1014 
1015     fn check_required_extensions(&self) -> hypervisor::Result<()> {
1016         check_required_kvm_extensions(&self.kvm)
1017             .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into()))
1018     }
1019 
1020     #[cfg(target_arch = "x86_64")]
1021     ///
1022     /// X86 specific call to get the system supported CPUID values.
1023     ///
1024     fn get_cpuid(&self) -> hypervisor::Result<Vec<CpuIdEntry>> {
1025         let kvm_cpuid = self
1026             .kvm
1027             .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
1028             .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))?;
1029 
1030         let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect();
1031 
1032         Ok(v)
1033     }
1034 
1035     #[cfg(target_arch = "aarch64")]
1036     ///
1037     /// Retrieve AArch64 host maximum IPA size supported by KVM.
1038     ///
1039     fn get_host_ipa_limit(&self) -> i32 {
1040         self.kvm.get_host_ipa_limit()
1041     }
1042 
1043     ///
1044     /// Retrieve TDX capabilities
1045     ///
1046     #[cfg(feature = "tdx")]
1047     fn tdx_capabilities(&self) -> hypervisor::Result<TdxCapabilities> {
1048         let data = TdxCapabilities {
1049             nr_cpuid_configs: TDX_MAX_NR_CPUID_CONFIGS as u32,
1050             ..Default::default()
1051         };
1052 
1053         tdx_command(
1054             &self.kvm.as_raw_fd(),
1055             TdxCommand::Capabilities,
1056             0,
1057             &data as *const _ as u64,
1058         )
1059         .map_err(|e| hypervisor::HypervisorError::TdxCapabilities(e.into()))?;
1060 
1061         Ok(data)
1062     }
1063 
1064     ///
1065     /// Get the number of supported hardware breakpoints
1066     ///
1067     fn get_guest_debug_hw_bps(&self) -> usize {
1068         #[cfg(target_arch = "x86_64")]
1069         {
1070             4
1071         }
1072         #[cfg(target_arch = "aarch64")]
1073         {
1074             self.kvm.get_guest_debug_hw_bps() as usize
1075         }
1076     }
1077 }
1078 /// Vcpu struct for KVM
1079 pub struct KvmVcpu {
1080     fd: VcpuFd,
1081     #[cfg(target_arch = "x86_64")]
1082     msrs: Vec<MsrEntry>,
1083     vm_ops: Option<Arc<dyn vm::VmOps>>,
1084     #[cfg(target_arch = "x86_64")]
1085     hyperv_synic: AtomicBool,
1086 }
1087 /// Implementation of Vcpu trait for KVM
1088 /// Example:
1089 /// #[cfg(feature = "kvm")]
1090 /// extern crate hypervisor
1091 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1092 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1093 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
1094 /// let vcpu = vm.create_vcpu(0, None).unwrap();
1095 /// vcpu.get/set().unwrap()
1096 ///
1097 impl cpu::Vcpu for KvmVcpu {
1098     #[cfg(target_arch = "x86_64")]
1099     ///
1100     /// Returns the vCPU general purpose registers.
1101     ///
1102     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1103         Ok(self
1104             .fd
1105             .get_regs()
1106             .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))?
1107             .into())
1108     }
1109     ///
1110     /// Returns the vCPU general purpose registers.
1111     /// The `KVM_GET_REGS` ioctl is not available on AArch64, `KVM_GET_ONE_REG`
1112     /// is used to get registers one by one.
1113     ///
1114     #[cfg(target_arch = "aarch64")]
1115     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1116         let mut state: StandardRegisters = kvm_regs::default();
1117         let mut off = offset__of!(user_pt_regs, regs);
1118         // There are 31 user_pt_regs:
1119         // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72
1120         // These actually are the general-purpose registers of the Armv8-a
1121         // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register).
1122         for i in 0..31 {
1123             state.regs.regs[i] = self
1124                 .fd
1125                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1126                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1127                 .try_into()
1128                 .unwrap();
1129             off += std::mem::size_of::<u64>();
1130         }
1131 
1132         // We are now entering the "Other register" section of the ARMv8-a architecture.
1133         // First one, stack pointer.
1134         let off = offset__of!(user_pt_regs, sp);
1135         state.regs.sp = self
1136             .fd
1137             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1138             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1139             .try_into()
1140             .unwrap();
1141 
1142         // Second one, the program counter.
1143         let off = offset__of!(user_pt_regs, pc);
1144         state.regs.pc = self
1145             .fd
1146             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1147             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1148             .try_into()
1149             .unwrap();
1150 
1151         // Next is the processor state.
1152         let off = offset__of!(user_pt_regs, pstate);
1153         state.regs.pstate = self
1154             .fd
1155             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1156             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1157             .try_into()
1158             .unwrap();
1159 
1160         // The stack pointer associated with EL1
1161         let off = offset__of!(kvm_regs, sp_el1);
1162         state.sp_el1 = self
1163             .fd
1164             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1165             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1166             .try_into()
1167             .unwrap();
1168 
1169         // Exception Link Register for EL1, when taking an exception to EL1, this register
1170         // holds the address to which to return afterwards.
1171         let off = offset__of!(kvm_regs, elr_el1);
1172         state.elr_el1 = self
1173             .fd
1174             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1175             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1176             .try_into()
1177             .unwrap();
1178 
1179         // Saved Program Status Registers, there are 5 of them used in the kernel.
1180         let mut off = offset__of!(kvm_regs, spsr);
1181         for i in 0..KVM_NR_SPSR as usize {
1182             state.spsr[i] = self
1183                 .fd
1184                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1185                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1186                 .try_into()
1187                 .unwrap();
1188             off += std::mem::size_of::<u64>();
1189         }
1190 
1191         // Now moving on to floting point registers which are stored in the user_fpsimd_state in the kernel:
1192         // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53
1193         let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs);
1194         for i in 0..32 {
1195             state.fp_regs.vregs[i] = self
1196                 .fd
1197                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off))
1198                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1199             off += mem::size_of::<u128>();
1200         }
1201 
1202         // Floating-point Status Register
1203         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr);
1204         state.fp_regs.fpsr = self
1205             .fd
1206             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1207             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1208             .try_into()
1209             .unwrap();
1210 
1211         // Floating-point Control Register
1212         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr);
1213         state.fp_regs.fpcr = self
1214             .fd
1215             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1216             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1217             .try_into()
1218             .unwrap();
1219         Ok(state)
1220     }
1221     #[cfg(target_arch = "x86_64")]
1222     ///
1223     /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl.
1224     ///
1225     fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> {
1226         let regs = (*regs).into();
1227         self.fd
1228             .set_regs(&regs)
1229             .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into()))
1230     }
1231 
1232     ///
1233     /// Sets the vCPU general purpose registers.
1234     /// The `KVM_SET_REGS` ioctl is not available on AArch64, `KVM_SET_ONE_REG`
1235     /// is used to set registers one by one.
1236     ///
1237     #[cfg(target_arch = "aarch64")]
1238     fn set_regs(&self, state: &StandardRegisters) -> cpu::Result<()> {
1239         // The function follows the exact identical order from `state`. Look there
1240         // for some additional info on registers.
1241         let mut off = offset__of!(user_pt_regs, regs);
1242         for i in 0..31 {
1243             self.fd
1244                 .set_one_reg(
1245                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1246                     state.regs.regs[i].into(),
1247                 )
1248                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1249             off += std::mem::size_of::<u64>();
1250         }
1251 
1252         let off = offset__of!(user_pt_regs, sp);
1253         self.fd
1254             .set_one_reg(
1255                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1256                 state.regs.sp.into(),
1257             )
1258             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1259 
1260         let off = offset__of!(user_pt_regs, pc);
1261         self.fd
1262             .set_one_reg(
1263                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1264                 state.regs.pc.into(),
1265             )
1266             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1267 
1268         let off = offset__of!(user_pt_regs, pstate);
1269         self.fd
1270             .set_one_reg(
1271                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1272                 state.regs.pstate.into(),
1273             )
1274             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1275 
1276         let off = offset__of!(kvm_regs, sp_el1);
1277         self.fd
1278             .set_one_reg(
1279                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1280                 state.sp_el1.into(),
1281             )
1282             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1283 
1284         let off = offset__of!(kvm_regs, elr_el1);
1285         self.fd
1286             .set_one_reg(
1287                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1288                 state.elr_el1.into(),
1289             )
1290             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1291 
1292         let mut off = offset__of!(kvm_regs, spsr);
1293         for i in 0..KVM_NR_SPSR as usize {
1294             self.fd
1295                 .set_one_reg(
1296                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1297                     state.spsr[i].into(),
1298                 )
1299                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1300             off += std::mem::size_of::<u64>();
1301         }
1302 
1303         let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs);
1304         for i in 0..32 {
1305             self.fd
1306                 .set_one_reg(
1307                     arm64_core_reg_id!(KVM_REG_SIZE_U128, off),
1308                     state.fp_regs.vregs[i],
1309                 )
1310                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1311             off += mem::size_of::<u128>();
1312         }
1313 
1314         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr);
1315         self.fd
1316             .set_one_reg(
1317                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1318                 state.fp_regs.fpsr.into(),
1319             )
1320             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1321 
1322         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr);
1323         self.fd
1324             .set_one_reg(
1325                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1326                 state.fp_regs.fpcr.into(),
1327             )
1328             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1329         Ok(())
1330     }
1331 
1332     #[cfg(target_arch = "x86_64")]
1333     ///
1334     /// Returns the vCPU special registers.
1335     ///
1336     fn get_sregs(&self) -> cpu::Result<SpecialRegisters> {
1337         Ok(self
1338             .fd
1339             .get_sregs()
1340             .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))?
1341             .into())
1342     }
1343     #[cfg(target_arch = "x86_64")]
1344     ///
1345     /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl.
1346     ///
1347     fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> {
1348         let sregs = (*sregs).into();
1349         self.fd
1350             .set_sregs(&sregs)
1351             .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into()))
1352     }
1353     #[cfg(target_arch = "x86_64")]
1354     ///
1355     /// Returns the floating point state (FPU) from the vCPU.
1356     ///
1357     fn get_fpu(&self) -> cpu::Result<FpuState> {
1358         Ok(self
1359             .fd
1360             .get_fpu()
1361             .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))?
1362             .into())
1363     }
1364     #[cfg(target_arch = "x86_64")]
1365     ///
1366     /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct.
1367     ///
1368     fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> {
1369         let fpu: kvm_bindings::kvm_fpu = (*fpu).clone().into();
1370         self.fd
1371             .set_fpu(&fpu)
1372             .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into()))
1373     }
1374     #[cfg(target_arch = "x86_64")]
1375     ///
1376     /// X86 specific call to setup the CPUID registers.
1377     ///
1378     fn set_cpuid2(&self, cpuid: &[CpuIdEntry]) -> cpu::Result<()> {
1379         let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> =
1380             cpuid.iter().map(|e| (*e).into()).collect();
1381         let kvm_cpuid = <CpuId>::from_entries(&cpuid)
1382             .map_err(|_| cpu::HypervisorCpuError::SetCpuid(anyhow!("failed to create CpuId")))?;
1383 
1384         self.fd
1385             .set_cpuid2(&kvm_cpuid)
1386             .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into()))
1387     }
1388     #[cfg(target_arch = "x86_64")]
1389     ///
1390     /// X86 specific call to enable HyperV SynIC
1391     ///
1392     fn enable_hyperv_synic(&self) -> cpu::Result<()> {
1393         // Update the information about Hyper-V SynIC being enabled and
1394         // emulated as it will influence later which MSRs should be saved.
1395         self.hyperv_synic.store(true, Ordering::Release);
1396 
1397         let cap = kvm_enable_cap {
1398             cap: KVM_CAP_HYPERV_SYNIC,
1399             ..Default::default()
1400         };
1401         self.fd
1402             .enable_cap(&cap)
1403             .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into()))
1404     }
1405     ///
1406     /// X86 specific call to retrieve the CPUID registers.
1407     ///
1408     #[cfg(target_arch = "x86_64")]
1409     fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<Vec<CpuIdEntry>> {
1410         let kvm_cpuid = self
1411             .fd
1412             .get_cpuid2(num_entries)
1413             .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))?;
1414 
1415         let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect();
1416 
1417         Ok(v)
1418     }
1419     #[cfg(target_arch = "x86_64")]
1420     ///
1421     /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1422     ///
1423     fn get_lapic(&self) -> cpu::Result<LapicState> {
1424         Ok(self
1425             .fd
1426             .get_lapic()
1427             .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))?
1428             .into())
1429     }
1430     #[cfg(target_arch = "x86_64")]
1431     ///
1432     /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1433     ///
1434     fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> {
1435         let klapic: kvm_bindings::kvm_lapic_state = (*klapic).clone().into();
1436         self.fd
1437             .set_lapic(&klapic)
1438             .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into()))
1439     }
1440     #[cfg(target_arch = "x86_64")]
1441     ///
1442     /// Returns the model-specific registers (MSR) for this vCPU.
1443     ///
1444     fn get_msrs(&self, msrs: &mut Vec<MsrEntry>) -> cpu::Result<usize> {
1445         let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect();
1446         let mut kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap();
1447         let succ = self
1448             .fd
1449             .get_msrs(&mut kvm_msrs)
1450             .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))?;
1451 
1452         msrs[..succ].copy_from_slice(
1453             &kvm_msrs.as_slice()[..succ]
1454                 .iter()
1455                 .map(|e| (*e).into())
1456                 .collect::<Vec<MsrEntry>>(),
1457         );
1458 
1459         Ok(succ)
1460     }
1461     #[cfg(target_arch = "x86_64")]
1462     ///
1463     /// Setup the model-specific registers (MSR) for this vCPU.
1464     /// Returns the number of MSR entries actually written.
1465     ///
1466     fn set_msrs(&self, msrs: &[MsrEntry]) -> cpu::Result<usize> {
1467         let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect();
1468         let kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap();
1469         self.fd
1470             .set_msrs(&kvm_msrs)
1471             .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into()))
1472     }
1473     ///
1474     /// Returns the vcpu's current "multiprocessing state".
1475     ///
1476     fn get_mp_state(&self) -> cpu::Result<MpState> {
1477         Ok(self
1478             .fd
1479             .get_mp_state()
1480             .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))?
1481             .into())
1482     }
1483     ///
1484     /// Sets the vcpu's current "multiprocessing state".
1485     ///
1486     fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> {
1487         self.fd
1488             .set_mp_state(mp_state.into())
1489             .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into()))
1490     }
1491     #[cfg(target_arch = "x86_64")]
1492     ///
1493     /// Translates guest virtual address to guest physical address using the `KVM_TRANSLATE` ioctl.
1494     ///
1495     fn translate_gva(&self, gva: u64, _flags: u64) -> cpu::Result<(u64, u32)> {
1496         let tr = self
1497             .fd
1498             .translate_gva(gva)
1499             .map_err(|e| cpu::HypervisorCpuError::TranslateVirtualAddress(e.into()))?;
1500         // tr.valid is set if the GVA is mapped to valid GPA.
1501         match tr.valid {
1502             0 => Err(cpu::HypervisorCpuError::TranslateVirtualAddress(anyhow!(
1503                 "Invalid GVA: {:#x}",
1504                 gva
1505             ))),
1506             _ => Ok((tr.physical_address, 0)),
1507         }
1508     }
1509     ///
1510     /// Triggers the running of the current virtual CPU returning an exit reason.
1511     ///
1512     fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> {
1513         match self.fd.run() {
1514             Ok(run) => match run {
1515                 #[cfg(target_arch = "x86_64")]
1516                 VcpuExit::IoIn(addr, data) => {
1517                     if let Some(vm_ops) = &self.vm_ops {
1518                         return vm_ops
1519                             .pio_read(addr.into(), data)
1520                             .map(|_| cpu::VmExit::Ignore)
1521                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1522                     }
1523 
1524                     Ok(cpu::VmExit::IoIn(addr, data))
1525                 }
1526                 #[cfg(target_arch = "x86_64")]
1527                 VcpuExit::IoOut(addr, data) => {
1528                     if let Some(vm_ops) = &self.vm_ops {
1529                         return vm_ops
1530                             .pio_write(addr.into(), data)
1531                             .map(|_| cpu::VmExit::Ignore)
1532                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1533                     }
1534 
1535                     Ok(cpu::VmExit::IoOut(addr, data))
1536                 }
1537                 #[cfg(target_arch = "x86_64")]
1538                 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)),
1539                 #[cfg(target_arch = "x86_64")]
1540                 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset),
1541 
1542                 #[cfg(target_arch = "aarch64")]
1543                 VcpuExit::SystemEvent(event_type, flags) => {
1544                     use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN};
1545                     // On Aarch64, when the VM is shutdown, run() returns
1546                     // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN
1547                     if event_type == KVM_SYSTEM_EVENT_RESET {
1548                         Ok(cpu::VmExit::Reset)
1549                     } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN {
1550                         Ok(cpu::VmExit::Shutdown)
1551                     } else {
1552                         Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1553                             "Unexpected system event with type 0x{:x}, flags 0x{:x}",
1554                             event_type,
1555                             flags
1556                         )))
1557                     }
1558                 }
1559 
1560                 VcpuExit::MmioRead(addr, data) => {
1561                     if let Some(vm_ops) = &self.vm_ops {
1562                         return vm_ops
1563                             .mmio_read(addr, data)
1564                             .map(|_| cpu::VmExit::Ignore)
1565                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1566                     }
1567 
1568                     Ok(cpu::VmExit::MmioRead(addr, data))
1569                 }
1570                 VcpuExit::MmioWrite(addr, data) => {
1571                     if let Some(vm_ops) = &self.vm_ops {
1572                         return vm_ops
1573                             .mmio_write(addr, data)
1574                             .map(|_| cpu::VmExit::Ignore)
1575                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1576                     }
1577 
1578                     Ok(cpu::VmExit::MmioWrite(addr, data))
1579                 }
1580                 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv),
1581                 #[cfg(feature = "tdx")]
1582                 VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx),
1583                 VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug),
1584 
1585                 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1586                     "Unexpected exit reason on vcpu run: {:?}",
1587                     r
1588                 ))),
1589             },
1590 
1591             Err(ref e) => match e.errno() {
1592                 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore),
1593                 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1594                     "VCPU error {:?}",
1595                     e
1596                 ))),
1597             },
1598         }
1599     }
1600     #[cfg(target_arch = "x86_64")]
1601     ///
1602     /// Let the guest know that it has been paused, which prevents from
1603     /// potential soft lockups when being resumed.
1604     ///
1605     fn notify_guest_clock_paused(&self) -> cpu::Result<()> {
1606         if let Err(e) = self.fd.kvmclock_ctrl() {
1607             // Linux kernel returns -EINVAL if the PV clock isn't yet initialised
1608             // which could be because we're still in firmware or the guest doesn't
1609             // use KVM clock.
1610             if e.errno() != libc::EINVAL {
1611                 return Err(cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into()));
1612             }
1613         }
1614 
1615         Ok(())
1616     }
1617     ///
1618     /// Sets debug registers to set hardware breakpoints and/or enable single step.
1619     ///
1620     fn set_guest_debug(
1621         &self,
1622         addrs: &[vm_memory::GuestAddress],
1623         singlestep: bool,
1624     ) -> cpu::Result<()> {
1625         let mut dbg = kvm_guest_debug {
1626             #[cfg(target_arch = "x86_64")]
1627             control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP,
1628             #[cfg(target_arch = "aarch64")]
1629             control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW,
1630             ..Default::default()
1631         };
1632         if singlestep {
1633             dbg.control |= KVM_GUESTDBG_SINGLESTEP;
1634         }
1635 
1636         // Set the debug registers.
1637         // Here we assume that the number of addresses do not exceed what
1638         // `Hypervisor::get_guest_debug_hw_bps()` specifies.
1639         #[cfg(target_arch = "x86_64")]
1640         {
1641             // Set bits 9 and 10.
1642             // bit 9: GE (global exact breakpoint enable) flag.
1643             // bit 10: always 1.
1644             dbg.arch.debugreg[7] = 0x0600;
1645 
1646             for (i, addr) in addrs.iter().enumerate() {
1647                 dbg.arch.debugreg[i] = addr.0;
1648                 // Set global breakpoint enable flag
1649                 dbg.arch.debugreg[7] |= 2 << (i * 2);
1650             }
1651         }
1652         #[cfg(target_arch = "aarch64")]
1653         {
1654             for (i, addr) in addrs.iter().enumerate() {
1655                 // DBGBCR_EL1 (Debug Breakpoint Control Registers, D13.3.2):
1656                 // bit 0: 1 (Enabled)
1657                 // bit 1~2: 0b11 (PMC = EL1/EL0)
1658                 // bit 5~8: 0b1111 (BAS = AArch64)
1659                 // others: 0
1660                 dbg.arch.dbg_bcr[i] = 0b1u64 | 0b110u64 | 0b1_1110_0000u64;
1661                 // DBGBVR_EL1 (Debug Breakpoint Value Registers, D13.3.3):
1662                 // bit 2~52: VA[2:52]
1663                 dbg.arch.dbg_bvr[i] = (!0u64 >> 11) & addr.0;
1664             }
1665         }
1666         self.fd
1667             .set_guest_debug(&dbg)
1668             .map_err(|e| cpu::HypervisorCpuError::SetDebugRegs(e.into()))
1669     }
1670     #[cfg(target_arch = "aarch64")]
1671     fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> {
1672         self.fd
1673             .vcpu_init(kvi)
1674             .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into()))
1675     }
1676     ///
1677     /// Gets a list of the guest registers that are supported for the
1678     /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls.
1679     ///
1680     #[cfg(target_arch = "aarch64")]
1681     fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> {
1682         self.fd
1683             .get_reg_list(reg_list)
1684             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))
1685     }
1686     ///
1687     /// Gets the value of a system register
1688     ///
1689     #[cfg(target_arch = "aarch64")]
1690     fn get_sys_reg(&self, sys_reg: u32) -> cpu::Result<u64> {
1691         //
1692         // Arm Architecture Reference Manual defines the encoding of
1693         // AArch64 system registers, see
1694         // https://developer.arm.com/documentation/ddi0487 (chapter D12).
1695         // While KVM defines another ID for each AArch64 system register,
1696         // which is used in calling `KVM_G/SET_ONE_REG` to access a system
1697         // register of a guest.
1698         // A mapping exists between the Arm standard encoding and the KVM ID.
1699         // This function takes the standard u32 ID as input parameter, converts
1700         // it to the corresponding KVM ID, and call `KVM_GET_ONE_REG` API to
1701         // get the value of the system parameter.
1702         //
1703         let id: u64 = KVM_REG_ARM64
1704             | KVM_REG_SIZE_U64
1705             | KVM_REG_ARM64_SYSREG as u64
1706             | ((((sys_reg) >> 5)
1707                 & (KVM_REG_ARM64_SYSREG_OP0_MASK
1708                     | KVM_REG_ARM64_SYSREG_OP1_MASK
1709                     | KVM_REG_ARM64_SYSREG_CRN_MASK
1710                     | KVM_REG_ARM64_SYSREG_CRM_MASK
1711                     | KVM_REG_ARM64_SYSREG_OP2_MASK)) as u64);
1712         Ok(self
1713             .fd
1714             .get_one_reg(id)
1715             .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?
1716             .try_into()
1717             .unwrap())
1718     }
1719     ///
1720     /// Configure core registers for a given CPU.
1721     ///
1722     #[cfg(target_arch = "aarch64")]
1723     fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> {
1724         #[allow(non_upper_case_globals)]
1725         // PSR (Processor State Register) bits.
1726         // Taken from arch/arm64/include/uapi/asm/ptrace.h.
1727         const PSR_MODE_EL1h: u64 = 0x0000_0005;
1728         const PSR_F_BIT: u64 = 0x0000_0040;
1729         const PSR_I_BIT: u64 = 0x0000_0080;
1730         const PSR_A_BIT: u64 = 0x0000_0100;
1731         const PSR_D_BIT: u64 = 0x0000_0200;
1732         // Taken from arch/arm64/kvm/inject_fault.c.
1733         const PSTATE_FAULT_BITS_64: u64 =
1734             PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | PSR_I_BIT | PSR_D_BIT;
1735 
1736         let kreg_off = offset__of!(kvm_regs, regs);
1737 
1738         // Get the register index of the PSTATE (Processor State) register.
1739         let pstate = offset__of!(user_pt_regs, pstate) + kreg_off;
1740         self.fd
1741             .set_one_reg(
1742                 arm64_core_reg_id!(KVM_REG_SIZE_U64, pstate),
1743                 PSTATE_FAULT_BITS_64.into(),
1744             )
1745             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1746 
1747         // Other vCPUs are powered off initially awaiting PSCI wakeup.
1748         if cpu_id == 0 {
1749             // Setting the PC (Processor Counter) to the current program address (kernel address).
1750             let pc = offset__of!(user_pt_regs, pc) + kreg_off;
1751             self.fd
1752                 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, pc), boot_ip.into())
1753                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1754 
1755             // Last mandatory thing to set -> the address pointing to the FDT (also called DTB).
1756             // "The device tree blob (dtb) must be placed on an 8-byte boundary and must
1757             // not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt.
1758             // We are choosing to place it the end of DRAM. See `get_fdt_addr`.
1759             let regs0 = offset__of!(user_pt_regs, regs) + kreg_off;
1760             self.fd
1761                 .set_one_reg(
1762                     arm64_core_reg_id!(KVM_REG_SIZE_U64, regs0),
1763                     fdt_start.into(),
1764                 )
1765                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1766         }
1767         Ok(())
1768     }
1769 
1770     #[cfg(target_arch = "x86_64")]
1771     ///
1772     /// Get the current CPU state
1773     ///
1774     /// Ordering requirements:
1775     ///
1776     /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify
1777     /// vCPU/LAPIC state. As such, it must be done before most everything
1778     /// else, otherwise we cannot restore everything and expect it to work.
1779     ///
1780     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1781     /// still running.
1782     ///
1783     /// KVM_GET_LAPIC may change state of LAPIC before returning it.
1784     ///
1785     /// GET_VCPU_EVENTS should probably be last to save. The code looks as
1786     /// it might as well be affected by internal state modifications of the
1787     /// GET ioctls.
1788     ///
1789     /// SREGS saves/restores a pending interrupt, similar to what
1790     /// VCPU_EVENTS also does.
1791     ///
1792     /// GET_MSRS requires a pre-populated data structure to do something
1793     /// meaningful. For SET_MSRS it will then contain good data.
1794     ///
1795     /// # Example
1796     ///
1797     /// ```rust
1798     /// # extern crate hypervisor;
1799     /// # use hypervisor::KvmHypervisor;
1800     /// # use std::sync::Arc;
1801     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1802     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1803     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1804     /// vm.enable_split_irq().unwrap();
1805     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1806     /// let state = vcpu.state().unwrap();
1807     /// ```
1808     fn state(&self) -> cpu::Result<CpuState> {
1809         let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?;
1810         let mp_state = self.get_mp_state()?.into();
1811         let regs = self.get_regs()?;
1812         let sregs = self.get_sregs()?;
1813         let xsave = self.get_xsave()?;
1814         let xcrs = self.get_xcrs()?;
1815         let lapic_state = self.get_lapic()?;
1816         let fpu = self.get_fpu()?;
1817 
1818         // Try to get all MSRs based on the list previously retrieved from KVM.
1819         // If the number of MSRs obtained from GET_MSRS is different from the
1820         // expected amount, we fallback onto a slower method by getting MSRs
1821         // by chunks. This is the only way to make sure we try to get as many
1822         // MSRs as possible, even if some MSRs are not supported.
1823         let mut msr_entries = self.msrs.clone();
1824 
1825         // Save extra MSRs if the Hyper-V synthetic interrupt controller is
1826         // emulated.
1827         if self.hyperv_synic.load(Ordering::Acquire) {
1828             let hyperv_synic_msrs = vec![
1829                 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084,
1830                 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096,
1831                 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d,
1832                 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4,
1833                 0x400000b5, 0x400000b6, 0x400000b7,
1834             ];
1835             for index in hyperv_synic_msrs {
1836                 let msr = kvm_msr_entry {
1837                     index,
1838                     ..Default::default()
1839                 };
1840                 msr_entries.push(msr.into());
1841             }
1842         }
1843 
1844         let expected_num_msrs = msr_entries.len();
1845         let num_msrs = self.get_msrs(&mut msr_entries)?;
1846         let msrs = if num_msrs != expected_num_msrs {
1847             let mut faulty_msr_index = num_msrs;
1848             let mut msr_entries_tmp = msr_entries[..faulty_msr_index].to_vec();
1849 
1850             loop {
1851                 warn!(
1852                     "Detected faulty MSR 0x{:x} while getting MSRs",
1853                     msr_entries[faulty_msr_index].index
1854                 );
1855 
1856                 // Skip the first bad MSR
1857                 let start_pos = faulty_msr_index + 1;
1858 
1859                 let mut sub_msr_entries = msr_entries[start_pos..].to_vec();
1860                 let num_msrs = self.get_msrs(&mut sub_msr_entries)?;
1861 
1862                 msr_entries_tmp.extend(&sub_msr_entries[..num_msrs]);
1863 
1864                 if num_msrs == sub_msr_entries.len() {
1865                     break;
1866                 }
1867 
1868                 faulty_msr_index = start_pos + num_msrs;
1869             }
1870 
1871             msr_entries_tmp
1872         } else {
1873             msr_entries
1874         };
1875 
1876         let vcpu_events = self.get_vcpu_events()?;
1877 
1878         Ok(VcpuKvmState {
1879             cpuid,
1880             msrs,
1881             vcpu_events,
1882             regs: regs.into(),
1883             sregs: sregs.into(),
1884             fpu,
1885             lapic_state,
1886             xsave,
1887             xcrs,
1888             mp_state,
1889         }
1890         .into())
1891     }
1892     ///
1893     /// Get the current AArch64 CPU state
1894     ///
1895     #[cfg(target_arch = "aarch64")]
1896     fn state(&self) -> cpu::Result<CpuState> {
1897         let mut state = VcpuKvmState {
1898             mp_state: self.get_mp_state()?.into(),
1899             ..Default::default()
1900         };
1901         // Get core registers
1902         state.core_regs = self.get_regs()?;
1903 
1904         // Get systerm register
1905         // Call KVM_GET_REG_LIST to get all registers available to the guest.
1906         // For ArmV8 there are around 500 registers.
1907         let mut sys_regs: Vec<Register> = Vec::new();
1908         let mut reg_list = RegList::new(500).unwrap();
1909         self.fd
1910             .get_reg_list(&mut reg_list)
1911             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?;
1912 
1913         // At this point reg_list should contain: core registers and system
1914         // registers.
1915         // The register list contains the number of registers and their ids. We
1916         // will be needing to call KVM_GET_ONE_REG on each id in order to save
1917         // all of them. We carve out from the list  the core registers which are
1918         // represented in the kernel by kvm_regs structure and for which we can
1919         // calculate the id based on the offset in the structure.
1920         reg_list.retain(|regid| is_system_register(*regid));
1921 
1922         // Now, for the rest of the registers left in the previously fetched
1923         // register list, we are simply calling KVM_GET_ONE_REG.
1924         let indices = reg_list.as_slice();
1925         for index in indices.iter() {
1926             sys_regs.push(kvm_bindings::kvm_one_reg {
1927                 id: *index,
1928                 addr: self
1929                     .fd
1930                     .get_one_reg(*index)
1931                     .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?
1932                     .try_into()
1933                     .unwrap(),
1934             });
1935         }
1936 
1937         state.sys_regs = sys_regs;
1938 
1939         Ok(state.into())
1940     }
1941     #[cfg(target_arch = "x86_64")]
1942     ///
1943     /// Restore the previously saved CPU state
1944     ///
1945     /// Ordering requirements:
1946     ///
1947     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1948     /// still running.
1949     ///
1950     /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so
1951     /// if we ever change the BSP, we have to do that before restoring anything.
1952     /// The same seems to be true for CPUID stuff.
1953     ///
1954     /// SREGS saves/restores a pending interrupt, similar to what
1955     /// VCPU_EVENTS also does.
1956     ///
1957     /// SET_REGS clears pending exceptions unconditionally, thus, it must be
1958     /// done before SET_VCPU_EVENTS, which restores it.
1959     ///
1960     /// SET_LAPIC must come after SET_SREGS, because the latter restores
1961     /// the apic base msr.
1962     ///
1963     /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR
1964     /// only restores successfully, when the LAPIC is correctly configured.
1965     ///
1966     /// Arguments: CpuState
1967     /// # Example
1968     ///
1969     /// ```rust
1970     /// # extern crate hypervisor;
1971     /// # use hypervisor::KvmHypervisor;
1972     /// # use std::sync::Arc;
1973     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1974     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1975     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1976     /// vm.enable_split_irq().unwrap();
1977     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1978     /// let state = vcpu.state().unwrap();
1979     /// vcpu.set_state(&state).unwrap();
1980     /// ```
1981     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
1982         let state: VcpuKvmState = state.clone().into();
1983         self.set_cpuid2(&state.cpuid)?;
1984         self.set_mp_state(state.mp_state.into())?;
1985         self.set_regs(&state.regs.into())?;
1986         self.set_sregs(&state.sregs.into())?;
1987         self.set_xsave(&state.xsave)?;
1988         self.set_xcrs(&state.xcrs)?;
1989         self.set_lapic(&state.lapic_state)?;
1990         self.set_fpu(&state.fpu)?;
1991 
1992         // Try to set all MSRs previously stored.
1993         // If the number of MSRs set from SET_MSRS is different from the
1994         // expected amount, we fallback onto a slower method by setting MSRs
1995         // by chunks. This is the only way to make sure we try to set as many
1996         // MSRs as possible, even if some MSRs are not supported.
1997         let expected_num_msrs = state.msrs.len();
1998         let num_msrs = self.set_msrs(&state.msrs)?;
1999         if num_msrs != expected_num_msrs {
2000             let mut faulty_msr_index = num_msrs;
2001 
2002             loop {
2003                 warn!(
2004                     "Detected faulty MSR 0x{:x} while setting MSRs",
2005                     state.msrs[faulty_msr_index].index
2006                 );
2007 
2008                 // Skip the first bad MSR
2009                 let start_pos = faulty_msr_index + 1;
2010 
2011                 let sub_msr_entries = state.msrs[start_pos..].to_vec();
2012 
2013                 let num_msrs = self.set_msrs(&sub_msr_entries)?;
2014 
2015                 if num_msrs == sub_msr_entries.len() {
2016                     break;
2017                 }
2018 
2019                 faulty_msr_index = start_pos + num_msrs;
2020             }
2021         }
2022 
2023         self.set_vcpu_events(&state.vcpu_events)?;
2024 
2025         Ok(())
2026     }
2027     ///
2028     /// Restore the previously saved AArch64 CPU state
2029     ///
2030     #[cfg(target_arch = "aarch64")]
2031     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
2032         let state: VcpuKvmState = state.clone().into();
2033         // Set core registers
2034         self.set_regs(&state.core_regs)?;
2035         // Set system registers
2036         for reg in &state.sys_regs {
2037             self.fd
2038                 .set_one_reg(reg.id, reg.addr.into())
2039                 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?;
2040         }
2041 
2042         self.set_mp_state(state.mp_state.into())?;
2043 
2044         Ok(())
2045     }
2046 
2047     ///
2048     /// Initialize TDX for this CPU
2049     ///
2050     #[cfg(feature = "tdx")]
2051     fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> {
2052         tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address)
2053             .map_err(cpu::HypervisorCpuError::InitializeTdx)
2054     }
2055 
2056     ///
2057     /// Set the "immediate_exit" state
2058     ///
2059     fn set_immediate_exit(&self, exit: bool) {
2060         self.fd.set_kvm_immediate_exit(exit.into());
2061     }
2062 
2063     ///
2064     /// Returns the details about TDX exit reason
2065     ///
2066     #[cfg(feature = "tdx")]
2067     fn get_tdx_exit_details(&mut self) -> cpu::Result<TdxExitDetails> {
2068         let kvm_run = self.fd.get_kvm_run();
2069         // SAFETY: accessing a union field in a valid structure
2070         let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall };
2071 
2072         tdx_vmcall.status_code = TDG_VP_VMCALL_INVALID_OPERAND;
2073 
2074         if tdx_vmcall.type_ != 0 {
2075             return Err(cpu::HypervisorCpuError::UnknownTdxVmCall);
2076         }
2077 
2078         match tdx_vmcall.subfunction {
2079             TDG_VP_VMCALL_GET_QUOTE => Ok(TdxExitDetails::GetQuote),
2080             TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT => {
2081                 Ok(TdxExitDetails::SetupEventNotifyInterrupt)
2082             }
2083             _ => Err(cpu::HypervisorCpuError::UnknownTdxVmCall),
2084         }
2085     }
2086 
2087     ///
2088     /// Set the status code for TDX exit
2089     ///
2090     #[cfg(feature = "tdx")]
2091     fn set_tdx_status(&mut self, status: TdxExitStatus) {
2092         let kvm_run = self.fd.get_kvm_run();
2093         // SAFETY: accessing a union field in a valid structure
2094         let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall };
2095 
2096         tdx_vmcall.status_code = match status {
2097             TdxExitStatus::Success => TDG_VP_VMCALL_SUCCESS,
2098             TdxExitStatus::InvalidOperand => TDG_VP_VMCALL_INVALID_OPERAND,
2099         };
2100     }
2101     #[cfg(target_arch = "x86_64")]
2102     ///
2103     /// Return the list of initial MSR entries for a VCPU
2104     ///
2105     fn boot_msr_entries(&self) -> Vec<MsrEntry> {
2106         use crate::arch::x86::{msr_index, MTRR_ENABLE, MTRR_MEM_TYPE_WB};
2107 
2108         [
2109             msr!(msr_index::MSR_IA32_SYSENTER_CS),
2110             msr!(msr_index::MSR_IA32_SYSENTER_ESP),
2111             msr!(msr_index::MSR_IA32_SYSENTER_EIP),
2112             msr!(msr_index::MSR_STAR),
2113             msr!(msr_index::MSR_CSTAR),
2114             msr!(msr_index::MSR_LSTAR),
2115             msr!(msr_index::MSR_KERNEL_GS_BASE),
2116             msr!(msr_index::MSR_SYSCALL_MASK),
2117             msr!(msr_index::MSR_IA32_TSC),
2118             msr_data!(
2119                 msr_index::MSR_IA32_MISC_ENABLE,
2120                 msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64
2121             ),
2122             msr_data!(msr_index::MSR_MTRRdefType, MTRR_ENABLE | MTRR_MEM_TYPE_WB),
2123         ]
2124         .to_vec()
2125     }
2126     #[cfg(target_arch = "aarch64")]
2127     fn has_pmu_support(&self) -> bool {
2128         let cpu_attr = kvm_bindings::kvm_device_attr {
2129             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2130             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT),
2131             addr: 0x0,
2132             flags: 0,
2133         };
2134         self.fd.has_device_attr(&cpu_attr).is_ok()
2135     }
2136     #[cfg(target_arch = "aarch64")]
2137     fn init_pmu(&self, irq: u32) -> cpu::Result<()> {
2138         let cpu_attr = kvm_bindings::kvm_device_attr {
2139             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2140             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT),
2141             addr: 0x0,
2142             flags: 0,
2143         };
2144         let cpu_attr_irq = kvm_bindings::kvm_device_attr {
2145             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2146             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_IRQ),
2147             addr: &irq as *const u32 as u64,
2148             flags: 0,
2149         };
2150         self.fd
2151             .set_device_attr(&cpu_attr_irq)
2152             .map_err(|_| cpu::HypervisorCpuError::InitializePmu)?;
2153         self.fd
2154             .set_device_attr(&cpu_attr)
2155             .map_err(|_| cpu::HypervisorCpuError::InitializePmu)
2156     }
2157 }
2158 
2159 impl KvmVcpu {
2160     #[cfg(target_arch = "x86_64")]
2161     ///
2162     /// X86 specific call that returns the vcpu's current "xsave struct".
2163     ///
2164     fn get_xsave(&self) -> cpu::Result<Xsave> {
2165         self.fd
2166             .get_xsave()
2167             .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into()))
2168     }
2169     #[cfg(target_arch = "x86_64")]
2170     ///
2171     /// X86 specific call that sets the vcpu's current "xsave struct".
2172     ///
2173     fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> {
2174         self.fd
2175             .set_xsave(xsave)
2176             .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into()))
2177     }
2178     #[cfg(target_arch = "x86_64")]
2179     ///
2180     /// X86 specific call that returns the vcpu's current "xcrs".
2181     ///
2182     fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> {
2183         self.fd
2184             .get_xcrs()
2185             .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into()))
2186     }
2187     #[cfg(target_arch = "x86_64")]
2188     ///
2189     /// X86 specific call that sets the vcpu's current "xcrs".
2190     ///
2191     fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> {
2192         self.fd
2193             .set_xcrs(xcrs)
2194             .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into()))
2195     }
2196     #[cfg(target_arch = "x86_64")]
2197     ///
2198     /// Returns currently pending exceptions, interrupts, and NMIs as well as related
2199     /// states of the vcpu.
2200     ///
2201     fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> {
2202         self.fd
2203             .get_vcpu_events()
2204             .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into()))
2205     }
2206     #[cfg(target_arch = "x86_64")]
2207     ///
2208     /// Sets pending exceptions, interrupts, and NMIs as well as related states
2209     /// of the vcpu.
2210     ///
2211     fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> {
2212         self.fd
2213             .set_vcpu_events(events)
2214             .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into()))
2215     }
2216 }
2217