xref: /cloud-hypervisor/hypervisor/src/kvm/mod.rs (revision d20f647b3201b9fea0ab5ee9cc81af3d92940b1e)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 // Copyright © 2020, Microsoft Corporation
6 //
7 // Copyright 2018-2019 CrowdStrike, Inc.
8 //
9 //
10 
11 #[cfg(target_arch = "aarch64")]
12 use crate::aarch64::gic::KvmGicV3Its;
13 #[cfg(target_arch = "aarch64")]
14 pub use crate::aarch64::{
15     check_required_kvm_extensions, gic::Gicv3ItsState as GicState, is_system_register, VcpuInit,
16     VcpuKvmState, MPIDR_EL1,
17 };
18 #[cfg(target_arch = "aarch64")]
19 use crate::arch::aarch64::gic::Vgic;
20 use crate::cpu;
21 use crate::device;
22 use crate::hypervisor;
23 use crate::vec_with_array_field;
24 use crate::vm::{self, InterruptSourceConfig, VmOps};
25 #[cfg(target_arch = "aarch64")]
26 use crate::{arm64_core_reg_id, offset__of};
27 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd};
28 use std::any::Any;
29 use std::collections::HashMap;
30 #[cfg(target_arch = "aarch64")]
31 use std::convert::TryInto;
32 #[cfg(target_arch = "x86_64")]
33 use std::fs::File;
34 #[cfg(target_arch = "x86_64")]
35 use std::os::unix::io::AsRawFd;
36 #[cfg(feature = "tdx")]
37 use std::os::unix::io::RawFd;
38 use std::result;
39 #[cfg(target_arch = "x86_64")]
40 use std::sync::atomic::{AtomicBool, Ordering};
41 #[cfg(target_arch = "aarch64")]
42 use std::sync::Mutex;
43 use std::sync::{Arc, RwLock};
44 use vmm_sys_util::eventfd::EventFd;
45 // x86_64 dependencies
46 #[cfg(target_arch = "x86_64")]
47 pub mod x86_64;
48 #[cfg(target_arch = "x86_64")]
49 use crate::arch::x86::NUM_IOAPIC_PINS;
50 #[cfg(target_arch = "x86_64")]
51 use crate::ClockData;
52 use crate::{
53     CpuState, IoEventAddress, IrqRoutingEntry, MpState, UserMemoryRegion,
54     USER_MEMORY_REGION_LOG_DIRTY, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE,
55 };
56 #[cfg(target_arch = "aarch64")]
57 use aarch64::{RegList, Register, StandardRegisters};
58 #[cfg(target_arch = "x86_64")]
59 use kvm_bindings::{
60     kvm_enable_cap, kvm_guest_debug, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC,
61     KVM_CAP_SPLIT_IRQCHIP, KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_GUESTDBG_USE_HW_BP,
62 };
63 #[cfg(target_arch = "x86_64")]
64 use x86_64::{check_required_kvm_extensions, FpuState, SpecialRegisters, StandardRegisters};
65 #[cfg(target_arch = "x86_64")]
66 pub use x86_64::{
67     CpuId, CpuIdEntry, ExtendedControlRegisters, LapicState, MsrEntries, VcpuKvmState, Xsave,
68     CPUID_FLAG_VALID_INDEX,
69 };
70 // aarch64 dependencies
71 #[cfg(target_arch = "aarch64")]
72 pub mod aarch64;
73 pub use kvm_bindings;
74 #[cfg(feature = "tdx")]
75 use kvm_bindings::KVMIO;
76 pub use kvm_bindings::{
77     kvm_clock_data, kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_irq_routing,
78     kvm_irq_routing_entry, kvm_mp_state, kvm_userspace_memory_region, KVM_IRQ_ROUTING_IRQCHIP,
79     KVM_IRQ_ROUTING_MSI, KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID,
80 };
81 #[cfg(target_arch = "aarch64")]
82 use kvm_bindings::{
83     kvm_regs, user_fpsimd_state, user_pt_regs, KVM_NR_SPSR, KVM_REG_ARM64, KVM_REG_ARM_CORE,
84     KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64,
85 };
86 pub use kvm_ioctls;
87 pub use kvm_ioctls::{Cap, Kvm};
88 #[cfg(target_arch = "aarch64")]
89 use std::mem;
90 use thiserror::Error;
91 #[cfg(feature = "tdx")]
92 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_expr, ioctl_ioc_nr, ioctl_iowr_nr};
93 ///
94 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms
95 ///
96 pub use {
97     kvm_bindings::kvm_create_device as CreateDevice, kvm_bindings::kvm_device_attr as DeviceAttr,
98     kvm_bindings::kvm_run, kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::DeviceFd,
99     kvm_ioctls::VcpuExit,
100 };
101 
102 #[cfg(target_arch = "x86_64")]
103 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196;
104 
105 #[cfg(feature = "tdx")]
106 const KVM_EXIT_TDX: u32 = 35;
107 #[cfg(feature = "tdx")]
108 const TDG_VP_VMCALL_GET_QUOTE: u64 = 0x10002;
109 #[cfg(feature = "tdx")]
110 const TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: u64 = 0x10004;
111 #[cfg(feature = "tdx")]
112 const TDG_VP_VMCALL_SUCCESS: u64 = 0;
113 #[cfg(feature = "tdx")]
114 const TDG_VP_VMCALL_INVALID_OPERAND: u64 = 0x8000000000000000;
115 
116 #[cfg(feature = "tdx")]
117 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong);
118 
119 #[cfg(feature = "tdx")]
120 #[repr(u32)]
121 enum TdxCommand {
122     Capabilities = 0,
123     InitVm,
124     InitVcpu,
125     InitMemRegion,
126     Finalize,
127 }
128 
129 #[cfg(feature = "tdx")]
130 pub enum TdxExitDetails {
131     GetQuote,
132     SetupEventNotifyInterrupt,
133 }
134 
135 #[cfg(feature = "tdx")]
136 pub enum TdxExitStatus {
137     Success,
138     InvalidOperand,
139 }
140 
141 #[cfg(feature = "tdx")]
142 const TDX_MAX_NR_CPUID_CONFIGS: usize = 6;
143 
144 #[cfg(feature = "tdx")]
145 #[repr(C)]
146 #[derive(Debug, Default)]
147 pub struct TdxCpuidConfig {
148     pub leaf: u32,
149     pub sub_leaf: u32,
150     pub eax: u32,
151     pub ebx: u32,
152     pub ecx: u32,
153     pub edx: u32,
154 }
155 
156 #[cfg(feature = "tdx")]
157 #[repr(C)]
158 #[derive(Debug, Default)]
159 pub struct TdxCapabilities {
160     pub attrs_fixed0: u64,
161     pub attrs_fixed1: u64,
162     pub xfam_fixed0: u64,
163     pub xfam_fixed1: u64,
164     pub nr_cpuid_configs: u32,
165     pub padding: u32,
166     pub cpuid_configs: [TdxCpuidConfig; TDX_MAX_NR_CPUID_CONFIGS],
167 }
168 
169 impl From<kvm_userspace_memory_region> for UserMemoryRegion {
170     fn from(region: kvm_userspace_memory_region) -> Self {
171         let mut flags = USER_MEMORY_REGION_READ;
172         if region.flags & KVM_MEM_READONLY == 0 {
173             flags |= USER_MEMORY_REGION_WRITE;
174         }
175         if region.flags & KVM_MEM_LOG_DIRTY_PAGES != 0 {
176             flags |= USER_MEMORY_REGION_LOG_DIRTY;
177         }
178 
179         UserMemoryRegion {
180             slot: region.slot,
181             guest_phys_addr: region.guest_phys_addr,
182             memory_size: region.memory_size,
183             userspace_addr: region.userspace_addr,
184             flags,
185         }
186     }
187 }
188 
189 impl From<UserMemoryRegion> for kvm_userspace_memory_region {
190     fn from(region: UserMemoryRegion) -> Self {
191         assert!(
192             region.flags & USER_MEMORY_REGION_READ != 0,
193             "KVM mapped memory is always readable"
194         );
195 
196         let mut flags = 0;
197         if region.flags & USER_MEMORY_REGION_WRITE == 0 {
198             flags |= KVM_MEM_READONLY;
199         }
200         if region.flags & USER_MEMORY_REGION_LOG_DIRTY != 0 {
201             flags |= KVM_MEM_LOG_DIRTY_PAGES;
202         }
203 
204         kvm_userspace_memory_region {
205             slot: region.slot,
206             guest_phys_addr: region.guest_phys_addr,
207             memory_size: region.memory_size,
208             userspace_addr: region.userspace_addr,
209             flags,
210         }
211     }
212 }
213 
214 impl From<kvm_mp_state> for MpState {
215     fn from(s: kvm_mp_state) -> Self {
216         MpState::Kvm(s)
217     }
218 }
219 
220 impl From<MpState> for kvm_mp_state {
221     fn from(ms: MpState) -> Self {
222         match ms {
223             MpState::Kvm(s) => s,
224             /* Needed in case other hypervisors are enabled */
225             #[allow(unreachable_patterns)]
226             _ => panic!("CpuState is not valid"),
227         }
228     }
229 }
230 
231 impl From<kvm_ioctls::IoEventAddress> for IoEventAddress {
232     fn from(a: kvm_ioctls::IoEventAddress) -> Self {
233         match a {
234             kvm_ioctls::IoEventAddress::Pio(x) => Self::Pio(x),
235             kvm_ioctls::IoEventAddress::Mmio(x) => Self::Mmio(x),
236         }
237     }
238 }
239 
240 impl From<IoEventAddress> for kvm_ioctls::IoEventAddress {
241     fn from(a: IoEventAddress) -> Self {
242         match a {
243             IoEventAddress::Pio(x) => Self::Pio(x),
244             IoEventAddress::Mmio(x) => Self::Mmio(x),
245         }
246     }
247 }
248 
249 impl From<VcpuKvmState> for CpuState {
250     fn from(s: VcpuKvmState) -> Self {
251         CpuState::Kvm(s)
252     }
253 }
254 
255 impl From<CpuState> for VcpuKvmState {
256     fn from(s: CpuState) -> Self {
257         match s {
258             CpuState::Kvm(s) => s,
259             /* Needed in case other hypervisors are enabled */
260             #[allow(unreachable_patterns)]
261             _ => panic!("CpuState is not valid"),
262         }
263     }
264 }
265 
266 #[cfg(target_arch = "x86_64")]
267 impl From<kvm_clock_data> for ClockData {
268     fn from(d: kvm_clock_data) -> Self {
269         ClockData::Kvm(d)
270     }
271 }
272 
273 #[cfg(target_arch = "x86_64")]
274 impl From<ClockData> for kvm_clock_data {
275     fn from(ms: ClockData) -> Self {
276         match ms {
277             ClockData::Kvm(s) => s,
278             /* Needed in case other hypervisors are enabled */
279             #[allow(unreachable_patterns)]
280             _ => panic!("CpuState is not valid"),
281         }
282     }
283 }
284 
285 impl From<kvm_irq_routing_entry> for IrqRoutingEntry {
286     fn from(s: kvm_irq_routing_entry) -> Self {
287         IrqRoutingEntry::Kvm(s)
288     }
289 }
290 
291 impl From<IrqRoutingEntry> for kvm_irq_routing_entry {
292     fn from(e: IrqRoutingEntry) -> Self {
293         match e {
294             IrqRoutingEntry::Kvm(e) => e,
295             /* Needed in case other hypervisors are enabled */
296             #[allow(unreachable_patterns)]
297             _ => panic!("IrqRoutingEntry is not valid"),
298         }
299     }
300 }
301 
302 struct KvmDirtyLogSlot {
303     slot: u32,
304     guest_phys_addr: u64,
305     memory_size: u64,
306     userspace_addr: u64,
307 }
308 
309 /// Wrapper over KVM VM ioctls.
310 pub struct KvmVm {
311     fd: Arc<VmFd>,
312     #[cfg(target_arch = "x86_64")]
313     msrs: MsrEntries,
314     dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>,
315 }
316 
317 ///
318 /// Implementation of Vm trait for KVM
319 /// Example:
320 /// #[cfg(feature = "kvm")]
321 /// extern crate hypervisor
322 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
323 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
324 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
325 /// vm.set/get().unwrap()
326 ///
327 impl vm::Vm for KvmVm {
328     #[cfg(target_arch = "x86_64")]
329     ///
330     /// Sets the address of the one-page region in the VM's address space.
331     ///
332     fn set_identity_map_address(&self, address: u64) -> vm::Result<()> {
333         self.fd
334             .set_identity_map_address(address)
335             .map_err(|e| vm::HypervisorVmError::SetIdentityMapAddress(e.into()))
336     }
337     #[cfg(target_arch = "x86_64")]
338     ///
339     /// Sets the address of the three-page region in the VM's address space.
340     ///
341     fn set_tss_address(&self, offset: usize) -> vm::Result<()> {
342         self.fd
343             .set_tss_address(offset)
344             .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into()))
345     }
346     ///
347     /// Creates an in-kernel interrupt controller.
348     ///
349     fn create_irq_chip(&self) -> vm::Result<()> {
350         self.fd
351             .create_irq_chip()
352             .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into()))
353     }
354     ///
355     /// Registers an event that will, when signaled, trigger the `gsi` IRQ.
356     ///
357     fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
358         self.fd
359             .register_irqfd(fd, gsi)
360             .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into()))
361     }
362     ///
363     /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ.
364     ///
365     fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
366         self.fd
367             .unregister_irqfd(fd, gsi)
368             .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into()))
369     }
370     ///
371     /// Creates a VcpuFd object from a vcpu RawFd.
372     ///
373     fn create_vcpu(
374         &self,
375         id: u8,
376         vm_ops: Option<Arc<dyn VmOps>>,
377     ) -> vm::Result<Arc<dyn cpu::Vcpu>> {
378         let vc = self
379             .fd
380             .create_vcpu(id as u64)
381             .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?;
382         let vcpu = KvmVcpu {
383             fd: vc,
384             #[cfg(target_arch = "x86_64")]
385             msrs: self.msrs.clone(),
386             vm_ops,
387             #[cfg(target_arch = "x86_64")]
388             hyperv_synic: AtomicBool::new(false),
389         };
390         Ok(Arc::new(vcpu))
391     }
392     #[cfg(target_arch = "aarch64")]
393     ///
394     /// Creates a virtual GIC device.
395     ///
396     fn create_vgic(
397         &self,
398         vcpu_count: u64,
399         dist_addr: u64,
400         dist_size: u64,
401         redist_size: u64,
402         msi_size: u64,
403         nr_irqs: u32,
404     ) -> vm::Result<Arc<Mutex<dyn Vgic>>> {
405         let gic_device = KvmGicV3Its::new(
406             self,
407             vcpu_count,
408             dist_addr,
409             dist_size,
410             redist_size,
411             msi_size,
412             nr_irqs,
413         )
414         .map_err(|e| vm::HypervisorVmError::CreateVgic(anyhow!("Vgic error {:?}", e)))?;
415         Ok(Arc::new(Mutex::new(gic_device)))
416     }
417     ///
418     /// Registers an event to be signaled whenever a certain address is written to.
419     ///
420     fn register_ioevent(
421         &self,
422         fd: &EventFd,
423         addr: &IoEventAddress,
424         datamatch: Option<vm::DataMatch>,
425     ) -> vm::Result<()> {
426         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
427         if let Some(dm) = datamatch {
428             match dm {
429                 vm::DataMatch::DataMatch32(kvm_dm32) => self
430                     .fd
431                     .register_ioevent(fd, addr, kvm_dm32)
432                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
433                 vm::DataMatch::DataMatch64(kvm_dm64) => self
434                     .fd
435                     .register_ioevent(fd, addr, kvm_dm64)
436                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
437             }
438         } else {
439             self.fd
440                 .register_ioevent(fd, addr, NoDatamatch)
441                 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into()))
442         }
443     }
444     ///
445     /// Unregisters an event from a certain address it has been previously registered to.
446     ///
447     fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> {
448         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
449         self.fd
450             .unregister_ioevent(fd, addr, NoDatamatch)
451             .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into()))
452     }
453 
454     ///
455     /// Constructs a routing entry
456     ///
457     fn make_routing_entry(&self, gsi: u32, config: &InterruptSourceConfig) -> IrqRoutingEntry {
458         match &config {
459             InterruptSourceConfig::MsiIrq(cfg) => {
460                 let mut kvm_route = kvm_irq_routing_entry {
461                     gsi,
462                     type_: KVM_IRQ_ROUTING_MSI,
463                     ..Default::default()
464                 };
465 
466                 kvm_route.u.msi.address_lo = cfg.low_addr;
467                 kvm_route.u.msi.address_hi = cfg.high_addr;
468                 kvm_route.u.msi.data = cfg.data;
469 
470                 if self.check_extension(crate::kvm::Cap::MsiDevid) {
471                     // On AArch64, there is limitation on the range of the 'devid',
472                     // it can not be greater than 65536 (the max of u16).
473                     //
474                     // BDF can not be used directly, because 'segment' is in high
475                     // 16 bits. The layout of the u32 BDF is:
476                     // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --|
477                     // |      segment    |     bus    |   device   |  function  |
478                     //
479                     // Now that we support 1 bus only in a segment, we can build a
480                     // 'devid' by replacing the 'bus' bits with the low 8 bits of
481                     // 'segment' data.
482                     // This way we can resolve the range checking problem and give
483                     // different `devid` to all the devices. Limitation is that at
484                     // most 256 segments can be supported.
485                     //
486                     let modified_devid = (cfg.devid & 0x00ff_0000) >> 8 | cfg.devid & 0xff;
487 
488                     kvm_route.flags = KVM_MSI_VALID_DEVID;
489                     kvm_route.u.msi.__bindgen_anon_1.devid = modified_devid;
490                 }
491                 kvm_route.into()
492             }
493             InterruptSourceConfig::LegacyIrq(cfg) => {
494                 let mut kvm_route = kvm_irq_routing_entry {
495                     gsi,
496                     type_: KVM_IRQ_ROUTING_IRQCHIP,
497                     ..Default::default()
498                 };
499                 kvm_route.u.irqchip.irqchip = cfg.irqchip;
500                 kvm_route.u.irqchip.pin = cfg.pin;
501 
502                 kvm_route.into()
503             }
504         }
505     }
506 
507     ///
508     /// Sets the GSI routing table entries, overwriting any previously set
509     /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl.
510     ///
511     fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> {
512         let mut irq_routing =
513             vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len());
514         irq_routing[0].nr = entries.len() as u32;
515         irq_routing[0].flags = 0;
516         let entries: Vec<kvm_irq_routing_entry> = entries
517             .iter()
518             .map(|entry| match entry {
519                 IrqRoutingEntry::Kvm(e) => *e,
520                 #[allow(unreachable_patterns)]
521                 _ => panic!("IrqRoutingEntry type is wrong"),
522             })
523             .collect();
524 
525         // SAFETY: irq_routing initialized with entries.len() and now it is being turned into
526         // entries_slice with entries.len() again. It is guaranteed to be large enough to hold
527         // everything from entries.
528         unsafe {
529             let entries_slice: &mut [kvm_irq_routing_entry] =
530                 irq_routing[0].entries.as_mut_slice(entries.len());
531             entries_slice.copy_from_slice(&entries);
532         }
533 
534         self.fd
535             .set_gsi_routing(&irq_routing[0])
536             .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into()))
537     }
538     ///
539     /// Creates a memory region structure that can be used with {create/remove}_user_memory_region
540     ///
541     fn make_user_memory_region(
542         &self,
543         slot: u32,
544         guest_phys_addr: u64,
545         memory_size: u64,
546         userspace_addr: u64,
547         readonly: bool,
548         log_dirty_pages: bool,
549     ) -> UserMemoryRegion {
550         kvm_userspace_memory_region {
551             slot,
552             guest_phys_addr,
553             memory_size,
554             userspace_addr,
555             flags: if readonly { KVM_MEM_READONLY } else { 0 }
556                 | if log_dirty_pages {
557                     KVM_MEM_LOG_DIRTY_PAGES
558                 } else {
559                     0
560                 },
561         }
562         .into()
563     }
564     ///
565     /// Creates a guest physical memory region.
566     ///
567     fn create_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
568         let mut region: kvm_userspace_memory_region = user_memory_region.into();
569 
570         if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 {
571             if (region.flags & KVM_MEM_READONLY) != 0 {
572                 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!(
573                     "Error creating regions with both 'dirty-pages-log' and 'read-only'."
574                 )));
575             }
576 
577             // Keep track of the regions that need dirty pages log
578             self.dirty_log_slots.write().unwrap().insert(
579                 region.slot,
580                 KvmDirtyLogSlot {
581                     slot: region.slot,
582                     guest_phys_addr: region.guest_phys_addr,
583                     memory_size: region.memory_size,
584                     userspace_addr: region.userspace_addr,
585                 },
586             );
587 
588             // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`.
589             // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`.
590             region.flags = 0;
591         }
592 
593         // SAFETY: Safe because guest regions are guaranteed not to overlap.
594         unsafe {
595             self.fd
596                 .set_user_memory_region(region)
597                 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))
598         }
599     }
600     ///
601     /// Removes a guest physical memory region.
602     ///
603     fn remove_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
604         let mut region: kvm_userspace_memory_region = user_memory_region.into();
605 
606         // Remove the corresponding entry from "self.dirty_log_slots" if needed
607         self.dirty_log_slots.write().unwrap().remove(&region.slot);
608 
609         // Setting the size to 0 means "remove"
610         region.memory_size = 0;
611         // SAFETY: Safe because guest regions are guaranteed not to overlap.
612         unsafe {
613             self.fd
614                 .set_user_memory_region(region)
615                 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into()))
616         }
617     }
618     ///
619     /// Creates an emulated device in the kernel.
620     ///
621     /// See the documentation for `KVM_CREATE_DEVICE`.
622     fn create_device(&self, device: &mut CreateDevice) -> vm::Result<Arc<dyn device::Device>> {
623         let device_fd = self
624             .fd
625             .create_device(device)
626             .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?;
627         Ok(Arc::new(device_fd))
628     }
629     ///
630     /// Returns the preferred CPU target type which can be emulated by KVM on underlying host.
631     ///
632     #[cfg(target_arch = "aarch64")]
633     fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> {
634         self.fd
635             .get_preferred_target(kvi)
636             .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into()))
637     }
638     #[cfg(target_arch = "x86_64")]
639     fn enable_split_irq(&self) -> vm::Result<()> {
640         // Create split irqchip
641         // Only the local APIC is emulated in kernel, both PICs and IOAPIC
642         // are not.
643         let mut cap = kvm_enable_cap {
644             cap: KVM_CAP_SPLIT_IRQCHIP,
645             ..Default::default()
646         };
647         cap.args[0] = NUM_IOAPIC_PINS as u64;
648         self.fd
649             .enable_cap(&cap)
650             .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?;
651         Ok(())
652     }
653     #[cfg(target_arch = "x86_64")]
654     fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> {
655         let mut cap = kvm_enable_cap {
656             cap: KVM_CAP_SGX_ATTRIBUTE,
657             ..Default::default()
658         };
659         cap.args[0] = file.as_raw_fd() as u64;
660         self.fd
661             .enable_cap(&cap)
662             .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?;
663         Ok(())
664     }
665     /// Retrieve guest clock.
666     #[cfg(target_arch = "x86_64")]
667     fn get_clock(&self) -> vm::Result<ClockData> {
668         Ok(self
669             .fd
670             .get_clock()
671             .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))?
672             .into())
673     }
674     /// Set guest clock.
675     #[cfg(target_arch = "x86_64")]
676     fn set_clock(&self, data: &ClockData) -> vm::Result<()> {
677         let data = (*data).into();
678         self.fd
679             .set_clock(&data)
680             .map_err(|e| vm::HypervisorVmError::SetClock(e.into()))
681     }
682     /// Checks if a particular `Cap` is available.
683     fn check_extension(&self, c: Cap) -> bool {
684         self.fd.check_extension(c)
685     }
686     /// Create a device that is used for passthrough
687     fn create_passthrough_device(&self) -> vm::Result<Arc<dyn device::Device>> {
688         let mut vfio_dev = kvm_create_device {
689             type_: kvm_device_type_KVM_DEV_TYPE_VFIO,
690             fd: 0,
691             flags: 0,
692         };
693 
694         self.create_device(&mut vfio_dev)
695             .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into()))
696     }
697     ///
698     /// Start logging dirty pages
699     ///
700     fn start_dirty_log(&self) -> vm::Result<()> {
701         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
702         for (_, s) in dirty_log_slots.iter() {
703             let region = kvm_userspace_memory_region {
704                 slot: s.slot,
705                 guest_phys_addr: s.guest_phys_addr,
706                 memory_size: s.memory_size,
707                 userspace_addr: s.userspace_addr,
708                 flags: KVM_MEM_LOG_DIRTY_PAGES,
709             };
710             // SAFETY: Safe because guest regions are guaranteed not to overlap.
711             unsafe {
712                 self.fd
713                     .set_user_memory_region(region)
714                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
715             }
716         }
717 
718         Ok(())
719     }
720 
721     ///
722     /// Stop logging dirty pages
723     ///
724     fn stop_dirty_log(&self) -> vm::Result<()> {
725         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
726         for (_, s) in dirty_log_slots.iter() {
727             let region = kvm_userspace_memory_region {
728                 slot: s.slot,
729                 guest_phys_addr: s.guest_phys_addr,
730                 memory_size: s.memory_size,
731                 userspace_addr: s.userspace_addr,
732                 flags: 0,
733             };
734             // SAFETY: Safe because guest regions are guaranteed not to overlap.
735             unsafe {
736                 self.fd
737                     .set_user_memory_region(region)
738                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
739             }
740         }
741 
742         Ok(())
743     }
744 
745     ///
746     /// Get dirty pages bitmap (one bit per page)
747     ///
748     fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> {
749         self.fd
750             .get_dirty_log(slot, memory_size as usize)
751             .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into()))
752     }
753 
754     ///
755     /// Initialize TDX for this VM
756     ///
757     #[cfg(feature = "tdx")]
758     fn tdx_init(&self, cpuid: &CpuId, max_vcpus: u32) -> vm::Result<()> {
759         #[repr(C)]
760         struct TdxInitVm {
761             max_vcpus: u32,
762             tsc_khz: u32,
763             attributes: u64,
764             cpuid: u64,
765             mrconfigid: [u64; 6],
766             mrowner: [u64; 6],
767             mrownerconfig: [u64; 6],
768             reserved: [u64; 43],
769         }
770         let data = TdxInitVm {
771             max_vcpus,
772             tsc_khz: 0,
773             attributes: 0,
774             cpuid: cpuid.as_fam_struct_ptr() as u64,
775             mrconfigid: [0; 6],
776             mrowner: [0; 6],
777             mrownerconfig: [0; 6],
778             reserved: [0; 43],
779         };
780 
781         tdx_command(
782             &self.fd.as_raw_fd(),
783             TdxCommand::InitVm,
784             0,
785             &data as *const _ as u64,
786         )
787         .map_err(vm::HypervisorVmError::InitializeTdx)
788     }
789 
790     ///
791     /// Finalize the TDX setup for this VM
792     ///
793     #[cfg(feature = "tdx")]
794     fn tdx_finalize(&self) -> vm::Result<()> {
795         tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0)
796             .map_err(vm::HypervisorVmError::FinalizeTdx)
797     }
798 
799     ///
800     /// Initialize memory regions for the TDX VM
801     ///
802     #[cfg(feature = "tdx")]
803     fn tdx_init_memory_region(
804         &self,
805         host_address: u64,
806         guest_address: u64,
807         size: u64,
808         measure: bool,
809     ) -> vm::Result<()> {
810         #[repr(C)]
811         struct TdxInitMemRegion {
812             host_address: u64,
813             guest_address: u64,
814             pages: u64,
815         }
816         let data = TdxInitMemRegion {
817             host_address,
818             guest_address,
819             pages: size / 4096,
820         };
821 
822         tdx_command(
823             &self.fd.as_raw_fd(),
824             TdxCommand::InitMemRegion,
825             if measure { 1 } else { 0 },
826             &data as *const _ as u64,
827         )
828         .map_err(vm::HypervisorVmError::InitMemRegionTdx)
829     }
830 }
831 
832 #[cfg(feature = "tdx")]
833 fn tdx_command(
834     fd: &RawFd,
835     command: TdxCommand,
836     metadata: u32,
837     data: u64,
838 ) -> std::result::Result<(), std::io::Error> {
839     #[repr(C)]
840     struct TdxIoctlCmd {
841         command: TdxCommand,
842         metadata: u32,
843         data: u64,
844     }
845     let cmd = TdxIoctlCmd {
846         command,
847         metadata,
848         data,
849     };
850     // SAFETY: FFI call. All input parameters are valid.
851     let ret = unsafe {
852         ioctl_with_val(
853             fd,
854             KVM_MEMORY_ENCRYPT_OP(),
855             &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong,
856         )
857     };
858 
859     if ret < 0 {
860         return Err(std::io::Error::last_os_error());
861     }
862     Ok(())
863 }
864 
865 /// Wrapper over KVM system ioctls.
866 pub struct KvmHypervisor {
867     kvm: Kvm,
868 }
869 /// Enum for KVM related error
870 #[derive(Debug, Error)]
871 pub enum KvmError {
872     #[error("Capability missing: {0:?}")]
873     CapabilityMissing(Cap),
874 }
875 pub type KvmResult<T> = result::Result<T, KvmError>;
876 impl KvmHypervisor {
877     /// Create a hypervisor based on Kvm
878     pub fn new() -> hypervisor::Result<KvmHypervisor> {
879         let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?;
880         let api_version = kvm_obj.get_api_version();
881 
882         if api_version != kvm_bindings::KVM_API_VERSION as i32 {
883             return Err(hypervisor::HypervisorError::IncompatibleApiVersion);
884         }
885 
886         Ok(KvmHypervisor { kvm: kvm_obj })
887     }
888 }
889 /// Implementation of Hypervisor trait for KVM
890 /// Example:
891 /// #[cfg(feature = "kvm")]
892 /// extern crate hypervisor
893 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
894 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
895 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
896 ///
897 impl hypervisor::Hypervisor for KvmHypervisor {
898     /// Create a KVM vm object of a specific VM type and return the object as Vm trait object
899     /// Example
900     /// # extern crate hypervisor;
901     /// # use hypervisor::KvmHypervisor;
902     /// use hypervisor::KvmVm;
903     /// let hypervisor = KvmHypervisor::new().unwrap();
904     /// let vm = hypervisor.create_vm_with_type(KvmVmType::LegacyVm).unwrap()
905     ///
906     fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> {
907         let fd: VmFd;
908         loop {
909             match self.kvm.create_vm_with_type(vm_type) {
910                 Ok(res) => fd = res,
911                 Err(e) => {
912                     if e.errno() == libc::EINTR {
913                         // If the error returned is EINTR, which means the
914                         // ioctl has been interrupted, we have to retry as
915                         // this can't be considered as a regular error.
916                         continue;
917                     } else {
918                         return Err(hypervisor::HypervisorError::VmCreate(e.into()));
919                     }
920                 }
921             }
922             break;
923         }
924 
925         let vm_fd = Arc::new(fd);
926 
927         #[cfg(target_arch = "x86_64")]
928         {
929             let msr_list = self.get_msr_list()?;
930             let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize;
931             let mut msrs = MsrEntries::new(num_msrs).unwrap();
932             let indices = msr_list.as_slice();
933             let msr_entries = msrs.as_mut_slice();
934             for (pos, index) in indices.iter().enumerate() {
935                 msr_entries[pos].index = *index;
936             }
937 
938             Ok(Arc::new(KvmVm {
939                 fd: vm_fd,
940                 msrs,
941                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
942             }))
943         }
944 
945         #[cfg(target_arch = "aarch64")]
946         {
947             Ok(Arc::new(KvmVm {
948                 fd: vm_fd,
949                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
950             }))
951         }
952     }
953 
954     /// Create a KVM vm object and return the object as Vm trait object
955     /// Example
956     /// # extern crate hypervisor;
957     /// # use hypervisor::KvmHypervisor;
958     /// use hypervisor::KvmVm;
959     /// let hypervisor = KvmHypervisor::new().unwrap();
960     /// let vm = hypervisor.create_vm().unwrap()
961     ///
962     fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> {
963         #[allow(unused_mut)]
964         let mut vm_type: u64 = 0; // Create with default platform type
965 
966         // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA
967         // size from the host and use that when creating the VM, which may
968         // avoid unnecessary VM creation failures.
969         #[cfg(target_arch = "aarch64")]
970         if self.kvm.check_extension(Cap::ArmVmIPASize) {
971             vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap();
972         }
973 
974         self.create_vm_with_type(vm_type)
975     }
976 
977     fn check_required_extensions(&self) -> hypervisor::Result<()> {
978         check_required_kvm_extensions(&self.kvm)
979             .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into()))
980     }
981 
982     #[cfg(target_arch = "x86_64")]
983     ///
984     /// X86 specific call to get the system supported CPUID values.
985     ///
986     fn get_cpuid(&self) -> hypervisor::Result<CpuId> {
987         self.kvm
988             .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
989             .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))
990     }
991 
992     #[cfg(target_arch = "x86_64")]
993     ///
994     /// Retrieve the list of MSRs supported by KVM.
995     ///
996     fn get_msr_list(&self) -> hypervisor::Result<MsrList> {
997         self.kvm
998             .get_msr_index_list()
999             .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into()))
1000     }
1001     #[cfg(target_arch = "aarch64")]
1002     ///
1003     /// Retrieve AArch64 host maximum IPA size supported by KVM.
1004     ///
1005     fn get_host_ipa_limit(&self) -> i32 {
1006         self.kvm.get_host_ipa_limit()
1007     }
1008 
1009     ///
1010     /// Retrieve TDX capabilities
1011     ///
1012     #[cfg(feature = "tdx")]
1013     fn tdx_capabilities(&self) -> hypervisor::Result<TdxCapabilities> {
1014         let data = TdxCapabilities {
1015             nr_cpuid_configs: TDX_MAX_NR_CPUID_CONFIGS as u32,
1016             ..Default::default()
1017         };
1018 
1019         tdx_command(
1020             &self.kvm.as_raw_fd(),
1021             TdxCommand::Capabilities,
1022             0,
1023             &data as *const _ as u64,
1024         )
1025         .map_err(|e| hypervisor::HypervisorError::TdxCapabilities(e.into()))?;
1026 
1027         Ok(data)
1028     }
1029 }
1030 /// Vcpu struct for KVM
1031 pub struct KvmVcpu {
1032     fd: VcpuFd,
1033     #[cfg(target_arch = "x86_64")]
1034     msrs: MsrEntries,
1035     vm_ops: Option<Arc<dyn vm::VmOps>>,
1036     #[cfg(target_arch = "x86_64")]
1037     hyperv_synic: AtomicBool,
1038 }
1039 /// Implementation of Vcpu trait for KVM
1040 /// Example:
1041 /// #[cfg(feature = "kvm")]
1042 /// extern crate hypervisor
1043 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1044 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1045 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
1046 /// let vcpu = vm.create_vcpu(0, None).unwrap();
1047 /// vcpu.get/set().unwrap()
1048 ///
1049 impl cpu::Vcpu for KvmVcpu {
1050     #[cfg(target_arch = "x86_64")]
1051     ///
1052     /// Returns the vCPU general purpose registers.
1053     ///
1054     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1055         self.fd
1056             .get_regs()
1057             .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))
1058     }
1059     ///
1060     /// Returns the vCPU general purpose registers.
1061     /// The `KVM_GET_REGS` ioctl is not available on AArch64, `KVM_GET_ONE_REG`
1062     /// is used to get registers one by one.
1063     ///
1064     #[cfg(target_arch = "aarch64")]
1065     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1066         let mut state: StandardRegisters = kvm_regs::default();
1067         let mut off = offset__of!(user_pt_regs, regs);
1068         // There are 31 user_pt_regs:
1069         // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72
1070         // These actually are the general-purpose registers of the Armv8-a
1071         // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register).
1072         for i in 0..31 {
1073             state.regs.regs[i] = self
1074                 .fd
1075                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1076                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1077             off += std::mem::size_of::<u64>();
1078         }
1079 
1080         // We are now entering the "Other register" section of the ARMv8-a architecture.
1081         // First one, stack pointer.
1082         let off = offset__of!(user_pt_regs, sp);
1083         state.regs.sp = self
1084             .fd
1085             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1086             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1087 
1088         // Second one, the program counter.
1089         let off = offset__of!(user_pt_regs, pc);
1090         state.regs.pc = self
1091             .fd
1092             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1093             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1094 
1095         // Next is the processor state.
1096         let off = offset__of!(user_pt_regs, pstate);
1097         state.regs.pstate = self
1098             .fd
1099             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1100             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1101 
1102         // The stack pointer associated with EL1
1103         let off = offset__of!(kvm_regs, sp_el1);
1104         state.sp_el1 = self
1105             .fd
1106             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1107             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1108 
1109         // Exception Link Register for EL1, when taking an exception to EL1, this register
1110         // holds the address to which to return afterwards.
1111         let off = offset__of!(kvm_regs, elr_el1);
1112         state.elr_el1 = self
1113             .fd
1114             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1115             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1116 
1117         // Saved Program Status Registers, there are 5 of them used in the kernel.
1118         let mut off = offset__of!(kvm_regs, spsr);
1119         for i in 0..KVM_NR_SPSR as usize {
1120             state.spsr[i] = self
1121                 .fd
1122                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1123                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1124             off += std::mem::size_of::<u64>();
1125         }
1126 
1127         // Now moving on to floting point registers which are stored in the user_fpsimd_state in the kernel:
1128         // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53
1129         let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs);
1130         for i in 0..32 {
1131             state.fp_regs.vregs[i] = self
1132                 .fd
1133                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off))
1134                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1135                 .into();
1136             off += mem::size_of::<u128>();
1137         }
1138 
1139         // Floating-point Status Register
1140         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr);
1141         state.fp_regs.fpsr = self
1142             .fd
1143             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1144             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1145             as u32;
1146 
1147         // Floating-point Control Register
1148         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr);
1149         state.fp_regs.fpcr = self
1150             .fd
1151             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1152             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1153             as u32;
1154         Ok(state)
1155     }
1156     #[cfg(target_arch = "x86_64")]
1157     ///
1158     /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl.
1159     ///
1160     fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> {
1161         self.fd
1162             .set_regs(regs)
1163             .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into()))
1164     }
1165 
1166     ///
1167     /// Sets the vCPU general purpose registers.
1168     /// The `KVM_SET_REGS` ioctl is not available on AArch64, `KVM_SET_ONE_REG`
1169     /// is used to set registers one by one.
1170     ///
1171     #[cfg(target_arch = "aarch64")]
1172     fn set_regs(&self, state: &StandardRegisters) -> cpu::Result<()> {
1173         // The function follows the exact identical order from `state`. Look there
1174         // for some additional info on registers.
1175         let mut off = offset__of!(user_pt_regs, regs);
1176         for i in 0..31 {
1177             self.fd
1178                 .set_one_reg(
1179                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1180                     state.regs.regs[i],
1181                 )
1182                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1183             off += std::mem::size_of::<u64>();
1184         }
1185 
1186         let off = offset__of!(user_pt_regs, sp);
1187         self.fd
1188             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.sp)
1189             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1190 
1191         let off = offset__of!(user_pt_regs, pc);
1192         self.fd
1193             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pc)
1194             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1195 
1196         let off = offset__of!(user_pt_regs, pstate);
1197         self.fd
1198             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pstate)
1199             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1200 
1201         let off = offset__of!(kvm_regs, sp_el1);
1202         self.fd
1203             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.sp_el1)
1204             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1205 
1206         let off = offset__of!(kvm_regs, elr_el1);
1207         self.fd
1208             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.elr_el1)
1209             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1210 
1211         let mut off = offset__of!(kvm_regs, spsr);
1212         for i in 0..KVM_NR_SPSR as usize {
1213             self.fd
1214                 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.spsr[i])
1215                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1216             off += std::mem::size_of::<u64>();
1217         }
1218 
1219         let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs);
1220         for i in 0..32 {
1221             self.fd
1222                 .set_one_reg(
1223                     arm64_core_reg_id!(KVM_REG_SIZE_U128, off),
1224                     state.fp_regs.vregs[i] as u64,
1225                 )
1226                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1227             off += mem::size_of::<u128>();
1228         }
1229 
1230         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr);
1231         self.fd
1232             .set_one_reg(
1233                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1234                 state.fp_regs.fpsr as u64,
1235             )
1236             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1237 
1238         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr);
1239         self.fd
1240             .set_one_reg(
1241                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1242                 state.fp_regs.fpcr as u64,
1243             )
1244             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1245         Ok(())
1246     }
1247 
1248     #[cfg(target_arch = "aarch64")]
1249     ///
1250     /// Set attribute for vcpu.
1251     ///
1252     fn set_vcpu_attr(&self, attr: &DeviceAttr) -> cpu::Result<()> {
1253         self.fd
1254             .set_device_attr(attr)
1255             .map_err(|e| cpu::HypervisorCpuError::SetVcpuAttribute(e.into()))
1256     }
1257 
1258     #[cfg(target_arch = "aarch64")]
1259     ///
1260     /// Check if vcpu has a certain attribute.
1261     ///
1262     fn has_vcpu_attr(&self, attr: &DeviceAttr) -> cpu::Result<()> {
1263         self.fd
1264             .has_device_attr(attr)
1265             .map_err(|e| cpu::HypervisorCpuError::HasVcpuAttribute(e.into()))
1266     }
1267 
1268     #[cfg(target_arch = "x86_64")]
1269     ///
1270     /// Returns the vCPU special registers.
1271     ///
1272     fn get_sregs(&self) -> cpu::Result<SpecialRegisters> {
1273         self.fd
1274             .get_sregs()
1275             .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))
1276     }
1277     #[cfg(target_arch = "x86_64")]
1278     ///
1279     /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl.
1280     ///
1281     fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> {
1282         self.fd
1283             .set_sregs(sregs)
1284             .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into()))
1285     }
1286     #[cfg(target_arch = "x86_64")]
1287     ///
1288     /// Returns the floating point state (FPU) from the vCPU.
1289     ///
1290     fn get_fpu(&self) -> cpu::Result<FpuState> {
1291         self.fd
1292             .get_fpu()
1293             .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))
1294     }
1295     #[cfg(target_arch = "x86_64")]
1296     ///
1297     /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct.
1298     ///
1299     fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> {
1300         self.fd
1301             .set_fpu(fpu)
1302             .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into()))
1303     }
1304     #[cfg(target_arch = "x86_64")]
1305     ///
1306     /// X86 specific call to setup the CPUID registers.
1307     ///
1308     fn set_cpuid2(&self, cpuid: &CpuId) -> cpu::Result<()> {
1309         self.fd
1310             .set_cpuid2(cpuid)
1311             .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into()))
1312     }
1313     #[cfg(target_arch = "x86_64")]
1314     ///
1315     /// X86 specific call to enable HyperV SynIC
1316     ///
1317     fn enable_hyperv_synic(&self) -> cpu::Result<()> {
1318         // Update the information about Hyper-V SynIC being enabled and
1319         // emulated as it will influence later which MSRs should be saved.
1320         self.hyperv_synic.store(true, Ordering::Release);
1321 
1322         let cap = kvm_enable_cap {
1323             cap: KVM_CAP_HYPERV_SYNIC,
1324             ..Default::default()
1325         };
1326         self.fd
1327             .enable_cap(&cap)
1328             .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into()))
1329     }
1330     ///
1331     /// X86 specific call to retrieve the CPUID registers.
1332     ///
1333     #[cfg(target_arch = "x86_64")]
1334     fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<CpuId> {
1335         self.fd
1336             .get_cpuid2(num_entries)
1337             .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))
1338     }
1339     #[cfg(target_arch = "x86_64")]
1340     ///
1341     /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1342     ///
1343     fn get_lapic(&self) -> cpu::Result<LapicState> {
1344         self.fd
1345             .get_lapic()
1346             .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))
1347     }
1348     #[cfg(target_arch = "x86_64")]
1349     ///
1350     /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1351     ///
1352     fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> {
1353         self.fd
1354             .set_lapic(klapic)
1355             .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into()))
1356     }
1357     #[cfg(target_arch = "x86_64")]
1358     ///
1359     /// Returns the model-specific registers (MSR) for this vCPU.
1360     ///
1361     fn get_msrs(&self, msrs: &mut MsrEntries) -> cpu::Result<usize> {
1362         self.fd
1363             .get_msrs(msrs)
1364             .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))
1365     }
1366     #[cfg(target_arch = "x86_64")]
1367     ///
1368     /// Setup the model-specific registers (MSR) for this vCPU.
1369     /// Returns the number of MSR entries actually written.
1370     ///
1371     fn set_msrs(&self, msrs: &MsrEntries) -> cpu::Result<usize> {
1372         self.fd
1373             .set_msrs(msrs)
1374             .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into()))
1375     }
1376     ///
1377     /// Returns the vcpu's current "multiprocessing state".
1378     ///
1379     fn get_mp_state(&self) -> cpu::Result<MpState> {
1380         Ok(self
1381             .fd
1382             .get_mp_state()
1383             .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))?
1384             .into())
1385     }
1386     ///
1387     /// Sets the vcpu's current "multiprocessing state".
1388     ///
1389     fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> {
1390         self.fd
1391             .set_mp_state(mp_state.into())
1392             .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into()))
1393     }
1394     #[cfg(target_arch = "x86_64")]
1395     ///
1396     /// X86 specific call that returns the vcpu's current "xsave struct".
1397     ///
1398     fn get_xsave(&self) -> cpu::Result<Xsave> {
1399         self.fd
1400             .get_xsave()
1401             .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into()))
1402     }
1403     #[cfg(target_arch = "x86_64")]
1404     ///
1405     /// X86 specific call that sets the vcpu's current "xsave struct".
1406     ///
1407     fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> {
1408         self.fd
1409             .set_xsave(xsave)
1410             .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into()))
1411     }
1412     #[cfg(target_arch = "x86_64")]
1413     ///
1414     /// X86 specific call that returns the vcpu's current "xcrs".
1415     ///
1416     fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> {
1417         self.fd
1418             .get_xcrs()
1419             .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into()))
1420     }
1421     #[cfg(target_arch = "x86_64")]
1422     ///
1423     /// X86 specific call that sets the vcpu's current "xcrs".
1424     ///
1425     fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> {
1426         self.fd
1427             .set_xcrs(xcrs)
1428             .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into()))
1429     }
1430     #[cfg(target_arch = "x86_64")]
1431     ///
1432     /// Translates guest virtual address to guest physical address using the `KVM_TRANSLATE` ioctl.
1433     ///
1434     fn translate_gva(&self, gva: u64, _flags: u64) -> cpu::Result<(u64, u32)> {
1435         let tr = self
1436             .fd
1437             .translate_gva(gva)
1438             .map_err(|e| cpu::HypervisorCpuError::TranslateVirtualAddress(e.into()))?;
1439         // tr.valid is set if the GVA is mapped to valid GPA.
1440         match tr.valid {
1441             0 => Err(cpu::HypervisorCpuError::TranslateVirtualAddress(anyhow!(
1442                 "Invalid GVA: {:#x}",
1443                 gva
1444             ))),
1445             _ => Ok((tr.physical_address, 0)),
1446         }
1447     }
1448     ///
1449     /// Triggers the running of the current virtual CPU returning an exit reason.
1450     ///
1451     fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> {
1452         match self.fd.run() {
1453             Ok(run) => match run {
1454                 #[cfg(target_arch = "x86_64")]
1455                 VcpuExit::IoIn(addr, data) => {
1456                     if let Some(vm_ops) = &self.vm_ops {
1457                         return vm_ops
1458                             .pio_read(addr.into(), data)
1459                             .map(|_| cpu::VmExit::Ignore)
1460                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1461                     }
1462 
1463                     Ok(cpu::VmExit::IoIn(addr, data))
1464                 }
1465                 #[cfg(target_arch = "x86_64")]
1466                 VcpuExit::IoOut(addr, data) => {
1467                     if let Some(vm_ops) = &self.vm_ops {
1468                         return vm_ops
1469                             .pio_write(addr.into(), data)
1470                             .map(|_| cpu::VmExit::Ignore)
1471                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1472                     }
1473 
1474                     Ok(cpu::VmExit::IoOut(addr, data))
1475                 }
1476                 #[cfg(target_arch = "x86_64")]
1477                 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)),
1478                 #[cfg(target_arch = "x86_64")]
1479                 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset),
1480 
1481                 #[cfg(target_arch = "aarch64")]
1482                 VcpuExit::SystemEvent(event_type, flags) => {
1483                     use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN};
1484                     // On Aarch64, when the VM is shutdown, run() returns
1485                     // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN
1486                     if event_type == KVM_SYSTEM_EVENT_RESET {
1487                         Ok(cpu::VmExit::Reset)
1488                     } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN {
1489                         Ok(cpu::VmExit::Shutdown)
1490                     } else {
1491                         Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1492                             "Unexpected system event with type 0x{:x}, flags 0x{:x}",
1493                             event_type,
1494                             flags
1495                         )))
1496                     }
1497                 }
1498 
1499                 VcpuExit::MmioRead(addr, data) => {
1500                     if let Some(vm_ops) = &self.vm_ops {
1501                         return vm_ops
1502                             .mmio_read(addr, data)
1503                             .map(|_| cpu::VmExit::Ignore)
1504                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1505                     }
1506 
1507                     Ok(cpu::VmExit::MmioRead(addr, data))
1508                 }
1509                 VcpuExit::MmioWrite(addr, data) => {
1510                     if let Some(vm_ops) = &self.vm_ops {
1511                         return vm_ops
1512                             .mmio_write(addr, data)
1513                             .map(|_| cpu::VmExit::Ignore)
1514                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1515                     }
1516 
1517                     Ok(cpu::VmExit::MmioWrite(addr, data))
1518                 }
1519                 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv),
1520                 #[cfg(feature = "tdx")]
1521                 VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx),
1522                 VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug),
1523 
1524                 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1525                     "Unexpected exit reason on vcpu run: {:?}",
1526                     r
1527                 ))),
1528             },
1529 
1530             Err(ref e) => match e.errno() {
1531                 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore),
1532                 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1533                     "VCPU error {:?}",
1534                     e
1535                 ))),
1536             },
1537         }
1538     }
1539     #[cfg(target_arch = "x86_64")]
1540     ///
1541     /// Returns currently pending exceptions, interrupts, and NMIs as well as related
1542     /// states of the vcpu.
1543     ///
1544     fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> {
1545         self.fd
1546             .get_vcpu_events()
1547             .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into()))
1548     }
1549     #[cfg(target_arch = "x86_64")]
1550     ///
1551     /// Sets pending exceptions, interrupts, and NMIs as well as related states
1552     /// of the vcpu.
1553     ///
1554     fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> {
1555         self.fd
1556             .set_vcpu_events(events)
1557             .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into()))
1558     }
1559     #[cfg(target_arch = "x86_64")]
1560     ///
1561     /// Let the guest know that it has been paused, which prevents from
1562     /// potential soft lockups when being resumed.
1563     ///
1564     fn notify_guest_clock_paused(&self) -> cpu::Result<()> {
1565         if let Err(e) = self.fd.kvmclock_ctrl() {
1566             // Linux kernel returns -EINVAL if the PV clock isn't yet initialised
1567             // which could be because we're still in firmware or the guest doesn't
1568             // use KVM clock.
1569             if e.errno() != libc::EINVAL {
1570                 return Err(cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into()));
1571             }
1572         }
1573 
1574         Ok(())
1575     }
1576     #[cfg(target_arch = "x86_64")]
1577     ///
1578     /// Sets debug registers to set hardware breakpoints and/or enable single step.
1579     ///
1580     fn set_guest_debug(
1581         &self,
1582         addrs: &[vm_memory::GuestAddress],
1583         singlestep: bool,
1584     ) -> cpu::Result<()> {
1585         if addrs.len() > 4 {
1586             return Err(cpu::HypervisorCpuError::SetDebugRegs(anyhow!(
1587                 "Support 4 breakpoints at most but {} addresses are passed",
1588                 addrs.len()
1589             )));
1590         }
1591 
1592         let mut dbg = kvm_guest_debug {
1593             control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP,
1594             ..Default::default()
1595         };
1596         if singlestep {
1597             dbg.control |= KVM_GUESTDBG_SINGLESTEP;
1598         }
1599 
1600         // Set bits 9 and 10.
1601         // bit 9: GE (global exact breakpoint enable) flag.
1602         // bit 10: always 1.
1603         dbg.arch.debugreg[7] = 0x0600;
1604 
1605         for (i, addr) in addrs.iter().enumerate() {
1606             dbg.arch.debugreg[i] = addr.0;
1607             // Set global breakpoint enable flag
1608             dbg.arch.debugreg[7] |= 2 << (i * 2);
1609         }
1610 
1611         self.fd
1612             .set_guest_debug(&dbg)
1613             .map_err(|e| cpu::HypervisorCpuError::SetDebugRegs(e.into()))
1614     }
1615     #[cfg(target_arch = "aarch64")]
1616     fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> {
1617         self.fd
1618             .vcpu_init(kvi)
1619             .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into()))
1620     }
1621     ///
1622     /// Sets the value of one register for this vCPU.
1623     ///
1624     #[cfg(target_arch = "aarch64")]
1625     fn set_reg(&self, reg_id: u64, data: u64) -> cpu::Result<()> {
1626         self.fd
1627             .set_one_reg(reg_id, data)
1628             .map_err(|e| cpu::HypervisorCpuError::SetRegister(e.into()))
1629     }
1630     ///
1631     /// Gets the value of one register for this vCPU.
1632     ///
1633     #[cfg(target_arch = "aarch64")]
1634     fn get_reg(&self, reg_id: u64) -> cpu::Result<u64> {
1635         self.fd
1636             .get_one_reg(reg_id)
1637             .map_err(|e| cpu::HypervisorCpuError::GetRegister(e.into()))
1638     }
1639     ///
1640     /// Gets a list of the guest registers that are supported for the
1641     /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls.
1642     ///
1643     #[cfg(target_arch = "aarch64")]
1644     fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> {
1645         self.fd
1646             .get_reg_list(reg_list)
1647             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))
1648     }
1649     ///
1650     /// Save the state of the system registers.
1651     ///
1652     #[cfg(target_arch = "aarch64")]
1653     fn get_sys_regs(&self) -> cpu::Result<Vec<Register>> {
1654         // Call KVM_GET_REG_LIST to get all registers available to the guest. For ArmV8 there are
1655         // around 500 registers.
1656         let mut state: Vec<Register> = Vec::new();
1657         let mut reg_list = RegList::new(500).unwrap();
1658         self.fd
1659             .get_reg_list(&mut reg_list)
1660             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?;
1661 
1662         // At this point reg_list should contain: core registers and system registers.
1663         // The register list contains the number of registers and their ids. We will be needing to
1664         // call KVM_GET_ONE_REG on each id in order to save all of them. We carve out from the list
1665         // the core registers which are represented in the kernel by kvm_regs structure and for which
1666         // we can calculate the id based on the offset in the structure.
1667         reg_list.retain(|regid| is_system_register(*regid));
1668 
1669         // Now, for the rest of the registers left in the previously fetched register list, we are
1670         // simply calling KVM_GET_ONE_REG.
1671         let indices = reg_list.as_slice();
1672         for index in indices.iter() {
1673             state.push(kvm_bindings::kvm_one_reg {
1674                 id: *index,
1675                 addr: self
1676                     .fd
1677                     .get_one_reg(*index)
1678                     .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?,
1679             });
1680         }
1681 
1682         Ok(state)
1683     }
1684     ///
1685     /// Restore the state of the system registers.
1686     ///
1687     #[cfg(target_arch = "aarch64")]
1688     fn set_sys_regs(&self, state: &[Register]) -> cpu::Result<()> {
1689         for reg in state {
1690             self.fd
1691                 .set_one_reg(reg.id, reg.addr)
1692                 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?;
1693         }
1694         Ok(())
1695     }
1696     ///
1697     /// Read the MPIDR - Multiprocessor Affinity Register.
1698     ///
1699     #[cfg(target_arch = "aarch64")]
1700     fn read_mpidr(&self) -> cpu::Result<u64> {
1701         self.fd
1702             .get_one_reg(MPIDR_EL1)
1703             .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))
1704     }
1705     ///
1706     /// Configure core registers for a given CPU.
1707     ///
1708     #[cfg(target_arch = "aarch64")]
1709     fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> {
1710         #[allow(non_upper_case_globals)]
1711         // PSR (Processor State Register) bits.
1712         // Taken from arch/arm64/include/uapi/asm/ptrace.h.
1713         const PSR_MODE_EL1h: u64 = 0x0000_0005;
1714         const PSR_F_BIT: u64 = 0x0000_0040;
1715         const PSR_I_BIT: u64 = 0x0000_0080;
1716         const PSR_A_BIT: u64 = 0x0000_0100;
1717         const PSR_D_BIT: u64 = 0x0000_0200;
1718         // Taken from arch/arm64/kvm/inject_fault.c.
1719         const PSTATE_FAULT_BITS_64: u64 =
1720             PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | PSR_I_BIT | PSR_D_BIT;
1721 
1722         let kreg_off = offset__of!(kvm_regs, regs);
1723 
1724         // Get the register index of the PSTATE (Processor State) register.
1725         let pstate = offset__of!(user_pt_regs, pstate) + kreg_off;
1726         self.set_reg(
1727             arm64_core_reg_id!(KVM_REG_SIZE_U64, pstate),
1728             PSTATE_FAULT_BITS_64,
1729         )
1730         .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1731 
1732         // Other vCPUs are powered off initially awaiting PSCI wakeup.
1733         if cpu_id == 0 {
1734             // Setting the PC (Processor Counter) to the current program address (kernel address).
1735             let pc = offset__of!(user_pt_regs, pc) + kreg_off;
1736             self.set_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, pc), boot_ip as u64)
1737                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1738 
1739             // Last mandatory thing to set -> the address pointing to the FDT (also called DTB).
1740             // "The device tree blob (dtb) must be placed on an 8-byte boundary and must
1741             // not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt.
1742             // We are choosing to place it the end of DRAM. See `get_fdt_addr`.
1743             let regs0 = offset__of!(user_pt_regs, regs) + kreg_off;
1744             self.set_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, regs0), fdt_start)
1745                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1746         }
1747         Ok(())
1748     }
1749 
1750     #[cfg(target_arch = "x86_64")]
1751     ///
1752     /// Get the current CPU state
1753     ///
1754     /// Ordering requirements:
1755     ///
1756     /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify
1757     /// vCPU/LAPIC state. As such, it must be done before most everything
1758     /// else, otherwise we cannot restore everything and expect it to work.
1759     ///
1760     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1761     /// still running.
1762     ///
1763     /// KVM_GET_LAPIC may change state of LAPIC before returning it.
1764     ///
1765     /// GET_VCPU_EVENTS should probably be last to save. The code looks as
1766     /// it might as well be affected by internal state modifications of the
1767     /// GET ioctls.
1768     ///
1769     /// SREGS saves/restores a pending interrupt, similar to what
1770     /// VCPU_EVENTS also does.
1771     ///
1772     /// GET_MSRS requires a pre-populated data structure to do something
1773     /// meaningful. For SET_MSRS it will then contain good data.
1774     ///
1775     /// # Example
1776     ///
1777     /// ```rust
1778     /// # extern crate hypervisor;
1779     /// # use hypervisor::KvmHypervisor;
1780     /// # use std::sync::Arc;
1781     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1782     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1783     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1784     /// vm.enable_split_irq().unwrap();
1785     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1786     /// let state = vcpu.state().unwrap();
1787     /// ```
1788     fn state(&self) -> cpu::Result<CpuState> {
1789         let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?;
1790         let mp_state = self.get_mp_state()?.into();
1791         let regs = self.get_regs()?;
1792         let sregs = self.get_sregs()?;
1793         let xsave = self.get_xsave()?;
1794         let xcrs = self.get_xcrs()?;
1795         let lapic_state = self.get_lapic()?;
1796         let fpu = self.get_fpu()?;
1797 
1798         // Try to get all MSRs based on the list previously retrieved from KVM.
1799         // If the number of MSRs obtained from GET_MSRS is different from the
1800         // expected amount, we fallback onto a slower method by getting MSRs
1801         // by chunks. This is the only way to make sure we try to get as many
1802         // MSRs as possible, even if some MSRs are not supported.
1803         let mut msr_entries = self.msrs.clone();
1804 
1805         // Save extra MSRs if the Hyper-V synthetic interrupt controller is
1806         // emulated.
1807         if self.hyperv_synic.load(Ordering::Acquire) {
1808             let hyperv_synic_msrs = vec![
1809                 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084,
1810                 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096,
1811                 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d,
1812                 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4,
1813                 0x400000b5, 0x400000b6, 0x400000b7,
1814             ];
1815             for index in hyperv_synic_msrs {
1816                 let msr = kvm_msr_entry {
1817                     index,
1818                     ..Default::default()
1819                 };
1820                 msr_entries.push(msr).unwrap();
1821             }
1822         }
1823 
1824         let expected_num_msrs = msr_entries.as_fam_struct_ref().nmsrs as usize;
1825         let num_msrs = self.get_msrs(&mut msr_entries)?;
1826         let msrs = if num_msrs != expected_num_msrs {
1827             let mut faulty_msr_index = num_msrs;
1828             let mut msr_entries_tmp =
1829                 MsrEntries::from_entries(&msr_entries.as_slice()[..faulty_msr_index]).unwrap();
1830 
1831             loop {
1832                 warn!(
1833                     "Detected faulty MSR 0x{:x} while getting MSRs",
1834                     msr_entries.as_slice()[faulty_msr_index].index
1835                 );
1836 
1837                 let start_pos = faulty_msr_index + 1;
1838                 let mut sub_msr_entries =
1839                     MsrEntries::from_entries(&msr_entries.as_slice()[start_pos..]).unwrap();
1840                 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize;
1841                 let num_msrs = self.get_msrs(&mut sub_msr_entries)?;
1842 
1843                 for i in 0..num_msrs {
1844                     msr_entries_tmp
1845                         .push(sub_msr_entries.as_slice()[i])
1846                         .map_err(|e| {
1847                             cpu::HypervisorCpuError::GetMsrEntries(anyhow!(
1848                                 "Failed adding MSR entries: {:?}",
1849                                 e
1850                             ))
1851                         })?;
1852                 }
1853 
1854                 if num_msrs == expected_num_msrs {
1855                     break;
1856                 }
1857 
1858                 faulty_msr_index = start_pos + num_msrs;
1859             }
1860 
1861             msr_entries_tmp
1862         } else {
1863             msr_entries
1864         };
1865 
1866         let vcpu_events = self.get_vcpu_events()?;
1867 
1868         Ok(VcpuKvmState {
1869             cpuid,
1870             msrs,
1871             vcpu_events,
1872             regs,
1873             sregs,
1874             fpu,
1875             lapic_state,
1876             xsave,
1877             xcrs,
1878             mp_state,
1879         }
1880         .into())
1881     }
1882     ///
1883     /// Get the current AArch64 CPU state
1884     ///
1885     #[cfg(target_arch = "aarch64")]
1886     fn state(&self) -> cpu::Result<CpuState> {
1887         let mut state = VcpuKvmState {
1888             mp_state: self.get_mp_state()?.into(),
1889             mpidr: self.read_mpidr()?,
1890             ..Default::default()
1891         };
1892         state.core_regs = self.get_regs()?;
1893         state.sys_regs = self.get_sys_regs()?;
1894 
1895         Ok(state.into())
1896     }
1897     #[cfg(target_arch = "x86_64")]
1898     ///
1899     /// Restore the previously saved CPU state
1900     ///
1901     /// Ordering requirements:
1902     ///
1903     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1904     /// still running.
1905     ///
1906     /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so
1907     /// if we ever change the BSP, we have to do that before restoring anything.
1908     /// The same seems to be true for CPUID stuff.
1909     ///
1910     /// SREGS saves/restores a pending interrupt, similar to what
1911     /// VCPU_EVENTS also does.
1912     ///
1913     /// SET_REGS clears pending exceptions unconditionally, thus, it must be
1914     /// done before SET_VCPU_EVENTS, which restores it.
1915     ///
1916     /// SET_LAPIC must come after SET_SREGS, because the latter restores
1917     /// the apic base msr.
1918     ///
1919     /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR
1920     /// only restores successfully, when the LAPIC is correctly configured.
1921     ///
1922     /// Arguments: CpuState
1923     /// # Example
1924     ///
1925     /// ```rust
1926     /// # extern crate hypervisor;
1927     /// # use hypervisor::KvmHypervisor;
1928     /// # use std::sync::Arc;
1929     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1930     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1931     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1932     /// vm.enable_split_irq().unwrap();
1933     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1934     /// let state = vcpu.state().unwrap();
1935     /// vcpu.set_state(&state).unwrap();
1936     /// ```
1937     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
1938         let state: VcpuKvmState = state.clone().into();
1939         self.set_cpuid2(&state.cpuid)?;
1940         self.set_mp_state(state.mp_state.into())?;
1941         self.set_regs(&state.regs)?;
1942         self.set_sregs(&state.sregs)?;
1943         self.set_xsave(&state.xsave)?;
1944         self.set_xcrs(&state.xcrs)?;
1945         self.set_lapic(&state.lapic_state)?;
1946         self.set_fpu(&state.fpu)?;
1947 
1948         // Try to set all MSRs previously stored.
1949         // If the number of MSRs set from SET_MSRS is different from the
1950         // expected amount, we fallback onto a slower method by setting MSRs
1951         // by chunks. This is the only way to make sure we try to set as many
1952         // MSRs as possible, even if some MSRs are not supported.
1953         let expected_num_msrs = state.msrs.as_fam_struct_ref().nmsrs as usize;
1954         let num_msrs = self.set_msrs(&state.msrs)?;
1955         if num_msrs != expected_num_msrs {
1956             let mut faulty_msr_index = num_msrs;
1957 
1958             loop {
1959                 warn!(
1960                     "Detected faulty MSR 0x{:x} while setting MSRs",
1961                     state.msrs.as_slice()[faulty_msr_index].index
1962                 );
1963 
1964                 let start_pos = faulty_msr_index + 1;
1965                 let sub_msr_entries =
1966                     MsrEntries::from_entries(&state.msrs.as_slice()[start_pos..]).unwrap();
1967                 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize;
1968                 let num_msrs = self.set_msrs(&sub_msr_entries)?;
1969 
1970                 if num_msrs == expected_num_msrs {
1971                     break;
1972                 }
1973 
1974                 faulty_msr_index = start_pos + num_msrs;
1975             }
1976         }
1977 
1978         self.set_vcpu_events(&state.vcpu_events)?;
1979 
1980         Ok(())
1981     }
1982     ///
1983     /// Restore the previously saved AArch64 CPU state
1984     ///
1985     #[cfg(target_arch = "aarch64")]
1986     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
1987         let state: VcpuKvmState = state.clone().into();
1988         self.set_regs(&state.core_regs)?;
1989         self.set_sys_regs(&state.sys_regs)?;
1990         self.set_mp_state(state.mp_state.into())?;
1991 
1992         Ok(())
1993     }
1994 
1995     ///
1996     /// Initialize TDX for this CPU
1997     ///
1998     #[cfg(feature = "tdx")]
1999     fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> {
2000         tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address)
2001             .map_err(cpu::HypervisorCpuError::InitializeTdx)
2002     }
2003 
2004     ///
2005     /// Set the "immediate_exit" state
2006     ///
2007     fn set_immediate_exit(&self, exit: bool) {
2008         self.fd.set_kvm_immediate_exit(exit.into());
2009     }
2010 
2011     ///
2012     /// Returns the details about TDX exit reason
2013     ///
2014     #[cfg(feature = "tdx")]
2015     fn get_tdx_exit_details(&mut self) -> cpu::Result<TdxExitDetails> {
2016         let kvm_run = self.fd.get_kvm_run();
2017         let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall };
2018 
2019         tdx_vmcall.status_code = TDG_VP_VMCALL_INVALID_OPERAND;
2020 
2021         if tdx_vmcall.type_ != 0 {
2022             return Err(cpu::HypervisorCpuError::UnknownTdxVmCall);
2023         }
2024 
2025         match tdx_vmcall.subfunction {
2026             TDG_VP_VMCALL_GET_QUOTE => Ok(TdxExitDetails::GetQuote),
2027             TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT => {
2028                 Ok(TdxExitDetails::SetupEventNotifyInterrupt)
2029             }
2030             _ => Err(cpu::HypervisorCpuError::UnknownTdxVmCall),
2031         }
2032     }
2033 
2034     ///
2035     /// Set the status code for TDX exit
2036     ///
2037     #[cfg(feature = "tdx")]
2038     fn set_tdx_status(&mut self, status: TdxExitStatus) {
2039         let kvm_run = self.fd.get_kvm_run();
2040         let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall };
2041 
2042         tdx_vmcall.status_code = match status {
2043             TdxExitStatus::Success => TDG_VP_VMCALL_SUCCESS,
2044             TdxExitStatus::InvalidOperand => TDG_VP_VMCALL_INVALID_OPERAND,
2045         };
2046     }
2047     #[cfg(target_arch = "x86_64")]
2048     ///
2049     /// Return the list of initial MSR entries for a VCPU
2050     ///
2051     fn boot_msr_entries(&self) -> MsrEntries {
2052         use crate::arch::x86::{msr_index, MTRR_ENABLE, MTRR_MEM_TYPE_WB};
2053         use kvm_bindings::kvm_msr_entry as MsrEntry;
2054 
2055         MsrEntries::from_entries(&[
2056             msr!(msr_index::MSR_IA32_SYSENTER_CS),
2057             msr!(msr_index::MSR_IA32_SYSENTER_ESP),
2058             msr!(msr_index::MSR_IA32_SYSENTER_EIP),
2059             msr!(msr_index::MSR_STAR),
2060             msr!(msr_index::MSR_CSTAR),
2061             msr!(msr_index::MSR_LSTAR),
2062             msr!(msr_index::MSR_KERNEL_GS_BASE),
2063             msr!(msr_index::MSR_SYSCALL_MASK),
2064             msr!(msr_index::MSR_IA32_TSC),
2065             msr_data!(
2066                 msr_index::MSR_IA32_MISC_ENABLE,
2067                 msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64
2068             ),
2069             msr_data!(msr_index::MSR_MTRRdefType, MTRR_ENABLE | MTRR_MEM_TYPE_WB),
2070         ])
2071         .unwrap()
2072     }
2073 }
2074 
2075 /// Device struct for KVM
2076 pub type KvmDevice = DeviceFd;
2077 
2078 impl device::Device for KvmDevice {
2079     ///
2080     /// Set device attribute
2081     ///
2082     fn set_device_attr(&self, attr: &DeviceAttr) -> device::Result<()> {
2083         self.set_device_attr(attr)
2084             .map_err(|e| device::HypervisorDeviceError::SetDeviceAttribute(e.into()))
2085     }
2086     ///
2087     /// Get device attribute
2088     ///
2089     fn get_device_attr(&self, attr: &mut DeviceAttr) -> device::Result<()> {
2090         self.get_device_attr(attr)
2091             .map_err(|e| device::HypervisorDeviceError::GetDeviceAttribute(e.into()))
2092     }
2093     ///
2094     /// Cast to the underlying KVM device fd
2095     ///
2096     fn as_any(&self) -> &dyn Any {
2097         self
2098     }
2099 }
2100