xref: /cloud-hypervisor/hypervisor/src/kvm/mod.rs (revision 84bbaf06d1e3cacac9adae050fbbd4ddaadaac7c)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 // Copyright © 2020, Microsoft Corporation
6 //
7 // Copyright 2018-2019 CrowdStrike, Inc.
8 //
9 //
10 
11 #[cfg(target_arch = "aarch64")]
12 use crate::aarch64::gic::KvmGicV3Its;
13 #[cfg(target_arch = "aarch64")]
14 pub use crate::aarch64::{
15     check_required_kvm_extensions, gic::Gicv3ItsState as GicState, is_system_register, VcpuInit,
16     VcpuKvmState as CpuState, MPIDR_EL1,
17 };
18 #[cfg(target_arch = "aarch64")]
19 use crate::arch::aarch64::gic::Vgic;
20 use crate::cpu;
21 use crate::device;
22 use crate::hypervisor;
23 use crate::vec_with_array_field;
24 use crate::vm::{self, InterruptSourceConfig, VmOps};
25 #[cfg(target_arch = "aarch64")]
26 use crate::{arm64_core_reg_id, offset__of};
27 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd};
28 use serde::{Deserialize, Serialize};
29 use std::collections::HashMap;
30 #[cfg(target_arch = "aarch64")]
31 use std::convert::TryInto;
32 #[cfg(target_arch = "x86_64")]
33 use std::fs::File;
34 use std::os::unix::io::{AsRawFd, RawFd};
35 use std::result;
36 #[cfg(target_arch = "x86_64")]
37 use std::sync::atomic::{AtomicBool, Ordering};
38 #[cfg(target_arch = "aarch64")]
39 use std::sync::Mutex;
40 use std::sync::{Arc, RwLock};
41 use vmm_sys_util::eventfd::EventFd;
42 // x86_64 dependencies
43 #[cfg(target_arch = "x86_64")]
44 pub mod x86_64;
45 #[cfg(target_arch = "x86_64")]
46 use crate::arch::x86::NUM_IOAPIC_PINS;
47 #[cfg(target_arch = "aarch64")]
48 use aarch64::{RegList, Register, StandardRegisters};
49 #[cfg(target_arch = "x86_64")]
50 use kvm_bindings::{
51     kvm_enable_cap, kvm_guest_debug, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC,
52     KVM_CAP_SPLIT_IRQCHIP, KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_GUESTDBG_USE_HW_BP,
53 };
54 #[cfg(target_arch = "x86_64")]
55 use x86_64::{check_required_kvm_extensions, FpuState, SpecialRegisters, StandardRegisters};
56 #[cfg(target_arch = "x86_64")]
57 pub use x86_64::{
58     CpuId, CpuIdEntry, ExtendedControlRegisters, LapicState, MsrEntries, VcpuKvmState as CpuState,
59     Xsave, CPUID_FLAG_VALID_INDEX,
60 };
61 // aarch64 dependencies
62 #[cfg(target_arch = "aarch64")]
63 pub mod aarch64;
64 pub use kvm_bindings;
65 #[cfg(feature = "tdx")]
66 use kvm_bindings::KVMIO;
67 pub use kvm_bindings::{
68     kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_irq_routing, kvm_irq_routing_entry,
69     kvm_userspace_memory_region, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI,
70     KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID,
71 };
72 #[cfg(target_arch = "aarch64")]
73 use kvm_bindings::{
74     kvm_regs, user_fpsimd_state, user_pt_regs, KVM_NR_SPSR, KVM_REG_ARM64, KVM_REG_ARM_CORE,
75     KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64,
76 };
77 pub use kvm_ioctls;
78 pub use kvm_ioctls::{Cap, Kvm};
79 #[cfg(target_arch = "aarch64")]
80 use std::mem;
81 use thiserror::Error;
82 #[cfg(feature = "tdx")]
83 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_expr, ioctl_ioc_nr, ioctl_iowr_nr};
84 ///
85 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms
86 ///
87 pub use {
88     kvm_bindings::kvm_clock_data as ClockData, kvm_bindings::kvm_create_device as CreateDevice,
89     kvm_bindings::kvm_device_attr as DeviceAttr,
90     kvm_bindings::kvm_irq_routing_entry as IrqRoutingEntry, kvm_bindings::kvm_mp_state as MpState,
91     kvm_bindings::kvm_run, kvm_bindings::kvm_userspace_memory_region as MemoryRegion,
92     kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::DeviceFd, kvm_ioctls::IoEventAddress,
93     kvm_ioctls::VcpuExit,
94 };
95 
96 #[cfg(target_arch = "x86_64")]
97 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196;
98 
99 #[cfg(feature = "tdx")]
100 const KVM_EXIT_TDX: u32 = 35;
101 #[cfg(feature = "tdx")]
102 const TDG_VP_VMCALL_GET_QUOTE: u64 = 0x10002;
103 #[cfg(feature = "tdx")]
104 const TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: u64 = 0x10004;
105 #[cfg(feature = "tdx")]
106 const TDG_VP_VMCALL_SUCCESS: u64 = 0;
107 #[cfg(feature = "tdx")]
108 const TDG_VP_VMCALL_INVALID_OPERAND: u64 = 0x8000000000000000;
109 
110 #[cfg(feature = "tdx")]
111 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong);
112 
113 #[cfg(feature = "tdx")]
114 #[repr(u32)]
115 enum TdxCommand {
116     Capabilities = 0,
117     InitVm,
118     InitVcpu,
119     InitMemRegion,
120     Finalize,
121 }
122 
123 #[cfg(feature = "tdx")]
124 pub enum TdxExitDetails {
125     GetQuote,
126     SetupEventNotifyInterrupt,
127 }
128 
129 #[cfg(feature = "tdx")]
130 pub enum TdxExitStatus {
131     Success,
132     InvalidOperand,
133 }
134 
135 #[cfg(feature = "tdx")]
136 const TDX_MAX_NR_CPUID_CONFIGS: usize = 6;
137 
138 #[cfg(feature = "tdx")]
139 #[repr(C)]
140 #[derive(Debug, Default)]
141 pub struct TdxCpuidConfig {
142     pub leaf: u32,
143     pub sub_leaf: u32,
144     pub eax: u32,
145     pub ebx: u32,
146     pub ecx: u32,
147     pub edx: u32,
148 }
149 
150 #[cfg(feature = "tdx")]
151 #[repr(C)]
152 #[derive(Debug, Default)]
153 pub struct TdxCapabilities {
154     pub attrs_fixed0: u64,
155     pub attrs_fixed1: u64,
156     pub xfam_fixed0: u64,
157     pub xfam_fixed1: u64,
158     pub nr_cpuid_configs: u32,
159     pub padding: u32,
160     pub cpuid_configs: [TdxCpuidConfig; TDX_MAX_NR_CPUID_CONFIGS],
161 }
162 
163 #[derive(Clone, Copy, Debug, PartialEq, Eq, Deserialize, Serialize)]
164 pub struct KvmVmState {}
165 
166 pub use KvmVmState as VmState;
167 
168 struct KvmDirtyLogSlot {
169     slot: u32,
170     guest_phys_addr: u64,
171     memory_size: u64,
172     userspace_addr: u64,
173 }
174 
175 /// Wrapper over KVM VM ioctls.
176 pub struct KvmVm {
177     fd: Arc<VmFd>,
178     #[cfg(target_arch = "x86_64")]
179     msrs: MsrEntries,
180     state: KvmVmState,
181     dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>,
182 }
183 
184 ///
185 /// Implementation of Vm trait for KVM
186 /// Example:
187 /// #[cfg(feature = "kvm")]
188 /// extern crate hypervisor
189 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
190 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
191 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
192 /// vm.set/get().unwrap()
193 ///
194 impl vm::Vm for KvmVm {
195     #[cfg(target_arch = "x86_64")]
196     ///
197     /// Sets the address of the one-page region in the VM's address space.
198     ///
199     fn set_identity_map_address(&self, address: u64) -> vm::Result<()> {
200         self.fd
201             .set_identity_map_address(address)
202             .map_err(|e| vm::HypervisorVmError::SetIdentityMapAddress(e.into()))
203     }
204     #[cfg(target_arch = "x86_64")]
205     ///
206     /// Sets the address of the three-page region in the VM's address space.
207     ///
208     fn set_tss_address(&self, offset: usize) -> vm::Result<()> {
209         self.fd
210             .set_tss_address(offset)
211             .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into()))
212     }
213     ///
214     /// Creates an in-kernel interrupt controller.
215     ///
216     fn create_irq_chip(&self) -> vm::Result<()> {
217         self.fd
218             .create_irq_chip()
219             .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into()))
220     }
221     ///
222     /// Registers an event that will, when signaled, trigger the `gsi` IRQ.
223     ///
224     fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
225         self.fd
226             .register_irqfd(fd, gsi)
227             .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into()))
228     }
229     ///
230     /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ.
231     ///
232     fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
233         self.fd
234             .unregister_irqfd(fd, gsi)
235             .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into()))
236     }
237     ///
238     /// Creates a VcpuFd object from a vcpu RawFd.
239     ///
240     fn create_vcpu(
241         &self,
242         id: u8,
243         vm_ops: Option<Arc<dyn VmOps>>,
244     ) -> vm::Result<Arc<dyn cpu::Vcpu>> {
245         let vc = self
246             .fd
247             .create_vcpu(id as u64)
248             .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?;
249         let vcpu = KvmVcpu {
250             fd: vc,
251             #[cfg(target_arch = "x86_64")]
252             msrs: self.msrs.clone(),
253             vm_ops,
254             #[cfg(target_arch = "x86_64")]
255             hyperv_synic: AtomicBool::new(false),
256         };
257         Ok(Arc::new(vcpu))
258     }
259     #[cfg(target_arch = "aarch64")]
260     ///
261     /// Creates a virtual GIC device.
262     ///
263     fn create_vgic(
264         &self,
265         vcpu_count: u64,
266         dist_addr: u64,
267         dist_size: u64,
268         redist_size: u64,
269         msi_size: u64,
270         nr_irqs: u32,
271     ) -> vm::Result<Arc<Mutex<dyn Vgic>>> {
272         let gic_device = KvmGicV3Its::new(
273             self,
274             vcpu_count,
275             dist_addr,
276             dist_size,
277             redist_size,
278             msi_size,
279             nr_irqs,
280         )
281         .map_err(|e| vm::HypervisorVmError::CreateVgic(anyhow!("Vgic error {:?}", e)))?;
282         Ok(Arc::new(Mutex::new(gic_device)))
283     }
284     ///
285     /// Registers an event to be signaled whenever a certain address is written to.
286     ///
287     fn register_ioevent(
288         &self,
289         fd: &EventFd,
290         addr: &IoEventAddress,
291         datamatch: Option<vm::DataMatch>,
292     ) -> vm::Result<()> {
293         if let Some(dm) = datamatch {
294             match dm {
295                 vm::DataMatch::DataMatch32(kvm_dm32) => self
296                     .fd
297                     .register_ioevent(fd, addr, kvm_dm32)
298                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
299                 vm::DataMatch::DataMatch64(kvm_dm64) => self
300                     .fd
301                     .register_ioevent(fd, addr, kvm_dm64)
302                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
303             }
304         } else {
305             self.fd
306                 .register_ioevent(fd, addr, NoDatamatch)
307                 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into()))
308         }
309     }
310     ///
311     /// Unregisters an event from a certain address it has been previously registered to.
312     ///
313     fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> {
314         self.fd
315             .unregister_ioevent(fd, addr, NoDatamatch)
316             .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into()))
317     }
318 
319     ///
320     /// Constructs a routing entry
321     ///
322     fn make_routing_entry(
323         &self,
324         gsi: u32,
325         config: &InterruptSourceConfig,
326     ) -> kvm_irq_routing_entry {
327         match &config {
328             InterruptSourceConfig::MsiIrq(cfg) => {
329                 let mut kvm_route = kvm_irq_routing_entry {
330                     gsi,
331                     type_: KVM_IRQ_ROUTING_MSI,
332                     ..Default::default()
333                 };
334 
335                 kvm_route.u.msi.address_lo = cfg.low_addr;
336                 kvm_route.u.msi.address_hi = cfg.high_addr;
337                 kvm_route.u.msi.data = cfg.data;
338 
339                 if self.check_extension(crate::kvm::Cap::MsiDevid) {
340                     // On AArch64, there is limitation on the range of the 'devid',
341                     // it can not be greater than 65536 (the max of u16).
342                     //
343                     // BDF can not be used directly, because 'segment' is in high
344                     // 16 bits. The layout of the u32 BDF is:
345                     // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --|
346                     // |      segment    |     bus    |   device   |  function  |
347                     //
348                     // Now that we support 1 bus only in a segment, we can build a
349                     // 'devid' by replacing the 'bus' bits with the low 8 bits of
350                     // 'segment' data.
351                     // This way we can resolve the range checking problem and give
352                     // different `devid` to all the devices. Limitation is that at
353                     // most 256 segments can be supported.
354                     //
355                     let modified_devid = (cfg.devid & 0x00ff_0000) >> 8 | cfg.devid & 0xff;
356 
357                     kvm_route.flags = KVM_MSI_VALID_DEVID;
358                     kvm_route.u.msi.__bindgen_anon_1.devid = modified_devid;
359                 }
360                 kvm_route
361             }
362             InterruptSourceConfig::LegacyIrq(cfg) => {
363                 let mut kvm_route = kvm_irq_routing_entry {
364                     gsi,
365                     type_: KVM_IRQ_ROUTING_IRQCHIP,
366                     ..Default::default()
367                 };
368                 kvm_route.u.irqchip.irqchip = cfg.irqchip;
369                 kvm_route.u.irqchip.pin = cfg.pin;
370 
371                 kvm_route
372             }
373         }
374     }
375 
376     ///
377     /// Sets the GSI routing table entries, overwriting any previously set
378     /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl.
379     ///
380     fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> {
381         let mut irq_routing =
382             vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len());
383         irq_routing[0].nr = entries.len() as u32;
384         irq_routing[0].flags = 0;
385 
386         // SAFETY: irq_routing initialized with entries.len() and now it is being turned into
387         // entries_slice with entries.len() again. It is guaranteed to be large enough to hold
388         // everything from entries.
389         unsafe {
390             let entries_slice: &mut [kvm_irq_routing_entry] =
391                 irq_routing[0].entries.as_mut_slice(entries.len());
392             entries_slice.copy_from_slice(entries);
393         }
394 
395         self.fd
396             .set_gsi_routing(&irq_routing[0])
397             .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into()))
398     }
399     ///
400     /// Creates a memory region structure that can be used with {create/remove}_user_memory_region
401     ///
402     fn make_user_memory_region(
403         &self,
404         slot: u32,
405         guest_phys_addr: u64,
406         memory_size: u64,
407         userspace_addr: u64,
408         readonly: bool,
409         log_dirty_pages: bool,
410     ) -> MemoryRegion {
411         MemoryRegion {
412             slot,
413             guest_phys_addr,
414             memory_size,
415             userspace_addr,
416             flags: if readonly { KVM_MEM_READONLY } else { 0 }
417                 | if log_dirty_pages {
418                     KVM_MEM_LOG_DIRTY_PAGES
419                 } else {
420                     0
421                 },
422         }
423     }
424     ///
425     /// Creates a guest physical memory region.
426     ///
427     fn create_user_memory_region(&self, user_memory_region: MemoryRegion) -> vm::Result<()> {
428         let mut region = user_memory_region;
429 
430         if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 {
431             if (region.flags & KVM_MEM_READONLY) != 0 {
432                 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!(
433                     "Error creating regions with both 'dirty-pages-log' and 'read-only'."
434                 )));
435             }
436 
437             // Keep track of the regions that need dirty pages log
438             self.dirty_log_slots.write().unwrap().insert(
439                 region.slot,
440                 KvmDirtyLogSlot {
441                     slot: region.slot,
442                     guest_phys_addr: region.guest_phys_addr,
443                     memory_size: region.memory_size,
444                     userspace_addr: region.userspace_addr,
445                 },
446             );
447 
448             // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`.
449             // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`.
450             region.flags = 0;
451         }
452 
453         // SAFETY: Safe because guest regions are guaranteed not to overlap.
454         unsafe {
455             self.fd
456                 .set_user_memory_region(region)
457                 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))
458         }
459     }
460     ///
461     /// Removes a guest physical memory region.
462     ///
463     fn remove_user_memory_region(&self, user_memory_region: MemoryRegion) -> vm::Result<()> {
464         let mut region = user_memory_region;
465 
466         // Remove the corresponding entry from "self.dirty_log_slots" if needed
467         self.dirty_log_slots.write().unwrap().remove(&region.slot);
468 
469         // Setting the size to 0 means "remove"
470         region.memory_size = 0;
471         // SAFETY: Safe because guest regions are guaranteed not to overlap.
472         unsafe {
473             self.fd
474                 .set_user_memory_region(region)
475                 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into()))
476         }
477     }
478     ///
479     /// Creates an emulated device in the kernel.
480     ///
481     /// See the documentation for `KVM_CREATE_DEVICE`.
482     fn create_device(&self, device: &mut CreateDevice) -> vm::Result<Arc<dyn device::Device>> {
483         let fd = self
484             .fd
485             .create_device(device)
486             .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?;
487         let device = KvmDevice { fd };
488         Ok(Arc::new(device))
489     }
490     ///
491     /// Returns the preferred CPU target type which can be emulated by KVM on underlying host.
492     ///
493     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
494     fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> {
495         self.fd
496             .get_preferred_target(kvi)
497             .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into()))
498     }
499     #[cfg(target_arch = "x86_64")]
500     fn enable_split_irq(&self) -> vm::Result<()> {
501         // Create split irqchip
502         // Only the local APIC is emulated in kernel, both PICs and IOAPIC
503         // are not.
504         let mut cap = kvm_enable_cap {
505             cap: KVM_CAP_SPLIT_IRQCHIP,
506             ..Default::default()
507         };
508         cap.args[0] = NUM_IOAPIC_PINS as u64;
509         self.fd
510             .enable_cap(&cap)
511             .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?;
512         Ok(())
513     }
514     #[cfg(target_arch = "x86_64")]
515     fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> {
516         let mut cap = kvm_enable_cap {
517             cap: KVM_CAP_SGX_ATTRIBUTE,
518             ..Default::default()
519         };
520         cap.args[0] = file.as_raw_fd() as u64;
521         self.fd
522             .enable_cap(&cap)
523             .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?;
524         Ok(())
525     }
526     /// Retrieve guest clock.
527     #[cfg(target_arch = "x86_64")]
528     fn get_clock(&self) -> vm::Result<ClockData> {
529         self.fd
530             .get_clock()
531             .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))
532     }
533     /// Set guest clock.
534     #[cfg(target_arch = "x86_64")]
535     fn set_clock(&self, data: &ClockData) -> vm::Result<()> {
536         self.fd
537             .set_clock(data)
538             .map_err(|e| vm::HypervisorVmError::SetClock(e.into()))
539     }
540     /// Checks if a particular `Cap` is available.
541     fn check_extension(&self, c: Cap) -> bool {
542         self.fd.check_extension(c)
543     }
544     /// Create a device that is used for passthrough
545     fn create_passthrough_device(&self) -> vm::Result<Arc<dyn device::Device>> {
546         let mut vfio_dev = kvm_create_device {
547             type_: kvm_device_type_KVM_DEV_TYPE_VFIO,
548             fd: 0,
549             flags: 0,
550         };
551 
552         self.create_device(&mut vfio_dev)
553             .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into()))
554     }
555     ///
556     /// Get the Vm state. Return VM specific data
557     ///
558     fn state(&self) -> vm::Result<VmState> {
559         Ok(self.state)
560     }
561     ///
562     /// Set the VM state
563     ///
564     fn set_state(&self, _state: VmState) -> vm::Result<()> {
565         Ok(())
566     }
567 
568     ///
569     /// Start logging dirty pages
570     ///
571     fn start_dirty_log(&self) -> vm::Result<()> {
572         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
573         for (_, s) in dirty_log_slots.iter() {
574             let region = MemoryRegion {
575                 slot: s.slot,
576                 guest_phys_addr: s.guest_phys_addr,
577                 memory_size: s.memory_size,
578                 userspace_addr: s.userspace_addr,
579                 flags: KVM_MEM_LOG_DIRTY_PAGES,
580             };
581             // SAFETY: Safe because guest regions are guaranteed not to overlap.
582             unsafe {
583                 self.fd
584                     .set_user_memory_region(region)
585                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
586             }
587         }
588 
589         Ok(())
590     }
591 
592     ///
593     /// Stop logging dirty pages
594     ///
595     fn stop_dirty_log(&self) -> vm::Result<()> {
596         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
597         for (_, s) in dirty_log_slots.iter() {
598             let region = MemoryRegion {
599                 slot: s.slot,
600                 guest_phys_addr: s.guest_phys_addr,
601                 memory_size: s.memory_size,
602                 userspace_addr: s.userspace_addr,
603                 flags: 0,
604             };
605             // SAFETY: Safe because guest regions are guaranteed not to overlap.
606             unsafe {
607                 self.fd
608                     .set_user_memory_region(region)
609                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
610             }
611         }
612 
613         Ok(())
614     }
615 
616     ///
617     /// Get dirty pages bitmap (one bit per page)
618     ///
619     fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> {
620         self.fd
621             .get_dirty_log(slot, memory_size as usize)
622             .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into()))
623     }
624 
625     ///
626     /// Initialize TDX for this VM
627     ///
628     #[cfg(feature = "tdx")]
629     fn tdx_init(&self, cpuid: &CpuId, max_vcpus: u32) -> vm::Result<()> {
630         #[repr(C)]
631         struct TdxInitVm {
632             max_vcpus: u32,
633             tsc_khz: u32,
634             attributes: u64,
635             cpuid: u64,
636             mrconfigid: [u64; 6],
637             mrowner: [u64; 6],
638             mrownerconfig: [u64; 6],
639             reserved: [u64; 43],
640         }
641         let data = TdxInitVm {
642             max_vcpus,
643             tsc_khz: 0,
644             attributes: 0,
645             cpuid: cpuid.as_fam_struct_ptr() as u64,
646             mrconfigid: [0; 6],
647             mrowner: [0; 6],
648             mrownerconfig: [0; 6],
649             reserved: [0; 43],
650         };
651 
652         tdx_command(
653             &self.fd.as_raw_fd(),
654             TdxCommand::InitVm,
655             0,
656             &data as *const _ as u64,
657         )
658         .map_err(vm::HypervisorVmError::InitializeTdx)
659     }
660 
661     ///
662     /// Finalize the TDX setup for this VM
663     ///
664     #[cfg(feature = "tdx")]
665     fn tdx_finalize(&self) -> vm::Result<()> {
666         tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0)
667             .map_err(vm::HypervisorVmError::FinalizeTdx)
668     }
669 
670     ///
671     /// Initialize memory regions for the TDX VM
672     ///
673     #[cfg(feature = "tdx")]
674     fn tdx_init_memory_region(
675         &self,
676         host_address: u64,
677         guest_address: u64,
678         size: u64,
679         measure: bool,
680     ) -> vm::Result<()> {
681         #[repr(C)]
682         struct TdxInitMemRegion {
683             host_address: u64,
684             guest_address: u64,
685             pages: u64,
686         }
687         let data = TdxInitMemRegion {
688             host_address,
689             guest_address,
690             pages: size / 4096,
691         };
692 
693         tdx_command(
694             &self.fd.as_raw_fd(),
695             TdxCommand::InitMemRegion,
696             if measure { 1 } else { 0 },
697             &data as *const _ as u64,
698         )
699         .map_err(vm::HypervisorVmError::InitMemRegionTdx)
700     }
701 }
702 
703 #[cfg(feature = "tdx")]
704 fn tdx_command(
705     fd: &RawFd,
706     command: TdxCommand,
707     metadata: u32,
708     data: u64,
709 ) -> std::result::Result<(), std::io::Error> {
710     #[repr(C)]
711     struct TdxIoctlCmd {
712         command: TdxCommand,
713         metadata: u32,
714         data: u64,
715     }
716     let cmd = TdxIoctlCmd {
717         command,
718         metadata,
719         data,
720     };
721     // SAFETY: FFI call. All input parameters are valid.
722     let ret = unsafe {
723         ioctl_with_val(
724             fd,
725             KVM_MEMORY_ENCRYPT_OP(),
726             &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong,
727         )
728     };
729 
730     if ret < 0 {
731         return Err(std::io::Error::last_os_error());
732     }
733     Ok(())
734 }
735 
736 /// Wrapper over KVM system ioctls.
737 pub struct KvmHypervisor {
738     kvm: Kvm,
739 }
740 /// Enum for KVM related error
741 #[derive(Debug, Error)]
742 pub enum KvmError {
743     #[error("Capability missing: {0:?}")]
744     CapabilityMissing(Cap),
745 }
746 pub type KvmResult<T> = result::Result<T, KvmError>;
747 impl KvmHypervisor {
748     /// Create a hypervisor based on Kvm
749     pub fn new() -> hypervisor::Result<KvmHypervisor> {
750         let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?;
751         let api_version = kvm_obj.get_api_version();
752 
753         if api_version != kvm_bindings::KVM_API_VERSION as i32 {
754             return Err(hypervisor::HypervisorError::IncompatibleApiVersion);
755         }
756 
757         Ok(KvmHypervisor { kvm: kvm_obj })
758     }
759 }
760 /// Implementation of Hypervisor trait for KVM
761 /// Example:
762 /// #[cfg(feature = "kvm")]
763 /// extern crate hypervisor
764 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
765 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
766 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
767 ///
768 impl hypervisor::Hypervisor for KvmHypervisor {
769     /// Create a KVM vm object of a specific VM type and return the object as Vm trait object
770     /// Example
771     /// # extern crate hypervisor;
772     /// # use hypervisor::KvmHypervisor;
773     /// use hypervisor::KvmVm;
774     /// let hypervisor = KvmHypervisor::new().unwrap();
775     /// let vm = hypervisor.create_vm_with_type(KvmVmType::LegacyVm).unwrap()
776     ///
777     fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> {
778         let fd: VmFd;
779         loop {
780             match self.kvm.create_vm_with_type(vm_type) {
781                 Ok(res) => fd = res,
782                 Err(e) => {
783                     if e.errno() == libc::EINTR {
784                         // If the error returned is EINTR, which means the
785                         // ioctl has been interrupted, we have to retry as
786                         // this can't be considered as a regular error.
787                         continue;
788                     } else {
789                         return Err(hypervisor::HypervisorError::VmCreate(e.into()));
790                     }
791                 }
792             }
793             break;
794         }
795 
796         let vm_fd = Arc::new(fd);
797 
798         #[cfg(target_arch = "x86_64")]
799         {
800             let msr_list = self.get_msr_list()?;
801             let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize;
802             let mut msrs = MsrEntries::new(num_msrs).unwrap();
803             let indices = msr_list.as_slice();
804             let msr_entries = msrs.as_mut_slice();
805             for (pos, index) in indices.iter().enumerate() {
806                 msr_entries[pos].index = *index;
807             }
808 
809             Ok(Arc::new(KvmVm {
810                 fd: vm_fd,
811                 msrs,
812                 state: VmState {},
813                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
814             }))
815         }
816 
817         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
818         {
819             Ok(Arc::new(KvmVm {
820                 fd: vm_fd,
821                 state: VmState {},
822                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
823             }))
824         }
825     }
826 
827     /// Create a KVM vm object and return the object as Vm trait object
828     /// Example
829     /// # extern crate hypervisor;
830     /// # use hypervisor::KvmHypervisor;
831     /// use hypervisor::KvmVm;
832     /// let hypervisor = KvmHypervisor::new().unwrap();
833     /// let vm = hypervisor.create_vm().unwrap()
834     ///
835     fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> {
836         #[allow(unused_mut)]
837         let mut vm_type: u64 = 0; // Create with default platform type
838 
839         // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA
840         // size from the host and use that when creating the VM, which may
841         // avoid unnecessary VM creation failures.
842         #[cfg(target_arch = "aarch64")]
843         if self.kvm.check_extension(Cap::ArmVmIPASize) {
844             vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap();
845         }
846 
847         self.create_vm_with_type(vm_type)
848     }
849 
850     fn check_required_extensions(&self) -> hypervisor::Result<()> {
851         check_required_kvm_extensions(&self.kvm)
852             .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into()))
853     }
854 
855     #[cfg(target_arch = "x86_64")]
856     ///
857     /// X86 specific call to get the system supported CPUID values.
858     ///
859     fn get_cpuid(&self) -> hypervisor::Result<CpuId> {
860         self.kvm
861             .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
862             .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))
863     }
864 
865     #[cfg(target_arch = "x86_64")]
866     ///
867     /// Retrieve the list of MSRs supported by KVM.
868     ///
869     fn get_msr_list(&self) -> hypervisor::Result<MsrList> {
870         self.kvm
871             .get_msr_index_list()
872             .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into()))
873     }
874     #[cfg(target_arch = "aarch64")]
875     ///
876     /// Retrieve AArch64 host maximum IPA size supported by KVM.
877     ///
878     fn get_host_ipa_limit(&self) -> i32 {
879         self.kvm.get_host_ipa_limit()
880     }
881 
882     ///
883     /// Retrieve TDX capabilities
884     ///
885     #[cfg(feature = "tdx")]
886     fn tdx_capabilities(&self) -> hypervisor::Result<TdxCapabilities> {
887         let data = TdxCapabilities {
888             nr_cpuid_configs: TDX_MAX_NR_CPUID_CONFIGS as u32,
889             ..Default::default()
890         };
891 
892         tdx_command(
893             &self.kvm.as_raw_fd(),
894             TdxCommand::Capabilities,
895             0,
896             &data as *const _ as u64,
897         )
898         .map_err(|e| hypervisor::HypervisorError::TdxCapabilities(e.into()))?;
899 
900         Ok(data)
901     }
902 }
903 /// Vcpu struct for KVM
904 pub struct KvmVcpu {
905     fd: VcpuFd,
906     #[cfg(target_arch = "x86_64")]
907     msrs: MsrEntries,
908     vm_ops: Option<Arc<dyn vm::VmOps>>,
909     #[cfg(target_arch = "x86_64")]
910     hyperv_synic: AtomicBool,
911 }
912 /// Implementation of Vcpu trait for KVM
913 /// Example:
914 /// #[cfg(feature = "kvm")]
915 /// extern crate hypervisor
916 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
917 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
918 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
919 /// let vcpu = vm.create_vcpu(0, None).unwrap();
920 /// vcpu.get/set().unwrap()
921 ///
922 impl cpu::Vcpu for KvmVcpu {
923     #[cfg(target_arch = "x86_64")]
924     ///
925     /// Returns the vCPU general purpose registers.
926     ///
927     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
928         self.fd
929             .get_regs()
930             .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))
931     }
932     #[cfg(target_arch = "x86_64")]
933     ///
934     /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl.
935     ///
936     fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> {
937         self.fd
938             .set_regs(regs)
939             .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into()))
940     }
941 
942     #[cfg(target_arch = "aarch64")]
943     ///
944     /// Set attribute for vcpu.
945     ///
946     fn set_vcpu_attr(&self, attr: &DeviceAttr) -> cpu::Result<()> {
947         self.fd
948             .set_device_attr(attr)
949             .map_err(|e| cpu::HypervisorCpuError::SetVcpuAttribute(e.into()))
950     }
951 
952     #[cfg(target_arch = "aarch64")]
953     ///
954     /// Check if vcpu has a certain attribute.
955     ///
956     fn has_vcpu_attr(&self, attr: &DeviceAttr) -> cpu::Result<()> {
957         self.fd
958             .has_device_attr(attr)
959             .map_err(|e| cpu::HypervisorCpuError::HasVcpuAttribute(e.into()))
960     }
961 
962     #[cfg(target_arch = "x86_64")]
963     ///
964     /// Returns the vCPU special registers.
965     ///
966     fn get_sregs(&self) -> cpu::Result<SpecialRegisters> {
967         self.fd
968             .get_sregs()
969             .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))
970     }
971     #[cfg(target_arch = "x86_64")]
972     ///
973     /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl.
974     ///
975     fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> {
976         self.fd
977             .set_sregs(sregs)
978             .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into()))
979     }
980     #[cfg(target_arch = "x86_64")]
981     ///
982     /// Returns the floating point state (FPU) from the vCPU.
983     ///
984     fn get_fpu(&self) -> cpu::Result<FpuState> {
985         self.fd
986             .get_fpu()
987             .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))
988     }
989     #[cfg(target_arch = "x86_64")]
990     ///
991     /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct.
992     ///
993     fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> {
994         self.fd
995             .set_fpu(fpu)
996             .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into()))
997     }
998     #[cfg(target_arch = "x86_64")]
999     ///
1000     /// X86 specific call to setup the CPUID registers.
1001     ///
1002     fn set_cpuid2(&self, cpuid: &CpuId) -> cpu::Result<()> {
1003         self.fd
1004             .set_cpuid2(cpuid)
1005             .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into()))
1006     }
1007     #[cfg(target_arch = "x86_64")]
1008     ///
1009     /// X86 specific call to enable HyperV SynIC
1010     ///
1011     fn enable_hyperv_synic(&self) -> cpu::Result<()> {
1012         // Update the information about Hyper-V SynIC being enabled and
1013         // emulated as it will influence later which MSRs should be saved.
1014         self.hyperv_synic.store(true, Ordering::Release);
1015 
1016         let cap = kvm_enable_cap {
1017             cap: KVM_CAP_HYPERV_SYNIC,
1018             ..Default::default()
1019         };
1020         self.fd
1021             .enable_cap(&cap)
1022             .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into()))
1023     }
1024     ///
1025     /// X86 specific call to retrieve the CPUID registers.
1026     ///
1027     #[cfg(target_arch = "x86_64")]
1028     fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<CpuId> {
1029         self.fd
1030             .get_cpuid2(num_entries)
1031             .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))
1032     }
1033     #[cfg(target_arch = "x86_64")]
1034     ///
1035     /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1036     ///
1037     fn get_lapic(&self) -> cpu::Result<LapicState> {
1038         self.fd
1039             .get_lapic()
1040             .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))
1041     }
1042     #[cfg(target_arch = "x86_64")]
1043     ///
1044     /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1045     ///
1046     fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> {
1047         self.fd
1048             .set_lapic(klapic)
1049             .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into()))
1050     }
1051     #[cfg(target_arch = "x86_64")]
1052     ///
1053     /// Returns the model-specific registers (MSR) for this vCPU.
1054     ///
1055     fn get_msrs(&self, msrs: &mut MsrEntries) -> cpu::Result<usize> {
1056         self.fd
1057             .get_msrs(msrs)
1058             .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))
1059     }
1060     #[cfg(target_arch = "x86_64")]
1061     ///
1062     /// Setup the model-specific registers (MSR) for this vCPU.
1063     /// Returns the number of MSR entries actually written.
1064     ///
1065     fn set_msrs(&self, msrs: &MsrEntries) -> cpu::Result<usize> {
1066         self.fd
1067             .set_msrs(msrs)
1068             .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into()))
1069     }
1070     ///
1071     /// Returns the vcpu's current "multiprocessing state".
1072     ///
1073     fn get_mp_state(&self) -> cpu::Result<MpState> {
1074         self.fd
1075             .get_mp_state()
1076             .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))
1077     }
1078     ///
1079     /// Sets the vcpu's current "multiprocessing state".
1080     ///
1081     fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> {
1082         self.fd
1083             .set_mp_state(mp_state)
1084             .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into()))
1085     }
1086     #[cfg(target_arch = "x86_64")]
1087     ///
1088     /// X86 specific call that returns the vcpu's current "xsave struct".
1089     ///
1090     fn get_xsave(&self) -> cpu::Result<Xsave> {
1091         self.fd
1092             .get_xsave()
1093             .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into()))
1094     }
1095     #[cfg(target_arch = "x86_64")]
1096     ///
1097     /// X86 specific call that sets the vcpu's current "xsave struct".
1098     ///
1099     fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> {
1100         self.fd
1101             .set_xsave(xsave)
1102             .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into()))
1103     }
1104     #[cfg(target_arch = "x86_64")]
1105     ///
1106     /// X86 specific call that returns the vcpu's current "xcrs".
1107     ///
1108     fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> {
1109         self.fd
1110             .get_xcrs()
1111             .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into()))
1112     }
1113     #[cfg(target_arch = "x86_64")]
1114     ///
1115     /// X86 specific call that sets the vcpu's current "xcrs".
1116     ///
1117     fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> {
1118         self.fd
1119             .set_xcrs(xcrs)
1120             .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into()))
1121     }
1122     #[cfg(target_arch = "x86_64")]
1123     ///
1124     /// Translates guest virtual address to guest physical address using the `KVM_TRANSLATE` ioctl.
1125     ///
1126     fn translate_gva(&self, gva: u64, _flags: u64) -> cpu::Result<(u64, u32)> {
1127         let tr = self
1128             .fd
1129             .translate_gva(gva)
1130             .map_err(|e| cpu::HypervisorCpuError::TranslateVirtualAddress(e.into()))?;
1131         // tr.valid is set if the GVA is mapped to valid GPA.
1132         match tr.valid {
1133             0 => Err(cpu::HypervisorCpuError::TranslateVirtualAddress(anyhow!(
1134                 "Invalid GVA: {:#x}",
1135                 gva
1136             ))),
1137             _ => Ok((tr.physical_address, 0)),
1138         }
1139     }
1140     ///
1141     /// Triggers the running of the current virtual CPU returning an exit reason.
1142     ///
1143     fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> {
1144         match self.fd.run() {
1145             Ok(run) => match run {
1146                 #[cfg(target_arch = "x86_64")]
1147                 VcpuExit::IoIn(addr, data) => {
1148                     if let Some(vm_ops) = &self.vm_ops {
1149                         return vm_ops
1150                             .pio_read(addr.into(), data)
1151                             .map(|_| cpu::VmExit::Ignore)
1152                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1153                     }
1154 
1155                     Ok(cpu::VmExit::IoIn(addr, data))
1156                 }
1157                 #[cfg(target_arch = "x86_64")]
1158                 VcpuExit::IoOut(addr, data) => {
1159                     if let Some(vm_ops) = &self.vm_ops {
1160                         return vm_ops
1161                             .pio_write(addr.into(), data)
1162                             .map(|_| cpu::VmExit::Ignore)
1163                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1164                     }
1165 
1166                     Ok(cpu::VmExit::IoOut(addr, data))
1167                 }
1168                 #[cfg(target_arch = "x86_64")]
1169                 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)),
1170                 #[cfg(target_arch = "x86_64")]
1171                 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset),
1172 
1173                 #[cfg(target_arch = "aarch64")]
1174                 VcpuExit::SystemEvent(event_type, flags) => {
1175                     use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN};
1176                     // On Aarch64, when the VM is shutdown, run() returns
1177                     // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN
1178                     if event_type == KVM_SYSTEM_EVENT_RESET {
1179                         Ok(cpu::VmExit::Reset)
1180                     } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN {
1181                         Ok(cpu::VmExit::Shutdown)
1182                     } else {
1183                         Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1184                             "Unexpected system event with type 0x{:x}, flags 0x{:x}",
1185                             event_type,
1186                             flags
1187                         )))
1188                     }
1189                 }
1190 
1191                 VcpuExit::MmioRead(addr, data) => {
1192                     if let Some(vm_ops) = &self.vm_ops {
1193                         return vm_ops
1194                             .mmio_read(addr, data)
1195                             .map(|_| cpu::VmExit::Ignore)
1196                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1197                     }
1198 
1199                     Ok(cpu::VmExit::MmioRead(addr, data))
1200                 }
1201                 VcpuExit::MmioWrite(addr, data) => {
1202                     if let Some(vm_ops) = &self.vm_ops {
1203                         return vm_ops
1204                             .mmio_write(addr, data)
1205                             .map(|_| cpu::VmExit::Ignore)
1206                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1207                     }
1208 
1209                     Ok(cpu::VmExit::MmioWrite(addr, data))
1210                 }
1211                 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv),
1212                 #[cfg(feature = "tdx")]
1213                 VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx),
1214                 VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug),
1215 
1216                 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1217                     "Unexpected exit reason on vcpu run: {:?}",
1218                     r
1219                 ))),
1220             },
1221 
1222             Err(ref e) => match e.errno() {
1223                 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore),
1224                 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1225                     "VCPU error {:?}",
1226                     e
1227                 ))),
1228             },
1229         }
1230     }
1231     #[cfg(target_arch = "x86_64")]
1232     ///
1233     /// Returns currently pending exceptions, interrupts, and NMIs as well as related
1234     /// states of the vcpu.
1235     ///
1236     fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> {
1237         self.fd
1238             .get_vcpu_events()
1239             .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into()))
1240     }
1241     #[cfg(target_arch = "x86_64")]
1242     ///
1243     /// Sets pending exceptions, interrupts, and NMIs as well as related states
1244     /// of the vcpu.
1245     ///
1246     fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> {
1247         self.fd
1248             .set_vcpu_events(events)
1249             .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into()))
1250     }
1251     #[cfg(target_arch = "x86_64")]
1252     ///
1253     /// Let the guest know that it has been paused, which prevents from
1254     /// potential soft lockups when being resumed.
1255     ///
1256     fn notify_guest_clock_paused(&self) -> cpu::Result<()> {
1257         if let Err(e) = self.fd.kvmclock_ctrl() {
1258             // Linux kernel returns -EINVAL if the PV clock isn't yet initialised
1259             // which could be because we're still in firmware or the guest doesn't
1260             // use KVM clock.
1261             if e.errno() != libc::EINVAL {
1262                 return Err(cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into()));
1263             }
1264         }
1265 
1266         Ok(())
1267     }
1268     #[cfg(target_arch = "x86_64")]
1269     ///
1270     /// Sets debug registers to set hardware breakpoints and/or enable single step.
1271     ///
1272     fn set_guest_debug(
1273         &self,
1274         addrs: &[vm_memory::GuestAddress],
1275         singlestep: bool,
1276     ) -> cpu::Result<()> {
1277         if addrs.len() > 4 {
1278             return Err(cpu::HypervisorCpuError::SetDebugRegs(anyhow!(
1279                 "Support 4 breakpoints at most but {} addresses are passed",
1280                 addrs.len()
1281             )));
1282         }
1283 
1284         let mut dbg = kvm_guest_debug {
1285             control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP,
1286             ..Default::default()
1287         };
1288         if singlestep {
1289             dbg.control |= KVM_GUESTDBG_SINGLESTEP;
1290         }
1291 
1292         // Set bits 9 and 10.
1293         // bit 9: GE (global exact breakpoint enable) flag.
1294         // bit 10: always 1.
1295         dbg.arch.debugreg[7] = 0x0600;
1296 
1297         for (i, addr) in addrs.iter().enumerate() {
1298             dbg.arch.debugreg[i] = addr.0;
1299             // Set global breakpoint enable flag
1300             dbg.arch.debugreg[7] |= 2 << (i * 2);
1301         }
1302 
1303         self.fd
1304             .set_guest_debug(&dbg)
1305             .map_err(|e| cpu::HypervisorCpuError::SetDebugRegs(e.into()))
1306     }
1307     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1308     fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> {
1309         self.fd
1310             .vcpu_init(kvi)
1311             .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into()))
1312     }
1313     ///
1314     /// Sets the value of one register for this vCPU.
1315     ///
1316     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1317     fn set_reg(&self, reg_id: u64, data: u64) -> cpu::Result<()> {
1318         self.fd
1319             .set_one_reg(reg_id, data)
1320             .map_err(|e| cpu::HypervisorCpuError::SetRegister(e.into()))
1321     }
1322     ///
1323     /// Gets the value of one register for this vCPU.
1324     ///
1325     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1326     fn get_reg(&self, reg_id: u64) -> cpu::Result<u64> {
1327         self.fd
1328             .get_one_reg(reg_id)
1329             .map_err(|e| cpu::HypervisorCpuError::GetRegister(e.into()))
1330     }
1331     ///
1332     /// Gets a list of the guest registers that are supported for the
1333     /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls.
1334     ///
1335     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1336     fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> {
1337         self.fd
1338             .get_reg_list(reg_list)
1339             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))
1340     }
1341     ///
1342     /// Save the state of the core registers.
1343     ///
1344     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1345     fn core_registers(&self, state: &mut StandardRegisters) -> cpu::Result<()> {
1346         let mut off = offset__of!(user_pt_regs, regs);
1347         // There are 31 user_pt_regs:
1348         // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72
1349         // These actually are the general-purpose registers of the Armv8-a
1350         // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register).
1351         for i in 0..31 {
1352             state.regs.regs[i] = self
1353                 .fd
1354                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1355                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1356             off += std::mem::size_of::<u64>();
1357         }
1358 
1359         // We are now entering the "Other register" section of the ARMv8-a architecture.
1360         // First one, stack pointer.
1361         let off = offset__of!(user_pt_regs, sp);
1362         state.regs.sp = self
1363             .fd
1364             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1365             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1366 
1367         // Second one, the program counter.
1368         let off = offset__of!(user_pt_regs, pc);
1369         state.regs.pc = self
1370             .fd
1371             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1372             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1373 
1374         // Next is the processor state.
1375         let off = offset__of!(user_pt_regs, pstate);
1376         state.regs.pstate = self
1377             .fd
1378             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1379             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1380 
1381         // The stack pointer associated with EL1
1382         let off = offset__of!(kvm_regs, sp_el1);
1383         state.sp_el1 = self
1384             .fd
1385             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1386             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1387 
1388         // Exception Link Register for EL1, when taking an exception to EL1, this register
1389         // holds the address to which to return afterwards.
1390         let off = offset__of!(kvm_regs, elr_el1);
1391         state.elr_el1 = self
1392             .fd
1393             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1394             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1395 
1396         // Saved Program Status Registers, there are 5 of them used in the kernel.
1397         let mut off = offset__of!(kvm_regs, spsr);
1398         for i in 0..KVM_NR_SPSR as usize {
1399             state.spsr[i] = self
1400                 .fd
1401                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1402                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1403             off += std::mem::size_of::<u64>();
1404         }
1405 
1406         // Now moving on to floting point registers which are stored in the user_fpsimd_state in the kernel:
1407         // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53
1408         let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs);
1409         for i in 0..32 {
1410             state.fp_regs.vregs[i] = self
1411                 .fd
1412                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off))
1413                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1414                 .into();
1415             off += mem::size_of::<u128>();
1416         }
1417 
1418         // Floating-point Status Register
1419         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr);
1420         state.fp_regs.fpsr = self
1421             .fd
1422             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1423             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1424             as u32;
1425 
1426         // Floating-point Control Register
1427         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr);
1428         state.fp_regs.fpcr = self
1429             .fd
1430             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1431             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1432             as u32;
1433         Ok(())
1434     }
1435     ///
1436     /// Restore the state of the core registers.
1437     ///
1438     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1439     fn set_core_registers(&self, state: &StandardRegisters) -> cpu::Result<()> {
1440         // The function follows the exact identical order from `state`. Look there
1441         // for some additional info on registers.
1442         let mut off = offset__of!(user_pt_regs, regs);
1443         for i in 0..31 {
1444             self.fd
1445                 .set_one_reg(
1446                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1447                     state.regs.regs[i],
1448                 )
1449                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1450             off += std::mem::size_of::<u64>();
1451         }
1452 
1453         let off = offset__of!(user_pt_regs, sp);
1454         self.fd
1455             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.sp)
1456             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1457 
1458         let off = offset__of!(user_pt_regs, pc);
1459         self.fd
1460             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pc)
1461             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1462 
1463         let off = offset__of!(user_pt_regs, pstate);
1464         self.fd
1465             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pstate)
1466             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1467 
1468         let off = offset__of!(kvm_regs, sp_el1);
1469         self.fd
1470             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.sp_el1)
1471             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1472 
1473         let off = offset__of!(kvm_regs, elr_el1);
1474         self.fd
1475             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.elr_el1)
1476             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1477 
1478         let mut off = offset__of!(kvm_regs, spsr);
1479         for i in 0..KVM_NR_SPSR as usize {
1480             self.fd
1481                 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.spsr[i])
1482                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1483             off += std::mem::size_of::<u64>();
1484         }
1485 
1486         let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs);
1487         for i in 0..32 {
1488             self.fd
1489                 .set_one_reg(
1490                     arm64_core_reg_id!(KVM_REG_SIZE_U128, off),
1491                     state.fp_regs.vregs[i] as u64,
1492                 )
1493                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1494             off += mem::size_of::<u128>();
1495         }
1496 
1497         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr);
1498         self.fd
1499             .set_one_reg(
1500                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1501                 state.fp_regs.fpsr as u64,
1502             )
1503             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1504 
1505         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr);
1506         self.fd
1507             .set_one_reg(
1508                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1509                 state.fp_regs.fpcr as u64,
1510             )
1511             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1512         Ok(())
1513     }
1514     ///
1515     /// Save the state of the system registers.
1516     ///
1517     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1518     fn system_registers(&self, state: &mut Vec<Register>) -> cpu::Result<()> {
1519         // Call KVM_GET_REG_LIST to get all registers available to the guest. For ArmV8 there are
1520         // around 500 registers.
1521         let mut reg_list = RegList::new(500).unwrap();
1522         self.fd
1523             .get_reg_list(&mut reg_list)
1524             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?;
1525 
1526         // At this point reg_list should contain: core registers and system registers.
1527         // The register list contains the number of registers and their ids. We will be needing to
1528         // call KVM_GET_ONE_REG on each id in order to save all of them. We carve out from the list
1529         // the core registers which are represented in the kernel by kvm_regs structure and for which
1530         // we can calculate the id based on the offset in the structure.
1531         reg_list.retain(|regid| is_system_register(*regid));
1532 
1533         // Now, for the rest of the registers left in the previously fetched register list, we are
1534         // simply calling KVM_GET_ONE_REG.
1535         let indices = reg_list.as_slice();
1536         for index in indices.iter() {
1537             state.push(kvm_bindings::kvm_one_reg {
1538                 id: *index,
1539                 addr: self
1540                     .fd
1541                     .get_one_reg(*index)
1542                     .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?,
1543             });
1544         }
1545 
1546         Ok(())
1547     }
1548     ///
1549     /// Restore the state of the system registers.
1550     ///
1551     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1552     fn set_system_registers(&self, state: &[Register]) -> cpu::Result<()> {
1553         for reg in state {
1554             self.fd
1555                 .set_one_reg(reg.id, reg.addr)
1556                 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?;
1557         }
1558         Ok(())
1559     }
1560     ///
1561     /// Read the MPIDR - Multiprocessor Affinity Register.
1562     ///
1563     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1564     fn read_mpidr(&self) -> cpu::Result<u64> {
1565         self.fd
1566             .get_one_reg(MPIDR_EL1)
1567             .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))
1568     }
1569     ///
1570     /// Configure core registers for a given CPU.
1571     ///
1572     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1573     fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> {
1574         #[allow(non_upper_case_globals)]
1575         // PSR (Processor State Register) bits.
1576         // Taken from arch/arm64/include/uapi/asm/ptrace.h.
1577         const PSR_MODE_EL1h: u64 = 0x0000_0005;
1578         const PSR_F_BIT: u64 = 0x0000_0040;
1579         const PSR_I_BIT: u64 = 0x0000_0080;
1580         const PSR_A_BIT: u64 = 0x0000_0100;
1581         const PSR_D_BIT: u64 = 0x0000_0200;
1582         // Taken from arch/arm64/kvm/inject_fault.c.
1583         const PSTATE_FAULT_BITS_64: u64 =
1584             PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | PSR_I_BIT | PSR_D_BIT;
1585 
1586         let kreg_off = offset__of!(kvm_regs, regs);
1587 
1588         // Get the register index of the PSTATE (Processor State) register.
1589         let pstate = offset__of!(user_pt_regs, pstate) + kreg_off;
1590         self.set_reg(
1591             arm64_core_reg_id!(KVM_REG_SIZE_U64, pstate),
1592             PSTATE_FAULT_BITS_64,
1593         )
1594         .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1595 
1596         // Other vCPUs are powered off initially awaiting PSCI wakeup.
1597         if cpu_id == 0 {
1598             // Setting the PC (Processor Counter) to the current program address (kernel address).
1599             let pc = offset__of!(user_pt_regs, pc) + kreg_off;
1600             self.set_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, pc), boot_ip as u64)
1601                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1602 
1603             // Last mandatory thing to set -> the address pointing to the FDT (also called DTB).
1604             // "The device tree blob (dtb) must be placed on an 8-byte boundary and must
1605             // not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt.
1606             // We are choosing to place it the end of DRAM. See `get_fdt_addr`.
1607             let regs0 = offset__of!(user_pt_regs, regs) + kreg_off;
1608             self.set_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, regs0), fdt_start)
1609                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1610         }
1611         Ok(())
1612     }
1613 
1614     #[cfg(target_arch = "x86_64")]
1615     ///
1616     /// Get the current CPU state
1617     ///
1618     /// Ordering requirements:
1619     ///
1620     /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify
1621     /// vCPU/LAPIC state. As such, it must be done before most everything
1622     /// else, otherwise we cannot restore everything and expect it to work.
1623     ///
1624     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1625     /// still running.
1626     ///
1627     /// KVM_GET_LAPIC may change state of LAPIC before returning it.
1628     ///
1629     /// GET_VCPU_EVENTS should probably be last to save. The code looks as
1630     /// it might as well be affected by internal state modifications of the
1631     /// GET ioctls.
1632     ///
1633     /// SREGS saves/restores a pending interrupt, similar to what
1634     /// VCPU_EVENTS also does.
1635     ///
1636     /// GET_MSRS requires a pre-populated data structure to do something
1637     /// meaningful. For SET_MSRS it will then contain good data.
1638     ///
1639     /// # Example
1640     ///
1641     /// ```rust
1642     /// # extern crate hypervisor;
1643     /// # use hypervisor::KvmHypervisor;
1644     /// # use std::sync::Arc;
1645     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1646     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1647     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1648     /// vm.enable_split_irq().unwrap();
1649     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1650     /// let state = vcpu.state().unwrap();
1651     /// ```
1652     fn state(&self) -> cpu::Result<CpuState> {
1653         let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?;
1654         let mp_state = self.get_mp_state()?;
1655         let regs = self.get_regs()?;
1656         let sregs = self.get_sregs()?;
1657         let xsave = self.get_xsave()?;
1658         let xcrs = self.get_xcrs()?;
1659         let lapic_state = self.get_lapic()?;
1660         let fpu = self.get_fpu()?;
1661 
1662         // Try to get all MSRs based on the list previously retrieved from KVM.
1663         // If the number of MSRs obtained from GET_MSRS is different from the
1664         // expected amount, we fallback onto a slower method by getting MSRs
1665         // by chunks. This is the only way to make sure we try to get as many
1666         // MSRs as possible, even if some MSRs are not supported.
1667         let mut msr_entries = self.msrs.clone();
1668 
1669         // Save extra MSRs if the Hyper-V synthetic interrupt controller is
1670         // emulated.
1671         if self.hyperv_synic.load(Ordering::Acquire) {
1672             let hyperv_synic_msrs = vec![
1673                 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084,
1674                 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096,
1675                 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d,
1676                 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4,
1677                 0x400000b5, 0x400000b6, 0x400000b7,
1678             ];
1679             for index in hyperv_synic_msrs {
1680                 let msr = kvm_msr_entry {
1681                     index,
1682                     ..Default::default()
1683                 };
1684                 msr_entries.push(msr).unwrap();
1685             }
1686         }
1687 
1688         let expected_num_msrs = msr_entries.as_fam_struct_ref().nmsrs as usize;
1689         let num_msrs = self.get_msrs(&mut msr_entries)?;
1690         let msrs = if num_msrs != expected_num_msrs {
1691             let mut faulty_msr_index = num_msrs;
1692             let mut msr_entries_tmp =
1693                 MsrEntries::from_entries(&msr_entries.as_slice()[..faulty_msr_index]).unwrap();
1694 
1695             loop {
1696                 warn!(
1697                     "Detected faulty MSR 0x{:x} while getting MSRs",
1698                     msr_entries.as_slice()[faulty_msr_index].index
1699                 );
1700 
1701                 let start_pos = faulty_msr_index + 1;
1702                 let mut sub_msr_entries =
1703                     MsrEntries::from_entries(&msr_entries.as_slice()[start_pos..]).unwrap();
1704                 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize;
1705                 let num_msrs = self.get_msrs(&mut sub_msr_entries)?;
1706 
1707                 for i in 0..num_msrs {
1708                     msr_entries_tmp
1709                         .push(sub_msr_entries.as_slice()[i])
1710                         .map_err(|e| {
1711                             cpu::HypervisorCpuError::GetMsrEntries(anyhow!(
1712                                 "Failed adding MSR entries: {:?}",
1713                                 e
1714                             ))
1715                         })?;
1716                 }
1717 
1718                 if num_msrs == expected_num_msrs {
1719                     break;
1720                 }
1721 
1722                 faulty_msr_index = start_pos + num_msrs;
1723             }
1724 
1725             msr_entries_tmp
1726         } else {
1727             msr_entries
1728         };
1729 
1730         let vcpu_events = self.get_vcpu_events()?;
1731 
1732         Ok(CpuState {
1733             cpuid,
1734             msrs,
1735             vcpu_events,
1736             regs,
1737             sregs,
1738             fpu,
1739             lapic_state,
1740             xsave,
1741             xcrs,
1742             mp_state,
1743         })
1744     }
1745     ///
1746     /// Get the current AArch64 CPU state
1747     ///
1748     #[cfg(target_arch = "aarch64")]
1749     fn state(&self) -> cpu::Result<CpuState> {
1750         let mut state = CpuState {
1751             mp_state: self.get_mp_state()?,
1752             mpidr: self.read_mpidr()?,
1753             ..Default::default()
1754         };
1755         self.core_registers(&mut state.core_regs)?;
1756         self.system_registers(&mut state.sys_regs)?;
1757 
1758         Ok(state)
1759     }
1760     #[cfg(target_arch = "x86_64")]
1761     ///
1762     /// Restore the previously saved CPU state
1763     ///
1764     /// Ordering requirements:
1765     ///
1766     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1767     /// still running.
1768     ///
1769     /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so
1770     /// if we ever change the BSP, we have to do that before restoring anything.
1771     /// The same seems to be true for CPUID stuff.
1772     ///
1773     /// SREGS saves/restores a pending interrupt, similar to what
1774     /// VCPU_EVENTS also does.
1775     ///
1776     /// SET_REGS clears pending exceptions unconditionally, thus, it must be
1777     /// done before SET_VCPU_EVENTS, which restores it.
1778     ///
1779     /// SET_LAPIC must come after SET_SREGS, because the latter restores
1780     /// the apic base msr.
1781     ///
1782     /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR
1783     /// only restores successfully, when the LAPIC is correctly configured.
1784     ///
1785     /// Arguments: CpuState
1786     /// # Example
1787     ///
1788     /// ```rust
1789     /// # extern crate hypervisor;
1790     /// # use hypervisor::KvmHypervisor;
1791     /// # use std::sync::Arc;
1792     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1793     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1794     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1795     /// vm.enable_split_irq().unwrap();
1796     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1797     /// let state = vcpu.state().unwrap();
1798     /// vcpu.set_state(&state).unwrap();
1799     /// ```
1800     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
1801         self.set_cpuid2(&state.cpuid)?;
1802         self.set_mp_state(state.mp_state)?;
1803         self.set_regs(&state.regs)?;
1804         self.set_sregs(&state.sregs)?;
1805         self.set_xsave(&state.xsave)?;
1806         self.set_xcrs(&state.xcrs)?;
1807         self.set_lapic(&state.lapic_state)?;
1808         self.set_fpu(&state.fpu)?;
1809 
1810         // Try to set all MSRs previously stored.
1811         // If the number of MSRs set from SET_MSRS is different from the
1812         // expected amount, we fallback onto a slower method by setting MSRs
1813         // by chunks. This is the only way to make sure we try to set as many
1814         // MSRs as possible, even if some MSRs are not supported.
1815         let expected_num_msrs = state.msrs.as_fam_struct_ref().nmsrs as usize;
1816         let num_msrs = self.set_msrs(&state.msrs)?;
1817         if num_msrs != expected_num_msrs {
1818             let mut faulty_msr_index = num_msrs;
1819 
1820             loop {
1821                 warn!(
1822                     "Detected faulty MSR 0x{:x} while setting MSRs",
1823                     state.msrs.as_slice()[faulty_msr_index].index
1824                 );
1825 
1826                 let start_pos = faulty_msr_index + 1;
1827                 let sub_msr_entries =
1828                     MsrEntries::from_entries(&state.msrs.as_slice()[start_pos..]).unwrap();
1829                 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize;
1830                 let num_msrs = self.set_msrs(&sub_msr_entries)?;
1831 
1832                 if num_msrs == expected_num_msrs {
1833                     break;
1834                 }
1835 
1836                 faulty_msr_index = start_pos + num_msrs;
1837             }
1838         }
1839 
1840         self.set_vcpu_events(&state.vcpu_events)?;
1841 
1842         Ok(())
1843     }
1844     ///
1845     /// Restore the previously saved AArch64 CPU state
1846     ///
1847     #[cfg(target_arch = "aarch64")]
1848     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
1849         self.set_core_registers(&state.core_regs)?;
1850         self.set_system_registers(&state.sys_regs)?;
1851         self.set_mp_state(state.mp_state)?;
1852 
1853         Ok(())
1854     }
1855 
1856     ///
1857     /// Initialize TDX for this CPU
1858     ///
1859     #[cfg(feature = "tdx")]
1860     fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> {
1861         tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address)
1862             .map_err(cpu::HypervisorCpuError::InitializeTdx)
1863     }
1864 
1865     ///
1866     /// Set the "immediate_exit" state
1867     ///
1868     fn set_immediate_exit(&self, exit: bool) {
1869         self.fd.set_kvm_immediate_exit(exit.into());
1870     }
1871 
1872     ///
1873     /// Returns the details about TDX exit reason
1874     ///
1875     #[cfg(feature = "tdx")]
1876     fn get_tdx_exit_details(&mut self) -> cpu::Result<TdxExitDetails> {
1877         let kvm_run = self.fd.get_kvm_run();
1878         let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall };
1879 
1880         tdx_vmcall.status_code = TDG_VP_VMCALL_INVALID_OPERAND;
1881 
1882         if tdx_vmcall.type_ != 0 {
1883             return Err(cpu::HypervisorCpuError::UnknownTdxVmCall);
1884         }
1885 
1886         match tdx_vmcall.subfunction {
1887             TDG_VP_VMCALL_GET_QUOTE => Ok(TdxExitDetails::GetQuote),
1888             TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT => {
1889                 Ok(TdxExitDetails::SetupEventNotifyInterrupt)
1890             }
1891             _ => Err(cpu::HypervisorCpuError::UnknownTdxVmCall),
1892         }
1893     }
1894 
1895     ///
1896     /// Set the status code for TDX exit
1897     ///
1898     #[cfg(feature = "tdx")]
1899     fn set_tdx_status(&mut self, status: TdxExitStatus) {
1900         let kvm_run = self.fd.get_kvm_run();
1901         let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall };
1902 
1903         tdx_vmcall.status_code = match status {
1904             TdxExitStatus::Success => TDG_VP_VMCALL_SUCCESS,
1905             TdxExitStatus::InvalidOperand => TDG_VP_VMCALL_INVALID_OPERAND,
1906         };
1907     }
1908     #[cfg(target_arch = "x86_64")]
1909     ///
1910     /// Return the list of initial MSR entries for a VCPU
1911     ///
1912     fn boot_msr_entries(&self) -> MsrEntries {
1913         use crate::arch::x86::{msr_index, MTRR_ENABLE, MTRR_MEM_TYPE_WB};
1914         use kvm_bindings::kvm_msr_entry as MsrEntry;
1915 
1916         MsrEntries::from_entries(&[
1917             msr!(msr_index::MSR_IA32_SYSENTER_CS),
1918             msr!(msr_index::MSR_IA32_SYSENTER_ESP),
1919             msr!(msr_index::MSR_IA32_SYSENTER_EIP),
1920             msr!(msr_index::MSR_STAR),
1921             msr!(msr_index::MSR_CSTAR),
1922             msr!(msr_index::MSR_LSTAR),
1923             msr!(msr_index::MSR_KERNEL_GS_BASE),
1924             msr!(msr_index::MSR_SYSCALL_MASK),
1925             msr!(msr_index::MSR_IA32_TSC),
1926             msr_data!(
1927                 msr_index::MSR_IA32_MISC_ENABLE,
1928                 msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64
1929             ),
1930             msr_data!(msr_index::MSR_MTRRdefType, MTRR_ENABLE | MTRR_MEM_TYPE_WB),
1931         ])
1932         .unwrap()
1933     }
1934 }
1935 
1936 /// Device struct for KVM
1937 pub struct KvmDevice {
1938     fd: DeviceFd,
1939 }
1940 
1941 impl device::Device for KvmDevice {
1942     ///
1943     /// Set device attribute
1944     ///
1945     fn set_device_attr(&self, attr: &DeviceAttr) -> device::Result<()> {
1946         self.fd
1947             .set_device_attr(attr)
1948             .map_err(|e| device::HypervisorDeviceError::SetDeviceAttribute(e.into()))
1949     }
1950     ///
1951     /// Get device attribute
1952     ///
1953     fn get_device_attr(&self, attr: &mut DeviceAttr) -> device::Result<()> {
1954         self.fd
1955             .get_device_attr(attr)
1956             .map_err(|e| device::HypervisorDeviceError::GetDeviceAttribute(e.into()))
1957     }
1958 }
1959 
1960 impl AsRawFd for KvmDevice {
1961     fn as_raw_fd(&self) -> RawFd {
1962         self.fd.as_raw_fd()
1963     }
1964 }
1965