xref: /cloud-hypervisor/hypervisor/src/kvm/mod.rs (revision d461daa7fa621bea2a41fd31c7cbee4756be7a2c)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 // Copyright © 2020, Microsoft Corporation
6 //
7 // Copyright 2018-2019 CrowdStrike, Inc.
8 //
9 //
10 
11 #[cfg(target_arch = "aarch64")]
12 use crate::aarch64::gic::KvmGicV3Its;
13 #[cfg(target_arch = "aarch64")]
14 pub use crate::aarch64::{
15     check_required_kvm_extensions, gic::Gicv3ItsState as GicState, is_system_register, VcpuInit,
16     VcpuKvmState, MPIDR_EL1,
17 };
18 #[cfg(target_arch = "aarch64")]
19 use crate::arch::aarch64::gic::Vgic;
20 use crate::cpu;
21 use crate::device;
22 use crate::hypervisor;
23 use crate::vec_with_array_field;
24 use crate::vm::{self, InterruptSourceConfig, VmOps};
25 #[cfg(target_arch = "aarch64")]
26 use crate::{arm64_core_reg_id, offset__of};
27 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd};
28 use std::any::Any;
29 use std::collections::HashMap;
30 #[cfg(target_arch = "aarch64")]
31 use std::convert::TryInto;
32 #[cfg(target_arch = "x86_64")]
33 use std::fs::File;
34 #[cfg(target_arch = "x86_64")]
35 use std::os::unix::io::AsRawFd;
36 #[cfg(feature = "tdx")]
37 use std::os::unix::io::RawFd;
38 use std::result;
39 #[cfg(target_arch = "x86_64")]
40 use std::sync::atomic::{AtomicBool, Ordering};
41 #[cfg(target_arch = "aarch64")]
42 use std::sync::Mutex;
43 use std::sync::{Arc, RwLock};
44 use vmm_sys_util::eventfd::EventFd;
45 // x86_64 dependencies
46 #[cfg(target_arch = "x86_64")]
47 pub mod x86_64;
48 #[cfg(target_arch = "x86_64")]
49 use crate::arch::x86::{
50     CpuIdEntry, FpuState, SpecialRegisters, StandardRegisters, NUM_IOAPIC_PINS,
51 };
52 #[cfg(target_arch = "x86_64")]
53 use crate::ClockData;
54 use crate::{
55     CpuState, IoEventAddress, IrqRoutingEntry, MpState, UserMemoryRegion,
56     USER_MEMORY_REGION_LOG_DIRTY, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE,
57 };
58 #[cfg(target_arch = "aarch64")]
59 use aarch64::{RegList, Register, StandardRegisters};
60 #[cfg(target_arch = "x86_64")]
61 use kvm_bindings::{
62     kvm_enable_cap, kvm_guest_debug, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC,
63     KVM_CAP_SPLIT_IRQCHIP, KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_GUESTDBG_USE_HW_BP,
64 };
65 #[cfg(target_arch = "x86_64")]
66 use x86_64::check_required_kvm_extensions;
67 #[cfg(target_arch = "x86_64")]
68 pub use x86_64::{CpuId, ExtendedControlRegisters, LapicState, MsrEntries, VcpuKvmState, Xsave};
69 // aarch64 dependencies
70 #[cfg(target_arch = "aarch64")]
71 pub mod aarch64;
72 pub use kvm_bindings;
73 #[cfg(feature = "tdx")]
74 use kvm_bindings::KVMIO;
75 pub use kvm_bindings::{
76     kvm_clock_data, kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_irq_routing,
77     kvm_irq_routing_entry, kvm_mp_state, kvm_userspace_memory_region, KVM_IRQ_ROUTING_IRQCHIP,
78     KVM_IRQ_ROUTING_MSI, KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID,
79 };
80 #[cfg(target_arch = "aarch64")]
81 use kvm_bindings::{
82     kvm_regs, user_fpsimd_state, user_pt_regs, KVM_NR_SPSR, KVM_REG_ARM64, KVM_REG_ARM_CORE,
83     KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64,
84 };
85 pub use kvm_ioctls;
86 pub use kvm_ioctls::{Cap, Kvm};
87 #[cfg(target_arch = "aarch64")]
88 use std::mem;
89 use thiserror::Error;
90 #[cfg(feature = "tdx")]
91 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_expr, ioctl_ioc_nr, ioctl_iowr_nr};
92 ///
93 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms
94 ///
95 pub use {
96     kvm_bindings::kvm_create_device as CreateDevice, kvm_bindings::kvm_device_attr as DeviceAttr,
97     kvm_bindings::kvm_run, kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::DeviceFd,
98     kvm_ioctls::VcpuExit,
99 };
100 
101 #[cfg(target_arch = "x86_64")]
102 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196;
103 
104 #[cfg(feature = "tdx")]
105 const KVM_EXIT_TDX: u32 = 35;
106 #[cfg(feature = "tdx")]
107 const TDG_VP_VMCALL_GET_QUOTE: u64 = 0x10002;
108 #[cfg(feature = "tdx")]
109 const TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: u64 = 0x10004;
110 #[cfg(feature = "tdx")]
111 const TDG_VP_VMCALL_SUCCESS: u64 = 0;
112 #[cfg(feature = "tdx")]
113 const TDG_VP_VMCALL_INVALID_OPERAND: u64 = 0x8000000000000000;
114 
115 #[cfg(feature = "tdx")]
116 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong);
117 
118 #[cfg(feature = "tdx")]
119 #[repr(u32)]
120 enum TdxCommand {
121     Capabilities = 0,
122     InitVm,
123     InitVcpu,
124     InitMemRegion,
125     Finalize,
126 }
127 
128 #[cfg(feature = "tdx")]
129 pub enum TdxExitDetails {
130     GetQuote,
131     SetupEventNotifyInterrupt,
132 }
133 
134 #[cfg(feature = "tdx")]
135 pub enum TdxExitStatus {
136     Success,
137     InvalidOperand,
138 }
139 
140 #[cfg(feature = "tdx")]
141 const TDX_MAX_NR_CPUID_CONFIGS: usize = 6;
142 
143 #[cfg(feature = "tdx")]
144 #[repr(C)]
145 #[derive(Debug, Default)]
146 pub struct TdxCpuidConfig {
147     pub leaf: u32,
148     pub sub_leaf: u32,
149     pub eax: u32,
150     pub ebx: u32,
151     pub ecx: u32,
152     pub edx: u32,
153 }
154 
155 #[cfg(feature = "tdx")]
156 #[repr(C)]
157 #[derive(Debug, Default)]
158 pub struct TdxCapabilities {
159     pub attrs_fixed0: u64,
160     pub attrs_fixed1: u64,
161     pub xfam_fixed0: u64,
162     pub xfam_fixed1: u64,
163     pub nr_cpuid_configs: u32,
164     pub padding: u32,
165     pub cpuid_configs: [TdxCpuidConfig; TDX_MAX_NR_CPUID_CONFIGS],
166 }
167 
168 impl From<kvm_userspace_memory_region> for UserMemoryRegion {
169     fn from(region: kvm_userspace_memory_region) -> Self {
170         let mut flags = USER_MEMORY_REGION_READ;
171         if region.flags & KVM_MEM_READONLY == 0 {
172             flags |= USER_MEMORY_REGION_WRITE;
173         }
174         if region.flags & KVM_MEM_LOG_DIRTY_PAGES != 0 {
175             flags |= USER_MEMORY_REGION_LOG_DIRTY;
176         }
177 
178         UserMemoryRegion {
179             slot: region.slot,
180             guest_phys_addr: region.guest_phys_addr,
181             memory_size: region.memory_size,
182             userspace_addr: region.userspace_addr,
183             flags,
184         }
185     }
186 }
187 
188 impl From<UserMemoryRegion> for kvm_userspace_memory_region {
189     fn from(region: UserMemoryRegion) -> Self {
190         assert!(
191             region.flags & USER_MEMORY_REGION_READ != 0,
192             "KVM mapped memory is always readable"
193         );
194 
195         let mut flags = 0;
196         if region.flags & USER_MEMORY_REGION_WRITE == 0 {
197             flags |= KVM_MEM_READONLY;
198         }
199         if region.flags & USER_MEMORY_REGION_LOG_DIRTY != 0 {
200             flags |= KVM_MEM_LOG_DIRTY_PAGES;
201         }
202 
203         kvm_userspace_memory_region {
204             slot: region.slot,
205             guest_phys_addr: region.guest_phys_addr,
206             memory_size: region.memory_size,
207             userspace_addr: region.userspace_addr,
208             flags,
209         }
210     }
211 }
212 
213 impl From<kvm_mp_state> for MpState {
214     fn from(s: kvm_mp_state) -> Self {
215         MpState::Kvm(s)
216     }
217 }
218 
219 impl From<MpState> for kvm_mp_state {
220     fn from(ms: MpState) -> Self {
221         match ms {
222             MpState::Kvm(s) => s,
223             /* Needed in case other hypervisors are enabled */
224             #[allow(unreachable_patterns)]
225             _ => panic!("CpuState is not valid"),
226         }
227     }
228 }
229 
230 impl From<kvm_ioctls::IoEventAddress> for IoEventAddress {
231     fn from(a: kvm_ioctls::IoEventAddress) -> Self {
232         match a {
233             kvm_ioctls::IoEventAddress::Pio(x) => Self::Pio(x),
234             kvm_ioctls::IoEventAddress::Mmio(x) => Self::Mmio(x),
235         }
236     }
237 }
238 
239 impl From<IoEventAddress> for kvm_ioctls::IoEventAddress {
240     fn from(a: IoEventAddress) -> Self {
241         match a {
242             IoEventAddress::Pio(x) => Self::Pio(x),
243             IoEventAddress::Mmio(x) => Self::Mmio(x),
244         }
245     }
246 }
247 
248 impl From<VcpuKvmState> for CpuState {
249     fn from(s: VcpuKvmState) -> Self {
250         CpuState::Kvm(s)
251     }
252 }
253 
254 impl From<CpuState> for VcpuKvmState {
255     fn from(s: CpuState) -> Self {
256         match s {
257             CpuState::Kvm(s) => s,
258             /* Needed in case other hypervisors are enabled */
259             #[allow(unreachable_patterns)]
260             _ => panic!("CpuState is not valid"),
261         }
262     }
263 }
264 
265 #[cfg(target_arch = "x86_64")]
266 impl From<kvm_clock_data> for ClockData {
267     fn from(d: kvm_clock_data) -> Self {
268         ClockData::Kvm(d)
269     }
270 }
271 
272 #[cfg(target_arch = "x86_64")]
273 impl From<ClockData> for kvm_clock_data {
274     fn from(ms: ClockData) -> Self {
275         match ms {
276             ClockData::Kvm(s) => s,
277             /* Needed in case other hypervisors are enabled */
278             #[allow(unreachable_patterns)]
279             _ => panic!("CpuState is not valid"),
280         }
281     }
282 }
283 
284 impl From<kvm_irq_routing_entry> for IrqRoutingEntry {
285     fn from(s: kvm_irq_routing_entry) -> Self {
286         IrqRoutingEntry::Kvm(s)
287     }
288 }
289 
290 impl From<IrqRoutingEntry> for kvm_irq_routing_entry {
291     fn from(e: IrqRoutingEntry) -> Self {
292         match e {
293             IrqRoutingEntry::Kvm(e) => e,
294             /* Needed in case other hypervisors are enabled */
295             #[allow(unreachable_patterns)]
296             _ => panic!("IrqRoutingEntry is not valid"),
297         }
298     }
299 }
300 
301 struct KvmDirtyLogSlot {
302     slot: u32,
303     guest_phys_addr: u64,
304     memory_size: u64,
305     userspace_addr: u64,
306 }
307 
308 /// Wrapper over KVM VM ioctls.
309 pub struct KvmVm {
310     fd: Arc<VmFd>,
311     #[cfg(target_arch = "x86_64")]
312     msrs: MsrEntries,
313     dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>,
314 }
315 
316 ///
317 /// Implementation of Vm trait for KVM
318 /// Example:
319 /// #[cfg(feature = "kvm")]
320 /// extern crate hypervisor
321 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
322 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
323 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
324 /// vm.set/get().unwrap()
325 ///
326 impl vm::Vm for KvmVm {
327     #[cfg(target_arch = "x86_64")]
328     ///
329     /// Sets the address of the one-page region in the VM's address space.
330     ///
331     fn set_identity_map_address(&self, address: u64) -> vm::Result<()> {
332         self.fd
333             .set_identity_map_address(address)
334             .map_err(|e| vm::HypervisorVmError::SetIdentityMapAddress(e.into()))
335     }
336     #[cfg(target_arch = "x86_64")]
337     ///
338     /// Sets the address of the three-page region in the VM's address space.
339     ///
340     fn set_tss_address(&self, offset: usize) -> vm::Result<()> {
341         self.fd
342             .set_tss_address(offset)
343             .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into()))
344     }
345     ///
346     /// Creates an in-kernel interrupt controller.
347     ///
348     fn create_irq_chip(&self) -> vm::Result<()> {
349         self.fd
350             .create_irq_chip()
351             .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into()))
352     }
353     ///
354     /// Registers an event that will, when signaled, trigger the `gsi` IRQ.
355     ///
356     fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
357         self.fd
358             .register_irqfd(fd, gsi)
359             .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into()))
360     }
361     ///
362     /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ.
363     ///
364     fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
365         self.fd
366             .unregister_irqfd(fd, gsi)
367             .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into()))
368     }
369     ///
370     /// Creates a VcpuFd object from a vcpu RawFd.
371     ///
372     fn create_vcpu(
373         &self,
374         id: u8,
375         vm_ops: Option<Arc<dyn VmOps>>,
376     ) -> vm::Result<Arc<dyn cpu::Vcpu>> {
377         let vc = self
378             .fd
379             .create_vcpu(id as u64)
380             .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?;
381         let vcpu = KvmVcpu {
382             fd: vc,
383             #[cfg(target_arch = "x86_64")]
384             msrs: self.msrs.clone(),
385             vm_ops,
386             #[cfg(target_arch = "x86_64")]
387             hyperv_synic: AtomicBool::new(false),
388         };
389         Ok(Arc::new(vcpu))
390     }
391     #[cfg(target_arch = "aarch64")]
392     ///
393     /// Creates a virtual GIC device.
394     ///
395     fn create_vgic(
396         &self,
397         vcpu_count: u64,
398         dist_addr: u64,
399         dist_size: u64,
400         redist_size: u64,
401         msi_size: u64,
402         nr_irqs: u32,
403     ) -> vm::Result<Arc<Mutex<dyn Vgic>>> {
404         let gic_device = KvmGicV3Its::new(
405             self,
406             vcpu_count,
407             dist_addr,
408             dist_size,
409             redist_size,
410             msi_size,
411             nr_irqs,
412         )
413         .map_err(|e| vm::HypervisorVmError::CreateVgic(anyhow!("Vgic error {:?}", e)))?;
414         Ok(Arc::new(Mutex::new(gic_device)))
415     }
416     ///
417     /// Registers an event to be signaled whenever a certain address is written to.
418     ///
419     fn register_ioevent(
420         &self,
421         fd: &EventFd,
422         addr: &IoEventAddress,
423         datamatch: Option<vm::DataMatch>,
424     ) -> vm::Result<()> {
425         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
426         if let Some(dm) = datamatch {
427             match dm {
428                 vm::DataMatch::DataMatch32(kvm_dm32) => self
429                     .fd
430                     .register_ioevent(fd, addr, kvm_dm32)
431                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
432                 vm::DataMatch::DataMatch64(kvm_dm64) => self
433                     .fd
434                     .register_ioevent(fd, addr, kvm_dm64)
435                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
436             }
437         } else {
438             self.fd
439                 .register_ioevent(fd, addr, NoDatamatch)
440                 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into()))
441         }
442     }
443     ///
444     /// Unregisters an event from a certain address it has been previously registered to.
445     ///
446     fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> {
447         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
448         self.fd
449             .unregister_ioevent(fd, addr, NoDatamatch)
450             .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into()))
451     }
452 
453     ///
454     /// Constructs a routing entry
455     ///
456     fn make_routing_entry(&self, gsi: u32, config: &InterruptSourceConfig) -> IrqRoutingEntry {
457         match &config {
458             InterruptSourceConfig::MsiIrq(cfg) => {
459                 let mut kvm_route = kvm_irq_routing_entry {
460                     gsi,
461                     type_: KVM_IRQ_ROUTING_MSI,
462                     ..Default::default()
463                 };
464 
465                 kvm_route.u.msi.address_lo = cfg.low_addr;
466                 kvm_route.u.msi.address_hi = cfg.high_addr;
467                 kvm_route.u.msi.data = cfg.data;
468 
469                 if self.check_extension(crate::kvm::Cap::MsiDevid) {
470                     // On AArch64, there is limitation on the range of the 'devid',
471                     // it can not be greater than 65536 (the max of u16).
472                     //
473                     // BDF can not be used directly, because 'segment' is in high
474                     // 16 bits. The layout of the u32 BDF is:
475                     // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --|
476                     // |      segment    |     bus    |   device   |  function  |
477                     //
478                     // Now that we support 1 bus only in a segment, we can build a
479                     // 'devid' by replacing the 'bus' bits with the low 8 bits of
480                     // 'segment' data.
481                     // This way we can resolve the range checking problem and give
482                     // different `devid` to all the devices. Limitation is that at
483                     // most 256 segments can be supported.
484                     //
485                     let modified_devid = (cfg.devid & 0x00ff_0000) >> 8 | cfg.devid & 0xff;
486 
487                     kvm_route.flags = KVM_MSI_VALID_DEVID;
488                     kvm_route.u.msi.__bindgen_anon_1.devid = modified_devid;
489                 }
490                 kvm_route.into()
491             }
492             InterruptSourceConfig::LegacyIrq(cfg) => {
493                 let mut kvm_route = kvm_irq_routing_entry {
494                     gsi,
495                     type_: KVM_IRQ_ROUTING_IRQCHIP,
496                     ..Default::default()
497                 };
498                 kvm_route.u.irqchip.irqchip = cfg.irqchip;
499                 kvm_route.u.irqchip.pin = cfg.pin;
500 
501                 kvm_route.into()
502             }
503         }
504     }
505 
506     ///
507     /// Sets the GSI routing table entries, overwriting any previously set
508     /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl.
509     ///
510     fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> {
511         let mut irq_routing =
512             vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len());
513         irq_routing[0].nr = entries.len() as u32;
514         irq_routing[0].flags = 0;
515         let entries: Vec<kvm_irq_routing_entry> = entries
516             .iter()
517             .map(|entry| match entry {
518                 IrqRoutingEntry::Kvm(e) => *e,
519                 #[allow(unreachable_patterns)]
520                 _ => panic!("IrqRoutingEntry type is wrong"),
521             })
522             .collect();
523 
524         // SAFETY: irq_routing initialized with entries.len() and now it is being turned into
525         // entries_slice with entries.len() again. It is guaranteed to be large enough to hold
526         // everything from entries.
527         unsafe {
528             let entries_slice: &mut [kvm_irq_routing_entry] =
529                 irq_routing[0].entries.as_mut_slice(entries.len());
530             entries_slice.copy_from_slice(&entries);
531         }
532 
533         self.fd
534             .set_gsi_routing(&irq_routing[0])
535             .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into()))
536     }
537     ///
538     /// Creates a memory region structure that can be used with {create/remove}_user_memory_region
539     ///
540     fn make_user_memory_region(
541         &self,
542         slot: u32,
543         guest_phys_addr: u64,
544         memory_size: u64,
545         userspace_addr: u64,
546         readonly: bool,
547         log_dirty_pages: bool,
548     ) -> UserMemoryRegion {
549         kvm_userspace_memory_region {
550             slot,
551             guest_phys_addr,
552             memory_size,
553             userspace_addr,
554             flags: if readonly { KVM_MEM_READONLY } else { 0 }
555                 | if log_dirty_pages {
556                     KVM_MEM_LOG_DIRTY_PAGES
557                 } else {
558                     0
559                 },
560         }
561         .into()
562     }
563     ///
564     /// Creates a guest physical memory region.
565     ///
566     fn create_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
567         let mut region: kvm_userspace_memory_region = user_memory_region.into();
568 
569         if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 {
570             if (region.flags & KVM_MEM_READONLY) != 0 {
571                 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!(
572                     "Error creating regions with both 'dirty-pages-log' and 'read-only'."
573                 )));
574             }
575 
576             // Keep track of the regions that need dirty pages log
577             self.dirty_log_slots.write().unwrap().insert(
578                 region.slot,
579                 KvmDirtyLogSlot {
580                     slot: region.slot,
581                     guest_phys_addr: region.guest_phys_addr,
582                     memory_size: region.memory_size,
583                     userspace_addr: region.userspace_addr,
584                 },
585             );
586 
587             // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`.
588             // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`.
589             region.flags = 0;
590         }
591 
592         // SAFETY: Safe because guest regions are guaranteed not to overlap.
593         unsafe {
594             self.fd
595                 .set_user_memory_region(region)
596                 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))
597         }
598     }
599     ///
600     /// Removes a guest physical memory region.
601     ///
602     fn remove_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
603         let mut region: kvm_userspace_memory_region = user_memory_region.into();
604 
605         // Remove the corresponding entry from "self.dirty_log_slots" if needed
606         self.dirty_log_slots.write().unwrap().remove(&region.slot);
607 
608         // Setting the size to 0 means "remove"
609         region.memory_size = 0;
610         // SAFETY: Safe because guest regions are guaranteed not to overlap.
611         unsafe {
612             self.fd
613                 .set_user_memory_region(region)
614                 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into()))
615         }
616     }
617     ///
618     /// Creates an emulated device in the kernel.
619     ///
620     /// See the documentation for `KVM_CREATE_DEVICE`.
621     fn create_device(&self, device: &mut CreateDevice) -> vm::Result<Arc<dyn device::Device>> {
622         let device_fd = self
623             .fd
624             .create_device(device)
625             .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?;
626         Ok(Arc::new(device_fd))
627     }
628     ///
629     /// Returns the preferred CPU target type which can be emulated by KVM on underlying host.
630     ///
631     #[cfg(target_arch = "aarch64")]
632     fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> {
633         self.fd
634             .get_preferred_target(kvi)
635             .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into()))
636     }
637     #[cfg(target_arch = "x86_64")]
638     fn enable_split_irq(&self) -> vm::Result<()> {
639         // Create split irqchip
640         // Only the local APIC is emulated in kernel, both PICs and IOAPIC
641         // are not.
642         let mut cap = kvm_enable_cap {
643             cap: KVM_CAP_SPLIT_IRQCHIP,
644             ..Default::default()
645         };
646         cap.args[0] = NUM_IOAPIC_PINS as u64;
647         self.fd
648             .enable_cap(&cap)
649             .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?;
650         Ok(())
651     }
652     #[cfg(target_arch = "x86_64")]
653     fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> {
654         let mut cap = kvm_enable_cap {
655             cap: KVM_CAP_SGX_ATTRIBUTE,
656             ..Default::default()
657         };
658         cap.args[0] = file.as_raw_fd() as u64;
659         self.fd
660             .enable_cap(&cap)
661             .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?;
662         Ok(())
663     }
664     /// Retrieve guest clock.
665     #[cfg(target_arch = "x86_64")]
666     fn get_clock(&self) -> vm::Result<ClockData> {
667         Ok(self
668             .fd
669             .get_clock()
670             .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))?
671             .into())
672     }
673     /// Set guest clock.
674     #[cfg(target_arch = "x86_64")]
675     fn set_clock(&self, data: &ClockData) -> vm::Result<()> {
676         let data = (*data).into();
677         self.fd
678             .set_clock(&data)
679             .map_err(|e| vm::HypervisorVmError::SetClock(e.into()))
680     }
681     /// Checks if a particular `Cap` is available.
682     fn check_extension(&self, c: Cap) -> bool {
683         self.fd.check_extension(c)
684     }
685     /// Create a device that is used for passthrough
686     fn create_passthrough_device(&self) -> vm::Result<Arc<dyn device::Device>> {
687         let mut vfio_dev = kvm_create_device {
688             type_: kvm_device_type_KVM_DEV_TYPE_VFIO,
689             fd: 0,
690             flags: 0,
691         };
692 
693         self.create_device(&mut vfio_dev)
694             .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into()))
695     }
696     ///
697     /// Start logging dirty pages
698     ///
699     fn start_dirty_log(&self) -> vm::Result<()> {
700         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
701         for (_, s) in dirty_log_slots.iter() {
702             let region = kvm_userspace_memory_region {
703                 slot: s.slot,
704                 guest_phys_addr: s.guest_phys_addr,
705                 memory_size: s.memory_size,
706                 userspace_addr: s.userspace_addr,
707                 flags: KVM_MEM_LOG_DIRTY_PAGES,
708             };
709             // SAFETY: Safe because guest regions are guaranteed not to overlap.
710             unsafe {
711                 self.fd
712                     .set_user_memory_region(region)
713                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
714             }
715         }
716 
717         Ok(())
718     }
719 
720     ///
721     /// Stop logging dirty pages
722     ///
723     fn stop_dirty_log(&self) -> vm::Result<()> {
724         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
725         for (_, s) in dirty_log_slots.iter() {
726             let region = kvm_userspace_memory_region {
727                 slot: s.slot,
728                 guest_phys_addr: s.guest_phys_addr,
729                 memory_size: s.memory_size,
730                 userspace_addr: s.userspace_addr,
731                 flags: 0,
732             };
733             // SAFETY: Safe because guest regions are guaranteed not to overlap.
734             unsafe {
735                 self.fd
736                     .set_user_memory_region(region)
737                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
738             }
739         }
740 
741         Ok(())
742     }
743 
744     ///
745     /// Get dirty pages bitmap (one bit per page)
746     ///
747     fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> {
748         self.fd
749             .get_dirty_log(slot, memory_size as usize)
750             .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into()))
751     }
752 
753     ///
754     /// Initialize TDX for this VM
755     ///
756     #[cfg(feature = "tdx")]
757     fn tdx_init(&self, cpuid: &[CpuIdEntry], max_vcpus: u32) -> vm::Result<()> {
758         use std::io::{Error, ErrorKind};
759         let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> =
760             cpuid.iter().map(|e| (*e).into()).collect();
761         let kvm_cpuid = kvm_bindings::CpuId::from_entries(&cpuid).map_err(|_| {
762             vm::HypervisorVmError::InitializeTdx(Error::new(
763                 ErrorKind::Other,
764                 "failed to allocate CpuId",
765             ))
766         })?;
767 
768         #[repr(C)]
769         struct TdxInitVm {
770             max_vcpus: u32,
771             tsc_khz: u32,
772             attributes: u64,
773             cpuid: u64,
774             mrconfigid: [u64; 6],
775             mrowner: [u64; 6],
776             mrownerconfig: [u64; 6],
777             reserved: [u64; 43],
778         }
779         let data = TdxInitVm {
780             max_vcpus,
781             tsc_khz: 0,
782             attributes: 0,
783             cpuid: kvm_cpuid.as_fam_struct_ptr() as u64,
784             mrconfigid: [0; 6],
785             mrowner: [0; 6],
786             mrownerconfig: [0; 6],
787             reserved: [0; 43],
788         };
789 
790         tdx_command(
791             &self.fd.as_raw_fd(),
792             TdxCommand::InitVm,
793             0,
794             &data as *const _ as u64,
795         )
796         .map_err(vm::HypervisorVmError::InitializeTdx)
797     }
798 
799     ///
800     /// Finalize the TDX setup for this VM
801     ///
802     #[cfg(feature = "tdx")]
803     fn tdx_finalize(&self) -> vm::Result<()> {
804         tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0)
805             .map_err(vm::HypervisorVmError::FinalizeTdx)
806     }
807 
808     ///
809     /// Initialize memory regions for the TDX VM
810     ///
811     #[cfg(feature = "tdx")]
812     fn tdx_init_memory_region(
813         &self,
814         host_address: u64,
815         guest_address: u64,
816         size: u64,
817         measure: bool,
818     ) -> vm::Result<()> {
819         #[repr(C)]
820         struct TdxInitMemRegion {
821             host_address: u64,
822             guest_address: u64,
823             pages: u64,
824         }
825         let data = TdxInitMemRegion {
826             host_address,
827             guest_address,
828             pages: size / 4096,
829         };
830 
831         tdx_command(
832             &self.fd.as_raw_fd(),
833             TdxCommand::InitMemRegion,
834             if measure { 1 } else { 0 },
835             &data as *const _ as u64,
836         )
837         .map_err(vm::HypervisorVmError::InitMemRegionTdx)
838     }
839 }
840 
841 #[cfg(feature = "tdx")]
842 fn tdx_command(
843     fd: &RawFd,
844     command: TdxCommand,
845     metadata: u32,
846     data: u64,
847 ) -> std::result::Result<(), std::io::Error> {
848     #[repr(C)]
849     struct TdxIoctlCmd {
850         command: TdxCommand,
851         metadata: u32,
852         data: u64,
853     }
854     let cmd = TdxIoctlCmd {
855         command,
856         metadata,
857         data,
858     };
859     // SAFETY: FFI call. All input parameters are valid.
860     let ret = unsafe {
861         ioctl_with_val(
862             fd,
863             KVM_MEMORY_ENCRYPT_OP(),
864             &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong,
865         )
866     };
867 
868     if ret < 0 {
869         return Err(std::io::Error::last_os_error());
870     }
871     Ok(())
872 }
873 
874 /// Wrapper over KVM system ioctls.
875 pub struct KvmHypervisor {
876     kvm: Kvm,
877 }
878 /// Enum for KVM related error
879 #[derive(Debug, Error)]
880 pub enum KvmError {
881     #[error("Capability missing: {0:?}")]
882     CapabilityMissing(Cap),
883 }
884 pub type KvmResult<T> = result::Result<T, KvmError>;
885 impl KvmHypervisor {
886     /// Create a hypervisor based on Kvm
887     pub fn new() -> hypervisor::Result<KvmHypervisor> {
888         let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?;
889         let api_version = kvm_obj.get_api_version();
890 
891         if api_version != kvm_bindings::KVM_API_VERSION as i32 {
892             return Err(hypervisor::HypervisorError::IncompatibleApiVersion);
893         }
894 
895         Ok(KvmHypervisor { kvm: kvm_obj })
896     }
897 }
898 /// Implementation of Hypervisor trait for KVM
899 /// Example:
900 /// #[cfg(feature = "kvm")]
901 /// extern crate hypervisor
902 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
903 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
904 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
905 ///
906 impl hypervisor::Hypervisor for KvmHypervisor {
907     /// Create a KVM vm object of a specific VM type and return the object as Vm trait object
908     /// Example
909     /// # extern crate hypervisor;
910     /// # use hypervisor::KvmHypervisor;
911     /// use hypervisor::KvmVm;
912     /// let hypervisor = KvmHypervisor::new().unwrap();
913     /// let vm = hypervisor.create_vm_with_type(KvmVmType::LegacyVm).unwrap()
914     ///
915     fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> {
916         let fd: VmFd;
917         loop {
918             match self.kvm.create_vm_with_type(vm_type) {
919                 Ok(res) => fd = res,
920                 Err(e) => {
921                     if e.errno() == libc::EINTR {
922                         // If the error returned is EINTR, which means the
923                         // ioctl has been interrupted, we have to retry as
924                         // this can't be considered as a regular error.
925                         continue;
926                     } else {
927                         return Err(hypervisor::HypervisorError::VmCreate(e.into()));
928                     }
929                 }
930             }
931             break;
932         }
933 
934         let vm_fd = Arc::new(fd);
935 
936         #[cfg(target_arch = "x86_64")]
937         {
938             let msr_list = self.get_msr_list()?;
939             let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize;
940             let mut msrs = MsrEntries::new(num_msrs).unwrap();
941             let indices = msr_list.as_slice();
942             let msr_entries = msrs.as_mut_slice();
943             for (pos, index) in indices.iter().enumerate() {
944                 msr_entries[pos].index = *index;
945             }
946 
947             Ok(Arc::new(KvmVm {
948                 fd: vm_fd,
949                 msrs,
950                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
951             }))
952         }
953 
954         #[cfg(target_arch = "aarch64")]
955         {
956             Ok(Arc::new(KvmVm {
957                 fd: vm_fd,
958                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
959             }))
960         }
961     }
962 
963     /// Create a KVM vm object and return the object as Vm trait object
964     /// Example
965     /// # extern crate hypervisor;
966     /// # use hypervisor::KvmHypervisor;
967     /// use hypervisor::KvmVm;
968     /// let hypervisor = KvmHypervisor::new().unwrap();
969     /// let vm = hypervisor.create_vm().unwrap()
970     ///
971     fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> {
972         #[allow(unused_mut)]
973         let mut vm_type: u64 = 0; // Create with default platform type
974 
975         // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA
976         // size from the host and use that when creating the VM, which may
977         // avoid unnecessary VM creation failures.
978         #[cfg(target_arch = "aarch64")]
979         if self.kvm.check_extension(Cap::ArmVmIPASize) {
980             vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap();
981         }
982 
983         self.create_vm_with_type(vm_type)
984     }
985 
986     fn check_required_extensions(&self) -> hypervisor::Result<()> {
987         check_required_kvm_extensions(&self.kvm)
988             .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into()))
989     }
990 
991     #[cfg(target_arch = "x86_64")]
992     ///
993     /// X86 specific call to get the system supported CPUID values.
994     ///
995     fn get_cpuid(&self) -> hypervisor::Result<Vec<CpuIdEntry>> {
996         let kvm_cpuid = self
997             .kvm
998             .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
999             .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))?;
1000 
1001         let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect();
1002 
1003         Ok(v)
1004     }
1005 
1006     #[cfg(target_arch = "x86_64")]
1007     ///
1008     /// Retrieve the list of MSRs supported by KVM.
1009     ///
1010     fn get_msr_list(&self) -> hypervisor::Result<MsrList> {
1011         self.kvm
1012             .get_msr_index_list()
1013             .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into()))
1014     }
1015     #[cfg(target_arch = "aarch64")]
1016     ///
1017     /// Retrieve AArch64 host maximum IPA size supported by KVM.
1018     ///
1019     fn get_host_ipa_limit(&self) -> i32 {
1020         self.kvm.get_host_ipa_limit()
1021     }
1022 
1023     ///
1024     /// Retrieve TDX capabilities
1025     ///
1026     #[cfg(feature = "tdx")]
1027     fn tdx_capabilities(&self) -> hypervisor::Result<TdxCapabilities> {
1028         let data = TdxCapabilities {
1029             nr_cpuid_configs: TDX_MAX_NR_CPUID_CONFIGS as u32,
1030             ..Default::default()
1031         };
1032 
1033         tdx_command(
1034             &self.kvm.as_raw_fd(),
1035             TdxCommand::Capabilities,
1036             0,
1037             &data as *const _ as u64,
1038         )
1039         .map_err(|e| hypervisor::HypervisorError::TdxCapabilities(e.into()))?;
1040 
1041         Ok(data)
1042     }
1043 }
1044 /// Vcpu struct for KVM
1045 pub struct KvmVcpu {
1046     fd: VcpuFd,
1047     #[cfg(target_arch = "x86_64")]
1048     msrs: MsrEntries,
1049     vm_ops: Option<Arc<dyn vm::VmOps>>,
1050     #[cfg(target_arch = "x86_64")]
1051     hyperv_synic: AtomicBool,
1052 }
1053 /// Implementation of Vcpu trait for KVM
1054 /// Example:
1055 /// #[cfg(feature = "kvm")]
1056 /// extern crate hypervisor
1057 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1058 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1059 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
1060 /// let vcpu = vm.create_vcpu(0, None).unwrap();
1061 /// vcpu.get/set().unwrap()
1062 ///
1063 impl cpu::Vcpu for KvmVcpu {
1064     #[cfg(target_arch = "x86_64")]
1065     ///
1066     /// Returns the vCPU general purpose registers.
1067     ///
1068     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1069         Ok(self
1070             .fd
1071             .get_regs()
1072             .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))?
1073             .into())
1074     }
1075     ///
1076     /// Returns the vCPU general purpose registers.
1077     /// The `KVM_GET_REGS` ioctl is not available on AArch64, `KVM_GET_ONE_REG`
1078     /// is used to get registers one by one.
1079     ///
1080     #[cfg(target_arch = "aarch64")]
1081     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1082         let mut state: StandardRegisters = kvm_regs::default();
1083         let mut off = offset__of!(user_pt_regs, regs);
1084         // There are 31 user_pt_regs:
1085         // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72
1086         // These actually are the general-purpose registers of the Armv8-a
1087         // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register).
1088         for i in 0..31 {
1089             state.regs.regs[i] = self
1090                 .fd
1091                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1092                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1093             off += std::mem::size_of::<u64>();
1094         }
1095 
1096         // We are now entering the "Other register" section of the ARMv8-a architecture.
1097         // First one, stack pointer.
1098         let off = offset__of!(user_pt_regs, sp);
1099         state.regs.sp = self
1100             .fd
1101             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1102             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1103 
1104         // Second one, the program counter.
1105         let off = offset__of!(user_pt_regs, pc);
1106         state.regs.pc = self
1107             .fd
1108             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1109             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1110 
1111         // Next is the processor state.
1112         let off = offset__of!(user_pt_regs, pstate);
1113         state.regs.pstate = self
1114             .fd
1115             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1116             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1117 
1118         // The stack pointer associated with EL1
1119         let off = offset__of!(kvm_regs, sp_el1);
1120         state.sp_el1 = self
1121             .fd
1122             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1123             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1124 
1125         // Exception Link Register for EL1, when taking an exception to EL1, this register
1126         // holds the address to which to return afterwards.
1127         let off = offset__of!(kvm_regs, elr_el1);
1128         state.elr_el1 = self
1129             .fd
1130             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1131             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1132 
1133         // Saved Program Status Registers, there are 5 of them used in the kernel.
1134         let mut off = offset__of!(kvm_regs, spsr);
1135         for i in 0..KVM_NR_SPSR as usize {
1136             state.spsr[i] = self
1137                 .fd
1138                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1139                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1140             off += std::mem::size_of::<u64>();
1141         }
1142 
1143         // Now moving on to floting point registers which are stored in the user_fpsimd_state in the kernel:
1144         // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53
1145         let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs);
1146         for i in 0..32 {
1147             state.fp_regs.vregs[i] = self
1148                 .fd
1149                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off))
1150                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1151                 .into();
1152             off += mem::size_of::<u128>();
1153         }
1154 
1155         // Floating-point Status Register
1156         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr);
1157         state.fp_regs.fpsr = self
1158             .fd
1159             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1160             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1161             as u32;
1162 
1163         // Floating-point Control Register
1164         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr);
1165         state.fp_regs.fpcr = self
1166             .fd
1167             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1168             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1169             as u32;
1170         Ok(state)
1171     }
1172     #[cfg(target_arch = "x86_64")]
1173     ///
1174     /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl.
1175     ///
1176     fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> {
1177         let regs = (*regs).into();
1178         self.fd
1179             .set_regs(&regs)
1180             .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into()))
1181     }
1182 
1183     ///
1184     /// Sets the vCPU general purpose registers.
1185     /// The `KVM_SET_REGS` ioctl is not available on AArch64, `KVM_SET_ONE_REG`
1186     /// is used to set registers one by one.
1187     ///
1188     #[cfg(target_arch = "aarch64")]
1189     fn set_regs(&self, state: &StandardRegisters) -> cpu::Result<()> {
1190         // The function follows the exact identical order from `state`. Look there
1191         // for some additional info on registers.
1192         let mut off = offset__of!(user_pt_regs, regs);
1193         for i in 0..31 {
1194             self.fd
1195                 .set_one_reg(
1196                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1197                     state.regs.regs[i],
1198                 )
1199                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1200             off += std::mem::size_of::<u64>();
1201         }
1202 
1203         let off = offset__of!(user_pt_regs, sp);
1204         self.fd
1205             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.sp)
1206             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1207 
1208         let off = offset__of!(user_pt_regs, pc);
1209         self.fd
1210             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pc)
1211             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1212 
1213         let off = offset__of!(user_pt_regs, pstate);
1214         self.fd
1215             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pstate)
1216             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1217 
1218         let off = offset__of!(kvm_regs, sp_el1);
1219         self.fd
1220             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.sp_el1)
1221             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1222 
1223         let off = offset__of!(kvm_regs, elr_el1);
1224         self.fd
1225             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.elr_el1)
1226             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1227 
1228         let mut off = offset__of!(kvm_regs, spsr);
1229         for i in 0..KVM_NR_SPSR as usize {
1230             self.fd
1231                 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.spsr[i])
1232                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1233             off += std::mem::size_of::<u64>();
1234         }
1235 
1236         let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs);
1237         for i in 0..32 {
1238             self.fd
1239                 .set_one_reg(
1240                     arm64_core_reg_id!(KVM_REG_SIZE_U128, off),
1241                     state.fp_regs.vregs[i] as u64,
1242                 )
1243                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1244             off += mem::size_of::<u128>();
1245         }
1246 
1247         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr);
1248         self.fd
1249             .set_one_reg(
1250                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1251                 state.fp_regs.fpsr as u64,
1252             )
1253             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1254 
1255         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr);
1256         self.fd
1257             .set_one_reg(
1258                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1259                 state.fp_regs.fpcr as u64,
1260             )
1261             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1262         Ok(())
1263     }
1264 
1265     #[cfg(target_arch = "aarch64")]
1266     ///
1267     /// Set attribute for vcpu.
1268     ///
1269     fn set_vcpu_attr(&self, attr: &DeviceAttr) -> cpu::Result<()> {
1270         self.fd
1271             .set_device_attr(attr)
1272             .map_err(|e| cpu::HypervisorCpuError::SetVcpuAttribute(e.into()))
1273     }
1274 
1275     #[cfg(target_arch = "aarch64")]
1276     ///
1277     /// Check if vcpu has a certain attribute.
1278     ///
1279     fn has_vcpu_attr(&self, attr: &DeviceAttr) -> cpu::Result<()> {
1280         self.fd
1281             .has_device_attr(attr)
1282             .map_err(|e| cpu::HypervisorCpuError::HasVcpuAttribute(e.into()))
1283     }
1284 
1285     #[cfg(target_arch = "x86_64")]
1286     ///
1287     /// Returns the vCPU special registers.
1288     ///
1289     fn get_sregs(&self) -> cpu::Result<SpecialRegisters> {
1290         Ok(self
1291             .fd
1292             .get_sregs()
1293             .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))?
1294             .into())
1295     }
1296     #[cfg(target_arch = "x86_64")]
1297     ///
1298     /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl.
1299     ///
1300     fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> {
1301         let sregs = (*sregs).into();
1302         self.fd
1303             .set_sregs(&sregs)
1304             .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into()))
1305     }
1306     #[cfg(target_arch = "x86_64")]
1307     ///
1308     /// Returns the floating point state (FPU) from the vCPU.
1309     ///
1310     fn get_fpu(&self) -> cpu::Result<FpuState> {
1311         Ok(self
1312             .fd
1313             .get_fpu()
1314             .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))?
1315             .into())
1316     }
1317     #[cfg(target_arch = "x86_64")]
1318     ///
1319     /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct.
1320     ///
1321     fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> {
1322         let fpu: kvm_bindings::kvm_fpu = (*fpu).clone().into();
1323         self.fd
1324             .set_fpu(&fpu)
1325             .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into()))
1326     }
1327     #[cfg(target_arch = "x86_64")]
1328     ///
1329     /// X86 specific call to setup the CPUID registers.
1330     ///
1331     fn set_cpuid2(&self, cpuid: &[CpuIdEntry]) -> cpu::Result<()> {
1332         let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> =
1333             cpuid.iter().map(|e| (*e).into()).collect();
1334         let kvm_cpuid = <CpuId>::from_entries(&cpuid)
1335             .map_err(|_| cpu::HypervisorCpuError::SetCpuid(anyhow!("failed to create CpuId")))?;
1336 
1337         self.fd
1338             .set_cpuid2(&kvm_cpuid)
1339             .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into()))
1340     }
1341     #[cfg(target_arch = "x86_64")]
1342     ///
1343     /// X86 specific call to enable HyperV SynIC
1344     ///
1345     fn enable_hyperv_synic(&self) -> cpu::Result<()> {
1346         // Update the information about Hyper-V SynIC being enabled and
1347         // emulated as it will influence later which MSRs should be saved.
1348         self.hyperv_synic.store(true, Ordering::Release);
1349 
1350         let cap = kvm_enable_cap {
1351             cap: KVM_CAP_HYPERV_SYNIC,
1352             ..Default::default()
1353         };
1354         self.fd
1355             .enable_cap(&cap)
1356             .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into()))
1357     }
1358     ///
1359     /// X86 specific call to retrieve the CPUID registers.
1360     ///
1361     #[cfg(target_arch = "x86_64")]
1362     fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<Vec<CpuIdEntry>> {
1363         let kvm_cpuid = self
1364             .fd
1365             .get_cpuid2(num_entries)
1366             .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))?;
1367 
1368         let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect();
1369 
1370         Ok(v)
1371     }
1372     #[cfg(target_arch = "x86_64")]
1373     ///
1374     /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1375     ///
1376     fn get_lapic(&self) -> cpu::Result<LapicState> {
1377         self.fd
1378             .get_lapic()
1379             .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))
1380     }
1381     #[cfg(target_arch = "x86_64")]
1382     ///
1383     /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1384     ///
1385     fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> {
1386         self.fd
1387             .set_lapic(klapic)
1388             .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into()))
1389     }
1390     #[cfg(target_arch = "x86_64")]
1391     ///
1392     /// Returns the model-specific registers (MSR) for this vCPU.
1393     ///
1394     fn get_msrs(&self, msrs: &mut MsrEntries) -> cpu::Result<usize> {
1395         self.fd
1396             .get_msrs(msrs)
1397             .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))
1398     }
1399     #[cfg(target_arch = "x86_64")]
1400     ///
1401     /// Setup the model-specific registers (MSR) for this vCPU.
1402     /// Returns the number of MSR entries actually written.
1403     ///
1404     fn set_msrs(&self, msrs: &MsrEntries) -> cpu::Result<usize> {
1405         self.fd
1406             .set_msrs(msrs)
1407             .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into()))
1408     }
1409     ///
1410     /// Returns the vcpu's current "multiprocessing state".
1411     ///
1412     fn get_mp_state(&self) -> cpu::Result<MpState> {
1413         Ok(self
1414             .fd
1415             .get_mp_state()
1416             .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))?
1417             .into())
1418     }
1419     ///
1420     /// Sets the vcpu's current "multiprocessing state".
1421     ///
1422     fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> {
1423         self.fd
1424             .set_mp_state(mp_state.into())
1425             .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into()))
1426     }
1427     #[cfg(target_arch = "x86_64")]
1428     ///
1429     /// Translates guest virtual address to guest physical address using the `KVM_TRANSLATE` ioctl.
1430     ///
1431     fn translate_gva(&self, gva: u64, _flags: u64) -> cpu::Result<(u64, u32)> {
1432         let tr = self
1433             .fd
1434             .translate_gva(gva)
1435             .map_err(|e| cpu::HypervisorCpuError::TranslateVirtualAddress(e.into()))?;
1436         // tr.valid is set if the GVA is mapped to valid GPA.
1437         match tr.valid {
1438             0 => Err(cpu::HypervisorCpuError::TranslateVirtualAddress(anyhow!(
1439                 "Invalid GVA: {:#x}",
1440                 gva
1441             ))),
1442             _ => Ok((tr.physical_address, 0)),
1443         }
1444     }
1445     ///
1446     /// Triggers the running of the current virtual CPU returning an exit reason.
1447     ///
1448     fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> {
1449         match self.fd.run() {
1450             Ok(run) => match run {
1451                 #[cfg(target_arch = "x86_64")]
1452                 VcpuExit::IoIn(addr, data) => {
1453                     if let Some(vm_ops) = &self.vm_ops {
1454                         return vm_ops
1455                             .pio_read(addr.into(), data)
1456                             .map(|_| cpu::VmExit::Ignore)
1457                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1458                     }
1459 
1460                     Ok(cpu::VmExit::IoIn(addr, data))
1461                 }
1462                 #[cfg(target_arch = "x86_64")]
1463                 VcpuExit::IoOut(addr, data) => {
1464                     if let Some(vm_ops) = &self.vm_ops {
1465                         return vm_ops
1466                             .pio_write(addr.into(), data)
1467                             .map(|_| cpu::VmExit::Ignore)
1468                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1469                     }
1470 
1471                     Ok(cpu::VmExit::IoOut(addr, data))
1472                 }
1473                 #[cfg(target_arch = "x86_64")]
1474                 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)),
1475                 #[cfg(target_arch = "x86_64")]
1476                 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset),
1477 
1478                 #[cfg(target_arch = "aarch64")]
1479                 VcpuExit::SystemEvent(event_type, flags) => {
1480                     use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN};
1481                     // On Aarch64, when the VM is shutdown, run() returns
1482                     // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN
1483                     if event_type == KVM_SYSTEM_EVENT_RESET {
1484                         Ok(cpu::VmExit::Reset)
1485                     } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN {
1486                         Ok(cpu::VmExit::Shutdown)
1487                     } else {
1488                         Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1489                             "Unexpected system event with type 0x{:x}, flags 0x{:x}",
1490                             event_type,
1491                             flags
1492                         )))
1493                     }
1494                 }
1495 
1496                 VcpuExit::MmioRead(addr, data) => {
1497                     if let Some(vm_ops) = &self.vm_ops {
1498                         return vm_ops
1499                             .mmio_read(addr, data)
1500                             .map(|_| cpu::VmExit::Ignore)
1501                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1502                     }
1503 
1504                     Ok(cpu::VmExit::MmioRead(addr, data))
1505                 }
1506                 VcpuExit::MmioWrite(addr, data) => {
1507                     if let Some(vm_ops) = &self.vm_ops {
1508                         return vm_ops
1509                             .mmio_write(addr, data)
1510                             .map(|_| cpu::VmExit::Ignore)
1511                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1512                     }
1513 
1514                     Ok(cpu::VmExit::MmioWrite(addr, data))
1515                 }
1516                 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv),
1517                 #[cfg(feature = "tdx")]
1518                 VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx),
1519                 VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug),
1520 
1521                 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1522                     "Unexpected exit reason on vcpu run: {:?}",
1523                     r
1524                 ))),
1525             },
1526 
1527             Err(ref e) => match e.errno() {
1528                 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore),
1529                 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1530                     "VCPU error {:?}",
1531                     e
1532                 ))),
1533             },
1534         }
1535     }
1536     #[cfg(target_arch = "x86_64")]
1537     ///
1538     /// Let the guest know that it has been paused, which prevents from
1539     /// potential soft lockups when being resumed.
1540     ///
1541     fn notify_guest_clock_paused(&self) -> cpu::Result<()> {
1542         if let Err(e) = self.fd.kvmclock_ctrl() {
1543             // Linux kernel returns -EINVAL if the PV clock isn't yet initialised
1544             // which could be because we're still in firmware or the guest doesn't
1545             // use KVM clock.
1546             if e.errno() != libc::EINVAL {
1547                 return Err(cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into()));
1548             }
1549         }
1550 
1551         Ok(())
1552     }
1553     #[cfg(target_arch = "x86_64")]
1554     ///
1555     /// Sets debug registers to set hardware breakpoints and/or enable single step.
1556     ///
1557     fn set_guest_debug(
1558         &self,
1559         addrs: &[vm_memory::GuestAddress],
1560         singlestep: bool,
1561     ) -> cpu::Result<()> {
1562         if addrs.len() > 4 {
1563             return Err(cpu::HypervisorCpuError::SetDebugRegs(anyhow!(
1564                 "Support 4 breakpoints at most but {} addresses are passed",
1565                 addrs.len()
1566             )));
1567         }
1568 
1569         let mut dbg = kvm_guest_debug {
1570             control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP,
1571             ..Default::default()
1572         };
1573         if singlestep {
1574             dbg.control |= KVM_GUESTDBG_SINGLESTEP;
1575         }
1576 
1577         // Set bits 9 and 10.
1578         // bit 9: GE (global exact breakpoint enable) flag.
1579         // bit 10: always 1.
1580         dbg.arch.debugreg[7] = 0x0600;
1581 
1582         for (i, addr) in addrs.iter().enumerate() {
1583             dbg.arch.debugreg[i] = addr.0;
1584             // Set global breakpoint enable flag
1585             dbg.arch.debugreg[7] |= 2 << (i * 2);
1586         }
1587 
1588         self.fd
1589             .set_guest_debug(&dbg)
1590             .map_err(|e| cpu::HypervisorCpuError::SetDebugRegs(e.into()))
1591     }
1592     #[cfg(target_arch = "aarch64")]
1593     fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> {
1594         self.fd
1595             .vcpu_init(kvi)
1596             .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into()))
1597     }
1598     ///
1599     /// Sets the value of one register for this vCPU.
1600     ///
1601     #[cfg(target_arch = "aarch64")]
1602     fn set_reg(&self, reg_id: u64, data: u64) -> cpu::Result<()> {
1603         self.fd
1604             .set_one_reg(reg_id, data)
1605             .map_err(|e| cpu::HypervisorCpuError::SetRegister(e.into()))
1606     }
1607     ///
1608     /// Gets the value of one register for this vCPU.
1609     ///
1610     #[cfg(target_arch = "aarch64")]
1611     fn get_reg(&self, reg_id: u64) -> cpu::Result<u64> {
1612         self.fd
1613             .get_one_reg(reg_id)
1614             .map_err(|e| cpu::HypervisorCpuError::GetRegister(e.into()))
1615     }
1616     ///
1617     /// Gets a list of the guest registers that are supported for the
1618     /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls.
1619     ///
1620     #[cfg(target_arch = "aarch64")]
1621     fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> {
1622         self.fd
1623             .get_reg_list(reg_list)
1624             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))
1625     }
1626     ///
1627     /// Save the state of the system registers.
1628     ///
1629     #[cfg(target_arch = "aarch64")]
1630     fn get_sys_regs(&self) -> cpu::Result<Vec<Register>> {
1631         // Call KVM_GET_REG_LIST to get all registers available to the guest. For ArmV8 there are
1632         // around 500 registers.
1633         let mut state: Vec<Register> = Vec::new();
1634         let mut reg_list = RegList::new(500).unwrap();
1635         self.fd
1636             .get_reg_list(&mut reg_list)
1637             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?;
1638 
1639         // At this point reg_list should contain: core registers and system registers.
1640         // The register list contains the number of registers and their ids. We will be needing to
1641         // call KVM_GET_ONE_REG on each id in order to save all of them. We carve out from the list
1642         // the core registers which are represented in the kernel by kvm_regs structure and for which
1643         // we can calculate the id based on the offset in the structure.
1644         reg_list.retain(|regid| is_system_register(*regid));
1645 
1646         // Now, for the rest of the registers left in the previously fetched register list, we are
1647         // simply calling KVM_GET_ONE_REG.
1648         let indices = reg_list.as_slice();
1649         for index in indices.iter() {
1650             state.push(kvm_bindings::kvm_one_reg {
1651                 id: *index,
1652                 addr: self
1653                     .fd
1654                     .get_one_reg(*index)
1655                     .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?,
1656             });
1657         }
1658 
1659         Ok(state)
1660     }
1661     ///
1662     /// Restore the state of the system registers.
1663     ///
1664     #[cfg(target_arch = "aarch64")]
1665     fn set_sys_regs(&self, state: &[Register]) -> cpu::Result<()> {
1666         for reg in state {
1667             self.fd
1668                 .set_one_reg(reg.id, reg.addr)
1669                 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?;
1670         }
1671         Ok(())
1672     }
1673     ///
1674     /// Read the MPIDR - Multiprocessor Affinity Register.
1675     ///
1676     #[cfg(target_arch = "aarch64")]
1677     fn read_mpidr(&self) -> cpu::Result<u64> {
1678         self.fd
1679             .get_one_reg(MPIDR_EL1)
1680             .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))
1681     }
1682     ///
1683     /// Configure core registers for a given CPU.
1684     ///
1685     #[cfg(target_arch = "aarch64")]
1686     fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> {
1687         #[allow(non_upper_case_globals)]
1688         // PSR (Processor State Register) bits.
1689         // Taken from arch/arm64/include/uapi/asm/ptrace.h.
1690         const PSR_MODE_EL1h: u64 = 0x0000_0005;
1691         const PSR_F_BIT: u64 = 0x0000_0040;
1692         const PSR_I_BIT: u64 = 0x0000_0080;
1693         const PSR_A_BIT: u64 = 0x0000_0100;
1694         const PSR_D_BIT: u64 = 0x0000_0200;
1695         // Taken from arch/arm64/kvm/inject_fault.c.
1696         const PSTATE_FAULT_BITS_64: u64 =
1697             PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | PSR_I_BIT | PSR_D_BIT;
1698 
1699         let kreg_off = offset__of!(kvm_regs, regs);
1700 
1701         // Get the register index of the PSTATE (Processor State) register.
1702         let pstate = offset__of!(user_pt_regs, pstate) + kreg_off;
1703         self.set_reg(
1704             arm64_core_reg_id!(KVM_REG_SIZE_U64, pstate),
1705             PSTATE_FAULT_BITS_64,
1706         )
1707         .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1708 
1709         // Other vCPUs are powered off initially awaiting PSCI wakeup.
1710         if cpu_id == 0 {
1711             // Setting the PC (Processor Counter) to the current program address (kernel address).
1712             let pc = offset__of!(user_pt_regs, pc) + kreg_off;
1713             self.set_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, pc), boot_ip as u64)
1714                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1715 
1716             // Last mandatory thing to set -> the address pointing to the FDT (also called DTB).
1717             // "The device tree blob (dtb) must be placed on an 8-byte boundary and must
1718             // not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt.
1719             // We are choosing to place it the end of DRAM. See `get_fdt_addr`.
1720             let regs0 = offset__of!(user_pt_regs, regs) + kreg_off;
1721             self.set_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, regs0), fdt_start)
1722                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1723         }
1724         Ok(())
1725     }
1726 
1727     #[cfg(target_arch = "x86_64")]
1728     ///
1729     /// Get the current CPU state
1730     ///
1731     /// Ordering requirements:
1732     ///
1733     /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify
1734     /// vCPU/LAPIC state. As such, it must be done before most everything
1735     /// else, otherwise we cannot restore everything and expect it to work.
1736     ///
1737     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1738     /// still running.
1739     ///
1740     /// KVM_GET_LAPIC may change state of LAPIC before returning it.
1741     ///
1742     /// GET_VCPU_EVENTS should probably be last to save. The code looks as
1743     /// it might as well be affected by internal state modifications of the
1744     /// GET ioctls.
1745     ///
1746     /// SREGS saves/restores a pending interrupt, similar to what
1747     /// VCPU_EVENTS also does.
1748     ///
1749     /// GET_MSRS requires a pre-populated data structure to do something
1750     /// meaningful. For SET_MSRS it will then contain good data.
1751     ///
1752     /// # Example
1753     ///
1754     /// ```rust
1755     /// # extern crate hypervisor;
1756     /// # use hypervisor::KvmHypervisor;
1757     /// # use std::sync::Arc;
1758     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1759     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1760     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1761     /// vm.enable_split_irq().unwrap();
1762     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1763     /// let state = vcpu.state().unwrap();
1764     /// ```
1765     fn state(&self) -> cpu::Result<CpuState> {
1766         let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?;
1767         let mp_state = self.get_mp_state()?.into();
1768         let regs = self.get_regs()?;
1769         let sregs = self.get_sregs()?;
1770         let xsave = self.get_xsave()?;
1771         let xcrs = self.get_xcrs()?;
1772         let lapic_state = self.get_lapic()?;
1773         let fpu = self.get_fpu()?;
1774 
1775         // Try to get all MSRs based on the list previously retrieved from KVM.
1776         // If the number of MSRs obtained from GET_MSRS is different from the
1777         // expected amount, we fallback onto a slower method by getting MSRs
1778         // by chunks. This is the only way to make sure we try to get as many
1779         // MSRs as possible, even if some MSRs are not supported.
1780         let mut msr_entries = self.msrs.clone();
1781 
1782         // Save extra MSRs if the Hyper-V synthetic interrupt controller is
1783         // emulated.
1784         if self.hyperv_synic.load(Ordering::Acquire) {
1785             let hyperv_synic_msrs = vec![
1786                 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084,
1787                 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096,
1788                 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d,
1789                 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4,
1790                 0x400000b5, 0x400000b6, 0x400000b7,
1791             ];
1792             for index in hyperv_synic_msrs {
1793                 let msr = kvm_msr_entry {
1794                     index,
1795                     ..Default::default()
1796                 };
1797                 msr_entries.push(msr).unwrap();
1798             }
1799         }
1800 
1801         let expected_num_msrs = msr_entries.as_fam_struct_ref().nmsrs as usize;
1802         let num_msrs = self.get_msrs(&mut msr_entries)?;
1803         let msrs = if num_msrs != expected_num_msrs {
1804             let mut faulty_msr_index = num_msrs;
1805             let mut msr_entries_tmp =
1806                 MsrEntries::from_entries(&msr_entries.as_slice()[..faulty_msr_index]).unwrap();
1807 
1808             loop {
1809                 warn!(
1810                     "Detected faulty MSR 0x{:x} while getting MSRs",
1811                     msr_entries.as_slice()[faulty_msr_index].index
1812                 );
1813 
1814                 let start_pos = faulty_msr_index + 1;
1815                 let mut sub_msr_entries =
1816                     MsrEntries::from_entries(&msr_entries.as_slice()[start_pos..]).unwrap();
1817                 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize;
1818                 let num_msrs = self.get_msrs(&mut sub_msr_entries)?;
1819 
1820                 for i in 0..num_msrs {
1821                     msr_entries_tmp
1822                         .push(sub_msr_entries.as_slice()[i])
1823                         .map_err(|e| {
1824                             cpu::HypervisorCpuError::GetMsrEntries(anyhow!(
1825                                 "Failed adding MSR entries: {:?}",
1826                                 e
1827                             ))
1828                         })?;
1829                 }
1830 
1831                 if num_msrs == expected_num_msrs {
1832                     break;
1833                 }
1834 
1835                 faulty_msr_index = start_pos + num_msrs;
1836             }
1837 
1838             msr_entries_tmp
1839         } else {
1840             msr_entries
1841         };
1842 
1843         let vcpu_events = self.get_vcpu_events()?;
1844 
1845         Ok(VcpuKvmState {
1846             cpuid,
1847             msrs,
1848             vcpu_events,
1849             regs: regs.into(),
1850             sregs: sregs.into(),
1851             fpu,
1852             lapic_state,
1853             xsave,
1854             xcrs,
1855             mp_state,
1856         }
1857         .into())
1858     }
1859     ///
1860     /// Get the current AArch64 CPU state
1861     ///
1862     #[cfg(target_arch = "aarch64")]
1863     fn state(&self) -> cpu::Result<CpuState> {
1864         let mut state = VcpuKvmState {
1865             mp_state: self.get_mp_state()?.into(),
1866             mpidr: self.read_mpidr()?,
1867             ..Default::default()
1868         };
1869         state.core_regs = self.get_regs()?;
1870         state.sys_regs = self.get_sys_regs()?;
1871 
1872         Ok(state.into())
1873     }
1874     #[cfg(target_arch = "x86_64")]
1875     ///
1876     /// Restore the previously saved CPU state
1877     ///
1878     /// Ordering requirements:
1879     ///
1880     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1881     /// still running.
1882     ///
1883     /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so
1884     /// if we ever change the BSP, we have to do that before restoring anything.
1885     /// The same seems to be true for CPUID stuff.
1886     ///
1887     /// SREGS saves/restores a pending interrupt, similar to what
1888     /// VCPU_EVENTS also does.
1889     ///
1890     /// SET_REGS clears pending exceptions unconditionally, thus, it must be
1891     /// done before SET_VCPU_EVENTS, which restores it.
1892     ///
1893     /// SET_LAPIC must come after SET_SREGS, because the latter restores
1894     /// the apic base msr.
1895     ///
1896     /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR
1897     /// only restores successfully, when the LAPIC is correctly configured.
1898     ///
1899     /// Arguments: CpuState
1900     /// # Example
1901     ///
1902     /// ```rust
1903     /// # extern crate hypervisor;
1904     /// # use hypervisor::KvmHypervisor;
1905     /// # use std::sync::Arc;
1906     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1907     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1908     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1909     /// vm.enable_split_irq().unwrap();
1910     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1911     /// let state = vcpu.state().unwrap();
1912     /// vcpu.set_state(&state).unwrap();
1913     /// ```
1914     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
1915         let state: VcpuKvmState = state.clone().into();
1916         self.set_cpuid2(&state.cpuid)?;
1917         self.set_mp_state(state.mp_state.into())?;
1918         self.set_regs(&state.regs.into())?;
1919         self.set_sregs(&state.sregs.into())?;
1920         self.set_xsave(&state.xsave)?;
1921         self.set_xcrs(&state.xcrs)?;
1922         self.set_lapic(&state.lapic_state)?;
1923         self.set_fpu(&state.fpu)?;
1924 
1925         // Try to set all MSRs previously stored.
1926         // If the number of MSRs set from SET_MSRS is different from the
1927         // expected amount, we fallback onto a slower method by setting MSRs
1928         // by chunks. This is the only way to make sure we try to set as many
1929         // MSRs as possible, even if some MSRs are not supported.
1930         let expected_num_msrs = state.msrs.as_fam_struct_ref().nmsrs as usize;
1931         let num_msrs = self.set_msrs(&state.msrs)?;
1932         if num_msrs != expected_num_msrs {
1933             let mut faulty_msr_index = num_msrs;
1934 
1935             loop {
1936                 warn!(
1937                     "Detected faulty MSR 0x{:x} while setting MSRs",
1938                     state.msrs.as_slice()[faulty_msr_index].index
1939                 );
1940 
1941                 let start_pos = faulty_msr_index + 1;
1942                 let sub_msr_entries =
1943                     MsrEntries::from_entries(&state.msrs.as_slice()[start_pos..]).unwrap();
1944                 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize;
1945                 let num_msrs = self.set_msrs(&sub_msr_entries)?;
1946 
1947                 if num_msrs == expected_num_msrs {
1948                     break;
1949                 }
1950 
1951                 faulty_msr_index = start_pos + num_msrs;
1952             }
1953         }
1954 
1955         self.set_vcpu_events(&state.vcpu_events)?;
1956 
1957         Ok(())
1958     }
1959     ///
1960     /// Restore the previously saved AArch64 CPU state
1961     ///
1962     #[cfg(target_arch = "aarch64")]
1963     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
1964         let state: VcpuKvmState = state.clone().into();
1965         self.set_regs(&state.core_regs)?;
1966         self.set_sys_regs(&state.sys_regs)?;
1967         self.set_mp_state(state.mp_state.into())?;
1968 
1969         Ok(())
1970     }
1971 
1972     ///
1973     /// Initialize TDX for this CPU
1974     ///
1975     #[cfg(feature = "tdx")]
1976     fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> {
1977         tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address)
1978             .map_err(cpu::HypervisorCpuError::InitializeTdx)
1979     }
1980 
1981     ///
1982     /// Set the "immediate_exit" state
1983     ///
1984     fn set_immediate_exit(&self, exit: bool) {
1985         self.fd.set_kvm_immediate_exit(exit.into());
1986     }
1987 
1988     ///
1989     /// Returns the details about TDX exit reason
1990     ///
1991     #[cfg(feature = "tdx")]
1992     fn get_tdx_exit_details(&mut self) -> cpu::Result<TdxExitDetails> {
1993         let kvm_run = self.fd.get_kvm_run();
1994         let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall };
1995 
1996         tdx_vmcall.status_code = TDG_VP_VMCALL_INVALID_OPERAND;
1997 
1998         if tdx_vmcall.type_ != 0 {
1999             return Err(cpu::HypervisorCpuError::UnknownTdxVmCall);
2000         }
2001 
2002         match tdx_vmcall.subfunction {
2003             TDG_VP_VMCALL_GET_QUOTE => Ok(TdxExitDetails::GetQuote),
2004             TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT => {
2005                 Ok(TdxExitDetails::SetupEventNotifyInterrupt)
2006             }
2007             _ => Err(cpu::HypervisorCpuError::UnknownTdxVmCall),
2008         }
2009     }
2010 
2011     ///
2012     /// Set the status code for TDX exit
2013     ///
2014     #[cfg(feature = "tdx")]
2015     fn set_tdx_status(&mut self, status: TdxExitStatus) {
2016         let kvm_run = self.fd.get_kvm_run();
2017         let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall };
2018 
2019         tdx_vmcall.status_code = match status {
2020             TdxExitStatus::Success => TDG_VP_VMCALL_SUCCESS,
2021             TdxExitStatus::InvalidOperand => TDG_VP_VMCALL_INVALID_OPERAND,
2022         };
2023     }
2024     #[cfg(target_arch = "x86_64")]
2025     ///
2026     /// Return the list of initial MSR entries for a VCPU
2027     ///
2028     fn boot_msr_entries(&self) -> MsrEntries {
2029         use crate::arch::x86::{msr_index, MTRR_ENABLE, MTRR_MEM_TYPE_WB};
2030         use kvm_bindings::kvm_msr_entry as MsrEntry;
2031 
2032         MsrEntries::from_entries(&[
2033             msr!(msr_index::MSR_IA32_SYSENTER_CS),
2034             msr!(msr_index::MSR_IA32_SYSENTER_ESP),
2035             msr!(msr_index::MSR_IA32_SYSENTER_EIP),
2036             msr!(msr_index::MSR_STAR),
2037             msr!(msr_index::MSR_CSTAR),
2038             msr!(msr_index::MSR_LSTAR),
2039             msr!(msr_index::MSR_KERNEL_GS_BASE),
2040             msr!(msr_index::MSR_SYSCALL_MASK),
2041             msr!(msr_index::MSR_IA32_TSC),
2042             msr_data!(
2043                 msr_index::MSR_IA32_MISC_ENABLE,
2044                 msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64
2045             ),
2046             msr_data!(msr_index::MSR_MTRRdefType, MTRR_ENABLE | MTRR_MEM_TYPE_WB),
2047         ])
2048         .unwrap()
2049     }
2050 }
2051 
2052 impl KvmVcpu {
2053     #[cfg(target_arch = "x86_64")]
2054     ///
2055     /// X86 specific call that returns the vcpu's current "xsave struct".
2056     ///
2057     fn get_xsave(&self) -> cpu::Result<Xsave> {
2058         self.fd
2059             .get_xsave()
2060             .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into()))
2061     }
2062     #[cfg(target_arch = "x86_64")]
2063     ///
2064     /// X86 specific call that sets the vcpu's current "xsave struct".
2065     ///
2066     fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> {
2067         self.fd
2068             .set_xsave(xsave)
2069             .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into()))
2070     }
2071     #[cfg(target_arch = "x86_64")]
2072     ///
2073     /// X86 specific call that returns the vcpu's current "xcrs".
2074     ///
2075     fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> {
2076         self.fd
2077             .get_xcrs()
2078             .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into()))
2079     }
2080     #[cfg(target_arch = "x86_64")]
2081     ///
2082     /// X86 specific call that sets the vcpu's current "xcrs".
2083     ///
2084     fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> {
2085         self.fd
2086             .set_xcrs(xcrs)
2087             .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into()))
2088     }
2089     #[cfg(target_arch = "x86_64")]
2090     ///
2091     /// Returns currently pending exceptions, interrupts, and NMIs as well as related
2092     /// states of the vcpu.
2093     ///
2094     fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> {
2095         self.fd
2096             .get_vcpu_events()
2097             .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into()))
2098     }
2099     #[cfg(target_arch = "x86_64")]
2100     ///
2101     /// Sets pending exceptions, interrupts, and NMIs as well as related states
2102     /// of the vcpu.
2103     ///
2104     fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> {
2105         self.fd
2106             .set_vcpu_events(events)
2107             .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into()))
2108     }
2109 }
2110 
2111 /// Device struct for KVM
2112 pub type KvmDevice = DeviceFd;
2113 
2114 impl device::Device for KvmDevice {
2115     ///
2116     /// Set device attribute
2117     ///
2118     fn set_device_attr(&self, attr: &DeviceAttr) -> device::Result<()> {
2119         self.set_device_attr(attr)
2120             .map_err(|e| device::HypervisorDeviceError::SetDeviceAttribute(e.into()))
2121     }
2122     ///
2123     /// Get device attribute
2124     ///
2125     fn get_device_attr(&self, attr: &mut DeviceAttr) -> device::Result<()> {
2126         self.get_device_attr(attr)
2127             .map_err(|e| device::HypervisorDeviceError::GetDeviceAttribute(e.into()))
2128     }
2129     ///
2130     /// Cast to the underlying KVM device fd
2131     ///
2132     fn as_any(&self) -> &dyn Any {
2133         self
2134     }
2135 }
2136