xref: /cloud-hypervisor/hypervisor/src/kvm/mod.rs (revision d295de4cd59ff8c7009710e101dcb09cec03a35a)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 // Copyright © 2020, Microsoft Corporation
6 //
7 // Copyright 2018-2019 CrowdStrike, Inc.
8 //
9 //
10 
11 #[cfg(target_arch = "aarch64")]
12 use crate::aarch64::gic::KvmGicV3Its;
13 #[cfg(target_arch = "aarch64")]
14 pub use crate::aarch64::{
15     check_required_kvm_extensions, gic::Gicv3ItsState as GicState, is_system_register, VcpuInit,
16     VcpuKvmState,
17 };
18 #[cfg(target_arch = "aarch64")]
19 use crate::arch::aarch64::gic::Vgic;
20 use crate::cpu;
21 use crate::hypervisor;
22 use crate::vec_with_array_field;
23 use crate::vm::{self, InterruptSourceConfig, VmOps};
24 use crate::HypervisorType;
25 #[cfg(target_arch = "aarch64")]
26 use crate::{arm64_core_reg_id, offset__of};
27 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd};
28 use std::any::Any;
29 use std::collections::HashMap;
30 #[cfg(target_arch = "aarch64")]
31 use std::convert::TryInto;
32 #[cfg(target_arch = "x86_64")]
33 use std::fs::File;
34 #[cfg(target_arch = "x86_64")]
35 use std::os::unix::io::AsRawFd;
36 #[cfg(feature = "tdx")]
37 use std::os::unix::io::RawFd;
38 use std::result;
39 #[cfg(target_arch = "x86_64")]
40 use std::sync::atomic::{AtomicBool, Ordering};
41 #[cfg(target_arch = "aarch64")]
42 use std::sync::Mutex;
43 use std::sync::{Arc, RwLock};
44 use vmm_sys_util::eventfd::EventFd;
45 // x86_64 dependencies
46 #[cfg(target_arch = "x86_64")]
47 pub mod x86_64;
48 #[cfg(target_arch = "x86_64")]
49 use crate::arch::x86::{
50     CpuIdEntry, FpuState, LapicState, MsrEntry, SpecialRegisters, StandardRegisters,
51     NUM_IOAPIC_PINS,
52 };
53 #[cfg(target_arch = "x86_64")]
54 use crate::ClockData;
55 use crate::{
56     CpuState, IoEventAddress, IrqRoutingEntry, MpState, UserMemoryRegion,
57     USER_MEMORY_REGION_LOG_DIRTY, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE,
58 };
59 #[cfg(target_arch = "aarch64")]
60 use aarch64::{RegList, Register, StandardRegisters};
61 #[cfg(target_arch = "x86_64")]
62 use kvm_bindings::{
63     kvm_enable_cap, kvm_guest_debug, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC,
64     KVM_CAP_SPLIT_IRQCHIP, KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_GUESTDBG_USE_HW_BP,
65 };
66 #[cfg(target_arch = "x86_64")]
67 use x86_64::check_required_kvm_extensions;
68 #[cfg(target_arch = "x86_64")]
69 pub use x86_64::{CpuId, ExtendedControlRegisters, MsrEntries, VcpuKvmState, Xsave};
70 // aarch64 dependencies
71 #[cfg(target_arch = "aarch64")]
72 pub mod aarch64;
73 pub use kvm_bindings;
74 #[cfg(feature = "tdx")]
75 use kvm_bindings::KVMIO;
76 pub use kvm_bindings::{
77     kvm_clock_data, kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_irq_routing,
78     kvm_irq_routing_entry, kvm_mp_state, kvm_userspace_memory_region, KVM_IRQ_ROUTING_IRQCHIP,
79     KVM_IRQ_ROUTING_MSI, KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID,
80 };
81 #[cfg(target_arch = "aarch64")]
82 use kvm_bindings::{
83     kvm_regs, user_fpsimd_state, user_pt_regs, KVM_NR_SPSR, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG,
84     KVM_REG_ARM64_SYSREG_CRM_MASK, KVM_REG_ARM64_SYSREG_CRN_MASK, KVM_REG_ARM64_SYSREG_OP0_MASK,
85     KVM_REG_ARM64_SYSREG_OP1_MASK, KVM_REG_ARM64_SYSREG_OP2_MASK, KVM_REG_ARM_CORE,
86     KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64,
87 };
88 pub use kvm_ioctls;
89 pub use kvm_ioctls::{Cap, Kvm};
90 #[cfg(target_arch = "aarch64")]
91 use std::mem;
92 use thiserror::Error;
93 use vfio_ioctls::VfioDeviceFd;
94 #[cfg(feature = "tdx")]
95 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_ioc_nr, ioctl_iowr_nr};
96 ///
97 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms
98 ///
99 pub use {
100     kvm_bindings::kvm_create_device as CreateDevice, kvm_bindings::kvm_device_attr as DeviceAttr,
101     kvm_bindings::kvm_run, kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::VcpuExit,
102 };
103 
104 #[cfg(target_arch = "x86_64")]
105 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196;
106 
107 #[cfg(feature = "tdx")]
108 const KVM_EXIT_TDX: u32 = 35;
109 #[cfg(feature = "tdx")]
110 const TDG_VP_VMCALL_GET_QUOTE: u64 = 0x10002;
111 #[cfg(feature = "tdx")]
112 const TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: u64 = 0x10004;
113 #[cfg(feature = "tdx")]
114 const TDG_VP_VMCALL_SUCCESS: u64 = 0;
115 #[cfg(feature = "tdx")]
116 const TDG_VP_VMCALL_INVALID_OPERAND: u64 = 0x8000000000000000;
117 
118 #[cfg(feature = "tdx")]
119 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong);
120 
121 #[cfg(feature = "tdx")]
122 #[repr(u32)]
123 enum TdxCommand {
124     Capabilities = 0,
125     InitVm,
126     InitVcpu,
127     InitMemRegion,
128     Finalize,
129 }
130 
131 #[cfg(feature = "tdx")]
132 pub enum TdxExitDetails {
133     GetQuote,
134     SetupEventNotifyInterrupt,
135 }
136 
137 #[cfg(feature = "tdx")]
138 pub enum TdxExitStatus {
139     Success,
140     InvalidOperand,
141 }
142 
143 #[cfg(feature = "tdx")]
144 const TDX_MAX_NR_CPUID_CONFIGS: usize = 6;
145 
146 #[cfg(feature = "tdx")]
147 #[repr(C)]
148 #[derive(Debug, Default)]
149 pub struct TdxCpuidConfig {
150     pub leaf: u32,
151     pub sub_leaf: u32,
152     pub eax: u32,
153     pub ebx: u32,
154     pub ecx: u32,
155     pub edx: u32,
156 }
157 
158 #[cfg(feature = "tdx")]
159 #[repr(C)]
160 #[derive(Debug, Default)]
161 pub struct TdxCapabilities {
162     pub attrs_fixed0: u64,
163     pub attrs_fixed1: u64,
164     pub xfam_fixed0: u64,
165     pub xfam_fixed1: u64,
166     pub nr_cpuid_configs: u32,
167     pub padding: u32,
168     pub cpuid_configs: [TdxCpuidConfig; TDX_MAX_NR_CPUID_CONFIGS],
169 }
170 
171 impl From<kvm_userspace_memory_region> for UserMemoryRegion {
172     fn from(region: kvm_userspace_memory_region) -> Self {
173         let mut flags = USER_MEMORY_REGION_READ;
174         if region.flags & KVM_MEM_READONLY == 0 {
175             flags |= USER_MEMORY_REGION_WRITE;
176         }
177         if region.flags & KVM_MEM_LOG_DIRTY_PAGES != 0 {
178             flags |= USER_MEMORY_REGION_LOG_DIRTY;
179         }
180 
181         UserMemoryRegion {
182             slot: region.slot,
183             guest_phys_addr: region.guest_phys_addr,
184             memory_size: region.memory_size,
185             userspace_addr: region.userspace_addr,
186             flags,
187         }
188     }
189 }
190 
191 impl From<UserMemoryRegion> for kvm_userspace_memory_region {
192     fn from(region: UserMemoryRegion) -> Self {
193         assert!(
194             region.flags & USER_MEMORY_REGION_READ != 0,
195             "KVM mapped memory is always readable"
196         );
197 
198         let mut flags = 0;
199         if region.flags & USER_MEMORY_REGION_WRITE == 0 {
200             flags |= KVM_MEM_READONLY;
201         }
202         if region.flags & USER_MEMORY_REGION_LOG_DIRTY != 0 {
203             flags |= KVM_MEM_LOG_DIRTY_PAGES;
204         }
205 
206         kvm_userspace_memory_region {
207             slot: region.slot,
208             guest_phys_addr: region.guest_phys_addr,
209             memory_size: region.memory_size,
210             userspace_addr: region.userspace_addr,
211             flags,
212         }
213     }
214 }
215 
216 impl From<kvm_mp_state> for MpState {
217     fn from(s: kvm_mp_state) -> Self {
218         MpState::Kvm(s)
219     }
220 }
221 
222 impl From<MpState> for kvm_mp_state {
223     fn from(ms: MpState) -> Self {
224         match ms {
225             MpState::Kvm(s) => s,
226             /* Needed in case other hypervisors are enabled */
227             #[allow(unreachable_patterns)]
228             _ => panic!("CpuState is not valid"),
229         }
230     }
231 }
232 
233 impl From<kvm_ioctls::IoEventAddress> for IoEventAddress {
234     fn from(a: kvm_ioctls::IoEventAddress) -> Self {
235         match a {
236             kvm_ioctls::IoEventAddress::Pio(x) => Self::Pio(x),
237             kvm_ioctls::IoEventAddress::Mmio(x) => Self::Mmio(x),
238         }
239     }
240 }
241 
242 impl From<IoEventAddress> for kvm_ioctls::IoEventAddress {
243     fn from(a: IoEventAddress) -> Self {
244         match a {
245             IoEventAddress::Pio(x) => Self::Pio(x),
246             IoEventAddress::Mmio(x) => Self::Mmio(x),
247         }
248     }
249 }
250 
251 impl From<VcpuKvmState> for CpuState {
252     fn from(s: VcpuKvmState) -> Self {
253         CpuState::Kvm(s)
254     }
255 }
256 
257 impl From<CpuState> for VcpuKvmState {
258     fn from(s: CpuState) -> Self {
259         match s {
260             CpuState::Kvm(s) => s,
261             /* Needed in case other hypervisors are enabled */
262             #[allow(unreachable_patterns)]
263             _ => panic!("CpuState is not valid"),
264         }
265     }
266 }
267 
268 #[cfg(target_arch = "x86_64")]
269 impl From<kvm_clock_data> for ClockData {
270     fn from(d: kvm_clock_data) -> Self {
271         ClockData::Kvm(d)
272     }
273 }
274 
275 #[cfg(target_arch = "x86_64")]
276 impl From<ClockData> for kvm_clock_data {
277     fn from(ms: ClockData) -> Self {
278         match ms {
279             ClockData::Kvm(s) => s,
280             /* Needed in case other hypervisors are enabled */
281             #[allow(unreachable_patterns)]
282             _ => panic!("CpuState is not valid"),
283         }
284     }
285 }
286 
287 impl From<kvm_irq_routing_entry> for IrqRoutingEntry {
288     fn from(s: kvm_irq_routing_entry) -> Self {
289         IrqRoutingEntry::Kvm(s)
290     }
291 }
292 
293 impl From<IrqRoutingEntry> for kvm_irq_routing_entry {
294     fn from(e: IrqRoutingEntry) -> Self {
295         match e {
296             IrqRoutingEntry::Kvm(e) => e,
297             /* Needed in case other hypervisors are enabled */
298             #[allow(unreachable_patterns)]
299             _ => panic!("IrqRoutingEntry is not valid"),
300         }
301     }
302 }
303 
304 struct KvmDirtyLogSlot {
305     slot: u32,
306     guest_phys_addr: u64,
307     memory_size: u64,
308     userspace_addr: u64,
309 }
310 
311 /// Wrapper over KVM VM ioctls.
312 pub struct KvmVm {
313     fd: Arc<VmFd>,
314     #[cfg(target_arch = "x86_64")]
315     msrs: Vec<MsrEntry>,
316     dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>,
317 }
318 
319 impl KvmVm {
320     ///
321     /// Creates an emulated device in the kernel.
322     ///
323     /// See the documentation for `KVM_CREATE_DEVICE`.
324     fn create_device(&self, device: &mut CreateDevice) -> vm::Result<vfio_ioctls::VfioDeviceFd> {
325         let device_fd = self
326             .fd
327             .create_device(device)
328             .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?;
329         Ok(VfioDeviceFd::new_from_kvm(device_fd))
330     }
331     /// Checks if a particular `Cap` is available.
332     fn check_extension(&self, c: Cap) -> bool {
333         self.fd.check_extension(c)
334     }
335 }
336 
337 ///
338 /// Implementation of Vm trait for KVM
339 /// Example:
340 /// #[cfg(feature = "kvm")]
341 /// extern crate hypervisor
342 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
343 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
344 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
345 /// vm.set/get().unwrap()
346 ///
347 impl vm::Vm for KvmVm {
348     #[cfg(target_arch = "x86_64")]
349     ///
350     /// Sets the address of the one-page region in the VM's address space.
351     ///
352     fn set_identity_map_address(&self, address: u64) -> vm::Result<()> {
353         self.fd
354             .set_identity_map_address(address)
355             .map_err(|e| vm::HypervisorVmError::SetIdentityMapAddress(e.into()))
356     }
357     #[cfg(target_arch = "x86_64")]
358     ///
359     /// Sets the address of the three-page region in the VM's address space.
360     ///
361     fn set_tss_address(&self, offset: usize) -> vm::Result<()> {
362         self.fd
363             .set_tss_address(offset)
364             .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into()))
365     }
366     ///
367     /// Creates an in-kernel interrupt controller.
368     ///
369     fn create_irq_chip(&self) -> vm::Result<()> {
370         self.fd
371             .create_irq_chip()
372             .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into()))
373     }
374     ///
375     /// Registers an event that will, when signaled, trigger the `gsi` IRQ.
376     ///
377     fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
378         self.fd
379             .register_irqfd(fd, gsi)
380             .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into()))
381     }
382     ///
383     /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ.
384     ///
385     fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
386         self.fd
387             .unregister_irqfd(fd, gsi)
388             .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into()))
389     }
390     ///
391     /// Creates a VcpuFd object from a vcpu RawFd.
392     ///
393     fn create_vcpu(
394         &self,
395         id: u8,
396         vm_ops: Option<Arc<dyn VmOps>>,
397     ) -> vm::Result<Arc<dyn cpu::Vcpu>> {
398         let vc = self
399             .fd
400             .create_vcpu(id as u64)
401             .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?;
402         let vcpu = KvmVcpu {
403             fd: vc,
404             #[cfg(target_arch = "x86_64")]
405             msrs: self.msrs.clone(),
406             vm_ops,
407             #[cfg(target_arch = "x86_64")]
408             hyperv_synic: AtomicBool::new(false),
409         };
410         Ok(Arc::new(vcpu))
411     }
412     #[cfg(target_arch = "aarch64")]
413     ///
414     /// Creates a virtual GIC device.
415     ///
416     fn create_vgic(
417         &self,
418         vcpu_count: u64,
419         dist_addr: u64,
420         dist_size: u64,
421         redist_size: u64,
422         msi_size: u64,
423         nr_irqs: u32,
424     ) -> vm::Result<Arc<Mutex<dyn Vgic>>> {
425         let gic_device = KvmGicV3Its::new(
426             self,
427             vcpu_count,
428             dist_addr,
429             dist_size,
430             redist_size,
431             msi_size,
432             nr_irqs,
433         )
434         .map_err(|e| vm::HypervisorVmError::CreateVgic(anyhow!("Vgic error {:?}", e)))?;
435         Ok(Arc::new(Mutex::new(gic_device)))
436     }
437     ///
438     /// Registers an event to be signaled whenever a certain address is written to.
439     ///
440     fn register_ioevent(
441         &self,
442         fd: &EventFd,
443         addr: &IoEventAddress,
444         datamatch: Option<vm::DataMatch>,
445     ) -> vm::Result<()> {
446         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
447         if let Some(dm) = datamatch {
448             match dm {
449                 vm::DataMatch::DataMatch32(kvm_dm32) => self
450                     .fd
451                     .register_ioevent(fd, addr, kvm_dm32)
452                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
453                 vm::DataMatch::DataMatch64(kvm_dm64) => self
454                     .fd
455                     .register_ioevent(fd, addr, kvm_dm64)
456                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
457             }
458         } else {
459             self.fd
460                 .register_ioevent(fd, addr, NoDatamatch)
461                 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into()))
462         }
463     }
464     ///
465     /// Unregisters an event from a certain address it has been previously registered to.
466     ///
467     fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> {
468         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
469         self.fd
470             .unregister_ioevent(fd, addr, NoDatamatch)
471             .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into()))
472     }
473 
474     ///
475     /// Constructs a routing entry
476     ///
477     fn make_routing_entry(&self, gsi: u32, config: &InterruptSourceConfig) -> IrqRoutingEntry {
478         match &config {
479             InterruptSourceConfig::MsiIrq(cfg) => {
480                 let mut kvm_route = kvm_irq_routing_entry {
481                     gsi,
482                     type_: KVM_IRQ_ROUTING_MSI,
483                     ..Default::default()
484                 };
485 
486                 kvm_route.u.msi.address_lo = cfg.low_addr;
487                 kvm_route.u.msi.address_hi = cfg.high_addr;
488                 kvm_route.u.msi.data = cfg.data;
489 
490                 if self.check_extension(crate::kvm::Cap::MsiDevid) {
491                     // On AArch64, there is limitation on the range of the 'devid',
492                     // it can not be greater than 65536 (the max of u16).
493                     //
494                     // BDF can not be used directly, because 'segment' is in high
495                     // 16 bits. The layout of the u32 BDF is:
496                     // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --|
497                     // |      segment    |     bus    |   device   |  function  |
498                     //
499                     // Now that we support 1 bus only in a segment, we can build a
500                     // 'devid' by replacing the 'bus' bits with the low 8 bits of
501                     // 'segment' data.
502                     // This way we can resolve the range checking problem and give
503                     // different `devid` to all the devices. Limitation is that at
504                     // most 256 segments can be supported.
505                     //
506                     let modified_devid = (cfg.devid & 0x00ff_0000) >> 8 | cfg.devid & 0xff;
507 
508                     kvm_route.flags = KVM_MSI_VALID_DEVID;
509                     kvm_route.u.msi.__bindgen_anon_1.devid = modified_devid;
510                 }
511                 kvm_route.into()
512             }
513             InterruptSourceConfig::LegacyIrq(cfg) => {
514                 let mut kvm_route = kvm_irq_routing_entry {
515                     gsi,
516                     type_: KVM_IRQ_ROUTING_IRQCHIP,
517                     ..Default::default()
518                 };
519                 kvm_route.u.irqchip.irqchip = cfg.irqchip;
520                 kvm_route.u.irqchip.pin = cfg.pin;
521 
522                 kvm_route.into()
523             }
524         }
525     }
526 
527     ///
528     /// Sets the GSI routing table entries, overwriting any previously set
529     /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl.
530     ///
531     fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> {
532         let mut irq_routing =
533             vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len());
534         irq_routing[0].nr = entries.len() as u32;
535         irq_routing[0].flags = 0;
536         let entries: Vec<kvm_irq_routing_entry> = entries
537             .iter()
538             .map(|entry| match entry {
539                 IrqRoutingEntry::Kvm(e) => *e,
540                 #[allow(unreachable_patterns)]
541                 _ => panic!("IrqRoutingEntry type is wrong"),
542             })
543             .collect();
544 
545         // SAFETY: irq_routing initialized with entries.len() and now it is being turned into
546         // entries_slice with entries.len() again. It is guaranteed to be large enough to hold
547         // everything from entries.
548         unsafe {
549             let entries_slice: &mut [kvm_irq_routing_entry] =
550                 irq_routing[0].entries.as_mut_slice(entries.len());
551             entries_slice.copy_from_slice(&entries);
552         }
553 
554         self.fd
555             .set_gsi_routing(&irq_routing[0])
556             .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into()))
557     }
558     ///
559     /// Creates a memory region structure that can be used with {create/remove}_user_memory_region
560     ///
561     fn make_user_memory_region(
562         &self,
563         slot: u32,
564         guest_phys_addr: u64,
565         memory_size: u64,
566         userspace_addr: u64,
567         readonly: bool,
568         log_dirty_pages: bool,
569     ) -> UserMemoryRegion {
570         kvm_userspace_memory_region {
571             slot,
572             guest_phys_addr,
573             memory_size,
574             userspace_addr,
575             flags: if readonly { KVM_MEM_READONLY } else { 0 }
576                 | if log_dirty_pages {
577                     KVM_MEM_LOG_DIRTY_PAGES
578                 } else {
579                     0
580                 },
581         }
582         .into()
583     }
584     ///
585     /// Creates a guest physical memory region.
586     ///
587     fn create_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
588         let mut region: kvm_userspace_memory_region = user_memory_region.into();
589 
590         if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 {
591             if (region.flags & KVM_MEM_READONLY) != 0 {
592                 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!(
593                     "Error creating regions with both 'dirty-pages-log' and 'read-only'."
594                 )));
595             }
596 
597             // Keep track of the regions that need dirty pages log
598             self.dirty_log_slots.write().unwrap().insert(
599                 region.slot,
600                 KvmDirtyLogSlot {
601                     slot: region.slot,
602                     guest_phys_addr: region.guest_phys_addr,
603                     memory_size: region.memory_size,
604                     userspace_addr: region.userspace_addr,
605                 },
606             );
607 
608             // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`.
609             // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`.
610             region.flags = 0;
611         }
612 
613         // SAFETY: Safe because guest regions are guaranteed not to overlap.
614         unsafe {
615             self.fd
616                 .set_user_memory_region(region)
617                 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))
618         }
619     }
620     ///
621     /// Removes a guest physical memory region.
622     ///
623     fn remove_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
624         let mut region: kvm_userspace_memory_region = user_memory_region.into();
625 
626         // Remove the corresponding entry from "self.dirty_log_slots" if needed
627         self.dirty_log_slots.write().unwrap().remove(&region.slot);
628 
629         // Setting the size to 0 means "remove"
630         region.memory_size = 0;
631         // SAFETY: Safe because guest regions are guaranteed not to overlap.
632         unsafe {
633             self.fd
634                 .set_user_memory_region(region)
635                 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into()))
636         }
637     }
638     ///
639     /// Returns the preferred CPU target type which can be emulated by KVM on underlying host.
640     ///
641     #[cfg(target_arch = "aarch64")]
642     fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> {
643         self.fd
644             .get_preferred_target(kvi)
645             .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into()))
646     }
647     #[cfg(target_arch = "x86_64")]
648     fn enable_split_irq(&self) -> vm::Result<()> {
649         // Create split irqchip
650         // Only the local APIC is emulated in kernel, both PICs and IOAPIC
651         // are not.
652         let mut cap = kvm_enable_cap {
653             cap: KVM_CAP_SPLIT_IRQCHIP,
654             ..Default::default()
655         };
656         cap.args[0] = NUM_IOAPIC_PINS as u64;
657         self.fd
658             .enable_cap(&cap)
659             .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?;
660         Ok(())
661     }
662     #[cfg(target_arch = "x86_64")]
663     fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> {
664         let mut cap = kvm_enable_cap {
665             cap: KVM_CAP_SGX_ATTRIBUTE,
666             ..Default::default()
667         };
668         cap.args[0] = file.as_raw_fd() as u64;
669         self.fd
670             .enable_cap(&cap)
671             .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?;
672         Ok(())
673     }
674     /// Retrieve guest clock.
675     #[cfg(target_arch = "x86_64")]
676     fn get_clock(&self) -> vm::Result<ClockData> {
677         Ok(self
678             .fd
679             .get_clock()
680             .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))?
681             .into())
682     }
683     /// Set guest clock.
684     #[cfg(target_arch = "x86_64")]
685     fn set_clock(&self, data: &ClockData) -> vm::Result<()> {
686         let data = (*data).into();
687         self.fd
688             .set_clock(&data)
689             .map_err(|e| vm::HypervisorVmError::SetClock(e.into()))
690     }
691     /// Create a device that is used for passthrough
692     fn create_passthrough_device(&self) -> vm::Result<VfioDeviceFd> {
693         let mut vfio_dev = kvm_create_device {
694             type_: kvm_device_type_KVM_DEV_TYPE_VFIO,
695             fd: 0,
696             flags: 0,
697         };
698 
699         self.create_device(&mut vfio_dev)
700             .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into()))
701     }
702     ///
703     /// Start logging dirty pages
704     ///
705     fn start_dirty_log(&self) -> vm::Result<()> {
706         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
707         for (_, s) in dirty_log_slots.iter() {
708             let region = kvm_userspace_memory_region {
709                 slot: s.slot,
710                 guest_phys_addr: s.guest_phys_addr,
711                 memory_size: s.memory_size,
712                 userspace_addr: s.userspace_addr,
713                 flags: KVM_MEM_LOG_DIRTY_PAGES,
714             };
715             // SAFETY: Safe because guest regions are guaranteed not to overlap.
716             unsafe {
717                 self.fd
718                     .set_user_memory_region(region)
719                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
720             }
721         }
722 
723         Ok(())
724     }
725 
726     ///
727     /// Stop logging dirty pages
728     ///
729     fn stop_dirty_log(&self) -> vm::Result<()> {
730         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
731         for (_, s) in dirty_log_slots.iter() {
732             let region = kvm_userspace_memory_region {
733                 slot: s.slot,
734                 guest_phys_addr: s.guest_phys_addr,
735                 memory_size: s.memory_size,
736                 userspace_addr: s.userspace_addr,
737                 flags: 0,
738             };
739             // SAFETY: Safe because guest regions are guaranteed not to overlap.
740             unsafe {
741                 self.fd
742                     .set_user_memory_region(region)
743                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
744             }
745         }
746 
747         Ok(())
748     }
749 
750     ///
751     /// Get dirty pages bitmap (one bit per page)
752     ///
753     fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> {
754         self.fd
755             .get_dirty_log(slot, memory_size as usize)
756             .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into()))
757     }
758 
759     ///
760     /// Initialize TDX for this VM
761     ///
762     #[cfg(feature = "tdx")]
763     fn tdx_init(&self, cpuid: &[CpuIdEntry], max_vcpus: u32) -> vm::Result<()> {
764         use std::io::{Error, ErrorKind};
765         let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> =
766             cpuid.iter().map(|e| (*e).into()).collect();
767         let kvm_cpuid = kvm_bindings::CpuId::from_entries(&cpuid).map_err(|_| {
768             vm::HypervisorVmError::InitializeTdx(Error::new(
769                 ErrorKind::Other,
770                 "failed to allocate CpuId",
771             ))
772         })?;
773 
774         #[repr(C)]
775         struct TdxInitVm {
776             max_vcpus: u32,
777             tsc_khz: u32,
778             attributes: u64,
779             cpuid: u64,
780             mrconfigid: [u64; 6],
781             mrowner: [u64; 6],
782             mrownerconfig: [u64; 6],
783             reserved: [u64; 43],
784         }
785         let data = TdxInitVm {
786             max_vcpus,
787             tsc_khz: 0,
788             attributes: 0,
789             cpuid: kvm_cpuid.as_fam_struct_ptr() as u64,
790             mrconfigid: [0; 6],
791             mrowner: [0; 6],
792             mrownerconfig: [0; 6],
793             reserved: [0; 43],
794         };
795 
796         tdx_command(
797             &self.fd.as_raw_fd(),
798             TdxCommand::InitVm,
799             0,
800             &data as *const _ as u64,
801         )
802         .map_err(vm::HypervisorVmError::InitializeTdx)
803     }
804 
805     ///
806     /// Finalize the TDX setup for this VM
807     ///
808     #[cfg(feature = "tdx")]
809     fn tdx_finalize(&self) -> vm::Result<()> {
810         tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0)
811             .map_err(vm::HypervisorVmError::FinalizeTdx)
812     }
813 
814     ///
815     /// Initialize memory regions for the TDX VM
816     ///
817     #[cfg(feature = "tdx")]
818     fn tdx_init_memory_region(
819         &self,
820         host_address: u64,
821         guest_address: u64,
822         size: u64,
823         measure: bool,
824     ) -> vm::Result<()> {
825         #[repr(C)]
826         struct TdxInitMemRegion {
827             host_address: u64,
828             guest_address: u64,
829             pages: u64,
830         }
831         let data = TdxInitMemRegion {
832             host_address,
833             guest_address,
834             pages: size / 4096,
835         };
836 
837         tdx_command(
838             &self.fd.as_raw_fd(),
839             TdxCommand::InitMemRegion,
840             if measure { 1 } else { 0 },
841             &data as *const _ as u64,
842         )
843         .map_err(vm::HypervisorVmError::InitMemRegionTdx)
844     }
845     /// Downcast to the underlying KvmVm type
846     fn as_any(&self) -> &dyn Any {
847         self
848     }
849 }
850 
851 #[cfg(feature = "tdx")]
852 fn tdx_command(
853     fd: &RawFd,
854     command: TdxCommand,
855     metadata: u32,
856     data: u64,
857 ) -> std::result::Result<(), std::io::Error> {
858     #[repr(C)]
859     struct TdxIoctlCmd {
860         command: TdxCommand,
861         metadata: u32,
862         data: u64,
863     }
864     let cmd = TdxIoctlCmd {
865         command,
866         metadata,
867         data,
868     };
869     // SAFETY: FFI call. All input parameters are valid.
870     let ret = unsafe {
871         ioctl_with_val(
872             fd,
873             KVM_MEMORY_ENCRYPT_OP(),
874             &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong,
875         )
876     };
877 
878     if ret < 0 {
879         return Err(std::io::Error::last_os_error());
880     }
881     Ok(())
882 }
883 
884 /// Wrapper over KVM system ioctls.
885 pub struct KvmHypervisor {
886     kvm: Kvm,
887 }
888 
889 impl KvmHypervisor {
890     #[cfg(target_arch = "x86_64")]
891     ///
892     /// Retrieve the list of MSRs supported by the hypervisor.
893     ///
894     fn get_msr_list(&self) -> hypervisor::Result<MsrList> {
895         self.kvm
896             .get_msr_index_list()
897             .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into()))
898     }
899 }
900 
901 /// Enum for KVM related error
902 #[derive(Debug, Error)]
903 pub enum KvmError {
904     #[error("Capability missing: {0:?}")]
905     CapabilityMissing(Cap),
906 }
907 pub type KvmResult<T> = result::Result<T, KvmError>;
908 impl KvmHypervisor {
909     /// Create a hypervisor based on Kvm
910     #[allow(clippy::new_ret_no_self)]
911     pub fn new() -> hypervisor::Result<Arc<dyn hypervisor::Hypervisor>> {
912         let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?;
913         let api_version = kvm_obj.get_api_version();
914 
915         if api_version != kvm_bindings::KVM_API_VERSION as i32 {
916             return Err(hypervisor::HypervisorError::IncompatibleApiVersion);
917         }
918 
919         Ok(Arc::new(KvmHypervisor { kvm: kvm_obj }))
920     }
921     /// Check if the hypervisor is available
922     pub fn is_available() -> hypervisor::Result<bool> {
923         match std::fs::metadata("/dev/kvm") {
924             Ok(_) => Ok(true),
925             Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false),
926             Err(err) => Err(hypervisor::HypervisorError::HypervisorAvailableCheck(
927                 err.into(),
928             )),
929         }
930     }
931 }
932 /// Implementation of Hypervisor trait for KVM
933 /// Example:
934 /// #[cfg(feature = "kvm")]
935 /// extern crate hypervisor
936 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
937 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
938 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
939 ///
940 impl hypervisor::Hypervisor for KvmHypervisor {
941     ///
942     /// Returns the type of the hypervisor
943     ///
944     fn hypervisor_type(&self) -> HypervisorType {
945         HypervisorType::Kvm
946     }
947     /// Create a KVM vm object of a specific VM type and return the object as Vm trait object
948     /// Example
949     /// # extern crate hypervisor;
950     /// # use hypervisor::KvmHypervisor;
951     /// use hypervisor::KvmVm;
952     /// let hypervisor = KvmHypervisor::new().unwrap();
953     /// let vm = hypervisor.create_vm_with_type(KvmVmType::LegacyVm).unwrap()
954     ///
955     fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> {
956         let fd: VmFd;
957         loop {
958             match self.kvm.create_vm_with_type(vm_type) {
959                 Ok(res) => fd = res,
960                 Err(e) => {
961                     if e.errno() == libc::EINTR {
962                         // If the error returned is EINTR, which means the
963                         // ioctl has been interrupted, we have to retry as
964                         // this can't be considered as a regular error.
965                         continue;
966                     } else {
967                         return Err(hypervisor::HypervisorError::VmCreate(e.into()));
968                     }
969                 }
970             }
971             break;
972         }
973 
974         let vm_fd = Arc::new(fd);
975 
976         #[cfg(target_arch = "x86_64")]
977         {
978             let msr_list = self.get_msr_list()?;
979             let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize;
980             let mut msrs: Vec<MsrEntry> = vec![
981                 MsrEntry {
982                     ..Default::default()
983                 };
984                 num_msrs
985             ];
986             let indices = msr_list.as_slice();
987             for (pos, index) in indices.iter().enumerate() {
988                 msrs[pos].index = *index;
989             }
990 
991             Ok(Arc::new(KvmVm {
992                 fd: vm_fd,
993                 msrs,
994                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
995             }))
996         }
997 
998         #[cfg(target_arch = "aarch64")]
999         {
1000             Ok(Arc::new(KvmVm {
1001                 fd: vm_fd,
1002                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
1003             }))
1004         }
1005     }
1006 
1007     /// Create a KVM vm object and return the object as Vm trait object
1008     /// Example
1009     /// # extern crate hypervisor;
1010     /// # use hypervisor::KvmHypervisor;
1011     /// use hypervisor::KvmVm;
1012     /// let hypervisor = KvmHypervisor::new().unwrap();
1013     /// let vm = hypervisor.create_vm().unwrap()
1014     ///
1015     fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> {
1016         #[allow(unused_mut)]
1017         let mut vm_type: u64 = 0; // Create with default platform type
1018 
1019         // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA
1020         // size from the host and use that when creating the VM, which may
1021         // avoid unnecessary VM creation failures.
1022         #[cfg(target_arch = "aarch64")]
1023         if self.kvm.check_extension(Cap::ArmVmIPASize) {
1024             vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap();
1025         }
1026 
1027         self.create_vm_with_type(vm_type)
1028     }
1029 
1030     fn check_required_extensions(&self) -> hypervisor::Result<()> {
1031         check_required_kvm_extensions(&self.kvm)
1032             .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into()))
1033     }
1034 
1035     #[cfg(target_arch = "x86_64")]
1036     ///
1037     /// X86 specific call to get the system supported CPUID values.
1038     ///
1039     fn get_cpuid(&self) -> hypervisor::Result<Vec<CpuIdEntry>> {
1040         let kvm_cpuid = self
1041             .kvm
1042             .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
1043             .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))?;
1044 
1045         let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect();
1046 
1047         Ok(v)
1048     }
1049 
1050     #[cfg(target_arch = "aarch64")]
1051     ///
1052     /// Retrieve AArch64 host maximum IPA size supported by KVM.
1053     ///
1054     fn get_host_ipa_limit(&self) -> i32 {
1055         self.kvm.get_host_ipa_limit()
1056     }
1057 
1058     ///
1059     /// Retrieve TDX capabilities
1060     ///
1061     #[cfg(feature = "tdx")]
1062     fn tdx_capabilities(&self) -> hypervisor::Result<TdxCapabilities> {
1063         let data = TdxCapabilities {
1064             nr_cpuid_configs: TDX_MAX_NR_CPUID_CONFIGS as u32,
1065             ..Default::default()
1066         };
1067 
1068         tdx_command(
1069             &self.kvm.as_raw_fd(),
1070             TdxCommand::Capabilities,
1071             0,
1072             &data as *const _ as u64,
1073         )
1074         .map_err(|e| hypervisor::HypervisorError::TdxCapabilities(e.into()))?;
1075 
1076         Ok(data)
1077     }
1078 }
1079 /// Vcpu struct for KVM
1080 pub struct KvmVcpu {
1081     fd: VcpuFd,
1082     #[cfg(target_arch = "x86_64")]
1083     msrs: Vec<MsrEntry>,
1084     vm_ops: Option<Arc<dyn vm::VmOps>>,
1085     #[cfg(target_arch = "x86_64")]
1086     hyperv_synic: AtomicBool,
1087 }
1088 /// Implementation of Vcpu trait for KVM
1089 /// Example:
1090 /// #[cfg(feature = "kvm")]
1091 /// extern crate hypervisor
1092 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1093 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1094 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
1095 /// let vcpu = vm.create_vcpu(0, None).unwrap();
1096 /// vcpu.get/set().unwrap()
1097 ///
1098 impl cpu::Vcpu for KvmVcpu {
1099     #[cfg(target_arch = "x86_64")]
1100     ///
1101     /// Returns the vCPU general purpose registers.
1102     ///
1103     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1104         Ok(self
1105             .fd
1106             .get_regs()
1107             .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))?
1108             .into())
1109     }
1110     ///
1111     /// Returns the vCPU general purpose registers.
1112     /// The `KVM_GET_REGS` ioctl is not available on AArch64, `KVM_GET_ONE_REG`
1113     /// is used to get registers one by one.
1114     ///
1115     #[cfg(target_arch = "aarch64")]
1116     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1117         let mut state: StandardRegisters = kvm_regs::default();
1118         let mut off = offset__of!(user_pt_regs, regs);
1119         // There are 31 user_pt_regs:
1120         // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72
1121         // These actually are the general-purpose registers of the Armv8-a
1122         // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register).
1123         for i in 0..31 {
1124             state.regs.regs[i] = self
1125                 .fd
1126                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1127                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1128             off += std::mem::size_of::<u64>();
1129         }
1130 
1131         // We are now entering the "Other register" section of the ARMv8-a architecture.
1132         // First one, stack pointer.
1133         let off = offset__of!(user_pt_regs, sp);
1134         state.regs.sp = self
1135             .fd
1136             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1137             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1138 
1139         // Second one, the program counter.
1140         let off = offset__of!(user_pt_regs, pc);
1141         state.regs.pc = self
1142             .fd
1143             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1144             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1145 
1146         // Next is the processor state.
1147         let off = offset__of!(user_pt_regs, pstate);
1148         state.regs.pstate = self
1149             .fd
1150             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1151             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1152 
1153         // The stack pointer associated with EL1
1154         let off = offset__of!(kvm_regs, sp_el1);
1155         state.sp_el1 = self
1156             .fd
1157             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1158             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1159 
1160         // Exception Link Register for EL1, when taking an exception to EL1, this register
1161         // holds the address to which to return afterwards.
1162         let off = offset__of!(kvm_regs, elr_el1);
1163         state.elr_el1 = self
1164             .fd
1165             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1166             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1167 
1168         // Saved Program Status Registers, there are 5 of them used in the kernel.
1169         let mut off = offset__of!(kvm_regs, spsr);
1170         for i in 0..KVM_NR_SPSR as usize {
1171             state.spsr[i] = self
1172                 .fd
1173                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1174                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1175             off += std::mem::size_of::<u64>();
1176         }
1177 
1178         // Now moving on to floting point registers which are stored in the user_fpsimd_state in the kernel:
1179         // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53
1180         let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs);
1181         for i in 0..32 {
1182             state.fp_regs.vregs[i] = self
1183                 .fd
1184                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off))
1185                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1186                 .into();
1187             off += mem::size_of::<u128>();
1188         }
1189 
1190         // Floating-point Status Register
1191         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr);
1192         state.fp_regs.fpsr = self
1193             .fd
1194             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1195             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1196             as u32;
1197 
1198         // Floating-point Control Register
1199         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr);
1200         state.fp_regs.fpcr = self
1201             .fd
1202             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1203             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1204             as u32;
1205         Ok(state)
1206     }
1207     #[cfg(target_arch = "x86_64")]
1208     ///
1209     /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl.
1210     ///
1211     fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> {
1212         let regs = (*regs).into();
1213         self.fd
1214             .set_regs(&regs)
1215             .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into()))
1216     }
1217 
1218     ///
1219     /// Sets the vCPU general purpose registers.
1220     /// The `KVM_SET_REGS` ioctl is not available on AArch64, `KVM_SET_ONE_REG`
1221     /// is used to set registers one by one.
1222     ///
1223     #[cfg(target_arch = "aarch64")]
1224     fn set_regs(&self, state: &StandardRegisters) -> cpu::Result<()> {
1225         // The function follows the exact identical order from `state`. Look there
1226         // for some additional info on registers.
1227         let mut off = offset__of!(user_pt_regs, regs);
1228         for i in 0..31 {
1229             self.fd
1230                 .set_one_reg(
1231                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1232                     state.regs.regs[i],
1233                 )
1234                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1235             off += std::mem::size_of::<u64>();
1236         }
1237 
1238         let off = offset__of!(user_pt_regs, sp);
1239         self.fd
1240             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.sp)
1241             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1242 
1243         let off = offset__of!(user_pt_regs, pc);
1244         self.fd
1245             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pc)
1246             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1247 
1248         let off = offset__of!(user_pt_regs, pstate);
1249         self.fd
1250             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pstate)
1251             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1252 
1253         let off = offset__of!(kvm_regs, sp_el1);
1254         self.fd
1255             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.sp_el1)
1256             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1257 
1258         let off = offset__of!(kvm_regs, elr_el1);
1259         self.fd
1260             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.elr_el1)
1261             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1262 
1263         let mut off = offset__of!(kvm_regs, spsr);
1264         for i in 0..KVM_NR_SPSR as usize {
1265             self.fd
1266                 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.spsr[i])
1267                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1268             off += std::mem::size_of::<u64>();
1269         }
1270 
1271         let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs);
1272         for i in 0..32 {
1273             self.fd
1274                 .set_one_reg(
1275                     arm64_core_reg_id!(KVM_REG_SIZE_U128, off),
1276                     state.fp_regs.vregs[i] as u64,
1277                 )
1278                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1279             off += mem::size_of::<u128>();
1280         }
1281 
1282         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr);
1283         self.fd
1284             .set_one_reg(
1285                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1286                 state.fp_regs.fpsr as u64,
1287             )
1288             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1289 
1290         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr);
1291         self.fd
1292             .set_one_reg(
1293                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1294                 state.fp_regs.fpcr as u64,
1295             )
1296             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1297         Ok(())
1298     }
1299 
1300     #[cfg(target_arch = "x86_64")]
1301     ///
1302     /// Returns the vCPU special registers.
1303     ///
1304     fn get_sregs(&self) -> cpu::Result<SpecialRegisters> {
1305         Ok(self
1306             .fd
1307             .get_sregs()
1308             .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))?
1309             .into())
1310     }
1311     #[cfg(target_arch = "x86_64")]
1312     ///
1313     /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl.
1314     ///
1315     fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> {
1316         let sregs = (*sregs).into();
1317         self.fd
1318             .set_sregs(&sregs)
1319             .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into()))
1320     }
1321     #[cfg(target_arch = "x86_64")]
1322     ///
1323     /// Returns the floating point state (FPU) from the vCPU.
1324     ///
1325     fn get_fpu(&self) -> cpu::Result<FpuState> {
1326         Ok(self
1327             .fd
1328             .get_fpu()
1329             .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))?
1330             .into())
1331     }
1332     #[cfg(target_arch = "x86_64")]
1333     ///
1334     /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct.
1335     ///
1336     fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> {
1337         let fpu: kvm_bindings::kvm_fpu = (*fpu).clone().into();
1338         self.fd
1339             .set_fpu(&fpu)
1340             .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into()))
1341     }
1342     #[cfg(target_arch = "x86_64")]
1343     ///
1344     /// X86 specific call to setup the CPUID registers.
1345     ///
1346     fn set_cpuid2(&self, cpuid: &[CpuIdEntry]) -> cpu::Result<()> {
1347         let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> =
1348             cpuid.iter().map(|e| (*e).into()).collect();
1349         let kvm_cpuid = <CpuId>::from_entries(&cpuid)
1350             .map_err(|_| cpu::HypervisorCpuError::SetCpuid(anyhow!("failed to create CpuId")))?;
1351 
1352         self.fd
1353             .set_cpuid2(&kvm_cpuid)
1354             .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into()))
1355     }
1356     #[cfg(target_arch = "x86_64")]
1357     ///
1358     /// X86 specific call to enable HyperV SynIC
1359     ///
1360     fn enable_hyperv_synic(&self) -> cpu::Result<()> {
1361         // Update the information about Hyper-V SynIC being enabled and
1362         // emulated as it will influence later which MSRs should be saved.
1363         self.hyperv_synic.store(true, Ordering::Release);
1364 
1365         let cap = kvm_enable_cap {
1366             cap: KVM_CAP_HYPERV_SYNIC,
1367             ..Default::default()
1368         };
1369         self.fd
1370             .enable_cap(&cap)
1371             .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into()))
1372     }
1373     ///
1374     /// X86 specific call to retrieve the CPUID registers.
1375     ///
1376     #[cfg(target_arch = "x86_64")]
1377     fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<Vec<CpuIdEntry>> {
1378         let kvm_cpuid = self
1379             .fd
1380             .get_cpuid2(num_entries)
1381             .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))?;
1382 
1383         let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect();
1384 
1385         Ok(v)
1386     }
1387     #[cfg(target_arch = "x86_64")]
1388     ///
1389     /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1390     ///
1391     fn get_lapic(&self) -> cpu::Result<LapicState> {
1392         Ok(self
1393             .fd
1394             .get_lapic()
1395             .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))?
1396             .into())
1397     }
1398     #[cfg(target_arch = "x86_64")]
1399     ///
1400     /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1401     ///
1402     fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> {
1403         let klapic: kvm_bindings::kvm_lapic_state = (*klapic).clone().into();
1404         self.fd
1405             .set_lapic(&klapic)
1406             .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into()))
1407     }
1408     #[cfg(target_arch = "x86_64")]
1409     ///
1410     /// Returns the model-specific registers (MSR) for this vCPU.
1411     ///
1412     fn get_msrs(&self, msrs: &mut Vec<MsrEntry>) -> cpu::Result<usize> {
1413         let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect();
1414         let mut kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap();
1415         let succ = self
1416             .fd
1417             .get_msrs(&mut kvm_msrs)
1418             .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))?;
1419 
1420         msrs[..succ].copy_from_slice(
1421             &kvm_msrs.as_slice()[..succ]
1422                 .iter()
1423                 .map(|e| (*e).into())
1424                 .collect::<Vec<MsrEntry>>(),
1425         );
1426 
1427         Ok(succ)
1428     }
1429     #[cfg(target_arch = "x86_64")]
1430     ///
1431     /// Setup the model-specific registers (MSR) for this vCPU.
1432     /// Returns the number of MSR entries actually written.
1433     ///
1434     fn set_msrs(&self, msrs: &[MsrEntry]) -> cpu::Result<usize> {
1435         let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect();
1436         let kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap();
1437         self.fd
1438             .set_msrs(&kvm_msrs)
1439             .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into()))
1440     }
1441     ///
1442     /// Returns the vcpu's current "multiprocessing state".
1443     ///
1444     fn get_mp_state(&self) -> cpu::Result<MpState> {
1445         Ok(self
1446             .fd
1447             .get_mp_state()
1448             .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))?
1449             .into())
1450     }
1451     ///
1452     /// Sets the vcpu's current "multiprocessing state".
1453     ///
1454     fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> {
1455         self.fd
1456             .set_mp_state(mp_state.into())
1457             .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into()))
1458     }
1459     #[cfg(target_arch = "x86_64")]
1460     ///
1461     /// Translates guest virtual address to guest physical address using the `KVM_TRANSLATE` ioctl.
1462     ///
1463     fn translate_gva(&self, gva: u64, _flags: u64) -> cpu::Result<(u64, u32)> {
1464         let tr = self
1465             .fd
1466             .translate_gva(gva)
1467             .map_err(|e| cpu::HypervisorCpuError::TranslateVirtualAddress(e.into()))?;
1468         // tr.valid is set if the GVA is mapped to valid GPA.
1469         match tr.valid {
1470             0 => Err(cpu::HypervisorCpuError::TranslateVirtualAddress(anyhow!(
1471                 "Invalid GVA: {:#x}",
1472                 gva
1473             ))),
1474             _ => Ok((tr.physical_address, 0)),
1475         }
1476     }
1477     ///
1478     /// Triggers the running of the current virtual CPU returning an exit reason.
1479     ///
1480     fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> {
1481         match self.fd.run() {
1482             Ok(run) => match run {
1483                 #[cfg(target_arch = "x86_64")]
1484                 VcpuExit::IoIn(addr, data) => {
1485                     if let Some(vm_ops) = &self.vm_ops {
1486                         return vm_ops
1487                             .pio_read(addr.into(), data)
1488                             .map(|_| cpu::VmExit::Ignore)
1489                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1490                     }
1491 
1492                     Ok(cpu::VmExit::IoIn(addr, data))
1493                 }
1494                 #[cfg(target_arch = "x86_64")]
1495                 VcpuExit::IoOut(addr, data) => {
1496                     if let Some(vm_ops) = &self.vm_ops {
1497                         return vm_ops
1498                             .pio_write(addr.into(), data)
1499                             .map(|_| cpu::VmExit::Ignore)
1500                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1501                     }
1502 
1503                     Ok(cpu::VmExit::IoOut(addr, data))
1504                 }
1505                 #[cfg(target_arch = "x86_64")]
1506                 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)),
1507                 #[cfg(target_arch = "x86_64")]
1508                 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset),
1509 
1510                 #[cfg(target_arch = "aarch64")]
1511                 VcpuExit::SystemEvent(event_type, flags) => {
1512                     use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN};
1513                     // On Aarch64, when the VM is shutdown, run() returns
1514                     // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN
1515                     if event_type == KVM_SYSTEM_EVENT_RESET {
1516                         Ok(cpu::VmExit::Reset)
1517                     } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN {
1518                         Ok(cpu::VmExit::Shutdown)
1519                     } else {
1520                         Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1521                             "Unexpected system event with type 0x{:x}, flags 0x{:x}",
1522                             event_type,
1523                             flags
1524                         )))
1525                     }
1526                 }
1527 
1528                 VcpuExit::MmioRead(addr, data) => {
1529                     if let Some(vm_ops) = &self.vm_ops {
1530                         return vm_ops
1531                             .mmio_read(addr, data)
1532                             .map(|_| cpu::VmExit::Ignore)
1533                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1534                     }
1535 
1536                     Ok(cpu::VmExit::MmioRead(addr, data))
1537                 }
1538                 VcpuExit::MmioWrite(addr, data) => {
1539                     if let Some(vm_ops) = &self.vm_ops {
1540                         return vm_ops
1541                             .mmio_write(addr, data)
1542                             .map(|_| cpu::VmExit::Ignore)
1543                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1544                     }
1545 
1546                     Ok(cpu::VmExit::MmioWrite(addr, data))
1547                 }
1548                 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv),
1549                 #[cfg(feature = "tdx")]
1550                 VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx),
1551                 VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug),
1552 
1553                 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1554                     "Unexpected exit reason on vcpu run: {:?}",
1555                     r
1556                 ))),
1557             },
1558 
1559             Err(ref e) => match e.errno() {
1560                 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore),
1561                 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1562                     "VCPU error {:?}",
1563                     e
1564                 ))),
1565             },
1566         }
1567     }
1568     #[cfg(target_arch = "x86_64")]
1569     ///
1570     /// Let the guest know that it has been paused, which prevents from
1571     /// potential soft lockups when being resumed.
1572     ///
1573     fn notify_guest_clock_paused(&self) -> cpu::Result<()> {
1574         if let Err(e) = self.fd.kvmclock_ctrl() {
1575             // Linux kernel returns -EINVAL if the PV clock isn't yet initialised
1576             // which could be because we're still in firmware or the guest doesn't
1577             // use KVM clock.
1578             if e.errno() != libc::EINVAL {
1579                 return Err(cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into()));
1580             }
1581         }
1582 
1583         Ok(())
1584     }
1585     #[cfg(target_arch = "x86_64")]
1586     ///
1587     /// Sets debug registers to set hardware breakpoints and/or enable single step.
1588     ///
1589     fn set_guest_debug(
1590         &self,
1591         addrs: &[vm_memory::GuestAddress],
1592         singlestep: bool,
1593     ) -> cpu::Result<()> {
1594         if addrs.len() > 4 {
1595             return Err(cpu::HypervisorCpuError::SetDebugRegs(anyhow!(
1596                 "Support 4 breakpoints at most but {} addresses are passed",
1597                 addrs.len()
1598             )));
1599         }
1600 
1601         let mut dbg = kvm_guest_debug {
1602             control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP,
1603             ..Default::default()
1604         };
1605         if singlestep {
1606             dbg.control |= KVM_GUESTDBG_SINGLESTEP;
1607         }
1608 
1609         // Set bits 9 and 10.
1610         // bit 9: GE (global exact breakpoint enable) flag.
1611         // bit 10: always 1.
1612         dbg.arch.debugreg[7] = 0x0600;
1613 
1614         for (i, addr) in addrs.iter().enumerate() {
1615             dbg.arch.debugreg[i] = addr.0;
1616             // Set global breakpoint enable flag
1617             dbg.arch.debugreg[7] |= 2 << (i * 2);
1618         }
1619 
1620         self.fd
1621             .set_guest_debug(&dbg)
1622             .map_err(|e| cpu::HypervisorCpuError::SetDebugRegs(e.into()))
1623     }
1624     #[cfg(target_arch = "aarch64")]
1625     fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> {
1626         self.fd
1627             .vcpu_init(kvi)
1628             .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into()))
1629     }
1630     ///
1631     /// Gets a list of the guest registers that are supported for the
1632     /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls.
1633     ///
1634     #[cfg(target_arch = "aarch64")]
1635     fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> {
1636         self.fd
1637             .get_reg_list(reg_list)
1638             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))
1639     }
1640     ///
1641     /// Gets the value of a system register
1642     ///
1643     #[cfg(target_arch = "aarch64")]
1644     fn get_sys_reg(&self, sys_reg: u32) -> cpu::Result<u64> {
1645         //
1646         // Arm Architecture Reference Manual defines the encoding of
1647         // AArch64 system registers, see
1648         // https://developer.arm.com/documentation/ddi0487 (chapter D12).
1649         // While KVM defines another ID for each AArch64 system register,
1650         // which is used in calling `KVM_G/SET_ONE_REG` to access a system
1651         // register of a guest.
1652         // A mapping exists between the Arm standard encoding and the KVM ID.
1653         // This function takes the standard u32 ID as input parameter, converts
1654         // it to the corresponding KVM ID, and call `KVM_GET_ONE_REG` API to
1655         // get the value of the system parameter.
1656         //
1657         let id: u64 = KVM_REG_ARM64 as u64
1658             | KVM_REG_SIZE_U64 as u64
1659             | KVM_REG_ARM64_SYSREG as u64
1660             | ((((sys_reg) >> 5)
1661                 & (KVM_REG_ARM64_SYSREG_OP0_MASK
1662                     | KVM_REG_ARM64_SYSREG_OP1_MASK
1663                     | KVM_REG_ARM64_SYSREG_CRN_MASK
1664                     | KVM_REG_ARM64_SYSREG_CRM_MASK
1665                     | KVM_REG_ARM64_SYSREG_OP2_MASK)) as u64);
1666         self.fd
1667             .get_one_reg(id)
1668             .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))
1669     }
1670     ///
1671     /// Configure core registers for a given CPU.
1672     ///
1673     #[cfg(target_arch = "aarch64")]
1674     fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> {
1675         #[allow(non_upper_case_globals)]
1676         // PSR (Processor State Register) bits.
1677         // Taken from arch/arm64/include/uapi/asm/ptrace.h.
1678         const PSR_MODE_EL1h: u64 = 0x0000_0005;
1679         const PSR_F_BIT: u64 = 0x0000_0040;
1680         const PSR_I_BIT: u64 = 0x0000_0080;
1681         const PSR_A_BIT: u64 = 0x0000_0100;
1682         const PSR_D_BIT: u64 = 0x0000_0200;
1683         // Taken from arch/arm64/kvm/inject_fault.c.
1684         const PSTATE_FAULT_BITS_64: u64 =
1685             PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | PSR_I_BIT | PSR_D_BIT;
1686 
1687         let kreg_off = offset__of!(kvm_regs, regs);
1688 
1689         // Get the register index of the PSTATE (Processor State) register.
1690         let pstate = offset__of!(user_pt_regs, pstate) + kreg_off;
1691         self.fd
1692             .set_one_reg(
1693                 arm64_core_reg_id!(KVM_REG_SIZE_U64, pstate),
1694                 PSTATE_FAULT_BITS_64,
1695             )
1696             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1697 
1698         // Other vCPUs are powered off initially awaiting PSCI wakeup.
1699         if cpu_id == 0 {
1700             // Setting the PC (Processor Counter) to the current program address (kernel address).
1701             let pc = offset__of!(user_pt_regs, pc) + kreg_off;
1702             self.fd
1703                 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, pc), boot_ip as u64)
1704                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1705 
1706             // Last mandatory thing to set -> the address pointing to the FDT (also called DTB).
1707             // "The device tree blob (dtb) must be placed on an 8-byte boundary and must
1708             // not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt.
1709             // We are choosing to place it the end of DRAM. See `get_fdt_addr`.
1710             let regs0 = offset__of!(user_pt_regs, regs) + kreg_off;
1711             self.fd
1712                 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, regs0), fdt_start)
1713                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1714         }
1715         Ok(())
1716     }
1717 
1718     #[cfg(target_arch = "x86_64")]
1719     ///
1720     /// Get the current CPU state
1721     ///
1722     /// Ordering requirements:
1723     ///
1724     /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify
1725     /// vCPU/LAPIC state. As such, it must be done before most everything
1726     /// else, otherwise we cannot restore everything and expect it to work.
1727     ///
1728     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1729     /// still running.
1730     ///
1731     /// KVM_GET_LAPIC may change state of LAPIC before returning it.
1732     ///
1733     /// GET_VCPU_EVENTS should probably be last to save. The code looks as
1734     /// it might as well be affected by internal state modifications of the
1735     /// GET ioctls.
1736     ///
1737     /// SREGS saves/restores a pending interrupt, similar to what
1738     /// VCPU_EVENTS also does.
1739     ///
1740     /// GET_MSRS requires a pre-populated data structure to do something
1741     /// meaningful. For SET_MSRS it will then contain good data.
1742     ///
1743     /// # Example
1744     ///
1745     /// ```rust
1746     /// # extern crate hypervisor;
1747     /// # use hypervisor::KvmHypervisor;
1748     /// # use std::sync::Arc;
1749     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1750     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1751     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1752     /// vm.enable_split_irq().unwrap();
1753     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1754     /// let state = vcpu.state().unwrap();
1755     /// ```
1756     fn state(&self) -> cpu::Result<CpuState> {
1757         let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?;
1758         let mp_state = self.get_mp_state()?.into();
1759         let regs = self.get_regs()?;
1760         let sregs = self.get_sregs()?;
1761         let xsave = self.get_xsave()?;
1762         let xcrs = self.get_xcrs()?;
1763         let lapic_state = self.get_lapic()?;
1764         let fpu = self.get_fpu()?;
1765 
1766         // Try to get all MSRs based on the list previously retrieved from KVM.
1767         // If the number of MSRs obtained from GET_MSRS is different from the
1768         // expected amount, we fallback onto a slower method by getting MSRs
1769         // by chunks. This is the only way to make sure we try to get as many
1770         // MSRs as possible, even if some MSRs are not supported.
1771         let mut msr_entries = self.msrs.clone();
1772 
1773         // Save extra MSRs if the Hyper-V synthetic interrupt controller is
1774         // emulated.
1775         if self.hyperv_synic.load(Ordering::Acquire) {
1776             let hyperv_synic_msrs = vec![
1777                 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084,
1778                 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096,
1779                 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d,
1780                 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4,
1781                 0x400000b5, 0x400000b6, 0x400000b7,
1782             ];
1783             for index in hyperv_synic_msrs {
1784                 let msr = kvm_msr_entry {
1785                     index,
1786                     ..Default::default()
1787                 };
1788                 msr_entries.push(msr.into());
1789             }
1790         }
1791 
1792         let expected_num_msrs = msr_entries.len();
1793         let num_msrs = self.get_msrs(&mut msr_entries)?;
1794         let msrs = if num_msrs != expected_num_msrs {
1795             let mut faulty_msr_index = num_msrs;
1796             let mut msr_entries_tmp = msr_entries[..faulty_msr_index].to_vec();
1797 
1798             loop {
1799                 warn!(
1800                     "Detected faulty MSR 0x{:x} while getting MSRs",
1801                     msr_entries[faulty_msr_index].index
1802                 );
1803 
1804                 // Skip the first bad MSR
1805                 let start_pos = faulty_msr_index + 1;
1806 
1807                 let mut sub_msr_entries = msr_entries[start_pos..].to_vec();
1808                 let num_msrs = self.get_msrs(&mut sub_msr_entries)?;
1809 
1810                 msr_entries_tmp.extend(&sub_msr_entries[..num_msrs]);
1811 
1812                 if num_msrs == sub_msr_entries.len() {
1813                     break;
1814                 }
1815 
1816                 faulty_msr_index = start_pos + num_msrs;
1817             }
1818 
1819             msr_entries_tmp
1820         } else {
1821             msr_entries
1822         };
1823 
1824         let vcpu_events = self.get_vcpu_events()?;
1825 
1826         Ok(VcpuKvmState {
1827             cpuid,
1828             msrs,
1829             vcpu_events,
1830             regs: regs.into(),
1831             sregs: sregs.into(),
1832             fpu,
1833             lapic_state,
1834             xsave,
1835             xcrs,
1836             mp_state,
1837         }
1838         .into())
1839     }
1840     ///
1841     /// Get the current AArch64 CPU state
1842     ///
1843     #[cfg(target_arch = "aarch64")]
1844     fn state(&self) -> cpu::Result<CpuState> {
1845         let mut state = VcpuKvmState {
1846             mp_state: self.get_mp_state()?.into(),
1847             ..Default::default()
1848         };
1849         // Get core registers
1850         state.core_regs = self.get_regs()?;
1851 
1852         // Get systerm register
1853         // Call KVM_GET_REG_LIST to get all registers available to the guest.
1854         // For ArmV8 there are around 500 registers.
1855         let mut sys_regs: Vec<Register> = Vec::new();
1856         let mut reg_list = RegList::new(500).unwrap();
1857         self.fd
1858             .get_reg_list(&mut reg_list)
1859             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?;
1860 
1861         // At this point reg_list should contain: core registers and system
1862         // registers.
1863         // The register list contains the number of registers and their ids. We
1864         // will be needing to call KVM_GET_ONE_REG on each id in order to save
1865         // all of them. We carve out from the list  the core registers which are
1866         // represented in the kernel by kvm_regs structure and for which we can
1867         // calculate the id based on the offset in the structure.
1868         reg_list.retain(|regid| is_system_register(*regid));
1869 
1870         // Now, for the rest of the registers left in the previously fetched
1871         // register list, we are simply calling KVM_GET_ONE_REG.
1872         let indices = reg_list.as_slice();
1873         for index in indices.iter() {
1874             sys_regs.push(kvm_bindings::kvm_one_reg {
1875                 id: *index,
1876                 addr: self
1877                     .fd
1878                     .get_one_reg(*index)
1879                     .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?,
1880             });
1881         }
1882 
1883         state.sys_regs = sys_regs;
1884 
1885         Ok(state.into())
1886     }
1887     #[cfg(target_arch = "x86_64")]
1888     ///
1889     /// Restore the previously saved CPU state
1890     ///
1891     /// Ordering requirements:
1892     ///
1893     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1894     /// still running.
1895     ///
1896     /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so
1897     /// if we ever change the BSP, we have to do that before restoring anything.
1898     /// The same seems to be true for CPUID stuff.
1899     ///
1900     /// SREGS saves/restores a pending interrupt, similar to what
1901     /// VCPU_EVENTS also does.
1902     ///
1903     /// SET_REGS clears pending exceptions unconditionally, thus, it must be
1904     /// done before SET_VCPU_EVENTS, which restores it.
1905     ///
1906     /// SET_LAPIC must come after SET_SREGS, because the latter restores
1907     /// the apic base msr.
1908     ///
1909     /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR
1910     /// only restores successfully, when the LAPIC is correctly configured.
1911     ///
1912     /// Arguments: CpuState
1913     /// # Example
1914     ///
1915     /// ```rust
1916     /// # extern crate hypervisor;
1917     /// # use hypervisor::KvmHypervisor;
1918     /// # use std::sync::Arc;
1919     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1920     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1921     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1922     /// vm.enable_split_irq().unwrap();
1923     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1924     /// let state = vcpu.state().unwrap();
1925     /// vcpu.set_state(&state).unwrap();
1926     /// ```
1927     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
1928         let state: VcpuKvmState = state.clone().into();
1929         self.set_cpuid2(&state.cpuid)?;
1930         self.set_mp_state(state.mp_state.into())?;
1931         self.set_regs(&state.regs.into())?;
1932         self.set_sregs(&state.sregs.into())?;
1933         self.set_xsave(&state.xsave)?;
1934         self.set_xcrs(&state.xcrs)?;
1935         self.set_lapic(&state.lapic_state)?;
1936         self.set_fpu(&state.fpu)?;
1937 
1938         // Try to set all MSRs previously stored.
1939         // If the number of MSRs set from SET_MSRS is different from the
1940         // expected amount, we fallback onto a slower method by setting MSRs
1941         // by chunks. This is the only way to make sure we try to set as many
1942         // MSRs as possible, even if some MSRs are not supported.
1943         let expected_num_msrs = state.msrs.len();
1944         let num_msrs = self.set_msrs(&state.msrs)?;
1945         if num_msrs != expected_num_msrs {
1946             let mut faulty_msr_index = num_msrs;
1947 
1948             loop {
1949                 warn!(
1950                     "Detected faulty MSR 0x{:x} while setting MSRs",
1951                     state.msrs[faulty_msr_index].index
1952                 );
1953 
1954                 // Skip the first bad MSR
1955                 let start_pos = faulty_msr_index + 1;
1956 
1957                 let sub_msr_entries = state.msrs[start_pos..].to_vec();
1958 
1959                 let num_msrs = self.set_msrs(&sub_msr_entries)?;
1960 
1961                 if num_msrs == sub_msr_entries.len() {
1962                     break;
1963                 }
1964 
1965                 faulty_msr_index = start_pos + num_msrs;
1966             }
1967         }
1968 
1969         self.set_vcpu_events(&state.vcpu_events)?;
1970 
1971         Ok(())
1972     }
1973     ///
1974     /// Restore the previously saved AArch64 CPU state
1975     ///
1976     #[cfg(target_arch = "aarch64")]
1977     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
1978         let state: VcpuKvmState = state.clone().into();
1979         // Set core registers
1980         self.set_regs(&state.core_regs)?;
1981         // Set system registers
1982         for reg in &state.sys_regs {
1983             self.fd
1984                 .set_one_reg(reg.id, reg.addr)
1985                 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?;
1986         }
1987 
1988         self.set_mp_state(state.mp_state.into())?;
1989 
1990         Ok(())
1991     }
1992 
1993     ///
1994     /// Initialize TDX for this CPU
1995     ///
1996     #[cfg(feature = "tdx")]
1997     fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> {
1998         tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address)
1999             .map_err(cpu::HypervisorCpuError::InitializeTdx)
2000     }
2001 
2002     ///
2003     /// Set the "immediate_exit" state
2004     ///
2005     fn set_immediate_exit(&self, exit: bool) {
2006         self.fd.set_kvm_immediate_exit(exit.into());
2007     }
2008 
2009     ///
2010     /// Returns the details about TDX exit reason
2011     ///
2012     #[cfg(feature = "tdx")]
2013     fn get_tdx_exit_details(&mut self) -> cpu::Result<TdxExitDetails> {
2014         let kvm_run = self.fd.get_kvm_run();
2015         let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall };
2016 
2017         tdx_vmcall.status_code = TDG_VP_VMCALL_INVALID_OPERAND;
2018 
2019         if tdx_vmcall.type_ != 0 {
2020             return Err(cpu::HypervisorCpuError::UnknownTdxVmCall);
2021         }
2022 
2023         match tdx_vmcall.subfunction {
2024             TDG_VP_VMCALL_GET_QUOTE => Ok(TdxExitDetails::GetQuote),
2025             TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT => {
2026                 Ok(TdxExitDetails::SetupEventNotifyInterrupt)
2027             }
2028             _ => Err(cpu::HypervisorCpuError::UnknownTdxVmCall),
2029         }
2030     }
2031 
2032     ///
2033     /// Set the status code for TDX exit
2034     ///
2035     #[cfg(feature = "tdx")]
2036     fn set_tdx_status(&mut self, status: TdxExitStatus) {
2037         let kvm_run = self.fd.get_kvm_run();
2038         let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall };
2039 
2040         tdx_vmcall.status_code = match status {
2041             TdxExitStatus::Success => TDG_VP_VMCALL_SUCCESS,
2042             TdxExitStatus::InvalidOperand => TDG_VP_VMCALL_INVALID_OPERAND,
2043         };
2044     }
2045     #[cfg(target_arch = "x86_64")]
2046     ///
2047     /// Return the list of initial MSR entries for a VCPU
2048     ///
2049     fn boot_msr_entries(&self) -> Vec<MsrEntry> {
2050         use crate::arch::x86::{msr_index, MTRR_ENABLE, MTRR_MEM_TYPE_WB};
2051 
2052         [
2053             msr!(msr_index::MSR_IA32_SYSENTER_CS),
2054             msr!(msr_index::MSR_IA32_SYSENTER_ESP),
2055             msr!(msr_index::MSR_IA32_SYSENTER_EIP),
2056             msr!(msr_index::MSR_STAR),
2057             msr!(msr_index::MSR_CSTAR),
2058             msr!(msr_index::MSR_LSTAR),
2059             msr!(msr_index::MSR_KERNEL_GS_BASE),
2060             msr!(msr_index::MSR_SYSCALL_MASK),
2061             msr!(msr_index::MSR_IA32_TSC),
2062             msr_data!(
2063                 msr_index::MSR_IA32_MISC_ENABLE,
2064                 msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64
2065             ),
2066             msr_data!(msr_index::MSR_MTRRdefType, MTRR_ENABLE | MTRR_MEM_TYPE_WB),
2067         ]
2068         .to_vec()
2069     }
2070     #[cfg(target_arch = "aarch64")]
2071     fn has_pmu_support(&self) -> bool {
2072         let cpu_attr = kvm_bindings::kvm_device_attr {
2073             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2074             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT),
2075             addr: 0x0,
2076             flags: 0,
2077         };
2078         self.fd.has_device_attr(&cpu_attr).is_ok()
2079     }
2080     #[cfg(target_arch = "aarch64")]
2081     fn init_pmu(&self, irq: u32) -> cpu::Result<()> {
2082         let cpu_attr = kvm_bindings::kvm_device_attr {
2083             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2084             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT),
2085             addr: 0x0,
2086             flags: 0,
2087         };
2088         let cpu_attr_irq = kvm_bindings::kvm_device_attr {
2089             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2090             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_IRQ),
2091             addr: &irq as *const u32 as u64,
2092             flags: 0,
2093         };
2094         self.fd
2095             .set_device_attr(&cpu_attr_irq)
2096             .map_err(|_| cpu::HypervisorCpuError::InitializePmu)?;
2097         self.fd
2098             .set_device_attr(&cpu_attr)
2099             .map_err(|_| cpu::HypervisorCpuError::InitializePmu)
2100     }
2101 }
2102 
2103 impl KvmVcpu {
2104     #[cfg(target_arch = "x86_64")]
2105     ///
2106     /// X86 specific call that returns the vcpu's current "xsave struct".
2107     ///
2108     fn get_xsave(&self) -> cpu::Result<Xsave> {
2109         self.fd
2110             .get_xsave()
2111             .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into()))
2112     }
2113     #[cfg(target_arch = "x86_64")]
2114     ///
2115     /// X86 specific call that sets the vcpu's current "xsave struct".
2116     ///
2117     fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> {
2118         self.fd
2119             .set_xsave(xsave)
2120             .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into()))
2121     }
2122     #[cfg(target_arch = "x86_64")]
2123     ///
2124     /// X86 specific call that returns the vcpu's current "xcrs".
2125     ///
2126     fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> {
2127         self.fd
2128             .get_xcrs()
2129             .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into()))
2130     }
2131     #[cfg(target_arch = "x86_64")]
2132     ///
2133     /// X86 specific call that sets the vcpu's current "xcrs".
2134     ///
2135     fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> {
2136         self.fd
2137             .set_xcrs(xcrs)
2138             .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into()))
2139     }
2140     #[cfg(target_arch = "x86_64")]
2141     ///
2142     /// Returns currently pending exceptions, interrupts, and NMIs as well as related
2143     /// states of the vcpu.
2144     ///
2145     fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> {
2146         self.fd
2147             .get_vcpu_events()
2148             .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into()))
2149     }
2150     #[cfg(target_arch = "x86_64")]
2151     ///
2152     /// Sets pending exceptions, interrupts, and NMIs as well as related states
2153     /// of the vcpu.
2154     ///
2155     fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> {
2156         self.fd
2157             .set_vcpu_events(events)
2158             .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into()))
2159     }
2160 }
2161