xref: /cloud-hypervisor/hypervisor/src/kvm/mod.rs (revision 686e6d50824fcc7403a51b91545899a6301d6216)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 // Copyright © 2020, Microsoft Corporation
6 //
7 // Copyright 2018-2019 CrowdStrike, Inc.
8 //
9 //
10 
11 #[cfg(target_arch = "aarch64")]
12 use crate::aarch64::gic::KvmGicV3Its;
13 #[cfg(target_arch = "aarch64")]
14 pub use crate::aarch64::{
15     check_required_kvm_extensions, gic::Gicv3ItsState as GicState, is_system_register, VcpuInit,
16     VcpuKvmState, MPIDR_EL1,
17 };
18 #[cfg(target_arch = "aarch64")]
19 use crate::arch::aarch64::gic::Vgic;
20 use crate::cpu;
21 use crate::hypervisor;
22 use crate::vec_with_array_field;
23 use crate::vm::{self, InterruptSourceConfig, VmOps};
24 use crate::HypervisorType;
25 #[cfg(target_arch = "aarch64")]
26 use crate::{arm64_core_reg_id, offset__of};
27 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd};
28 use std::any::Any;
29 use std::collections::HashMap;
30 #[cfg(target_arch = "aarch64")]
31 use std::convert::TryInto;
32 #[cfg(target_arch = "x86_64")]
33 use std::fs::File;
34 #[cfg(target_arch = "x86_64")]
35 use std::os::unix::io::AsRawFd;
36 #[cfg(feature = "tdx")]
37 use std::os::unix::io::RawFd;
38 use std::result;
39 #[cfg(target_arch = "x86_64")]
40 use std::sync::atomic::{AtomicBool, Ordering};
41 #[cfg(target_arch = "aarch64")]
42 use std::sync::Mutex;
43 use std::sync::{Arc, RwLock};
44 use vmm_sys_util::eventfd::EventFd;
45 // x86_64 dependencies
46 #[cfg(target_arch = "x86_64")]
47 pub mod x86_64;
48 #[cfg(target_arch = "x86_64")]
49 use crate::arch::x86::{
50     CpuIdEntry, FpuState, LapicState, MsrEntry, SpecialRegisters, StandardRegisters,
51     NUM_IOAPIC_PINS,
52 };
53 #[cfg(target_arch = "x86_64")]
54 use crate::ClockData;
55 use crate::{
56     CpuState, IoEventAddress, IrqRoutingEntry, MpState, UserMemoryRegion,
57     USER_MEMORY_REGION_LOG_DIRTY, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE,
58 };
59 #[cfg(target_arch = "aarch64")]
60 use aarch64::{RegList, Register, StandardRegisters};
61 #[cfg(target_arch = "x86_64")]
62 use kvm_bindings::{
63     kvm_enable_cap, kvm_guest_debug, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC,
64     KVM_CAP_SPLIT_IRQCHIP, KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_GUESTDBG_USE_HW_BP,
65 };
66 #[cfg(target_arch = "x86_64")]
67 use x86_64::check_required_kvm_extensions;
68 #[cfg(target_arch = "x86_64")]
69 pub use x86_64::{CpuId, ExtendedControlRegisters, MsrEntries, VcpuKvmState, Xsave};
70 // aarch64 dependencies
71 #[cfg(target_arch = "aarch64")]
72 pub mod aarch64;
73 pub use kvm_bindings;
74 #[cfg(feature = "tdx")]
75 use kvm_bindings::KVMIO;
76 pub use kvm_bindings::{
77     kvm_clock_data, kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_irq_routing,
78     kvm_irq_routing_entry, kvm_mp_state, kvm_userspace_memory_region, KVM_IRQ_ROUTING_IRQCHIP,
79     KVM_IRQ_ROUTING_MSI, KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID,
80 };
81 #[cfg(target_arch = "aarch64")]
82 use kvm_bindings::{
83     kvm_regs, user_fpsimd_state, user_pt_regs, KVM_NR_SPSR, KVM_REG_ARM64, KVM_REG_ARM_CORE,
84     KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64,
85 };
86 pub use kvm_ioctls;
87 pub use kvm_ioctls::{Cap, Kvm};
88 #[cfg(target_arch = "aarch64")]
89 use std::mem;
90 use thiserror::Error;
91 use vfio_ioctls::VfioDeviceFd;
92 #[cfg(feature = "tdx")]
93 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_ioc_nr, ioctl_iowr_nr};
94 ///
95 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms
96 ///
97 pub use {
98     kvm_bindings::kvm_create_device as CreateDevice, kvm_bindings::kvm_device_attr as DeviceAttr,
99     kvm_bindings::kvm_run, kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::VcpuExit,
100 };
101 
102 #[cfg(target_arch = "x86_64")]
103 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196;
104 
105 #[cfg(feature = "tdx")]
106 const KVM_EXIT_TDX: u32 = 35;
107 #[cfg(feature = "tdx")]
108 const TDG_VP_VMCALL_GET_QUOTE: u64 = 0x10002;
109 #[cfg(feature = "tdx")]
110 const TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: u64 = 0x10004;
111 #[cfg(feature = "tdx")]
112 const TDG_VP_VMCALL_SUCCESS: u64 = 0;
113 #[cfg(feature = "tdx")]
114 const TDG_VP_VMCALL_INVALID_OPERAND: u64 = 0x8000000000000000;
115 
116 #[cfg(feature = "tdx")]
117 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong);
118 
119 #[cfg(feature = "tdx")]
120 #[repr(u32)]
121 enum TdxCommand {
122     Capabilities = 0,
123     InitVm,
124     InitVcpu,
125     InitMemRegion,
126     Finalize,
127 }
128 
129 #[cfg(feature = "tdx")]
130 pub enum TdxExitDetails {
131     GetQuote,
132     SetupEventNotifyInterrupt,
133 }
134 
135 #[cfg(feature = "tdx")]
136 pub enum TdxExitStatus {
137     Success,
138     InvalidOperand,
139 }
140 
141 #[cfg(feature = "tdx")]
142 const TDX_MAX_NR_CPUID_CONFIGS: usize = 6;
143 
144 #[cfg(feature = "tdx")]
145 #[repr(C)]
146 #[derive(Debug, Default)]
147 pub struct TdxCpuidConfig {
148     pub leaf: u32,
149     pub sub_leaf: u32,
150     pub eax: u32,
151     pub ebx: u32,
152     pub ecx: u32,
153     pub edx: u32,
154 }
155 
156 #[cfg(feature = "tdx")]
157 #[repr(C)]
158 #[derive(Debug, Default)]
159 pub struct TdxCapabilities {
160     pub attrs_fixed0: u64,
161     pub attrs_fixed1: u64,
162     pub xfam_fixed0: u64,
163     pub xfam_fixed1: u64,
164     pub nr_cpuid_configs: u32,
165     pub padding: u32,
166     pub cpuid_configs: [TdxCpuidConfig; TDX_MAX_NR_CPUID_CONFIGS],
167 }
168 
169 impl From<kvm_userspace_memory_region> for UserMemoryRegion {
170     fn from(region: kvm_userspace_memory_region) -> Self {
171         let mut flags = USER_MEMORY_REGION_READ;
172         if region.flags & KVM_MEM_READONLY == 0 {
173             flags |= USER_MEMORY_REGION_WRITE;
174         }
175         if region.flags & KVM_MEM_LOG_DIRTY_PAGES != 0 {
176             flags |= USER_MEMORY_REGION_LOG_DIRTY;
177         }
178 
179         UserMemoryRegion {
180             slot: region.slot,
181             guest_phys_addr: region.guest_phys_addr,
182             memory_size: region.memory_size,
183             userspace_addr: region.userspace_addr,
184             flags,
185         }
186     }
187 }
188 
189 impl From<UserMemoryRegion> for kvm_userspace_memory_region {
190     fn from(region: UserMemoryRegion) -> Self {
191         assert!(
192             region.flags & USER_MEMORY_REGION_READ != 0,
193             "KVM mapped memory is always readable"
194         );
195 
196         let mut flags = 0;
197         if region.flags & USER_MEMORY_REGION_WRITE == 0 {
198             flags |= KVM_MEM_READONLY;
199         }
200         if region.flags & USER_MEMORY_REGION_LOG_DIRTY != 0 {
201             flags |= KVM_MEM_LOG_DIRTY_PAGES;
202         }
203 
204         kvm_userspace_memory_region {
205             slot: region.slot,
206             guest_phys_addr: region.guest_phys_addr,
207             memory_size: region.memory_size,
208             userspace_addr: region.userspace_addr,
209             flags,
210         }
211     }
212 }
213 
214 impl From<kvm_mp_state> for MpState {
215     fn from(s: kvm_mp_state) -> Self {
216         MpState::Kvm(s)
217     }
218 }
219 
220 impl From<MpState> for kvm_mp_state {
221     fn from(ms: MpState) -> Self {
222         match ms {
223             MpState::Kvm(s) => s,
224             /* Needed in case other hypervisors are enabled */
225             #[allow(unreachable_patterns)]
226             _ => panic!("CpuState is not valid"),
227         }
228     }
229 }
230 
231 impl From<kvm_ioctls::IoEventAddress> for IoEventAddress {
232     fn from(a: kvm_ioctls::IoEventAddress) -> Self {
233         match a {
234             kvm_ioctls::IoEventAddress::Pio(x) => Self::Pio(x),
235             kvm_ioctls::IoEventAddress::Mmio(x) => Self::Mmio(x),
236         }
237     }
238 }
239 
240 impl From<IoEventAddress> for kvm_ioctls::IoEventAddress {
241     fn from(a: IoEventAddress) -> Self {
242         match a {
243             IoEventAddress::Pio(x) => Self::Pio(x),
244             IoEventAddress::Mmio(x) => Self::Mmio(x),
245         }
246     }
247 }
248 
249 impl From<VcpuKvmState> for CpuState {
250     fn from(s: VcpuKvmState) -> Self {
251         CpuState::Kvm(s)
252     }
253 }
254 
255 impl From<CpuState> for VcpuKvmState {
256     fn from(s: CpuState) -> Self {
257         match s {
258             CpuState::Kvm(s) => s,
259             /* Needed in case other hypervisors are enabled */
260             #[allow(unreachable_patterns)]
261             _ => panic!("CpuState is not valid"),
262         }
263     }
264 }
265 
266 #[cfg(target_arch = "x86_64")]
267 impl From<kvm_clock_data> for ClockData {
268     fn from(d: kvm_clock_data) -> Self {
269         ClockData::Kvm(d)
270     }
271 }
272 
273 #[cfg(target_arch = "x86_64")]
274 impl From<ClockData> for kvm_clock_data {
275     fn from(ms: ClockData) -> Self {
276         match ms {
277             ClockData::Kvm(s) => s,
278             /* Needed in case other hypervisors are enabled */
279             #[allow(unreachable_patterns)]
280             _ => panic!("CpuState is not valid"),
281         }
282     }
283 }
284 
285 impl From<kvm_irq_routing_entry> for IrqRoutingEntry {
286     fn from(s: kvm_irq_routing_entry) -> Self {
287         IrqRoutingEntry::Kvm(s)
288     }
289 }
290 
291 impl From<IrqRoutingEntry> for kvm_irq_routing_entry {
292     fn from(e: IrqRoutingEntry) -> Self {
293         match e {
294             IrqRoutingEntry::Kvm(e) => e,
295             /* Needed in case other hypervisors are enabled */
296             #[allow(unreachable_patterns)]
297             _ => panic!("IrqRoutingEntry is not valid"),
298         }
299     }
300 }
301 
302 struct KvmDirtyLogSlot {
303     slot: u32,
304     guest_phys_addr: u64,
305     memory_size: u64,
306     userspace_addr: u64,
307 }
308 
309 /// Wrapper over KVM VM ioctls.
310 pub struct KvmVm {
311     fd: Arc<VmFd>,
312     #[cfg(target_arch = "x86_64")]
313     msrs: Vec<MsrEntry>,
314     dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>,
315 }
316 
317 impl KvmVm {
318     ///
319     /// Creates an emulated device in the kernel.
320     ///
321     /// See the documentation for `KVM_CREATE_DEVICE`.
322     fn create_device(&self, device: &mut CreateDevice) -> vm::Result<vfio_ioctls::VfioDeviceFd> {
323         let device_fd = self
324             .fd
325             .create_device(device)
326             .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?;
327         Ok(VfioDeviceFd::new_from_kvm(device_fd))
328     }
329     /// Checks if a particular `Cap` is available.
330     fn check_extension(&self, c: Cap) -> bool {
331         self.fd.check_extension(c)
332     }
333 }
334 
335 ///
336 /// Implementation of Vm trait for KVM
337 /// Example:
338 /// #[cfg(feature = "kvm")]
339 /// extern crate hypervisor
340 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
341 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
342 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
343 /// vm.set/get().unwrap()
344 ///
345 impl vm::Vm for KvmVm {
346     #[cfg(target_arch = "x86_64")]
347     ///
348     /// Sets the address of the one-page region in the VM's address space.
349     ///
350     fn set_identity_map_address(&self, address: u64) -> vm::Result<()> {
351         self.fd
352             .set_identity_map_address(address)
353             .map_err(|e| vm::HypervisorVmError::SetIdentityMapAddress(e.into()))
354     }
355     #[cfg(target_arch = "x86_64")]
356     ///
357     /// Sets the address of the three-page region in the VM's address space.
358     ///
359     fn set_tss_address(&self, offset: usize) -> vm::Result<()> {
360         self.fd
361             .set_tss_address(offset)
362             .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into()))
363     }
364     ///
365     /// Creates an in-kernel interrupt controller.
366     ///
367     fn create_irq_chip(&self) -> vm::Result<()> {
368         self.fd
369             .create_irq_chip()
370             .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into()))
371     }
372     ///
373     /// Registers an event that will, when signaled, trigger the `gsi` IRQ.
374     ///
375     fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
376         self.fd
377             .register_irqfd(fd, gsi)
378             .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into()))
379     }
380     ///
381     /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ.
382     ///
383     fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
384         self.fd
385             .unregister_irqfd(fd, gsi)
386             .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into()))
387     }
388     ///
389     /// Creates a VcpuFd object from a vcpu RawFd.
390     ///
391     fn create_vcpu(
392         &self,
393         id: u8,
394         vm_ops: Option<Arc<dyn VmOps>>,
395     ) -> vm::Result<Arc<dyn cpu::Vcpu>> {
396         let vc = self
397             .fd
398             .create_vcpu(id as u64)
399             .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?;
400         let vcpu = KvmVcpu {
401             fd: vc,
402             #[cfg(target_arch = "x86_64")]
403             msrs: self.msrs.clone(),
404             vm_ops,
405             #[cfg(target_arch = "x86_64")]
406             hyperv_synic: AtomicBool::new(false),
407         };
408         Ok(Arc::new(vcpu))
409     }
410     #[cfg(target_arch = "aarch64")]
411     ///
412     /// Creates a virtual GIC device.
413     ///
414     fn create_vgic(
415         &self,
416         vcpu_count: u64,
417         dist_addr: u64,
418         dist_size: u64,
419         redist_size: u64,
420         msi_size: u64,
421         nr_irqs: u32,
422     ) -> vm::Result<Arc<Mutex<dyn Vgic>>> {
423         let gic_device = KvmGicV3Its::new(
424             self,
425             vcpu_count,
426             dist_addr,
427             dist_size,
428             redist_size,
429             msi_size,
430             nr_irqs,
431         )
432         .map_err(|e| vm::HypervisorVmError::CreateVgic(anyhow!("Vgic error {:?}", e)))?;
433         Ok(Arc::new(Mutex::new(gic_device)))
434     }
435     ///
436     /// Registers an event to be signaled whenever a certain address is written to.
437     ///
438     fn register_ioevent(
439         &self,
440         fd: &EventFd,
441         addr: &IoEventAddress,
442         datamatch: Option<vm::DataMatch>,
443     ) -> vm::Result<()> {
444         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
445         if let Some(dm) = datamatch {
446             match dm {
447                 vm::DataMatch::DataMatch32(kvm_dm32) => self
448                     .fd
449                     .register_ioevent(fd, addr, kvm_dm32)
450                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
451                 vm::DataMatch::DataMatch64(kvm_dm64) => self
452                     .fd
453                     .register_ioevent(fd, addr, kvm_dm64)
454                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
455             }
456         } else {
457             self.fd
458                 .register_ioevent(fd, addr, NoDatamatch)
459                 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into()))
460         }
461     }
462     ///
463     /// Unregisters an event from a certain address it has been previously registered to.
464     ///
465     fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> {
466         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
467         self.fd
468             .unregister_ioevent(fd, addr, NoDatamatch)
469             .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into()))
470     }
471 
472     ///
473     /// Constructs a routing entry
474     ///
475     fn make_routing_entry(&self, gsi: u32, config: &InterruptSourceConfig) -> IrqRoutingEntry {
476         match &config {
477             InterruptSourceConfig::MsiIrq(cfg) => {
478                 let mut kvm_route = kvm_irq_routing_entry {
479                     gsi,
480                     type_: KVM_IRQ_ROUTING_MSI,
481                     ..Default::default()
482                 };
483 
484                 kvm_route.u.msi.address_lo = cfg.low_addr;
485                 kvm_route.u.msi.address_hi = cfg.high_addr;
486                 kvm_route.u.msi.data = cfg.data;
487 
488                 if self.check_extension(crate::kvm::Cap::MsiDevid) {
489                     // On AArch64, there is limitation on the range of the 'devid',
490                     // it can not be greater than 65536 (the max of u16).
491                     //
492                     // BDF can not be used directly, because 'segment' is in high
493                     // 16 bits. The layout of the u32 BDF is:
494                     // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --|
495                     // |      segment    |     bus    |   device   |  function  |
496                     //
497                     // Now that we support 1 bus only in a segment, we can build a
498                     // 'devid' by replacing the 'bus' bits with the low 8 bits of
499                     // 'segment' data.
500                     // This way we can resolve the range checking problem and give
501                     // different `devid` to all the devices. Limitation is that at
502                     // most 256 segments can be supported.
503                     //
504                     let modified_devid = (cfg.devid & 0x00ff_0000) >> 8 | cfg.devid & 0xff;
505 
506                     kvm_route.flags = KVM_MSI_VALID_DEVID;
507                     kvm_route.u.msi.__bindgen_anon_1.devid = modified_devid;
508                 }
509                 kvm_route.into()
510             }
511             InterruptSourceConfig::LegacyIrq(cfg) => {
512                 let mut kvm_route = kvm_irq_routing_entry {
513                     gsi,
514                     type_: KVM_IRQ_ROUTING_IRQCHIP,
515                     ..Default::default()
516                 };
517                 kvm_route.u.irqchip.irqchip = cfg.irqchip;
518                 kvm_route.u.irqchip.pin = cfg.pin;
519 
520                 kvm_route.into()
521             }
522         }
523     }
524 
525     ///
526     /// Sets the GSI routing table entries, overwriting any previously set
527     /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl.
528     ///
529     fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> {
530         let mut irq_routing =
531             vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len());
532         irq_routing[0].nr = entries.len() as u32;
533         irq_routing[0].flags = 0;
534         let entries: Vec<kvm_irq_routing_entry> = entries
535             .iter()
536             .map(|entry| match entry {
537                 IrqRoutingEntry::Kvm(e) => *e,
538                 #[allow(unreachable_patterns)]
539                 _ => panic!("IrqRoutingEntry type is wrong"),
540             })
541             .collect();
542 
543         // SAFETY: irq_routing initialized with entries.len() and now it is being turned into
544         // entries_slice with entries.len() again. It is guaranteed to be large enough to hold
545         // everything from entries.
546         unsafe {
547             let entries_slice: &mut [kvm_irq_routing_entry] =
548                 irq_routing[0].entries.as_mut_slice(entries.len());
549             entries_slice.copy_from_slice(&entries);
550         }
551 
552         self.fd
553             .set_gsi_routing(&irq_routing[0])
554             .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into()))
555     }
556     ///
557     /// Creates a memory region structure that can be used with {create/remove}_user_memory_region
558     ///
559     fn make_user_memory_region(
560         &self,
561         slot: u32,
562         guest_phys_addr: u64,
563         memory_size: u64,
564         userspace_addr: u64,
565         readonly: bool,
566         log_dirty_pages: bool,
567     ) -> UserMemoryRegion {
568         kvm_userspace_memory_region {
569             slot,
570             guest_phys_addr,
571             memory_size,
572             userspace_addr,
573             flags: if readonly { KVM_MEM_READONLY } else { 0 }
574                 | if log_dirty_pages {
575                     KVM_MEM_LOG_DIRTY_PAGES
576                 } else {
577                     0
578                 },
579         }
580         .into()
581     }
582     ///
583     /// Creates a guest physical memory region.
584     ///
585     fn create_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
586         let mut region: kvm_userspace_memory_region = user_memory_region.into();
587 
588         if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 {
589             if (region.flags & KVM_MEM_READONLY) != 0 {
590                 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!(
591                     "Error creating regions with both 'dirty-pages-log' and 'read-only'."
592                 )));
593             }
594 
595             // Keep track of the regions that need dirty pages log
596             self.dirty_log_slots.write().unwrap().insert(
597                 region.slot,
598                 KvmDirtyLogSlot {
599                     slot: region.slot,
600                     guest_phys_addr: region.guest_phys_addr,
601                     memory_size: region.memory_size,
602                     userspace_addr: region.userspace_addr,
603                 },
604             );
605 
606             // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`.
607             // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`.
608             region.flags = 0;
609         }
610 
611         // SAFETY: Safe because guest regions are guaranteed not to overlap.
612         unsafe {
613             self.fd
614                 .set_user_memory_region(region)
615                 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))
616         }
617     }
618     ///
619     /// Removes a guest physical memory region.
620     ///
621     fn remove_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
622         let mut region: kvm_userspace_memory_region = user_memory_region.into();
623 
624         // Remove the corresponding entry from "self.dirty_log_slots" if needed
625         self.dirty_log_slots.write().unwrap().remove(&region.slot);
626 
627         // Setting the size to 0 means "remove"
628         region.memory_size = 0;
629         // SAFETY: Safe because guest regions are guaranteed not to overlap.
630         unsafe {
631             self.fd
632                 .set_user_memory_region(region)
633                 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into()))
634         }
635     }
636     ///
637     /// Returns the preferred CPU target type which can be emulated by KVM on underlying host.
638     ///
639     #[cfg(target_arch = "aarch64")]
640     fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> {
641         self.fd
642             .get_preferred_target(kvi)
643             .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into()))
644     }
645     #[cfg(target_arch = "x86_64")]
646     fn enable_split_irq(&self) -> vm::Result<()> {
647         // Create split irqchip
648         // Only the local APIC is emulated in kernel, both PICs and IOAPIC
649         // are not.
650         let mut cap = kvm_enable_cap {
651             cap: KVM_CAP_SPLIT_IRQCHIP,
652             ..Default::default()
653         };
654         cap.args[0] = NUM_IOAPIC_PINS as u64;
655         self.fd
656             .enable_cap(&cap)
657             .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?;
658         Ok(())
659     }
660     #[cfg(target_arch = "x86_64")]
661     fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> {
662         let mut cap = kvm_enable_cap {
663             cap: KVM_CAP_SGX_ATTRIBUTE,
664             ..Default::default()
665         };
666         cap.args[0] = file.as_raw_fd() as u64;
667         self.fd
668             .enable_cap(&cap)
669             .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?;
670         Ok(())
671     }
672     /// Retrieve guest clock.
673     #[cfg(target_arch = "x86_64")]
674     fn get_clock(&self) -> vm::Result<ClockData> {
675         Ok(self
676             .fd
677             .get_clock()
678             .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))?
679             .into())
680     }
681     /// Set guest clock.
682     #[cfg(target_arch = "x86_64")]
683     fn set_clock(&self, data: &ClockData) -> vm::Result<()> {
684         let data = (*data).into();
685         self.fd
686             .set_clock(&data)
687             .map_err(|e| vm::HypervisorVmError::SetClock(e.into()))
688     }
689     /// Create a device that is used for passthrough
690     fn create_passthrough_device(&self) -> vm::Result<VfioDeviceFd> {
691         let mut vfio_dev = kvm_create_device {
692             type_: kvm_device_type_KVM_DEV_TYPE_VFIO,
693             fd: 0,
694             flags: 0,
695         };
696 
697         self.create_device(&mut vfio_dev)
698             .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into()))
699     }
700     ///
701     /// Start logging dirty pages
702     ///
703     fn start_dirty_log(&self) -> vm::Result<()> {
704         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
705         for (_, s) in dirty_log_slots.iter() {
706             let region = kvm_userspace_memory_region {
707                 slot: s.slot,
708                 guest_phys_addr: s.guest_phys_addr,
709                 memory_size: s.memory_size,
710                 userspace_addr: s.userspace_addr,
711                 flags: KVM_MEM_LOG_DIRTY_PAGES,
712             };
713             // SAFETY: Safe because guest regions are guaranteed not to overlap.
714             unsafe {
715                 self.fd
716                     .set_user_memory_region(region)
717                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
718             }
719         }
720 
721         Ok(())
722     }
723 
724     ///
725     /// Stop logging dirty pages
726     ///
727     fn stop_dirty_log(&self) -> vm::Result<()> {
728         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
729         for (_, s) in dirty_log_slots.iter() {
730             let region = kvm_userspace_memory_region {
731                 slot: s.slot,
732                 guest_phys_addr: s.guest_phys_addr,
733                 memory_size: s.memory_size,
734                 userspace_addr: s.userspace_addr,
735                 flags: 0,
736             };
737             // SAFETY: Safe because guest regions are guaranteed not to overlap.
738             unsafe {
739                 self.fd
740                     .set_user_memory_region(region)
741                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
742             }
743         }
744 
745         Ok(())
746     }
747 
748     ///
749     /// Get dirty pages bitmap (one bit per page)
750     ///
751     fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> {
752         self.fd
753             .get_dirty_log(slot, memory_size as usize)
754             .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into()))
755     }
756 
757     ///
758     /// Initialize TDX for this VM
759     ///
760     #[cfg(feature = "tdx")]
761     fn tdx_init(&self, cpuid: &[CpuIdEntry], max_vcpus: u32) -> vm::Result<()> {
762         use std::io::{Error, ErrorKind};
763         let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> =
764             cpuid.iter().map(|e| (*e).into()).collect();
765         let kvm_cpuid = kvm_bindings::CpuId::from_entries(&cpuid).map_err(|_| {
766             vm::HypervisorVmError::InitializeTdx(Error::new(
767                 ErrorKind::Other,
768                 "failed to allocate CpuId",
769             ))
770         })?;
771 
772         #[repr(C)]
773         struct TdxInitVm {
774             max_vcpus: u32,
775             tsc_khz: u32,
776             attributes: u64,
777             cpuid: u64,
778             mrconfigid: [u64; 6],
779             mrowner: [u64; 6],
780             mrownerconfig: [u64; 6],
781             reserved: [u64; 43],
782         }
783         let data = TdxInitVm {
784             max_vcpus,
785             tsc_khz: 0,
786             attributes: 0,
787             cpuid: kvm_cpuid.as_fam_struct_ptr() as u64,
788             mrconfigid: [0; 6],
789             mrowner: [0; 6],
790             mrownerconfig: [0; 6],
791             reserved: [0; 43],
792         };
793 
794         tdx_command(
795             &self.fd.as_raw_fd(),
796             TdxCommand::InitVm,
797             0,
798             &data as *const _ as u64,
799         )
800         .map_err(vm::HypervisorVmError::InitializeTdx)
801     }
802 
803     ///
804     /// Finalize the TDX setup for this VM
805     ///
806     #[cfg(feature = "tdx")]
807     fn tdx_finalize(&self) -> vm::Result<()> {
808         tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0)
809             .map_err(vm::HypervisorVmError::FinalizeTdx)
810     }
811 
812     ///
813     /// Initialize memory regions for the TDX VM
814     ///
815     #[cfg(feature = "tdx")]
816     fn tdx_init_memory_region(
817         &self,
818         host_address: u64,
819         guest_address: u64,
820         size: u64,
821         measure: bool,
822     ) -> vm::Result<()> {
823         #[repr(C)]
824         struct TdxInitMemRegion {
825             host_address: u64,
826             guest_address: u64,
827             pages: u64,
828         }
829         let data = TdxInitMemRegion {
830             host_address,
831             guest_address,
832             pages: size / 4096,
833         };
834 
835         tdx_command(
836             &self.fd.as_raw_fd(),
837             TdxCommand::InitMemRegion,
838             if measure { 1 } else { 0 },
839             &data as *const _ as u64,
840         )
841         .map_err(vm::HypervisorVmError::InitMemRegionTdx)
842     }
843     /// Downcast to the underlying KvmVm type
844     fn as_any(&self) -> &dyn Any {
845         self
846     }
847 }
848 
849 #[cfg(feature = "tdx")]
850 fn tdx_command(
851     fd: &RawFd,
852     command: TdxCommand,
853     metadata: u32,
854     data: u64,
855 ) -> std::result::Result<(), std::io::Error> {
856     #[repr(C)]
857     struct TdxIoctlCmd {
858         command: TdxCommand,
859         metadata: u32,
860         data: u64,
861     }
862     let cmd = TdxIoctlCmd {
863         command,
864         metadata,
865         data,
866     };
867     // SAFETY: FFI call. All input parameters are valid.
868     let ret = unsafe {
869         ioctl_with_val(
870             fd,
871             KVM_MEMORY_ENCRYPT_OP(),
872             &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong,
873         )
874     };
875 
876     if ret < 0 {
877         return Err(std::io::Error::last_os_error());
878     }
879     Ok(())
880 }
881 
882 /// Wrapper over KVM system ioctls.
883 pub struct KvmHypervisor {
884     kvm: Kvm,
885 }
886 
887 impl KvmHypervisor {
888     #[cfg(target_arch = "x86_64")]
889     ///
890     /// Retrieve the list of MSRs supported by the hypervisor.
891     ///
892     fn get_msr_list(&self) -> hypervisor::Result<MsrList> {
893         self.kvm
894             .get_msr_index_list()
895             .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into()))
896     }
897 }
898 
899 /// Enum for KVM related error
900 #[derive(Debug, Error)]
901 pub enum KvmError {
902     #[error("Capability missing: {0:?}")]
903     CapabilityMissing(Cap),
904 }
905 pub type KvmResult<T> = result::Result<T, KvmError>;
906 impl KvmHypervisor {
907     /// Create a hypervisor based on Kvm
908     #[allow(clippy::new_ret_no_self)]
909     pub fn new() -> hypervisor::Result<Arc<dyn hypervisor::Hypervisor>> {
910         let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?;
911         let api_version = kvm_obj.get_api_version();
912 
913         if api_version != kvm_bindings::KVM_API_VERSION as i32 {
914             return Err(hypervisor::HypervisorError::IncompatibleApiVersion);
915         }
916 
917         Ok(Arc::new(KvmHypervisor { kvm: kvm_obj }))
918     }
919     /// Check if the hypervisor is available
920     pub fn is_available() -> hypervisor::Result<bool> {
921         match std::fs::metadata("/dev/kvm") {
922             Ok(_) => Ok(true),
923             Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false),
924             Err(err) => Err(hypervisor::HypervisorError::HypervisorAvailableCheck(
925                 err.into(),
926             )),
927         }
928     }
929 }
930 /// Implementation of Hypervisor trait for KVM
931 /// Example:
932 /// #[cfg(feature = "kvm")]
933 /// extern crate hypervisor
934 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
935 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
936 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
937 ///
938 impl hypervisor::Hypervisor for KvmHypervisor {
939     ///
940     /// Returns the type of the hypervisor
941     ///
942     fn hypervisor_type(&self) -> HypervisorType {
943         HypervisorType::Kvm
944     }
945     /// Create a KVM vm object of a specific VM type and return the object as Vm trait object
946     /// Example
947     /// # extern crate hypervisor;
948     /// # use hypervisor::KvmHypervisor;
949     /// use hypervisor::KvmVm;
950     /// let hypervisor = KvmHypervisor::new().unwrap();
951     /// let vm = hypervisor.create_vm_with_type(KvmVmType::LegacyVm).unwrap()
952     ///
953     fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> {
954         let fd: VmFd;
955         loop {
956             match self.kvm.create_vm_with_type(vm_type) {
957                 Ok(res) => fd = res,
958                 Err(e) => {
959                     if e.errno() == libc::EINTR {
960                         // If the error returned is EINTR, which means the
961                         // ioctl has been interrupted, we have to retry as
962                         // this can't be considered as a regular error.
963                         continue;
964                     } else {
965                         return Err(hypervisor::HypervisorError::VmCreate(e.into()));
966                     }
967                 }
968             }
969             break;
970         }
971 
972         let vm_fd = Arc::new(fd);
973 
974         #[cfg(target_arch = "x86_64")]
975         {
976             let msr_list = self.get_msr_list()?;
977             let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize;
978             let mut msrs: Vec<MsrEntry> = vec![
979                 MsrEntry {
980                     ..Default::default()
981                 };
982                 num_msrs
983             ];
984             let indices = msr_list.as_slice();
985             for (pos, index) in indices.iter().enumerate() {
986                 msrs[pos].index = *index;
987             }
988 
989             Ok(Arc::new(KvmVm {
990                 fd: vm_fd,
991                 msrs,
992                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
993             }))
994         }
995 
996         #[cfg(target_arch = "aarch64")]
997         {
998             Ok(Arc::new(KvmVm {
999                 fd: vm_fd,
1000                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
1001             }))
1002         }
1003     }
1004 
1005     /// Create a KVM vm object and return the object as Vm trait object
1006     /// Example
1007     /// # extern crate hypervisor;
1008     /// # use hypervisor::KvmHypervisor;
1009     /// use hypervisor::KvmVm;
1010     /// let hypervisor = KvmHypervisor::new().unwrap();
1011     /// let vm = hypervisor.create_vm().unwrap()
1012     ///
1013     fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> {
1014         #[allow(unused_mut)]
1015         let mut vm_type: u64 = 0; // Create with default platform type
1016 
1017         // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA
1018         // size from the host and use that when creating the VM, which may
1019         // avoid unnecessary VM creation failures.
1020         #[cfg(target_arch = "aarch64")]
1021         if self.kvm.check_extension(Cap::ArmVmIPASize) {
1022             vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap();
1023         }
1024 
1025         self.create_vm_with_type(vm_type)
1026     }
1027 
1028     fn check_required_extensions(&self) -> hypervisor::Result<()> {
1029         check_required_kvm_extensions(&self.kvm)
1030             .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into()))
1031     }
1032 
1033     #[cfg(target_arch = "x86_64")]
1034     ///
1035     /// X86 specific call to get the system supported CPUID values.
1036     ///
1037     fn get_cpuid(&self) -> hypervisor::Result<Vec<CpuIdEntry>> {
1038         let kvm_cpuid = self
1039             .kvm
1040             .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
1041             .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))?;
1042 
1043         let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect();
1044 
1045         Ok(v)
1046     }
1047 
1048     #[cfg(target_arch = "aarch64")]
1049     ///
1050     /// Retrieve AArch64 host maximum IPA size supported by KVM.
1051     ///
1052     fn get_host_ipa_limit(&self) -> i32 {
1053         self.kvm.get_host_ipa_limit()
1054     }
1055 
1056     ///
1057     /// Retrieve TDX capabilities
1058     ///
1059     #[cfg(feature = "tdx")]
1060     fn tdx_capabilities(&self) -> hypervisor::Result<TdxCapabilities> {
1061         let data = TdxCapabilities {
1062             nr_cpuid_configs: TDX_MAX_NR_CPUID_CONFIGS as u32,
1063             ..Default::default()
1064         };
1065 
1066         tdx_command(
1067             &self.kvm.as_raw_fd(),
1068             TdxCommand::Capabilities,
1069             0,
1070             &data as *const _ as u64,
1071         )
1072         .map_err(|e| hypervisor::HypervisorError::TdxCapabilities(e.into()))?;
1073 
1074         Ok(data)
1075     }
1076 }
1077 /// Vcpu struct for KVM
1078 pub struct KvmVcpu {
1079     fd: VcpuFd,
1080     #[cfg(target_arch = "x86_64")]
1081     msrs: Vec<MsrEntry>,
1082     vm_ops: Option<Arc<dyn vm::VmOps>>,
1083     #[cfg(target_arch = "x86_64")]
1084     hyperv_synic: AtomicBool,
1085 }
1086 /// Implementation of Vcpu trait for KVM
1087 /// Example:
1088 /// #[cfg(feature = "kvm")]
1089 /// extern crate hypervisor
1090 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1091 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1092 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
1093 /// let vcpu = vm.create_vcpu(0, None).unwrap();
1094 /// vcpu.get/set().unwrap()
1095 ///
1096 impl cpu::Vcpu for KvmVcpu {
1097     #[cfg(target_arch = "x86_64")]
1098     ///
1099     /// Returns the vCPU general purpose registers.
1100     ///
1101     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1102         Ok(self
1103             .fd
1104             .get_regs()
1105             .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))?
1106             .into())
1107     }
1108     ///
1109     /// Returns the vCPU general purpose registers.
1110     /// The `KVM_GET_REGS` ioctl is not available on AArch64, `KVM_GET_ONE_REG`
1111     /// is used to get registers one by one.
1112     ///
1113     #[cfg(target_arch = "aarch64")]
1114     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1115         let mut state: StandardRegisters = kvm_regs::default();
1116         let mut off = offset__of!(user_pt_regs, regs);
1117         // There are 31 user_pt_regs:
1118         // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72
1119         // These actually are the general-purpose registers of the Armv8-a
1120         // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register).
1121         for i in 0..31 {
1122             state.regs.regs[i] = self
1123                 .fd
1124                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1125                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1126             off += std::mem::size_of::<u64>();
1127         }
1128 
1129         // We are now entering the "Other register" section of the ARMv8-a architecture.
1130         // First one, stack pointer.
1131         let off = offset__of!(user_pt_regs, sp);
1132         state.regs.sp = self
1133             .fd
1134             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1135             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1136 
1137         // Second one, the program counter.
1138         let off = offset__of!(user_pt_regs, pc);
1139         state.regs.pc = self
1140             .fd
1141             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1142             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1143 
1144         // Next is the processor state.
1145         let off = offset__of!(user_pt_regs, pstate);
1146         state.regs.pstate = self
1147             .fd
1148             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1149             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1150 
1151         // The stack pointer associated with EL1
1152         let off = offset__of!(kvm_regs, sp_el1);
1153         state.sp_el1 = self
1154             .fd
1155             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1156             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1157 
1158         // Exception Link Register for EL1, when taking an exception to EL1, this register
1159         // holds the address to which to return afterwards.
1160         let off = offset__of!(kvm_regs, elr_el1);
1161         state.elr_el1 = self
1162             .fd
1163             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1164             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1165 
1166         // Saved Program Status Registers, there are 5 of them used in the kernel.
1167         let mut off = offset__of!(kvm_regs, spsr);
1168         for i in 0..KVM_NR_SPSR as usize {
1169             state.spsr[i] = self
1170                 .fd
1171                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1172                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1173             off += std::mem::size_of::<u64>();
1174         }
1175 
1176         // Now moving on to floting point registers which are stored in the user_fpsimd_state in the kernel:
1177         // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53
1178         let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs);
1179         for i in 0..32 {
1180             state.fp_regs.vregs[i] = self
1181                 .fd
1182                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off))
1183                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1184                 .into();
1185             off += mem::size_of::<u128>();
1186         }
1187 
1188         // Floating-point Status Register
1189         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr);
1190         state.fp_regs.fpsr = self
1191             .fd
1192             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1193             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1194             as u32;
1195 
1196         // Floating-point Control Register
1197         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr);
1198         state.fp_regs.fpcr = self
1199             .fd
1200             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1201             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1202             as u32;
1203         Ok(state)
1204     }
1205     #[cfg(target_arch = "x86_64")]
1206     ///
1207     /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl.
1208     ///
1209     fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> {
1210         let regs = (*regs).into();
1211         self.fd
1212             .set_regs(&regs)
1213             .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into()))
1214     }
1215 
1216     ///
1217     /// Sets the vCPU general purpose registers.
1218     /// The `KVM_SET_REGS` ioctl is not available on AArch64, `KVM_SET_ONE_REG`
1219     /// is used to set registers one by one.
1220     ///
1221     #[cfg(target_arch = "aarch64")]
1222     fn set_regs(&self, state: &StandardRegisters) -> cpu::Result<()> {
1223         // The function follows the exact identical order from `state`. Look there
1224         // for some additional info on registers.
1225         let mut off = offset__of!(user_pt_regs, regs);
1226         for i in 0..31 {
1227             self.fd
1228                 .set_one_reg(
1229                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1230                     state.regs.regs[i],
1231                 )
1232                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1233             off += std::mem::size_of::<u64>();
1234         }
1235 
1236         let off = offset__of!(user_pt_regs, sp);
1237         self.fd
1238             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.sp)
1239             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1240 
1241         let off = offset__of!(user_pt_regs, pc);
1242         self.fd
1243             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pc)
1244             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1245 
1246         let off = offset__of!(user_pt_regs, pstate);
1247         self.fd
1248             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pstate)
1249             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1250 
1251         let off = offset__of!(kvm_regs, sp_el1);
1252         self.fd
1253             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.sp_el1)
1254             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1255 
1256         let off = offset__of!(kvm_regs, elr_el1);
1257         self.fd
1258             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.elr_el1)
1259             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1260 
1261         let mut off = offset__of!(kvm_regs, spsr);
1262         for i in 0..KVM_NR_SPSR as usize {
1263             self.fd
1264                 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.spsr[i])
1265                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1266             off += std::mem::size_of::<u64>();
1267         }
1268 
1269         let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs);
1270         for i in 0..32 {
1271             self.fd
1272                 .set_one_reg(
1273                     arm64_core_reg_id!(KVM_REG_SIZE_U128, off),
1274                     state.fp_regs.vregs[i] as u64,
1275                 )
1276                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1277             off += mem::size_of::<u128>();
1278         }
1279 
1280         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr);
1281         self.fd
1282             .set_one_reg(
1283                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1284                 state.fp_regs.fpsr as u64,
1285             )
1286             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1287 
1288         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr);
1289         self.fd
1290             .set_one_reg(
1291                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1292                 state.fp_regs.fpcr as u64,
1293             )
1294             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1295         Ok(())
1296     }
1297 
1298     #[cfg(target_arch = "x86_64")]
1299     ///
1300     /// Returns the vCPU special registers.
1301     ///
1302     fn get_sregs(&self) -> cpu::Result<SpecialRegisters> {
1303         Ok(self
1304             .fd
1305             .get_sregs()
1306             .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))?
1307             .into())
1308     }
1309     #[cfg(target_arch = "x86_64")]
1310     ///
1311     /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl.
1312     ///
1313     fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> {
1314         let sregs = (*sregs).into();
1315         self.fd
1316             .set_sregs(&sregs)
1317             .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into()))
1318     }
1319     #[cfg(target_arch = "x86_64")]
1320     ///
1321     /// Returns the floating point state (FPU) from the vCPU.
1322     ///
1323     fn get_fpu(&self) -> cpu::Result<FpuState> {
1324         Ok(self
1325             .fd
1326             .get_fpu()
1327             .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))?
1328             .into())
1329     }
1330     #[cfg(target_arch = "x86_64")]
1331     ///
1332     /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct.
1333     ///
1334     fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> {
1335         let fpu: kvm_bindings::kvm_fpu = (*fpu).clone().into();
1336         self.fd
1337             .set_fpu(&fpu)
1338             .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into()))
1339     }
1340     #[cfg(target_arch = "x86_64")]
1341     ///
1342     /// X86 specific call to setup the CPUID registers.
1343     ///
1344     fn set_cpuid2(&self, cpuid: &[CpuIdEntry]) -> cpu::Result<()> {
1345         let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> =
1346             cpuid.iter().map(|e| (*e).into()).collect();
1347         let kvm_cpuid = <CpuId>::from_entries(&cpuid)
1348             .map_err(|_| cpu::HypervisorCpuError::SetCpuid(anyhow!("failed to create CpuId")))?;
1349 
1350         self.fd
1351             .set_cpuid2(&kvm_cpuid)
1352             .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into()))
1353     }
1354     #[cfg(target_arch = "x86_64")]
1355     ///
1356     /// X86 specific call to enable HyperV SynIC
1357     ///
1358     fn enable_hyperv_synic(&self) -> cpu::Result<()> {
1359         // Update the information about Hyper-V SynIC being enabled and
1360         // emulated as it will influence later which MSRs should be saved.
1361         self.hyperv_synic.store(true, Ordering::Release);
1362 
1363         let cap = kvm_enable_cap {
1364             cap: KVM_CAP_HYPERV_SYNIC,
1365             ..Default::default()
1366         };
1367         self.fd
1368             .enable_cap(&cap)
1369             .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into()))
1370     }
1371     ///
1372     /// X86 specific call to retrieve the CPUID registers.
1373     ///
1374     #[cfg(target_arch = "x86_64")]
1375     fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<Vec<CpuIdEntry>> {
1376         let kvm_cpuid = self
1377             .fd
1378             .get_cpuid2(num_entries)
1379             .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))?;
1380 
1381         let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect();
1382 
1383         Ok(v)
1384     }
1385     #[cfg(target_arch = "x86_64")]
1386     ///
1387     /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1388     ///
1389     fn get_lapic(&self) -> cpu::Result<LapicState> {
1390         Ok(self
1391             .fd
1392             .get_lapic()
1393             .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))?
1394             .into())
1395     }
1396     #[cfg(target_arch = "x86_64")]
1397     ///
1398     /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1399     ///
1400     fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> {
1401         let klapic: kvm_bindings::kvm_lapic_state = (*klapic).clone().into();
1402         self.fd
1403             .set_lapic(&klapic)
1404             .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into()))
1405     }
1406     #[cfg(target_arch = "x86_64")]
1407     ///
1408     /// Returns the model-specific registers (MSR) for this vCPU.
1409     ///
1410     fn get_msrs(&self, msrs: &mut Vec<MsrEntry>) -> cpu::Result<usize> {
1411         let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect();
1412         let mut kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap();
1413         let succ = self
1414             .fd
1415             .get_msrs(&mut kvm_msrs)
1416             .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))?;
1417 
1418         msrs[..succ].copy_from_slice(
1419             &kvm_msrs.as_slice()[..succ]
1420                 .iter()
1421                 .map(|e| (*e).into())
1422                 .collect::<Vec<MsrEntry>>(),
1423         );
1424 
1425         Ok(succ)
1426     }
1427     #[cfg(target_arch = "x86_64")]
1428     ///
1429     /// Setup the model-specific registers (MSR) for this vCPU.
1430     /// Returns the number of MSR entries actually written.
1431     ///
1432     fn set_msrs(&self, msrs: &[MsrEntry]) -> cpu::Result<usize> {
1433         let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect();
1434         let kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap();
1435         self.fd
1436             .set_msrs(&kvm_msrs)
1437             .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into()))
1438     }
1439     ///
1440     /// Returns the vcpu's current "multiprocessing state".
1441     ///
1442     fn get_mp_state(&self) -> cpu::Result<MpState> {
1443         Ok(self
1444             .fd
1445             .get_mp_state()
1446             .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))?
1447             .into())
1448     }
1449     ///
1450     /// Sets the vcpu's current "multiprocessing state".
1451     ///
1452     fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> {
1453         self.fd
1454             .set_mp_state(mp_state.into())
1455             .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into()))
1456     }
1457     #[cfg(target_arch = "x86_64")]
1458     ///
1459     /// Translates guest virtual address to guest physical address using the `KVM_TRANSLATE` ioctl.
1460     ///
1461     fn translate_gva(&self, gva: u64, _flags: u64) -> cpu::Result<(u64, u32)> {
1462         let tr = self
1463             .fd
1464             .translate_gva(gva)
1465             .map_err(|e| cpu::HypervisorCpuError::TranslateVirtualAddress(e.into()))?;
1466         // tr.valid is set if the GVA is mapped to valid GPA.
1467         match tr.valid {
1468             0 => Err(cpu::HypervisorCpuError::TranslateVirtualAddress(anyhow!(
1469                 "Invalid GVA: {:#x}",
1470                 gva
1471             ))),
1472             _ => Ok((tr.physical_address, 0)),
1473         }
1474     }
1475     ///
1476     /// Triggers the running of the current virtual CPU returning an exit reason.
1477     ///
1478     fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> {
1479         match self.fd.run() {
1480             Ok(run) => match run {
1481                 #[cfg(target_arch = "x86_64")]
1482                 VcpuExit::IoIn(addr, data) => {
1483                     if let Some(vm_ops) = &self.vm_ops {
1484                         return vm_ops
1485                             .pio_read(addr.into(), data)
1486                             .map(|_| cpu::VmExit::Ignore)
1487                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1488                     }
1489 
1490                     Ok(cpu::VmExit::IoIn(addr, data))
1491                 }
1492                 #[cfg(target_arch = "x86_64")]
1493                 VcpuExit::IoOut(addr, data) => {
1494                     if let Some(vm_ops) = &self.vm_ops {
1495                         return vm_ops
1496                             .pio_write(addr.into(), data)
1497                             .map(|_| cpu::VmExit::Ignore)
1498                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1499                     }
1500 
1501                     Ok(cpu::VmExit::IoOut(addr, data))
1502                 }
1503                 #[cfg(target_arch = "x86_64")]
1504                 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)),
1505                 #[cfg(target_arch = "x86_64")]
1506                 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset),
1507 
1508                 #[cfg(target_arch = "aarch64")]
1509                 VcpuExit::SystemEvent(event_type, flags) => {
1510                     use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN};
1511                     // On Aarch64, when the VM is shutdown, run() returns
1512                     // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN
1513                     if event_type == KVM_SYSTEM_EVENT_RESET {
1514                         Ok(cpu::VmExit::Reset)
1515                     } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN {
1516                         Ok(cpu::VmExit::Shutdown)
1517                     } else {
1518                         Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1519                             "Unexpected system event with type 0x{:x}, flags 0x{:x}",
1520                             event_type,
1521                             flags
1522                         )))
1523                     }
1524                 }
1525 
1526                 VcpuExit::MmioRead(addr, data) => {
1527                     if let Some(vm_ops) = &self.vm_ops {
1528                         return vm_ops
1529                             .mmio_read(addr, data)
1530                             .map(|_| cpu::VmExit::Ignore)
1531                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1532                     }
1533 
1534                     Ok(cpu::VmExit::MmioRead(addr, data))
1535                 }
1536                 VcpuExit::MmioWrite(addr, data) => {
1537                     if let Some(vm_ops) = &self.vm_ops {
1538                         return vm_ops
1539                             .mmio_write(addr, data)
1540                             .map(|_| cpu::VmExit::Ignore)
1541                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1542                     }
1543 
1544                     Ok(cpu::VmExit::MmioWrite(addr, data))
1545                 }
1546                 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv),
1547                 #[cfg(feature = "tdx")]
1548                 VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx),
1549                 VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug),
1550 
1551                 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1552                     "Unexpected exit reason on vcpu run: {:?}",
1553                     r
1554                 ))),
1555             },
1556 
1557             Err(ref e) => match e.errno() {
1558                 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore),
1559                 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1560                     "VCPU error {:?}",
1561                     e
1562                 ))),
1563             },
1564         }
1565     }
1566     #[cfg(target_arch = "x86_64")]
1567     ///
1568     /// Let the guest know that it has been paused, which prevents from
1569     /// potential soft lockups when being resumed.
1570     ///
1571     fn notify_guest_clock_paused(&self) -> cpu::Result<()> {
1572         if let Err(e) = self.fd.kvmclock_ctrl() {
1573             // Linux kernel returns -EINVAL if the PV clock isn't yet initialised
1574             // which could be because we're still in firmware or the guest doesn't
1575             // use KVM clock.
1576             if e.errno() != libc::EINVAL {
1577                 return Err(cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into()));
1578             }
1579         }
1580 
1581         Ok(())
1582     }
1583     #[cfg(target_arch = "x86_64")]
1584     ///
1585     /// Sets debug registers to set hardware breakpoints and/or enable single step.
1586     ///
1587     fn set_guest_debug(
1588         &self,
1589         addrs: &[vm_memory::GuestAddress],
1590         singlestep: bool,
1591     ) -> cpu::Result<()> {
1592         if addrs.len() > 4 {
1593             return Err(cpu::HypervisorCpuError::SetDebugRegs(anyhow!(
1594                 "Support 4 breakpoints at most but {} addresses are passed",
1595                 addrs.len()
1596             )));
1597         }
1598 
1599         let mut dbg = kvm_guest_debug {
1600             control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP,
1601             ..Default::default()
1602         };
1603         if singlestep {
1604             dbg.control |= KVM_GUESTDBG_SINGLESTEP;
1605         }
1606 
1607         // Set bits 9 and 10.
1608         // bit 9: GE (global exact breakpoint enable) flag.
1609         // bit 10: always 1.
1610         dbg.arch.debugreg[7] = 0x0600;
1611 
1612         for (i, addr) in addrs.iter().enumerate() {
1613             dbg.arch.debugreg[i] = addr.0;
1614             // Set global breakpoint enable flag
1615             dbg.arch.debugreg[7] |= 2 << (i * 2);
1616         }
1617 
1618         self.fd
1619             .set_guest_debug(&dbg)
1620             .map_err(|e| cpu::HypervisorCpuError::SetDebugRegs(e.into()))
1621     }
1622     #[cfg(target_arch = "aarch64")]
1623     fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> {
1624         self.fd
1625             .vcpu_init(kvi)
1626             .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into()))
1627     }
1628     ///
1629     /// Sets the value of one register for this vCPU.
1630     ///
1631     #[cfg(target_arch = "aarch64")]
1632     fn set_reg(&self, reg_id: u64, data: u64) -> cpu::Result<()> {
1633         self.fd
1634             .set_one_reg(reg_id, data)
1635             .map_err(|e| cpu::HypervisorCpuError::SetRegister(e.into()))
1636     }
1637     ///
1638     /// Gets the value of one register for this vCPU.
1639     ///
1640     #[cfg(target_arch = "aarch64")]
1641     fn get_reg(&self, reg_id: u64) -> cpu::Result<u64> {
1642         self.fd
1643             .get_one_reg(reg_id)
1644             .map_err(|e| cpu::HypervisorCpuError::GetRegister(e.into()))
1645     }
1646     ///
1647     /// Gets a list of the guest registers that are supported for the
1648     /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls.
1649     ///
1650     #[cfg(target_arch = "aarch64")]
1651     fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> {
1652         self.fd
1653             .get_reg_list(reg_list)
1654             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))
1655     }
1656     ///
1657     /// Save the state of the system registers.
1658     ///
1659     #[cfg(target_arch = "aarch64")]
1660     fn get_sys_regs(&self) -> cpu::Result<Vec<Register>> {
1661         // Call KVM_GET_REG_LIST to get all registers available to the guest. For ArmV8 there are
1662         // around 500 registers.
1663         let mut state: Vec<Register> = Vec::new();
1664         let mut reg_list = RegList::new(500).unwrap();
1665         self.fd
1666             .get_reg_list(&mut reg_list)
1667             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?;
1668 
1669         // At this point reg_list should contain: core registers and system registers.
1670         // The register list contains the number of registers and their ids. We will be needing to
1671         // call KVM_GET_ONE_REG on each id in order to save all of them. We carve out from the list
1672         // the core registers which are represented in the kernel by kvm_regs structure and for which
1673         // we can calculate the id based on the offset in the structure.
1674         reg_list.retain(|regid| is_system_register(*regid));
1675 
1676         // Now, for the rest of the registers left in the previously fetched register list, we are
1677         // simply calling KVM_GET_ONE_REG.
1678         let indices = reg_list.as_slice();
1679         for index in indices.iter() {
1680             state.push(kvm_bindings::kvm_one_reg {
1681                 id: *index,
1682                 addr: self
1683                     .fd
1684                     .get_one_reg(*index)
1685                     .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?,
1686             });
1687         }
1688 
1689         Ok(state)
1690     }
1691     ///
1692     /// Restore the state of the system registers.
1693     ///
1694     #[cfg(target_arch = "aarch64")]
1695     fn set_sys_regs(&self, state: &[Register]) -> cpu::Result<()> {
1696         for reg in state {
1697             self.fd
1698                 .set_one_reg(reg.id, reg.addr)
1699                 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?;
1700         }
1701         Ok(())
1702     }
1703     ///
1704     /// Read the MPIDR - Multiprocessor Affinity Register.
1705     ///
1706     #[cfg(target_arch = "aarch64")]
1707     fn read_mpidr(&self) -> cpu::Result<u64> {
1708         self.fd
1709             .get_one_reg(MPIDR_EL1)
1710             .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))
1711     }
1712     ///
1713     /// Configure core registers for a given CPU.
1714     ///
1715     #[cfg(target_arch = "aarch64")]
1716     fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> {
1717         #[allow(non_upper_case_globals)]
1718         // PSR (Processor State Register) bits.
1719         // Taken from arch/arm64/include/uapi/asm/ptrace.h.
1720         const PSR_MODE_EL1h: u64 = 0x0000_0005;
1721         const PSR_F_BIT: u64 = 0x0000_0040;
1722         const PSR_I_BIT: u64 = 0x0000_0080;
1723         const PSR_A_BIT: u64 = 0x0000_0100;
1724         const PSR_D_BIT: u64 = 0x0000_0200;
1725         // Taken from arch/arm64/kvm/inject_fault.c.
1726         const PSTATE_FAULT_BITS_64: u64 =
1727             PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | PSR_I_BIT | PSR_D_BIT;
1728 
1729         let kreg_off = offset__of!(kvm_regs, regs);
1730 
1731         // Get the register index of the PSTATE (Processor State) register.
1732         let pstate = offset__of!(user_pt_regs, pstate) + kreg_off;
1733         self.set_reg(
1734             arm64_core_reg_id!(KVM_REG_SIZE_U64, pstate),
1735             PSTATE_FAULT_BITS_64,
1736         )
1737         .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1738 
1739         // Other vCPUs are powered off initially awaiting PSCI wakeup.
1740         if cpu_id == 0 {
1741             // Setting the PC (Processor Counter) to the current program address (kernel address).
1742             let pc = offset__of!(user_pt_regs, pc) + kreg_off;
1743             self.set_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, pc), boot_ip as u64)
1744                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1745 
1746             // Last mandatory thing to set -> the address pointing to the FDT (also called DTB).
1747             // "The device tree blob (dtb) must be placed on an 8-byte boundary and must
1748             // not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt.
1749             // We are choosing to place it the end of DRAM. See `get_fdt_addr`.
1750             let regs0 = offset__of!(user_pt_regs, regs) + kreg_off;
1751             self.set_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, regs0), fdt_start)
1752                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1753         }
1754         Ok(())
1755     }
1756 
1757     #[cfg(target_arch = "x86_64")]
1758     ///
1759     /// Get the current CPU state
1760     ///
1761     /// Ordering requirements:
1762     ///
1763     /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify
1764     /// vCPU/LAPIC state. As such, it must be done before most everything
1765     /// else, otherwise we cannot restore everything and expect it to work.
1766     ///
1767     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1768     /// still running.
1769     ///
1770     /// KVM_GET_LAPIC may change state of LAPIC before returning it.
1771     ///
1772     /// GET_VCPU_EVENTS should probably be last to save. The code looks as
1773     /// it might as well be affected by internal state modifications of the
1774     /// GET ioctls.
1775     ///
1776     /// SREGS saves/restores a pending interrupt, similar to what
1777     /// VCPU_EVENTS also does.
1778     ///
1779     /// GET_MSRS requires a pre-populated data structure to do something
1780     /// meaningful. For SET_MSRS it will then contain good data.
1781     ///
1782     /// # Example
1783     ///
1784     /// ```rust
1785     /// # extern crate hypervisor;
1786     /// # use hypervisor::KvmHypervisor;
1787     /// # use std::sync::Arc;
1788     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1789     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1790     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1791     /// vm.enable_split_irq().unwrap();
1792     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1793     /// let state = vcpu.state().unwrap();
1794     /// ```
1795     fn state(&self) -> cpu::Result<CpuState> {
1796         let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?;
1797         let mp_state = self.get_mp_state()?.into();
1798         let regs = self.get_regs()?;
1799         let sregs = self.get_sregs()?;
1800         let xsave = self.get_xsave()?;
1801         let xcrs = self.get_xcrs()?;
1802         let lapic_state = self.get_lapic()?;
1803         let fpu = self.get_fpu()?;
1804 
1805         // Try to get all MSRs based on the list previously retrieved from KVM.
1806         // If the number of MSRs obtained from GET_MSRS is different from the
1807         // expected amount, we fallback onto a slower method by getting MSRs
1808         // by chunks. This is the only way to make sure we try to get as many
1809         // MSRs as possible, even if some MSRs are not supported.
1810         let mut msr_entries = self.msrs.clone();
1811 
1812         // Save extra MSRs if the Hyper-V synthetic interrupt controller is
1813         // emulated.
1814         if self.hyperv_synic.load(Ordering::Acquire) {
1815             let hyperv_synic_msrs = vec![
1816                 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084,
1817                 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096,
1818                 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d,
1819                 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4,
1820                 0x400000b5, 0x400000b6, 0x400000b7,
1821             ];
1822             for index in hyperv_synic_msrs {
1823                 let msr = kvm_msr_entry {
1824                     index,
1825                     ..Default::default()
1826                 };
1827                 msr_entries.push(msr.into());
1828             }
1829         }
1830 
1831         let expected_num_msrs = msr_entries.len();
1832         let num_msrs = self.get_msrs(&mut msr_entries)?;
1833         let msrs = if num_msrs != expected_num_msrs {
1834             let mut faulty_msr_index = num_msrs;
1835             let mut msr_entries_tmp = msr_entries[..faulty_msr_index].to_vec();
1836 
1837             loop {
1838                 warn!(
1839                     "Detected faulty MSR 0x{:x} while getting MSRs",
1840                     msr_entries[faulty_msr_index].index
1841                 );
1842 
1843                 // Skip the first bad MSR
1844                 let start_pos = faulty_msr_index + 1;
1845 
1846                 let mut sub_msr_entries = msr_entries[start_pos..].to_vec();
1847                 let num_msrs = self.get_msrs(&mut sub_msr_entries)?;
1848 
1849                 msr_entries_tmp.extend(&sub_msr_entries[..num_msrs]);
1850 
1851                 if num_msrs == sub_msr_entries.len() {
1852                     break;
1853                 }
1854 
1855                 faulty_msr_index = start_pos + num_msrs;
1856             }
1857 
1858             msr_entries_tmp
1859         } else {
1860             msr_entries
1861         };
1862 
1863         let vcpu_events = self.get_vcpu_events()?;
1864 
1865         Ok(VcpuKvmState {
1866             cpuid,
1867             msrs,
1868             vcpu_events,
1869             regs: regs.into(),
1870             sregs: sregs.into(),
1871             fpu,
1872             lapic_state,
1873             xsave,
1874             xcrs,
1875             mp_state,
1876         }
1877         .into())
1878     }
1879     ///
1880     /// Get the current AArch64 CPU state
1881     ///
1882     #[cfg(target_arch = "aarch64")]
1883     fn state(&self) -> cpu::Result<CpuState> {
1884         let mut state = VcpuKvmState {
1885             mp_state: self.get_mp_state()?.into(),
1886             mpidr: self.read_mpidr()?,
1887             ..Default::default()
1888         };
1889         state.core_regs = self.get_regs()?;
1890         state.sys_regs = self.get_sys_regs()?;
1891 
1892         Ok(state.into())
1893     }
1894     #[cfg(target_arch = "x86_64")]
1895     ///
1896     /// Restore the previously saved CPU state
1897     ///
1898     /// Ordering requirements:
1899     ///
1900     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1901     /// still running.
1902     ///
1903     /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so
1904     /// if we ever change the BSP, we have to do that before restoring anything.
1905     /// The same seems to be true for CPUID stuff.
1906     ///
1907     /// SREGS saves/restores a pending interrupt, similar to what
1908     /// VCPU_EVENTS also does.
1909     ///
1910     /// SET_REGS clears pending exceptions unconditionally, thus, it must be
1911     /// done before SET_VCPU_EVENTS, which restores it.
1912     ///
1913     /// SET_LAPIC must come after SET_SREGS, because the latter restores
1914     /// the apic base msr.
1915     ///
1916     /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR
1917     /// only restores successfully, when the LAPIC is correctly configured.
1918     ///
1919     /// Arguments: CpuState
1920     /// # Example
1921     ///
1922     /// ```rust
1923     /// # extern crate hypervisor;
1924     /// # use hypervisor::KvmHypervisor;
1925     /// # use std::sync::Arc;
1926     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1927     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1928     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1929     /// vm.enable_split_irq().unwrap();
1930     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1931     /// let state = vcpu.state().unwrap();
1932     /// vcpu.set_state(&state).unwrap();
1933     /// ```
1934     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
1935         let state: VcpuKvmState = state.clone().into();
1936         self.set_cpuid2(&state.cpuid)?;
1937         self.set_mp_state(state.mp_state.into())?;
1938         self.set_regs(&state.regs.into())?;
1939         self.set_sregs(&state.sregs.into())?;
1940         self.set_xsave(&state.xsave)?;
1941         self.set_xcrs(&state.xcrs)?;
1942         self.set_lapic(&state.lapic_state)?;
1943         self.set_fpu(&state.fpu)?;
1944 
1945         // Try to set all MSRs previously stored.
1946         // If the number of MSRs set from SET_MSRS is different from the
1947         // expected amount, we fallback onto a slower method by setting MSRs
1948         // by chunks. This is the only way to make sure we try to set as many
1949         // MSRs as possible, even if some MSRs are not supported.
1950         let expected_num_msrs = state.msrs.len();
1951         let num_msrs = self.set_msrs(&state.msrs)?;
1952         if num_msrs != expected_num_msrs {
1953             let mut faulty_msr_index = num_msrs;
1954 
1955             loop {
1956                 warn!(
1957                     "Detected faulty MSR 0x{:x} while setting MSRs",
1958                     state.msrs[faulty_msr_index].index
1959                 );
1960 
1961                 // Skip the first bad MSR
1962                 let start_pos = faulty_msr_index + 1;
1963 
1964                 let sub_msr_entries = state.msrs[start_pos..].to_vec();
1965 
1966                 let num_msrs = self.set_msrs(&sub_msr_entries)?;
1967 
1968                 if num_msrs == sub_msr_entries.len() {
1969                     break;
1970                 }
1971 
1972                 faulty_msr_index = start_pos + num_msrs;
1973             }
1974         }
1975 
1976         self.set_vcpu_events(&state.vcpu_events)?;
1977 
1978         Ok(())
1979     }
1980     ///
1981     /// Restore the previously saved AArch64 CPU state
1982     ///
1983     #[cfg(target_arch = "aarch64")]
1984     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
1985         let state: VcpuKvmState = state.clone().into();
1986         self.set_regs(&state.core_regs)?;
1987         self.set_sys_regs(&state.sys_regs)?;
1988         self.set_mp_state(state.mp_state.into())?;
1989 
1990         Ok(())
1991     }
1992 
1993     ///
1994     /// Initialize TDX for this CPU
1995     ///
1996     #[cfg(feature = "tdx")]
1997     fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> {
1998         tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address)
1999             .map_err(cpu::HypervisorCpuError::InitializeTdx)
2000     }
2001 
2002     ///
2003     /// Set the "immediate_exit" state
2004     ///
2005     fn set_immediate_exit(&self, exit: bool) {
2006         self.fd.set_kvm_immediate_exit(exit.into());
2007     }
2008 
2009     ///
2010     /// Returns the details about TDX exit reason
2011     ///
2012     #[cfg(feature = "tdx")]
2013     fn get_tdx_exit_details(&mut self) -> cpu::Result<TdxExitDetails> {
2014         let kvm_run = self.fd.get_kvm_run();
2015         let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall };
2016 
2017         tdx_vmcall.status_code = TDG_VP_VMCALL_INVALID_OPERAND;
2018 
2019         if tdx_vmcall.type_ != 0 {
2020             return Err(cpu::HypervisorCpuError::UnknownTdxVmCall);
2021         }
2022 
2023         match tdx_vmcall.subfunction {
2024             TDG_VP_VMCALL_GET_QUOTE => Ok(TdxExitDetails::GetQuote),
2025             TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT => {
2026                 Ok(TdxExitDetails::SetupEventNotifyInterrupt)
2027             }
2028             _ => Err(cpu::HypervisorCpuError::UnknownTdxVmCall),
2029         }
2030     }
2031 
2032     ///
2033     /// Set the status code for TDX exit
2034     ///
2035     #[cfg(feature = "tdx")]
2036     fn set_tdx_status(&mut self, status: TdxExitStatus) {
2037         let kvm_run = self.fd.get_kvm_run();
2038         let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall };
2039 
2040         tdx_vmcall.status_code = match status {
2041             TdxExitStatus::Success => TDG_VP_VMCALL_SUCCESS,
2042             TdxExitStatus::InvalidOperand => TDG_VP_VMCALL_INVALID_OPERAND,
2043         };
2044     }
2045     #[cfg(target_arch = "x86_64")]
2046     ///
2047     /// Return the list of initial MSR entries for a VCPU
2048     ///
2049     fn boot_msr_entries(&self) -> Vec<MsrEntry> {
2050         use crate::arch::x86::{msr_index, MTRR_ENABLE, MTRR_MEM_TYPE_WB};
2051 
2052         [
2053             msr!(msr_index::MSR_IA32_SYSENTER_CS),
2054             msr!(msr_index::MSR_IA32_SYSENTER_ESP),
2055             msr!(msr_index::MSR_IA32_SYSENTER_EIP),
2056             msr!(msr_index::MSR_STAR),
2057             msr!(msr_index::MSR_CSTAR),
2058             msr!(msr_index::MSR_LSTAR),
2059             msr!(msr_index::MSR_KERNEL_GS_BASE),
2060             msr!(msr_index::MSR_SYSCALL_MASK),
2061             msr!(msr_index::MSR_IA32_TSC),
2062             msr_data!(
2063                 msr_index::MSR_IA32_MISC_ENABLE,
2064                 msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64
2065             ),
2066             msr_data!(msr_index::MSR_MTRRdefType, MTRR_ENABLE | MTRR_MEM_TYPE_WB),
2067         ]
2068         .to_vec()
2069     }
2070     #[cfg(target_arch = "aarch64")]
2071     fn has_pmu_support(&self) -> bool {
2072         let cpu_attr = kvm_bindings::kvm_device_attr {
2073             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2074             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT),
2075             addr: 0x0,
2076             flags: 0,
2077         };
2078         self.fd.has_device_attr(&cpu_attr).is_ok()
2079     }
2080     #[cfg(target_arch = "aarch64")]
2081     fn init_pmu(&self, irq: u32) -> cpu::Result<()> {
2082         let cpu_attr = kvm_bindings::kvm_device_attr {
2083             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2084             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT),
2085             addr: 0x0,
2086             flags: 0,
2087         };
2088         let cpu_attr_irq = kvm_bindings::kvm_device_attr {
2089             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2090             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_IRQ),
2091             addr: &irq as *const u32 as u64,
2092             flags: 0,
2093         };
2094         self.fd
2095             .set_device_attr(&cpu_attr_irq)
2096             .map_err(|_| cpu::HypervisorCpuError::InitializePmu)?;
2097         self.fd
2098             .set_device_attr(&cpu_attr)
2099             .map_err(|_| cpu::HypervisorCpuError::InitializePmu)
2100     }
2101 }
2102 
2103 impl KvmVcpu {
2104     #[cfg(target_arch = "x86_64")]
2105     ///
2106     /// X86 specific call that returns the vcpu's current "xsave struct".
2107     ///
2108     fn get_xsave(&self) -> cpu::Result<Xsave> {
2109         self.fd
2110             .get_xsave()
2111             .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into()))
2112     }
2113     #[cfg(target_arch = "x86_64")]
2114     ///
2115     /// X86 specific call that sets the vcpu's current "xsave struct".
2116     ///
2117     fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> {
2118         self.fd
2119             .set_xsave(xsave)
2120             .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into()))
2121     }
2122     #[cfg(target_arch = "x86_64")]
2123     ///
2124     /// X86 specific call that returns the vcpu's current "xcrs".
2125     ///
2126     fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> {
2127         self.fd
2128             .get_xcrs()
2129             .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into()))
2130     }
2131     #[cfg(target_arch = "x86_64")]
2132     ///
2133     /// X86 specific call that sets the vcpu's current "xcrs".
2134     ///
2135     fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> {
2136         self.fd
2137             .set_xcrs(xcrs)
2138             .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into()))
2139     }
2140     #[cfg(target_arch = "x86_64")]
2141     ///
2142     /// Returns currently pending exceptions, interrupts, and NMIs as well as related
2143     /// states of the vcpu.
2144     ///
2145     fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> {
2146         self.fd
2147             .get_vcpu_events()
2148             .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into()))
2149     }
2150     #[cfg(target_arch = "x86_64")]
2151     ///
2152     /// Sets pending exceptions, interrupts, and NMIs as well as related states
2153     /// of the vcpu.
2154     ///
2155     fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> {
2156         self.fd
2157             .set_vcpu_events(events)
2158             .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into()))
2159     }
2160 }
2161