xref: /cloud-hypervisor/hypervisor/src/kvm/mod.rs (revision cb6a14dec94b9bc892f9e010e46efac1c11e316c)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 // Copyright © 2020, Microsoft Corporation
6 //
7 // Copyright 2018-2019 CrowdStrike, Inc.
8 //
9 //
10 
11 #[cfg(target_arch = "aarch64")]
12 use crate::aarch64::gic::KvmGicV3Its;
13 #[cfg(target_arch = "aarch64")]
14 pub use crate::aarch64::{
15     check_required_kvm_extensions, gic::Gicv3ItsState as GicState, is_system_register, VcpuInit,
16     VcpuKvmState, MPIDR_EL1,
17 };
18 #[cfg(target_arch = "aarch64")]
19 use crate::arch::aarch64::gic::Vgic;
20 use crate::cpu;
21 use crate::hypervisor;
22 use crate::vec_with_array_field;
23 use crate::vm::{self, InterruptSourceConfig, VmOps};
24 #[cfg(target_arch = "aarch64")]
25 use crate::{arm64_core_reg_id, offset__of};
26 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd};
27 use std::any::Any;
28 use std::collections::HashMap;
29 #[cfg(target_arch = "aarch64")]
30 use std::convert::TryInto;
31 #[cfg(target_arch = "x86_64")]
32 use std::fs::File;
33 #[cfg(target_arch = "x86_64")]
34 use std::os::unix::io::AsRawFd;
35 #[cfg(feature = "tdx")]
36 use std::os::unix::io::RawFd;
37 use std::result;
38 #[cfg(target_arch = "x86_64")]
39 use std::sync::atomic::{AtomicBool, Ordering};
40 #[cfg(target_arch = "aarch64")]
41 use std::sync::Mutex;
42 use std::sync::{Arc, RwLock};
43 use vmm_sys_util::eventfd::EventFd;
44 // x86_64 dependencies
45 #[cfg(target_arch = "x86_64")]
46 pub mod x86_64;
47 #[cfg(target_arch = "x86_64")]
48 use crate::arch::x86::{
49     CpuIdEntry, FpuState, LapicState, MsrEntry, SpecialRegisters, StandardRegisters,
50     NUM_IOAPIC_PINS,
51 };
52 #[cfg(target_arch = "x86_64")]
53 use crate::ClockData;
54 use crate::{
55     CpuState, IoEventAddress, IrqRoutingEntry, MpState, UserMemoryRegion,
56     USER_MEMORY_REGION_LOG_DIRTY, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE,
57 };
58 #[cfg(target_arch = "aarch64")]
59 use aarch64::{RegList, Register, StandardRegisters};
60 #[cfg(target_arch = "x86_64")]
61 use kvm_bindings::{
62     kvm_enable_cap, kvm_guest_debug, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC,
63     KVM_CAP_SPLIT_IRQCHIP, KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_GUESTDBG_USE_HW_BP,
64 };
65 #[cfg(target_arch = "x86_64")]
66 use x86_64::check_required_kvm_extensions;
67 #[cfg(target_arch = "x86_64")]
68 pub use x86_64::{CpuId, ExtendedControlRegisters, MsrEntries, VcpuKvmState, Xsave};
69 // aarch64 dependencies
70 #[cfg(target_arch = "aarch64")]
71 pub mod aarch64;
72 pub use kvm_bindings;
73 #[cfg(feature = "tdx")]
74 use kvm_bindings::KVMIO;
75 pub use kvm_bindings::{
76     kvm_clock_data, kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_irq_routing,
77     kvm_irq_routing_entry, kvm_mp_state, kvm_userspace_memory_region, KVM_IRQ_ROUTING_IRQCHIP,
78     KVM_IRQ_ROUTING_MSI, KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID,
79 };
80 #[cfg(target_arch = "aarch64")]
81 use kvm_bindings::{
82     kvm_regs, user_fpsimd_state, user_pt_regs, KVM_NR_SPSR, KVM_REG_ARM64, KVM_REG_ARM_CORE,
83     KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64,
84 };
85 pub use kvm_ioctls;
86 pub use kvm_ioctls::{Cap, Kvm};
87 #[cfg(target_arch = "aarch64")]
88 use std::mem;
89 use thiserror::Error;
90 use vfio_ioctls::VfioDeviceFd;
91 #[cfg(feature = "tdx")]
92 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_ioc_nr, ioctl_iowr_nr};
93 ///
94 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms
95 ///
96 pub use {
97     kvm_bindings::kvm_create_device as CreateDevice, kvm_bindings::kvm_device_attr as DeviceAttr,
98     kvm_bindings::kvm_run, kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::VcpuExit,
99 };
100 
101 #[cfg(target_arch = "x86_64")]
102 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196;
103 
104 #[cfg(feature = "tdx")]
105 const KVM_EXIT_TDX: u32 = 35;
106 #[cfg(feature = "tdx")]
107 const TDG_VP_VMCALL_GET_QUOTE: u64 = 0x10002;
108 #[cfg(feature = "tdx")]
109 const TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: u64 = 0x10004;
110 #[cfg(feature = "tdx")]
111 const TDG_VP_VMCALL_SUCCESS: u64 = 0;
112 #[cfg(feature = "tdx")]
113 const TDG_VP_VMCALL_INVALID_OPERAND: u64 = 0x8000000000000000;
114 
115 #[cfg(feature = "tdx")]
116 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong);
117 
118 #[cfg(feature = "tdx")]
119 #[repr(u32)]
120 enum TdxCommand {
121     Capabilities = 0,
122     InitVm,
123     InitVcpu,
124     InitMemRegion,
125     Finalize,
126 }
127 
128 #[cfg(feature = "tdx")]
129 pub enum TdxExitDetails {
130     GetQuote,
131     SetupEventNotifyInterrupt,
132 }
133 
134 #[cfg(feature = "tdx")]
135 pub enum TdxExitStatus {
136     Success,
137     InvalidOperand,
138 }
139 
140 #[cfg(feature = "tdx")]
141 const TDX_MAX_NR_CPUID_CONFIGS: usize = 6;
142 
143 #[cfg(feature = "tdx")]
144 #[repr(C)]
145 #[derive(Debug, Default)]
146 pub struct TdxCpuidConfig {
147     pub leaf: u32,
148     pub sub_leaf: u32,
149     pub eax: u32,
150     pub ebx: u32,
151     pub ecx: u32,
152     pub edx: u32,
153 }
154 
155 #[cfg(feature = "tdx")]
156 #[repr(C)]
157 #[derive(Debug, Default)]
158 pub struct TdxCapabilities {
159     pub attrs_fixed0: u64,
160     pub attrs_fixed1: u64,
161     pub xfam_fixed0: u64,
162     pub xfam_fixed1: u64,
163     pub nr_cpuid_configs: u32,
164     pub padding: u32,
165     pub cpuid_configs: [TdxCpuidConfig; TDX_MAX_NR_CPUID_CONFIGS],
166 }
167 
168 impl From<kvm_userspace_memory_region> for UserMemoryRegion {
169     fn from(region: kvm_userspace_memory_region) -> Self {
170         let mut flags = USER_MEMORY_REGION_READ;
171         if region.flags & KVM_MEM_READONLY == 0 {
172             flags |= USER_MEMORY_REGION_WRITE;
173         }
174         if region.flags & KVM_MEM_LOG_DIRTY_PAGES != 0 {
175             flags |= USER_MEMORY_REGION_LOG_DIRTY;
176         }
177 
178         UserMemoryRegion {
179             slot: region.slot,
180             guest_phys_addr: region.guest_phys_addr,
181             memory_size: region.memory_size,
182             userspace_addr: region.userspace_addr,
183             flags,
184         }
185     }
186 }
187 
188 impl From<UserMemoryRegion> for kvm_userspace_memory_region {
189     fn from(region: UserMemoryRegion) -> Self {
190         assert!(
191             region.flags & USER_MEMORY_REGION_READ != 0,
192             "KVM mapped memory is always readable"
193         );
194 
195         let mut flags = 0;
196         if region.flags & USER_MEMORY_REGION_WRITE == 0 {
197             flags |= KVM_MEM_READONLY;
198         }
199         if region.flags & USER_MEMORY_REGION_LOG_DIRTY != 0 {
200             flags |= KVM_MEM_LOG_DIRTY_PAGES;
201         }
202 
203         kvm_userspace_memory_region {
204             slot: region.slot,
205             guest_phys_addr: region.guest_phys_addr,
206             memory_size: region.memory_size,
207             userspace_addr: region.userspace_addr,
208             flags,
209         }
210     }
211 }
212 
213 impl From<kvm_mp_state> for MpState {
214     fn from(s: kvm_mp_state) -> Self {
215         MpState::Kvm(s)
216     }
217 }
218 
219 impl From<MpState> for kvm_mp_state {
220     fn from(ms: MpState) -> Self {
221         match ms {
222             MpState::Kvm(s) => s,
223             /* Needed in case other hypervisors are enabled */
224             #[allow(unreachable_patterns)]
225             _ => panic!("CpuState is not valid"),
226         }
227     }
228 }
229 
230 impl From<kvm_ioctls::IoEventAddress> for IoEventAddress {
231     fn from(a: kvm_ioctls::IoEventAddress) -> Self {
232         match a {
233             kvm_ioctls::IoEventAddress::Pio(x) => Self::Pio(x),
234             kvm_ioctls::IoEventAddress::Mmio(x) => Self::Mmio(x),
235         }
236     }
237 }
238 
239 impl From<IoEventAddress> for kvm_ioctls::IoEventAddress {
240     fn from(a: IoEventAddress) -> Self {
241         match a {
242             IoEventAddress::Pio(x) => Self::Pio(x),
243             IoEventAddress::Mmio(x) => Self::Mmio(x),
244         }
245     }
246 }
247 
248 impl From<VcpuKvmState> for CpuState {
249     fn from(s: VcpuKvmState) -> Self {
250         CpuState::Kvm(s)
251     }
252 }
253 
254 impl From<CpuState> for VcpuKvmState {
255     fn from(s: CpuState) -> Self {
256         match s {
257             CpuState::Kvm(s) => s,
258             /* Needed in case other hypervisors are enabled */
259             #[allow(unreachable_patterns)]
260             _ => panic!("CpuState is not valid"),
261         }
262     }
263 }
264 
265 #[cfg(target_arch = "x86_64")]
266 impl From<kvm_clock_data> for ClockData {
267     fn from(d: kvm_clock_data) -> Self {
268         ClockData::Kvm(d)
269     }
270 }
271 
272 #[cfg(target_arch = "x86_64")]
273 impl From<ClockData> for kvm_clock_data {
274     fn from(ms: ClockData) -> Self {
275         match ms {
276             ClockData::Kvm(s) => s,
277             /* Needed in case other hypervisors are enabled */
278             #[allow(unreachable_patterns)]
279             _ => panic!("CpuState is not valid"),
280         }
281     }
282 }
283 
284 impl From<kvm_irq_routing_entry> for IrqRoutingEntry {
285     fn from(s: kvm_irq_routing_entry) -> Self {
286         IrqRoutingEntry::Kvm(s)
287     }
288 }
289 
290 impl From<IrqRoutingEntry> for kvm_irq_routing_entry {
291     fn from(e: IrqRoutingEntry) -> Self {
292         match e {
293             IrqRoutingEntry::Kvm(e) => e,
294             /* Needed in case other hypervisors are enabled */
295             #[allow(unreachable_patterns)]
296             _ => panic!("IrqRoutingEntry is not valid"),
297         }
298     }
299 }
300 
301 struct KvmDirtyLogSlot {
302     slot: u32,
303     guest_phys_addr: u64,
304     memory_size: u64,
305     userspace_addr: u64,
306 }
307 
308 /// Wrapper over KVM VM ioctls.
309 pub struct KvmVm {
310     fd: Arc<VmFd>,
311     #[cfg(target_arch = "x86_64")]
312     msrs: Vec<MsrEntry>,
313     dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>,
314 }
315 
316 impl KvmVm {
317     ///
318     /// Creates an emulated device in the kernel.
319     ///
320     /// See the documentation for `KVM_CREATE_DEVICE`.
321     fn create_device(&self, device: &mut CreateDevice) -> vm::Result<vfio_ioctls::VfioDeviceFd> {
322         let device_fd = self
323             .fd
324             .create_device(device)
325             .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?;
326         Ok(VfioDeviceFd::new_from_kvm(device_fd))
327     }
328     /// Checks if a particular `Cap` is available.
329     fn check_extension(&self, c: Cap) -> bool {
330         self.fd.check_extension(c)
331     }
332 }
333 
334 ///
335 /// Implementation of Vm trait for KVM
336 /// Example:
337 /// #[cfg(feature = "kvm")]
338 /// extern crate hypervisor
339 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
340 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
341 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
342 /// vm.set/get().unwrap()
343 ///
344 impl vm::Vm for KvmVm {
345     #[cfg(target_arch = "x86_64")]
346     ///
347     /// Sets the address of the one-page region in the VM's address space.
348     ///
349     fn set_identity_map_address(&self, address: u64) -> vm::Result<()> {
350         self.fd
351             .set_identity_map_address(address)
352             .map_err(|e| vm::HypervisorVmError::SetIdentityMapAddress(e.into()))
353     }
354     #[cfg(target_arch = "x86_64")]
355     ///
356     /// Sets the address of the three-page region in the VM's address space.
357     ///
358     fn set_tss_address(&self, offset: usize) -> vm::Result<()> {
359         self.fd
360             .set_tss_address(offset)
361             .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into()))
362     }
363     ///
364     /// Creates an in-kernel interrupt controller.
365     ///
366     fn create_irq_chip(&self) -> vm::Result<()> {
367         self.fd
368             .create_irq_chip()
369             .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into()))
370     }
371     ///
372     /// Registers an event that will, when signaled, trigger the `gsi` IRQ.
373     ///
374     fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
375         self.fd
376             .register_irqfd(fd, gsi)
377             .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into()))
378     }
379     ///
380     /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ.
381     ///
382     fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
383         self.fd
384             .unregister_irqfd(fd, gsi)
385             .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into()))
386     }
387     ///
388     /// Creates a VcpuFd object from a vcpu RawFd.
389     ///
390     fn create_vcpu(
391         &self,
392         id: u8,
393         vm_ops: Option<Arc<dyn VmOps>>,
394     ) -> vm::Result<Arc<dyn cpu::Vcpu>> {
395         let vc = self
396             .fd
397             .create_vcpu(id as u64)
398             .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?;
399         let vcpu = KvmVcpu {
400             fd: vc,
401             #[cfg(target_arch = "x86_64")]
402             msrs: self.msrs.clone(),
403             vm_ops,
404             #[cfg(target_arch = "x86_64")]
405             hyperv_synic: AtomicBool::new(false),
406         };
407         Ok(Arc::new(vcpu))
408     }
409     #[cfg(target_arch = "aarch64")]
410     ///
411     /// Creates a virtual GIC device.
412     ///
413     fn create_vgic(
414         &self,
415         vcpu_count: u64,
416         dist_addr: u64,
417         dist_size: u64,
418         redist_size: u64,
419         msi_size: u64,
420         nr_irqs: u32,
421     ) -> vm::Result<Arc<Mutex<dyn Vgic>>> {
422         let gic_device = KvmGicV3Its::new(
423             self,
424             vcpu_count,
425             dist_addr,
426             dist_size,
427             redist_size,
428             msi_size,
429             nr_irqs,
430         )
431         .map_err(|e| vm::HypervisorVmError::CreateVgic(anyhow!("Vgic error {:?}", e)))?;
432         Ok(Arc::new(Mutex::new(gic_device)))
433     }
434     ///
435     /// Registers an event to be signaled whenever a certain address is written to.
436     ///
437     fn register_ioevent(
438         &self,
439         fd: &EventFd,
440         addr: &IoEventAddress,
441         datamatch: Option<vm::DataMatch>,
442     ) -> vm::Result<()> {
443         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
444         if let Some(dm) = datamatch {
445             match dm {
446                 vm::DataMatch::DataMatch32(kvm_dm32) => self
447                     .fd
448                     .register_ioevent(fd, addr, kvm_dm32)
449                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
450                 vm::DataMatch::DataMatch64(kvm_dm64) => self
451                     .fd
452                     .register_ioevent(fd, addr, kvm_dm64)
453                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
454             }
455         } else {
456             self.fd
457                 .register_ioevent(fd, addr, NoDatamatch)
458                 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into()))
459         }
460     }
461     ///
462     /// Unregisters an event from a certain address it has been previously registered to.
463     ///
464     fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> {
465         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
466         self.fd
467             .unregister_ioevent(fd, addr, NoDatamatch)
468             .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into()))
469     }
470 
471     ///
472     /// Constructs a routing entry
473     ///
474     fn make_routing_entry(&self, gsi: u32, config: &InterruptSourceConfig) -> IrqRoutingEntry {
475         match &config {
476             InterruptSourceConfig::MsiIrq(cfg) => {
477                 let mut kvm_route = kvm_irq_routing_entry {
478                     gsi,
479                     type_: KVM_IRQ_ROUTING_MSI,
480                     ..Default::default()
481                 };
482 
483                 kvm_route.u.msi.address_lo = cfg.low_addr;
484                 kvm_route.u.msi.address_hi = cfg.high_addr;
485                 kvm_route.u.msi.data = cfg.data;
486 
487                 if self.check_extension(crate::kvm::Cap::MsiDevid) {
488                     // On AArch64, there is limitation on the range of the 'devid',
489                     // it can not be greater than 65536 (the max of u16).
490                     //
491                     // BDF can not be used directly, because 'segment' is in high
492                     // 16 bits. The layout of the u32 BDF is:
493                     // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --|
494                     // |      segment    |     bus    |   device   |  function  |
495                     //
496                     // Now that we support 1 bus only in a segment, we can build a
497                     // 'devid' by replacing the 'bus' bits with the low 8 bits of
498                     // 'segment' data.
499                     // This way we can resolve the range checking problem and give
500                     // different `devid` to all the devices. Limitation is that at
501                     // most 256 segments can be supported.
502                     //
503                     let modified_devid = (cfg.devid & 0x00ff_0000) >> 8 | cfg.devid & 0xff;
504 
505                     kvm_route.flags = KVM_MSI_VALID_DEVID;
506                     kvm_route.u.msi.__bindgen_anon_1.devid = modified_devid;
507                 }
508                 kvm_route.into()
509             }
510             InterruptSourceConfig::LegacyIrq(cfg) => {
511                 let mut kvm_route = kvm_irq_routing_entry {
512                     gsi,
513                     type_: KVM_IRQ_ROUTING_IRQCHIP,
514                     ..Default::default()
515                 };
516                 kvm_route.u.irqchip.irqchip = cfg.irqchip;
517                 kvm_route.u.irqchip.pin = cfg.pin;
518 
519                 kvm_route.into()
520             }
521         }
522     }
523 
524     ///
525     /// Sets the GSI routing table entries, overwriting any previously set
526     /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl.
527     ///
528     fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> {
529         let mut irq_routing =
530             vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len());
531         irq_routing[0].nr = entries.len() as u32;
532         irq_routing[0].flags = 0;
533         let entries: Vec<kvm_irq_routing_entry> = entries
534             .iter()
535             .map(|entry| match entry {
536                 IrqRoutingEntry::Kvm(e) => *e,
537                 #[allow(unreachable_patterns)]
538                 _ => panic!("IrqRoutingEntry type is wrong"),
539             })
540             .collect();
541 
542         // SAFETY: irq_routing initialized with entries.len() and now it is being turned into
543         // entries_slice with entries.len() again. It is guaranteed to be large enough to hold
544         // everything from entries.
545         unsafe {
546             let entries_slice: &mut [kvm_irq_routing_entry] =
547                 irq_routing[0].entries.as_mut_slice(entries.len());
548             entries_slice.copy_from_slice(&entries);
549         }
550 
551         self.fd
552             .set_gsi_routing(&irq_routing[0])
553             .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into()))
554     }
555     ///
556     /// Creates a memory region structure that can be used with {create/remove}_user_memory_region
557     ///
558     fn make_user_memory_region(
559         &self,
560         slot: u32,
561         guest_phys_addr: u64,
562         memory_size: u64,
563         userspace_addr: u64,
564         readonly: bool,
565         log_dirty_pages: bool,
566     ) -> UserMemoryRegion {
567         kvm_userspace_memory_region {
568             slot,
569             guest_phys_addr,
570             memory_size,
571             userspace_addr,
572             flags: if readonly { KVM_MEM_READONLY } else { 0 }
573                 | if log_dirty_pages {
574                     KVM_MEM_LOG_DIRTY_PAGES
575                 } else {
576                     0
577                 },
578         }
579         .into()
580     }
581     ///
582     /// Creates a guest physical memory region.
583     ///
584     fn create_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
585         let mut region: kvm_userspace_memory_region = user_memory_region.into();
586 
587         if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 {
588             if (region.flags & KVM_MEM_READONLY) != 0 {
589                 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!(
590                     "Error creating regions with both 'dirty-pages-log' and 'read-only'."
591                 )));
592             }
593 
594             // Keep track of the regions that need dirty pages log
595             self.dirty_log_slots.write().unwrap().insert(
596                 region.slot,
597                 KvmDirtyLogSlot {
598                     slot: region.slot,
599                     guest_phys_addr: region.guest_phys_addr,
600                     memory_size: region.memory_size,
601                     userspace_addr: region.userspace_addr,
602                 },
603             );
604 
605             // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`.
606             // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`.
607             region.flags = 0;
608         }
609 
610         // SAFETY: Safe because guest regions are guaranteed not to overlap.
611         unsafe {
612             self.fd
613                 .set_user_memory_region(region)
614                 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))
615         }
616     }
617     ///
618     /// Removes a guest physical memory region.
619     ///
620     fn remove_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
621         let mut region: kvm_userspace_memory_region = user_memory_region.into();
622 
623         // Remove the corresponding entry from "self.dirty_log_slots" if needed
624         self.dirty_log_slots.write().unwrap().remove(&region.slot);
625 
626         // Setting the size to 0 means "remove"
627         region.memory_size = 0;
628         // SAFETY: Safe because guest regions are guaranteed not to overlap.
629         unsafe {
630             self.fd
631                 .set_user_memory_region(region)
632                 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into()))
633         }
634     }
635     ///
636     /// Returns the preferred CPU target type which can be emulated by KVM on underlying host.
637     ///
638     #[cfg(target_arch = "aarch64")]
639     fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> {
640         self.fd
641             .get_preferred_target(kvi)
642             .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into()))
643     }
644     #[cfg(target_arch = "x86_64")]
645     fn enable_split_irq(&self) -> vm::Result<()> {
646         // Create split irqchip
647         // Only the local APIC is emulated in kernel, both PICs and IOAPIC
648         // are not.
649         let mut cap = kvm_enable_cap {
650             cap: KVM_CAP_SPLIT_IRQCHIP,
651             ..Default::default()
652         };
653         cap.args[0] = NUM_IOAPIC_PINS as u64;
654         self.fd
655             .enable_cap(&cap)
656             .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?;
657         Ok(())
658     }
659     #[cfg(target_arch = "x86_64")]
660     fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> {
661         let mut cap = kvm_enable_cap {
662             cap: KVM_CAP_SGX_ATTRIBUTE,
663             ..Default::default()
664         };
665         cap.args[0] = file.as_raw_fd() as u64;
666         self.fd
667             .enable_cap(&cap)
668             .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?;
669         Ok(())
670     }
671     /// Retrieve guest clock.
672     #[cfg(target_arch = "x86_64")]
673     fn get_clock(&self) -> vm::Result<ClockData> {
674         Ok(self
675             .fd
676             .get_clock()
677             .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))?
678             .into())
679     }
680     /// Set guest clock.
681     #[cfg(target_arch = "x86_64")]
682     fn set_clock(&self, data: &ClockData) -> vm::Result<()> {
683         let data = (*data).into();
684         self.fd
685             .set_clock(&data)
686             .map_err(|e| vm::HypervisorVmError::SetClock(e.into()))
687     }
688     /// Create a device that is used for passthrough
689     fn create_passthrough_device(&self) -> vm::Result<VfioDeviceFd> {
690         let mut vfio_dev = kvm_create_device {
691             type_: kvm_device_type_KVM_DEV_TYPE_VFIO,
692             fd: 0,
693             flags: 0,
694         };
695 
696         self.create_device(&mut vfio_dev)
697             .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into()))
698     }
699     ///
700     /// Start logging dirty pages
701     ///
702     fn start_dirty_log(&self) -> vm::Result<()> {
703         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
704         for (_, s) in dirty_log_slots.iter() {
705             let region = kvm_userspace_memory_region {
706                 slot: s.slot,
707                 guest_phys_addr: s.guest_phys_addr,
708                 memory_size: s.memory_size,
709                 userspace_addr: s.userspace_addr,
710                 flags: KVM_MEM_LOG_DIRTY_PAGES,
711             };
712             // SAFETY: Safe because guest regions are guaranteed not to overlap.
713             unsafe {
714                 self.fd
715                     .set_user_memory_region(region)
716                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
717             }
718         }
719 
720         Ok(())
721     }
722 
723     ///
724     /// Stop logging dirty pages
725     ///
726     fn stop_dirty_log(&self) -> vm::Result<()> {
727         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
728         for (_, s) in dirty_log_slots.iter() {
729             let region = kvm_userspace_memory_region {
730                 slot: s.slot,
731                 guest_phys_addr: s.guest_phys_addr,
732                 memory_size: s.memory_size,
733                 userspace_addr: s.userspace_addr,
734                 flags: 0,
735             };
736             // SAFETY: Safe because guest regions are guaranteed not to overlap.
737             unsafe {
738                 self.fd
739                     .set_user_memory_region(region)
740                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
741             }
742         }
743 
744         Ok(())
745     }
746 
747     ///
748     /// Get dirty pages bitmap (one bit per page)
749     ///
750     fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> {
751         self.fd
752             .get_dirty_log(slot, memory_size as usize)
753             .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into()))
754     }
755 
756     ///
757     /// Initialize TDX for this VM
758     ///
759     #[cfg(feature = "tdx")]
760     fn tdx_init(&self, cpuid: &[CpuIdEntry], max_vcpus: u32) -> vm::Result<()> {
761         use std::io::{Error, ErrorKind};
762         let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> =
763             cpuid.iter().map(|e| (*e).into()).collect();
764         let kvm_cpuid = kvm_bindings::CpuId::from_entries(&cpuid).map_err(|_| {
765             vm::HypervisorVmError::InitializeTdx(Error::new(
766                 ErrorKind::Other,
767                 "failed to allocate CpuId",
768             ))
769         })?;
770 
771         #[repr(C)]
772         struct TdxInitVm {
773             max_vcpus: u32,
774             tsc_khz: u32,
775             attributes: u64,
776             cpuid: u64,
777             mrconfigid: [u64; 6],
778             mrowner: [u64; 6],
779             mrownerconfig: [u64; 6],
780             reserved: [u64; 43],
781         }
782         let data = TdxInitVm {
783             max_vcpus,
784             tsc_khz: 0,
785             attributes: 0,
786             cpuid: kvm_cpuid.as_fam_struct_ptr() as u64,
787             mrconfigid: [0; 6],
788             mrowner: [0; 6],
789             mrownerconfig: [0; 6],
790             reserved: [0; 43],
791         };
792 
793         tdx_command(
794             &self.fd.as_raw_fd(),
795             TdxCommand::InitVm,
796             0,
797             &data as *const _ as u64,
798         )
799         .map_err(vm::HypervisorVmError::InitializeTdx)
800     }
801 
802     ///
803     /// Finalize the TDX setup for this VM
804     ///
805     #[cfg(feature = "tdx")]
806     fn tdx_finalize(&self) -> vm::Result<()> {
807         tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0)
808             .map_err(vm::HypervisorVmError::FinalizeTdx)
809     }
810 
811     ///
812     /// Initialize memory regions for the TDX VM
813     ///
814     #[cfg(feature = "tdx")]
815     fn tdx_init_memory_region(
816         &self,
817         host_address: u64,
818         guest_address: u64,
819         size: u64,
820         measure: bool,
821     ) -> vm::Result<()> {
822         #[repr(C)]
823         struct TdxInitMemRegion {
824             host_address: u64,
825             guest_address: u64,
826             pages: u64,
827         }
828         let data = TdxInitMemRegion {
829             host_address,
830             guest_address,
831             pages: size / 4096,
832         };
833 
834         tdx_command(
835             &self.fd.as_raw_fd(),
836             TdxCommand::InitMemRegion,
837             if measure { 1 } else { 0 },
838             &data as *const _ as u64,
839         )
840         .map_err(vm::HypervisorVmError::InitMemRegionTdx)
841     }
842     /// Downcast to the underlying KvmVm type
843     fn as_any(&self) -> &dyn Any {
844         self
845     }
846 }
847 
848 #[cfg(feature = "tdx")]
849 fn tdx_command(
850     fd: &RawFd,
851     command: TdxCommand,
852     metadata: u32,
853     data: u64,
854 ) -> std::result::Result<(), std::io::Error> {
855     #[repr(C)]
856     struct TdxIoctlCmd {
857         command: TdxCommand,
858         metadata: u32,
859         data: u64,
860     }
861     let cmd = TdxIoctlCmd {
862         command,
863         metadata,
864         data,
865     };
866     // SAFETY: FFI call. All input parameters are valid.
867     let ret = unsafe {
868         ioctl_with_val(
869             fd,
870             KVM_MEMORY_ENCRYPT_OP(),
871             &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong,
872         )
873     };
874 
875     if ret < 0 {
876         return Err(std::io::Error::last_os_error());
877     }
878     Ok(())
879 }
880 
881 /// Wrapper over KVM system ioctls.
882 pub struct KvmHypervisor {
883     kvm: Kvm,
884 }
885 
886 impl KvmHypervisor {
887     #[cfg(target_arch = "x86_64")]
888     ///
889     /// Retrieve the list of MSRs supported by the hypervisor.
890     ///
891     fn get_msr_list(&self) -> hypervisor::Result<MsrList> {
892         self.kvm
893             .get_msr_index_list()
894             .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into()))
895     }
896 }
897 
898 /// Enum for KVM related error
899 #[derive(Debug, Error)]
900 pub enum KvmError {
901     #[error("Capability missing: {0:?}")]
902     CapabilityMissing(Cap),
903 }
904 pub type KvmResult<T> = result::Result<T, KvmError>;
905 impl KvmHypervisor {
906     /// Create a hypervisor based on Kvm
907     pub fn new() -> hypervisor::Result<KvmHypervisor> {
908         let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?;
909         let api_version = kvm_obj.get_api_version();
910 
911         if api_version != kvm_bindings::KVM_API_VERSION as i32 {
912             return Err(hypervisor::HypervisorError::IncompatibleApiVersion);
913         }
914 
915         Ok(KvmHypervisor { kvm: kvm_obj })
916     }
917 }
918 /// Implementation of Hypervisor trait for KVM
919 /// Example:
920 /// #[cfg(feature = "kvm")]
921 /// extern crate hypervisor
922 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
923 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
924 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
925 ///
926 impl hypervisor::Hypervisor for KvmHypervisor {
927     /// Create a KVM vm object of a specific VM type and return the object as Vm trait object
928     /// Example
929     /// # extern crate hypervisor;
930     /// # use hypervisor::KvmHypervisor;
931     /// use hypervisor::KvmVm;
932     /// let hypervisor = KvmHypervisor::new().unwrap();
933     /// let vm = hypervisor.create_vm_with_type(KvmVmType::LegacyVm).unwrap()
934     ///
935     fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> {
936         let fd: VmFd;
937         loop {
938             match self.kvm.create_vm_with_type(vm_type) {
939                 Ok(res) => fd = res,
940                 Err(e) => {
941                     if e.errno() == libc::EINTR {
942                         // If the error returned is EINTR, which means the
943                         // ioctl has been interrupted, we have to retry as
944                         // this can't be considered as a regular error.
945                         continue;
946                     } else {
947                         return Err(hypervisor::HypervisorError::VmCreate(e.into()));
948                     }
949                 }
950             }
951             break;
952         }
953 
954         let vm_fd = Arc::new(fd);
955 
956         #[cfg(target_arch = "x86_64")]
957         {
958             let msr_list = self.get_msr_list()?;
959             let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize;
960             let mut msrs: Vec<MsrEntry> = vec![
961                 MsrEntry {
962                     ..Default::default()
963                 };
964                 num_msrs
965             ];
966             let indices = msr_list.as_slice();
967             for (pos, index) in indices.iter().enumerate() {
968                 msrs[pos].index = *index;
969             }
970 
971             Ok(Arc::new(KvmVm {
972                 fd: vm_fd,
973                 msrs,
974                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
975             }))
976         }
977 
978         #[cfg(target_arch = "aarch64")]
979         {
980             Ok(Arc::new(KvmVm {
981                 fd: vm_fd,
982                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
983             }))
984         }
985     }
986 
987     /// Create a KVM vm object and return the object as Vm trait object
988     /// Example
989     /// # extern crate hypervisor;
990     /// # use hypervisor::KvmHypervisor;
991     /// use hypervisor::KvmVm;
992     /// let hypervisor = KvmHypervisor::new().unwrap();
993     /// let vm = hypervisor.create_vm().unwrap()
994     ///
995     fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> {
996         #[allow(unused_mut)]
997         let mut vm_type: u64 = 0; // Create with default platform type
998 
999         // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA
1000         // size from the host and use that when creating the VM, which may
1001         // avoid unnecessary VM creation failures.
1002         #[cfg(target_arch = "aarch64")]
1003         if self.kvm.check_extension(Cap::ArmVmIPASize) {
1004             vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap();
1005         }
1006 
1007         self.create_vm_with_type(vm_type)
1008     }
1009 
1010     fn check_required_extensions(&self) -> hypervisor::Result<()> {
1011         check_required_kvm_extensions(&self.kvm)
1012             .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into()))
1013     }
1014 
1015     #[cfg(target_arch = "x86_64")]
1016     ///
1017     /// X86 specific call to get the system supported CPUID values.
1018     ///
1019     fn get_cpuid(&self) -> hypervisor::Result<Vec<CpuIdEntry>> {
1020         let kvm_cpuid = self
1021             .kvm
1022             .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
1023             .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))?;
1024 
1025         let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect();
1026 
1027         Ok(v)
1028     }
1029 
1030     #[cfg(target_arch = "aarch64")]
1031     ///
1032     /// Retrieve AArch64 host maximum IPA size supported by KVM.
1033     ///
1034     fn get_host_ipa_limit(&self) -> i32 {
1035         self.kvm.get_host_ipa_limit()
1036     }
1037 
1038     ///
1039     /// Retrieve TDX capabilities
1040     ///
1041     #[cfg(feature = "tdx")]
1042     fn tdx_capabilities(&self) -> hypervisor::Result<TdxCapabilities> {
1043         let data = TdxCapabilities {
1044             nr_cpuid_configs: TDX_MAX_NR_CPUID_CONFIGS as u32,
1045             ..Default::default()
1046         };
1047 
1048         tdx_command(
1049             &self.kvm.as_raw_fd(),
1050             TdxCommand::Capabilities,
1051             0,
1052             &data as *const _ as u64,
1053         )
1054         .map_err(|e| hypervisor::HypervisorError::TdxCapabilities(e.into()))?;
1055 
1056         Ok(data)
1057     }
1058 }
1059 /// Vcpu struct for KVM
1060 pub struct KvmVcpu {
1061     fd: VcpuFd,
1062     #[cfg(target_arch = "x86_64")]
1063     msrs: Vec<MsrEntry>,
1064     vm_ops: Option<Arc<dyn vm::VmOps>>,
1065     #[cfg(target_arch = "x86_64")]
1066     hyperv_synic: AtomicBool,
1067 }
1068 /// Implementation of Vcpu trait for KVM
1069 /// Example:
1070 /// #[cfg(feature = "kvm")]
1071 /// extern crate hypervisor
1072 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1073 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1074 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
1075 /// let vcpu = vm.create_vcpu(0, None).unwrap();
1076 /// vcpu.get/set().unwrap()
1077 ///
1078 impl cpu::Vcpu for KvmVcpu {
1079     #[cfg(target_arch = "x86_64")]
1080     ///
1081     /// Returns the vCPU general purpose registers.
1082     ///
1083     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1084         Ok(self
1085             .fd
1086             .get_regs()
1087             .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))?
1088             .into())
1089     }
1090     ///
1091     /// Returns the vCPU general purpose registers.
1092     /// The `KVM_GET_REGS` ioctl is not available on AArch64, `KVM_GET_ONE_REG`
1093     /// is used to get registers one by one.
1094     ///
1095     #[cfg(target_arch = "aarch64")]
1096     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1097         let mut state: StandardRegisters = kvm_regs::default();
1098         let mut off = offset__of!(user_pt_regs, regs);
1099         // There are 31 user_pt_regs:
1100         // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72
1101         // These actually are the general-purpose registers of the Armv8-a
1102         // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register).
1103         for i in 0..31 {
1104             state.regs.regs[i] = self
1105                 .fd
1106                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1107                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1108             off += std::mem::size_of::<u64>();
1109         }
1110 
1111         // We are now entering the "Other register" section of the ARMv8-a architecture.
1112         // First one, stack pointer.
1113         let off = offset__of!(user_pt_regs, sp);
1114         state.regs.sp = self
1115             .fd
1116             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1117             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1118 
1119         // Second one, the program counter.
1120         let off = offset__of!(user_pt_regs, pc);
1121         state.regs.pc = self
1122             .fd
1123             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1124             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1125 
1126         // Next is the processor state.
1127         let off = offset__of!(user_pt_regs, pstate);
1128         state.regs.pstate = self
1129             .fd
1130             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1131             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1132 
1133         // The stack pointer associated with EL1
1134         let off = offset__of!(kvm_regs, sp_el1);
1135         state.sp_el1 = self
1136             .fd
1137             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1138             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1139 
1140         // Exception Link Register for EL1, when taking an exception to EL1, this register
1141         // holds the address to which to return afterwards.
1142         let off = offset__of!(kvm_regs, elr_el1);
1143         state.elr_el1 = self
1144             .fd
1145             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1146             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1147 
1148         // Saved Program Status Registers, there are 5 of them used in the kernel.
1149         let mut off = offset__of!(kvm_regs, spsr);
1150         for i in 0..KVM_NR_SPSR as usize {
1151             state.spsr[i] = self
1152                 .fd
1153                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1154                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1155             off += std::mem::size_of::<u64>();
1156         }
1157 
1158         // Now moving on to floting point registers which are stored in the user_fpsimd_state in the kernel:
1159         // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53
1160         let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs);
1161         for i in 0..32 {
1162             state.fp_regs.vregs[i] = self
1163                 .fd
1164                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off))
1165                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1166                 .into();
1167             off += mem::size_of::<u128>();
1168         }
1169 
1170         // Floating-point Status Register
1171         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr);
1172         state.fp_regs.fpsr = self
1173             .fd
1174             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1175             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1176             as u32;
1177 
1178         // Floating-point Control Register
1179         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr);
1180         state.fp_regs.fpcr = self
1181             .fd
1182             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1183             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1184             as u32;
1185         Ok(state)
1186     }
1187     #[cfg(target_arch = "x86_64")]
1188     ///
1189     /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl.
1190     ///
1191     fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> {
1192         let regs = (*regs).into();
1193         self.fd
1194             .set_regs(&regs)
1195             .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into()))
1196     }
1197 
1198     ///
1199     /// Sets the vCPU general purpose registers.
1200     /// The `KVM_SET_REGS` ioctl is not available on AArch64, `KVM_SET_ONE_REG`
1201     /// is used to set registers one by one.
1202     ///
1203     #[cfg(target_arch = "aarch64")]
1204     fn set_regs(&self, state: &StandardRegisters) -> cpu::Result<()> {
1205         // The function follows the exact identical order from `state`. Look there
1206         // for some additional info on registers.
1207         let mut off = offset__of!(user_pt_regs, regs);
1208         for i in 0..31 {
1209             self.fd
1210                 .set_one_reg(
1211                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1212                     state.regs.regs[i],
1213                 )
1214                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1215             off += std::mem::size_of::<u64>();
1216         }
1217 
1218         let off = offset__of!(user_pt_regs, sp);
1219         self.fd
1220             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.sp)
1221             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1222 
1223         let off = offset__of!(user_pt_regs, pc);
1224         self.fd
1225             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pc)
1226             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1227 
1228         let off = offset__of!(user_pt_regs, pstate);
1229         self.fd
1230             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pstate)
1231             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1232 
1233         let off = offset__of!(kvm_regs, sp_el1);
1234         self.fd
1235             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.sp_el1)
1236             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1237 
1238         let off = offset__of!(kvm_regs, elr_el1);
1239         self.fd
1240             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.elr_el1)
1241             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1242 
1243         let mut off = offset__of!(kvm_regs, spsr);
1244         for i in 0..KVM_NR_SPSR as usize {
1245             self.fd
1246                 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.spsr[i])
1247                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1248             off += std::mem::size_of::<u64>();
1249         }
1250 
1251         let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs);
1252         for i in 0..32 {
1253             self.fd
1254                 .set_one_reg(
1255                     arm64_core_reg_id!(KVM_REG_SIZE_U128, off),
1256                     state.fp_regs.vregs[i] as u64,
1257                 )
1258                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1259             off += mem::size_of::<u128>();
1260         }
1261 
1262         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr);
1263         self.fd
1264             .set_one_reg(
1265                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1266                 state.fp_regs.fpsr as u64,
1267             )
1268             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1269 
1270         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr);
1271         self.fd
1272             .set_one_reg(
1273                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1274                 state.fp_regs.fpcr as u64,
1275             )
1276             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1277         Ok(())
1278     }
1279 
1280     #[cfg(target_arch = "x86_64")]
1281     ///
1282     /// Returns the vCPU special registers.
1283     ///
1284     fn get_sregs(&self) -> cpu::Result<SpecialRegisters> {
1285         Ok(self
1286             .fd
1287             .get_sregs()
1288             .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))?
1289             .into())
1290     }
1291     #[cfg(target_arch = "x86_64")]
1292     ///
1293     /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl.
1294     ///
1295     fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> {
1296         let sregs = (*sregs).into();
1297         self.fd
1298             .set_sregs(&sregs)
1299             .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into()))
1300     }
1301     #[cfg(target_arch = "x86_64")]
1302     ///
1303     /// Returns the floating point state (FPU) from the vCPU.
1304     ///
1305     fn get_fpu(&self) -> cpu::Result<FpuState> {
1306         Ok(self
1307             .fd
1308             .get_fpu()
1309             .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))?
1310             .into())
1311     }
1312     #[cfg(target_arch = "x86_64")]
1313     ///
1314     /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct.
1315     ///
1316     fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> {
1317         let fpu: kvm_bindings::kvm_fpu = (*fpu).clone().into();
1318         self.fd
1319             .set_fpu(&fpu)
1320             .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into()))
1321     }
1322     #[cfg(target_arch = "x86_64")]
1323     ///
1324     /// X86 specific call to setup the CPUID registers.
1325     ///
1326     fn set_cpuid2(&self, cpuid: &[CpuIdEntry]) -> cpu::Result<()> {
1327         let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> =
1328             cpuid.iter().map(|e| (*e).into()).collect();
1329         let kvm_cpuid = <CpuId>::from_entries(&cpuid)
1330             .map_err(|_| cpu::HypervisorCpuError::SetCpuid(anyhow!("failed to create CpuId")))?;
1331 
1332         self.fd
1333             .set_cpuid2(&kvm_cpuid)
1334             .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into()))
1335     }
1336     #[cfg(target_arch = "x86_64")]
1337     ///
1338     /// X86 specific call to enable HyperV SynIC
1339     ///
1340     fn enable_hyperv_synic(&self) -> cpu::Result<()> {
1341         // Update the information about Hyper-V SynIC being enabled and
1342         // emulated as it will influence later which MSRs should be saved.
1343         self.hyperv_synic.store(true, Ordering::Release);
1344 
1345         let cap = kvm_enable_cap {
1346             cap: KVM_CAP_HYPERV_SYNIC,
1347             ..Default::default()
1348         };
1349         self.fd
1350             .enable_cap(&cap)
1351             .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into()))
1352     }
1353     ///
1354     /// X86 specific call to retrieve the CPUID registers.
1355     ///
1356     #[cfg(target_arch = "x86_64")]
1357     fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<Vec<CpuIdEntry>> {
1358         let kvm_cpuid = self
1359             .fd
1360             .get_cpuid2(num_entries)
1361             .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))?;
1362 
1363         let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect();
1364 
1365         Ok(v)
1366     }
1367     #[cfg(target_arch = "x86_64")]
1368     ///
1369     /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1370     ///
1371     fn get_lapic(&self) -> cpu::Result<LapicState> {
1372         Ok(self
1373             .fd
1374             .get_lapic()
1375             .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))?
1376             .into())
1377     }
1378     #[cfg(target_arch = "x86_64")]
1379     ///
1380     /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1381     ///
1382     fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> {
1383         let klapic: kvm_bindings::kvm_lapic_state = (*klapic).clone().into();
1384         self.fd
1385             .set_lapic(&klapic)
1386             .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into()))
1387     }
1388     #[cfg(target_arch = "x86_64")]
1389     ///
1390     /// Returns the model-specific registers (MSR) for this vCPU.
1391     ///
1392     fn get_msrs(&self, msrs: &mut Vec<MsrEntry>) -> cpu::Result<usize> {
1393         let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect();
1394         let mut kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap();
1395         let succ = self
1396             .fd
1397             .get_msrs(&mut kvm_msrs)
1398             .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))?;
1399 
1400         msrs[..succ].copy_from_slice(
1401             &kvm_msrs.as_slice()[..succ]
1402                 .iter()
1403                 .map(|e| (*e).into())
1404                 .collect::<Vec<MsrEntry>>(),
1405         );
1406 
1407         Ok(succ)
1408     }
1409     #[cfg(target_arch = "x86_64")]
1410     ///
1411     /// Setup the model-specific registers (MSR) for this vCPU.
1412     /// Returns the number of MSR entries actually written.
1413     ///
1414     fn set_msrs(&self, msrs: &[MsrEntry]) -> cpu::Result<usize> {
1415         let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect();
1416         let kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap();
1417         self.fd
1418             .set_msrs(&kvm_msrs)
1419             .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into()))
1420     }
1421     ///
1422     /// Returns the vcpu's current "multiprocessing state".
1423     ///
1424     fn get_mp_state(&self) -> cpu::Result<MpState> {
1425         Ok(self
1426             .fd
1427             .get_mp_state()
1428             .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))?
1429             .into())
1430     }
1431     ///
1432     /// Sets the vcpu's current "multiprocessing state".
1433     ///
1434     fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> {
1435         self.fd
1436             .set_mp_state(mp_state.into())
1437             .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into()))
1438     }
1439     #[cfg(target_arch = "x86_64")]
1440     ///
1441     /// Translates guest virtual address to guest physical address using the `KVM_TRANSLATE` ioctl.
1442     ///
1443     fn translate_gva(&self, gva: u64, _flags: u64) -> cpu::Result<(u64, u32)> {
1444         let tr = self
1445             .fd
1446             .translate_gva(gva)
1447             .map_err(|e| cpu::HypervisorCpuError::TranslateVirtualAddress(e.into()))?;
1448         // tr.valid is set if the GVA is mapped to valid GPA.
1449         match tr.valid {
1450             0 => Err(cpu::HypervisorCpuError::TranslateVirtualAddress(anyhow!(
1451                 "Invalid GVA: {:#x}",
1452                 gva
1453             ))),
1454             _ => Ok((tr.physical_address, 0)),
1455         }
1456     }
1457     ///
1458     /// Triggers the running of the current virtual CPU returning an exit reason.
1459     ///
1460     fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> {
1461         match self.fd.run() {
1462             Ok(run) => match run {
1463                 #[cfg(target_arch = "x86_64")]
1464                 VcpuExit::IoIn(addr, data) => {
1465                     if let Some(vm_ops) = &self.vm_ops {
1466                         return vm_ops
1467                             .pio_read(addr.into(), data)
1468                             .map(|_| cpu::VmExit::Ignore)
1469                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1470                     }
1471 
1472                     Ok(cpu::VmExit::IoIn(addr, data))
1473                 }
1474                 #[cfg(target_arch = "x86_64")]
1475                 VcpuExit::IoOut(addr, data) => {
1476                     if let Some(vm_ops) = &self.vm_ops {
1477                         return vm_ops
1478                             .pio_write(addr.into(), data)
1479                             .map(|_| cpu::VmExit::Ignore)
1480                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1481                     }
1482 
1483                     Ok(cpu::VmExit::IoOut(addr, data))
1484                 }
1485                 #[cfg(target_arch = "x86_64")]
1486                 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)),
1487                 #[cfg(target_arch = "x86_64")]
1488                 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset),
1489 
1490                 #[cfg(target_arch = "aarch64")]
1491                 VcpuExit::SystemEvent(event_type, flags) => {
1492                     use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN};
1493                     // On Aarch64, when the VM is shutdown, run() returns
1494                     // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN
1495                     if event_type == KVM_SYSTEM_EVENT_RESET {
1496                         Ok(cpu::VmExit::Reset)
1497                     } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN {
1498                         Ok(cpu::VmExit::Shutdown)
1499                     } else {
1500                         Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1501                             "Unexpected system event with type 0x{:x}, flags 0x{:x}",
1502                             event_type,
1503                             flags
1504                         )))
1505                     }
1506                 }
1507 
1508                 VcpuExit::MmioRead(addr, data) => {
1509                     if let Some(vm_ops) = &self.vm_ops {
1510                         return vm_ops
1511                             .mmio_read(addr, data)
1512                             .map(|_| cpu::VmExit::Ignore)
1513                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1514                     }
1515 
1516                     Ok(cpu::VmExit::MmioRead(addr, data))
1517                 }
1518                 VcpuExit::MmioWrite(addr, data) => {
1519                     if let Some(vm_ops) = &self.vm_ops {
1520                         return vm_ops
1521                             .mmio_write(addr, data)
1522                             .map(|_| cpu::VmExit::Ignore)
1523                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1524                     }
1525 
1526                     Ok(cpu::VmExit::MmioWrite(addr, data))
1527                 }
1528                 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv),
1529                 #[cfg(feature = "tdx")]
1530                 VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx),
1531                 VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug),
1532 
1533                 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1534                     "Unexpected exit reason on vcpu run: {:?}",
1535                     r
1536                 ))),
1537             },
1538 
1539             Err(ref e) => match e.errno() {
1540                 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore),
1541                 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1542                     "VCPU error {:?}",
1543                     e
1544                 ))),
1545             },
1546         }
1547     }
1548     #[cfg(target_arch = "x86_64")]
1549     ///
1550     /// Let the guest know that it has been paused, which prevents from
1551     /// potential soft lockups when being resumed.
1552     ///
1553     fn notify_guest_clock_paused(&self) -> cpu::Result<()> {
1554         if let Err(e) = self.fd.kvmclock_ctrl() {
1555             // Linux kernel returns -EINVAL if the PV clock isn't yet initialised
1556             // which could be because we're still in firmware or the guest doesn't
1557             // use KVM clock.
1558             if e.errno() != libc::EINVAL {
1559                 return Err(cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into()));
1560             }
1561         }
1562 
1563         Ok(())
1564     }
1565     #[cfg(target_arch = "x86_64")]
1566     ///
1567     /// Sets debug registers to set hardware breakpoints and/or enable single step.
1568     ///
1569     fn set_guest_debug(
1570         &self,
1571         addrs: &[vm_memory::GuestAddress],
1572         singlestep: bool,
1573     ) -> cpu::Result<()> {
1574         if addrs.len() > 4 {
1575             return Err(cpu::HypervisorCpuError::SetDebugRegs(anyhow!(
1576                 "Support 4 breakpoints at most but {} addresses are passed",
1577                 addrs.len()
1578             )));
1579         }
1580 
1581         let mut dbg = kvm_guest_debug {
1582             control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP,
1583             ..Default::default()
1584         };
1585         if singlestep {
1586             dbg.control |= KVM_GUESTDBG_SINGLESTEP;
1587         }
1588 
1589         // Set bits 9 and 10.
1590         // bit 9: GE (global exact breakpoint enable) flag.
1591         // bit 10: always 1.
1592         dbg.arch.debugreg[7] = 0x0600;
1593 
1594         for (i, addr) in addrs.iter().enumerate() {
1595             dbg.arch.debugreg[i] = addr.0;
1596             // Set global breakpoint enable flag
1597             dbg.arch.debugreg[7] |= 2 << (i * 2);
1598         }
1599 
1600         self.fd
1601             .set_guest_debug(&dbg)
1602             .map_err(|e| cpu::HypervisorCpuError::SetDebugRegs(e.into()))
1603     }
1604     #[cfg(target_arch = "aarch64")]
1605     fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> {
1606         self.fd
1607             .vcpu_init(kvi)
1608             .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into()))
1609     }
1610     ///
1611     /// Sets the value of one register for this vCPU.
1612     ///
1613     #[cfg(target_arch = "aarch64")]
1614     fn set_reg(&self, reg_id: u64, data: u64) -> cpu::Result<()> {
1615         self.fd
1616             .set_one_reg(reg_id, data)
1617             .map_err(|e| cpu::HypervisorCpuError::SetRegister(e.into()))
1618     }
1619     ///
1620     /// Gets the value of one register for this vCPU.
1621     ///
1622     #[cfg(target_arch = "aarch64")]
1623     fn get_reg(&self, reg_id: u64) -> cpu::Result<u64> {
1624         self.fd
1625             .get_one_reg(reg_id)
1626             .map_err(|e| cpu::HypervisorCpuError::GetRegister(e.into()))
1627     }
1628     ///
1629     /// Gets a list of the guest registers that are supported for the
1630     /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls.
1631     ///
1632     #[cfg(target_arch = "aarch64")]
1633     fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> {
1634         self.fd
1635             .get_reg_list(reg_list)
1636             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))
1637     }
1638     ///
1639     /// Save the state of the system registers.
1640     ///
1641     #[cfg(target_arch = "aarch64")]
1642     fn get_sys_regs(&self) -> cpu::Result<Vec<Register>> {
1643         // Call KVM_GET_REG_LIST to get all registers available to the guest. For ArmV8 there are
1644         // around 500 registers.
1645         let mut state: Vec<Register> = Vec::new();
1646         let mut reg_list = RegList::new(500).unwrap();
1647         self.fd
1648             .get_reg_list(&mut reg_list)
1649             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?;
1650 
1651         // At this point reg_list should contain: core registers and system registers.
1652         // The register list contains the number of registers and their ids. We will be needing to
1653         // call KVM_GET_ONE_REG on each id in order to save all of them. We carve out from the list
1654         // the core registers which are represented in the kernel by kvm_regs structure and for which
1655         // we can calculate the id based on the offset in the structure.
1656         reg_list.retain(|regid| is_system_register(*regid));
1657 
1658         // Now, for the rest of the registers left in the previously fetched register list, we are
1659         // simply calling KVM_GET_ONE_REG.
1660         let indices = reg_list.as_slice();
1661         for index in indices.iter() {
1662             state.push(kvm_bindings::kvm_one_reg {
1663                 id: *index,
1664                 addr: self
1665                     .fd
1666                     .get_one_reg(*index)
1667                     .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?,
1668             });
1669         }
1670 
1671         Ok(state)
1672     }
1673     ///
1674     /// Restore the state of the system registers.
1675     ///
1676     #[cfg(target_arch = "aarch64")]
1677     fn set_sys_regs(&self, state: &[Register]) -> cpu::Result<()> {
1678         for reg in state {
1679             self.fd
1680                 .set_one_reg(reg.id, reg.addr)
1681                 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?;
1682         }
1683         Ok(())
1684     }
1685     ///
1686     /// Read the MPIDR - Multiprocessor Affinity Register.
1687     ///
1688     #[cfg(target_arch = "aarch64")]
1689     fn read_mpidr(&self) -> cpu::Result<u64> {
1690         self.fd
1691             .get_one_reg(MPIDR_EL1)
1692             .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))
1693     }
1694     ///
1695     /// Configure core registers for a given CPU.
1696     ///
1697     #[cfg(target_arch = "aarch64")]
1698     fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> {
1699         #[allow(non_upper_case_globals)]
1700         // PSR (Processor State Register) bits.
1701         // Taken from arch/arm64/include/uapi/asm/ptrace.h.
1702         const PSR_MODE_EL1h: u64 = 0x0000_0005;
1703         const PSR_F_BIT: u64 = 0x0000_0040;
1704         const PSR_I_BIT: u64 = 0x0000_0080;
1705         const PSR_A_BIT: u64 = 0x0000_0100;
1706         const PSR_D_BIT: u64 = 0x0000_0200;
1707         // Taken from arch/arm64/kvm/inject_fault.c.
1708         const PSTATE_FAULT_BITS_64: u64 =
1709             PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | PSR_I_BIT | PSR_D_BIT;
1710 
1711         let kreg_off = offset__of!(kvm_regs, regs);
1712 
1713         // Get the register index of the PSTATE (Processor State) register.
1714         let pstate = offset__of!(user_pt_regs, pstate) + kreg_off;
1715         self.set_reg(
1716             arm64_core_reg_id!(KVM_REG_SIZE_U64, pstate),
1717             PSTATE_FAULT_BITS_64,
1718         )
1719         .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1720 
1721         // Other vCPUs are powered off initially awaiting PSCI wakeup.
1722         if cpu_id == 0 {
1723             // Setting the PC (Processor Counter) to the current program address (kernel address).
1724             let pc = offset__of!(user_pt_regs, pc) + kreg_off;
1725             self.set_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, pc), boot_ip as u64)
1726                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1727 
1728             // Last mandatory thing to set -> the address pointing to the FDT (also called DTB).
1729             // "The device tree blob (dtb) must be placed on an 8-byte boundary and must
1730             // not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt.
1731             // We are choosing to place it the end of DRAM. See `get_fdt_addr`.
1732             let regs0 = offset__of!(user_pt_regs, regs) + kreg_off;
1733             self.set_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, regs0), fdt_start)
1734                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1735         }
1736         Ok(())
1737     }
1738 
1739     #[cfg(target_arch = "x86_64")]
1740     ///
1741     /// Get the current CPU state
1742     ///
1743     /// Ordering requirements:
1744     ///
1745     /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify
1746     /// vCPU/LAPIC state. As such, it must be done before most everything
1747     /// else, otherwise we cannot restore everything and expect it to work.
1748     ///
1749     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1750     /// still running.
1751     ///
1752     /// KVM_GET_LAPIC may change state of LAPIC before returning it.
1753     ///
1754     /// GET_VCPU_EVENTS should probably be last to save. The code looks as
1755     /// it might as well be affected by internal state modifications of the
1756     /// GET ioctls.
1757     ///
1758     /// SREGS saves/restores a pending interrupt, similar to what
1759     /// VCPU_EVENTS also does.
1760     ///
1761     /// GET_MSRS requires a pre-populated data structure to do something
1762     /// meaningful. For SET_MSRS it will then contain good data.
1763     ///
1764     /// # Example
1765     ///
1766     /// ```rust
1767     /// # extern crate hypervisor;
1768     /// # use hypervisor::KvmHypervisor;
1769     /// # use std::sync::Arc;
1770     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1771     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1772     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1773     /// vm.enable_split_irq().unwrap();
1774     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1775     /// let state = vcpu.state().unwrap();
1776     /// ```
1777     fn state(&self) -> cpu::Result<CpuState> {
1778         let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?;
1779         let mp_state = self.get_mp_state()?.into();
1780         let regs = self.get_regs()?;
1781         let sregs = self.get_sregs()?;
1782         let xsave = self.get_xsave()?;
1783         let xcrs = self.get_xcrs()?;
1784         let lapic_state = self.get_lapic()?;
1785         let fpu = self.get_fpu()?;
1786 
1787         // Try to get all MSRs based on the list previously retrieved from KVM.
1788         // If the number of MSRs obtained from GET_MSRS is different from the
1789         // expected amount, we fallback onto a slower method by getting MSRs
1790         // by chunks. This is the only way to make sure we try to get as many
1791         // MSRs as possible, even if some MSRs are not supported.
1792         let mut msr_entries = self.msrs.clone();
1793 
1794         // Save extra MSRs if the Hyper-V synthetic interrupt controller is
1795         // emulated.
1796         if self.hyperv_synic.load(Ordering::Acquire) {
1797             let hyperv_synic_msrs = vec![
1798                 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084,
1799                 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096,
1800                 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d,
1801                 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4,
1802                 0x400000b5, 0x400000b6, 0x400000b7,
1803             ];
1804             for index in hyperv_synic_msrs {
1805                 let msr = kvm_msr_entry {
1806                     index,
1807                     ..Default::default()
1808                 };
1809                 msr_entries.push(msr.into());
1810             }
1811         }
1812 
1813         let expected_num_msrs = msr_entries.len();
1814         let num_msrs = self.get_msrs(&mut msr_entries)?;
1815         let msrs = if num_msrs != expected_num_msrs {
1816             let mut faulty_msr_index = num_msrs;
1817             let mut msr_entries_tmp = msr_entries[..faulty_msr_index].to_vec();
1818 
1819             loop {
1820                 warn!(
1821                     "Detected faulty MSR 0x{:x} while getting MSRs",
1822                     msr_entries[faulty_msr_index].index
1823                 );
1824 
1825                 // Skip the first bad MSR
1826                 let start_pos = faulty_msr_index + 1;
1827 
1828                 let mut sub_msr_entries = msr_entries[start_pos..].to_vec();
1829                 let num_msrs = self.get_msrs(&mut sub_msr_entries)?;
1830 
1831                 msr_entries_tmp.extend(&sub_msr_entries[..num_msrs]);
1832 
1833                 if num_msrs == sub_msr_entries.len() {
1834                     break;
1835                 }
1836 
1837                 faulty_msr_index = start_pos + num_msrs;
1838             }
1839 
1840             msr_entries_tmp
1841         } else {
1842             msr_entries
1843         };
1844 
1845         let vcpu_events = self.get_vcpu_events()?;
1846 
1847         Ok(VcpuKvmState {
1848             cpuid,
1849             msrs,
1850             vcpu_events,
1851             regs: regs.into(),
1852             sregs: sregs.into(),
1853             fpu,
1854             lapic_state,
1855             xsave,
1856             xcrs,
1857             mp_state,
1858         }
1859         .into())
1860     }
1861     ///
1862     /// Get the current AArch64 CPU state
1863     ///
1864     #[cfg(target_arch = "aarch64")]
1865     fn state(&self) -> cpu::Result<CpuState> {
1866         let mut state = VcpuKvmState {
1867             mp_state: self.get_mp_state()?.into(),
1868             mpidr: self.read_mpidr()?,
1869             ..Default::default()
1870         };
1871         state.core_regs = self.get_regs()?;
1872         state.sys_regs = self.get_sys_regs()?;
1873 
1874         Ok(state.into())
1875     }
1876     #[cfg(target_arch = "x86_64")]
1877     ///
1878     /// Restore the previously saved CPU state
1879     ///
1880     /// Ordering requirements:
1881     ///
1882     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1883     /// still running.
1884     ///
1885     /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so
1886     /// if we ever change the BSP, we have to do that before restoring anything.
1887     /// The same seems to be true for CPUID stuff.
1888     ///
1889     /// SREGS saves/restores a pending interrupt, similar to what
1890     /// VCPU_EVENTS also does.
1891     ///
1892     /// SET_REGS clears pending exceptions unconditionally, thus, it must be
1893     /// done before SET_VCPU_EVENTS, which restores it.
1894     ///
1895     /// SET_LAPIC must come after SET_SREGS, because the latter restores
1896     /// the apic base msr.
1897     ///
1898     /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR
1899     /// only restores successfully, when the LAPIC is correctly configured.
1900     ///
1901     /// Arguments: CpuState
1902     /// # Example
1903     ///
1904     /// ```rust
1905     /// # extern crate hypervisor;
1906     /// # use hypervisor::KvmHypervisor;
1907     /// # use std::sync::Arc;
1908     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1909     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1910     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1911     /// vm.enable_split_irq().unwrap();
1912     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1913     /// let state = vcpu.state().unwrap();
1914     /// vcpu.set_state(&state).unwrap();
1915     /// ```
1916     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
1917         let state: VcpuKvmState = state.clone().into();
1918         self.set_cpuid2(&state.cpuid)?;
1919         self.set_mp_state(state.mp_state.into())?;
1920         self.set_regs(&state.regs.into())?;
1921         self.set_sregs(&state.sregs.into())?;
1922         self.set_xsave(&state.xsave)?;
1923         self.set_xcrs(&state.xcrs)?;
1924         self.set_lapic(&state.lapic_state)?;
1925         self.set_fpu(&state.fpu)?;
1926 
1927         // Try to set all MSRs previously stored.
1928         // If the number of MSRs set from SET_MSRS is different from the
1929         // expected amount, we fallback onto a slower method by setting MSRs
1930         // by chunks. This is the only way to make sure we try to set as many
1931         // MSRs as possible, even if some MSRs are not supported.
1932         let expected_num_msrs = state.msrs.len();
1933         let num_msrs = self.set_msrs(&state.msrs)?;
1934         if num_msrs != expected_num_msrs {
1935             let mut faulty_msr_index = num_msrs;
1936 
1937             loop {
1938                 warn!(
1939                     "Detected faulty MSR 0x{:x} while setting MSRs",
1940                     state.msrs[faulty_msr_index].index
1941                 );
1942 
1943                 // Skip the first bad MSR
1944                 let start_pos = faulty_msr_index + 1;
1945 
1946                 let sub_msr_entries = state.msrs[start_pos..].to_vec();
1947 
1948                 let num_msrs = self.set_msrs(&sub_msr_entries)?;
1949 
1950                 if num_msrs == sub_msr_entries.len() {
1951                     break;
1952                 }
1953 
1954                 faulty_msr_index = start_pos + num_msrs;
1955             }
1956         }
1957 
1958         self.set_vcpu_events(&state.vcpu_events)?;
1959 
1960         Ok(())
1961     }
1962     ///
1963     /// Restore the previously saved AArch64 CPU state
1964     ///
1965     #[cfg(target_arch = "aarch64")]
1966     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
1967         let state: VcpuKvmState = state.clone().into();
1968         self.set_regs(&state.core_regs)?;
1969         self.set_sys_regs(&state.sys_regs)?;
1970         self.set_mp_state(state.mp_state.into())?;
1971 
1972         Ok(())
1973     }
1974 
1975     ///
1976     /// Initialize TDX for this CPU
1977     ///
1978     #[cfg(feature = "tdx")]
1979     fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> {
1980         tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address)
1981             .map_err(cpu::HypervisorCpuError::InitializeTdx)
1982     }
1983 
1984     ///
1985     /// Set the "immediate_exit" state
1986     ///
1987     fn set_immediate_exit(&self, exit: bool) {
1988         self.fd.set_kvm_immediate_exit(exit.into());
1989     }
1990 
1991     ///
1992     /// Returns the details about TDX exit reason
1993     ///
1994     #[cfg(feature = "tdx")]
1995     fn get_tdx_exit_details(&mut self) -> cpu::Result<TdxExitDetails> {
1996         let kvm_run = self.fd.get_kvm_run();
1997         let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall };
1998 
1999         tdx_vmcall.status_code = TDG_VP_VMCALL_INVALID_OPERAND;
2000 
2001         if tdx_vmcall.type_ != 0 {
2002             return Err(cpu::HypervisorCpuError::UnknownTdxVmCall);
2003         }
2004 
2005         match tdx_vmcall.subfunction {
2006             TDG_VP_VMCALL_GET_QUOTE => Ok(TdxExitDetails::GetQuote),
2007             TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT => {
2008                 Ok(TdxExitDetails::SetupEventNotifyInterrupt)
2009             }
2010             _ => Err(cpu::HypervisorCpuError::UnknownTdxVmCall),
2011         }
2012     }
2013 
2014     ///
2015     /// Set the status code for TDX exit
2016     ///
2017     #[cfg(feature = "tdx")]
2018     fn set_tdx_status(&mut self, status: TdxExitStatus) {
2019         let kvm_run = self.fd.get_kvm_run();
2020         let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall };
2021 
2022         tdx_vmcall.status_code = match status {
2023             TdxExitStatus::Success => TDG_VP_VMCALL_SUCCESS,
2024             TdxExitStatus::InvalidOperand => TDG_VP_VMCALL_INVALID_OPERAND,
2025         };
2026     }
2027     #[cfg(target_arch = "x86_64")]
2028     ///
2029     /// Return the list of initial MSR entries for a VCPU
2030     ///
2031     fn boot_msr_entries(&self) -> Vec<MsrEntry> {
2032         use crate::arch::x86::{msr_index, MTRR_ENABLE, MTRR_MEM_TYPE_WB};
2033 
2034         [
2035             msr!(msr_index::MSR_IA32_SYSENTER_CS),
2036             msr!(msr_index::MSR_IA32_SYSENTER_ESP),
2037             msr!(msr_index::MSR_IA32_SYSENTER_EIP),
2038             msr!(msr_index::MSR_STAR),
2039             msr!(msr_index::MSR_CSTAR),
2040             msr!(msr_index::MSR_LSTAR),
2041             msr!(msr_index::MSR_KERNEL_GS_BASE),
2042             msr!(msr_index::MSR_SYSCALL_MASK),
2043             msr!(msr_index::MSR_IA32_TSC),
2044             msr_data!(
2045                 msr_index::MSR_IA32_MISC_ENABLE,
2046                 msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64
2047             ),
2048             msr_data!(msr_index::MSR_MTRRdefType, MTRR_ENABLE | MTRR_MEM_TYPE_WB),
2049         ]
2050         .to_vec()
2051     }
2052     #[cfg(target_arch = "aarch64")]
2053     fn has_pmu_support(&self) -> bool {
2054         let cpu_attr = kvm_bindings::kvm_device_attr {
2055             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2056             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT),
2057             addr: 0x0,
2058             flags: 0,
2059         };
2060         self.fd.has_device_attr(&cpu_attr).is_ok()
2061     }
2062     #[cfg(target_arch = "aarch64")]
2063     fn init_pmu(&self, irq: u32) -> cpu::Result<()> {
2064         let cpu_attr = kvm_bindings::kvm_device_attr {
2065             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2066             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT),
2067             addr: 0x0,
2068             flags: 0,
2069         };
2070         let cpu_attr_irq = kvm_bindings::kvm_device_attr {
2071             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2072             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_IRQ),
2073             addr: &irq as *const u32 as u64,
2074             flags: 0,
2075         };
2076         self.fd
2077             .set_device_attr(&cpu_attr_irq)
2078             .map_err(|_| cpu::HypervisorCpuError::InitializePmu)?;
2079         self.fd
2080             .set_device_attr(&cpu_attr)
2081             .map_err(|_| cpu::HypervisorCpuError::InitializePmu)
2082     }
2083 }
2084 
2085 impl KvmVcpu {
2086     #[cfg(target_arch = "x86_64")]
2087     ///
2088     /// X86 specific call that returns the vcpu's current "xsave struct".
2089     ///
2090     fn get_xsave(&self) -> cpu::Result<Xsave> {
2091         self.fd
2092             .get_xsave()
2093             .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into()))
2094     }
2095     #[cfg(target_arch = "x86_64")]
2096     ///
2097     /// X86 specific call that sets the vcpu's current "xsave struct".
2098     ///
2099     fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> {
2100         self.fd
2101             .set_xsave(xsave)
2102             .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into()))
2103     }
2104     #[cfg(target_arch = "x86_64")]
2105     ///
2106     /// X86 specific call that returns the vcpu's current "xcrs".
2107     ///
2108     fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> {
2109         self.fd
2110             .get_xcrs()
2111             .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into()))
2112     }
2113     #[cfg(target_arch = "x86_64")]
2114     ///
2115     /// X86 specific call that sets the vcpu's current "xcrs".
2116     ///
2117     fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> {
2118         self.fd
2119             .set_xcrs(xcrs)
2120             .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into()))
2121     }
2122     #[cfg(target_arch = "x86_64")]
2123     ///
2124     /// Returns currently pending exceptions, interrupts, and NMIs as well as related
2125     /// states of the vcpu.
2126     ///
2127     fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> {
2128         self.fd
2129             .get_vcpu_events()
2130             .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into()))
2131     }
2132     #[cfg(target_arch = "x86_64")]
2133     ///
2134     /// Sets pending exceptions, interrupts, and NMIs as well as related states
2135     /// of the vcpu.
2136     ///
2137     fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> {
2138         self.fd
2139             .set_vcpu_events(events)
2140             .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into()))
2141     }
2142 }
2143