xref: /cloud-hypervisor/hypervisor/src/kvm/mod.rs (revision b440cb7d2330770cd415b63544a371d4caa2db3a)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 // Copyright © 2020, Microsoft Corporation
6 //
7 // Copyright 2018-2019 CrowdStrike, Inc.
8 //
9 //
10 
11 #[cfg(target_arch = "aarch64")]
12 use crate::aarch64::gic::KvmGicV3Its;
13 #[cfg(target_arch = "aarch64")]
14 pub use crate::aarch64::{
15     check_required_kvm_extensions, gic::Gicv3ItsState as GicState, is_system_register, VcpuInit,
16     VcpuKvmState, MPIDR_EL1,
17 };
18 #[cfg(target_arch = "aarch64")]
19 use crate::arch::aarch64::gic::Vgic;
20 use crate::cpu;
21 use crate::device;
22 use crate::hypervisor;
23 use crate::vec_with_array_field;
24 use crate::vm::{self, InterruptSourceConfig, VmOps};
25 #[cfg(target_arch = "aarch64")]
26 use crate::{arm64_core_reg_id, offset__of};
27 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd};
28 use std::any::Any;
29 use std::collections::HashMap;
30 #[cfg(target_arch = "aarch64")]
31 use std::convert::TryInto;
32 #[cfg(target_arch = "x86_64")]
33 use std::fs::File;
34 #[cfg(target_arch = "x86_64")]
35 use std::os::unix::io::AsRawFd;
36 #[cfg(feature = "tdx")]
37 use std::os::unix::io::RawFd;
38 use std::result;
39 #[cfg(target_arch = "x86_64")]
40 use std::sync::atomic::{AtomicBool, Ordering};
41 #[cfg(target_arch = "aarch64")]
42 use std::sync::Mutex;
43 use std::sync::{Arc, RwLock};
44 use vmm_sys_util::eventfd::EventFd;
45 // x86_64 dependencies
46 #[cfg(target_arch = "x86_64")]
47 pub mod x86_64;
48 #[cfg(target_arch = "x86_64")]
49 use crate::arch::x86::{
50     CpuIdEntry, FpuState, LapicState, MsrEntry, SpecialRegisters, StandardRegisters,
51     NUM_IOAPIC_PINS,
52 };
53 #[cfg(target_arch = "x86_64")]
54 use crate::ClockData;
55 use crate::{
56     CpuState, IoEventAddress, IrqRoutingEntry, MpState, UserMemoryRegion,
57     USER_MEMORY_REGION_LOG_DIRTY, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE,
58 };
59 #[cfg(target_arch = "aarch64")]
60 use aarch64::{RegList, Register, StandardRegisters};
61 #[cfg(target_arch = "x86_64")]
62 use kvm_bindings::{
63     kvm_enable_cap, kvm_guest_debug, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC,
64     KVM_CAP_SPLIT_IRQCHIP, KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_GUESTDBG_USE_HW_BP,
65 };
66 #[cfg(target_arch = "x86_64")]
67 use x86_64::check_required_kvm_extensions;
68 #[cfg(target_arch = "x86_64")]
69 pub use x86_64::{CpuId, ExtendedControlRegisters, MsrEntries, VcpuKvmState, Xsave};
70 // aarch64 dependencies
71 #[cfg(target_arch = "aarch64")]
72 pub mod aarch64;
73 pub use kvm_bindings;
74 #[cfg(feature = "tdx")]
75 use kvm_bindings::KVMIO;
76 pub use kvm_bindings::{
77     kvm_clock_data, kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_irq_routing,
78     kvm_irq_routing_entry, kvm_mp_state, kvm_userspace_memory_region, KVM_IRQ_ROUTING_IRQCHIP,
79     KVM_IRQ_ROUTING_MSI, KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID,
80 };
81 #[cfg(target_arch = "aarch64")]
82 use kvm_bindings::{
83     kvm_regs, user_fpsimd_state, user_pt_regs, KVM_NR_SPSR, KVM_REG_ARM64, KVM_REG_ARM_CORE,
84     KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64,
85 };
86 pub use kvm_ioctls;
87 pub use kvm_ioctls::{Cap, Kvm};
88 #[cfg(target_arch = "aarch64")]
89 use std::mem;
90 use thiserror::Error;
91 #[cfg(feature = "tdx")]
92 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_ioc_nr, ioctl_iowr_nr};
93 ///
94 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms
95 ///
96 pub use {
97     kvm_bindings::kvm_create_device as CreateDevice, kvm_bindings::kvm_device_attr as DeviceAttr,
98     kvm_bindings::kvm_run, kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::DeviceFd,
99     kvm_ioctls::VcpuExit,
100 };
101 
102 #[cfg(target_arch = "x86_64")]
103 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196;
104 
105 #[cfg(feature = "tdx")]
106 const KVM_EXIT_TDX: u32 = 35;
107 #[cfg(feature = "tdx")]
108 const TDG_VP_VMCALL_GET_QUOTE: u64 = 0x10002;
109 #[cfg(feature = "tdx")]
110 const TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: u64 = 0x10004;
111 #[cfg(feature = "tdx")]
112 const TDG_VP_VMCALL_SUCCESS: u64 = 0;
113 #[cfg(feature = "tdx")]
114 const TDG_VP_VMCALL_INVALID_OPERAND: u64 = 0x8000000000000000;
115 
116 #[cfg(feature = "tdx")]
117 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong);
118 
119 #[cfg(feature = "tdx")]
120 #[repr(u32)]
121 enum TdxCommand {
122     Capabilities = 0,
123     InitVm,
124     InitVcpu,
125     InitMemRegion,
126     Finalize,
127 }
128 
129 #[cfg(feature = "tdx")]
130 pub enum TdxExitDetails {
131     GetQuote,
132     SetupEventNotifyInterrupt,
133 }
134 
135 #[cfg(feature = "tdx")]
136 pub enum TdxExitStatus {
137     Success,
138     InvalidOperand,
139 }
140 
141 #[cfg(feature = "tdx")]
142 const TDX_MAX_NR_CPUID_CONFIGS: usize = 6;
143 
144 #[cfg(feature = "tdx")]
145 #[repr(C)]
146 #[derive(Debug, Default)]
147 pub struct TdxCpuidConfig {
148     pub leaf: u32,
149     pub sub_leaf: u32,
150     pub eax: u32,
151     pub ebx: u32,
152     pub ecx: u32,
153     pub edx: u32,
154 }
155 
156 #[cfg(feature = "tdx")]
157 #[repr(C)]
158 #[derive(Debug, Default)]
159 pub struct TdxCapabilities {
160     pub attrs_fixed0: u64,
161     pub attrs_fixed1: u64,
162     pub xfam_fixed0: u64,
163     pub xfam_fixed1: u64,
164     pub nr_cpuid_configs: u32,
165     pub padding: u32,
166     pub cpuid_configs: [TdxCpuidConfig; TDX_MAX_NR_CPUID_CONFIGS],
167 }
168 
169 impl From<kvm_userspace_memory_region> for UserMemoryRegion {
170     fn from(region: kvm_userspace_memory_region) -> Self {
171         let mut flags = USER_MEMORY_REGION_READ;
172         if region.flags & KVM_MEM_READONLY == 0 {
173             flags |= USER_MEMORY_REGION_WRITE;
174         }
175         if region.flags & KVM_MEM_LOG_DIRTY_PAGES != 0 {
176             flags |= USER_MEMORY_REGION_LOG_DIRTY;
177         }
178 
179         UserMemoryRegion {
180             slot: region.slot,
181             guest_phys_addr: region.guest_phys_addr,
182             memory_size: region.memory_size,
183             userspace_addr: region.userspace_addr,
184             flags,
185         }
186     }
187 }
188 
189 impl From<UserMemoryRegion> for kvm_userspace_memory_region {
190     fn from(region: UserMemoryRegion) -> Self {
191         assert!(
192             region.flags & USER_MEMORY_REGION_READ != 0,
193             "KVM mapped memory is always readable"
194         );
195 
196         let mut flags = 0;
197         if region.flags & USER_MEMORY_REGION_WRITE == 0 {
198             flags |= KVM_MEM_READONLY;
199         }
200         if region.flags & USER_MEMORY_REGION_LOG_DIRTY != 0 {
201             flags |= KVM_MEM_LOG_DIRTY_PAGES;
202         }
203 
204         kvm_userspace_memory_region {
205             slot: region.slot,
206             guest_phys_addr: region.guest_phys_addr,
207             memory_size: region.memory_size,
208             userspace_addr: region.userspace_addr,
209             flags,
210         }
211     }
212 }
213 
214 impl From<kvm_mp_state> for MpState {
215     fn from(s: kvm_mp_state) -> Self {
216         MpState::Kvm(s)
217     }
218 }
219 
220 impl From<MpState> for kvm_mp_state {
221     fn from(ms: MpState) -> Self {
222         match ms {
223             MpState::Kvm(s) => s,
224             /* Needed in case other hypervisors are enabled */
225             #[allow(unreachable_patterns)]
226             _ => panic!("CpuState is not valid"),
227         }
228     }
229 }
230 
231 impl From<kvm_ioctls::IoEventAddress> for IoEventAddress {
232     fn from(a: kvm_ioctls::IoEventAddress) -> Self {
233         match a {
234             kvm_ioctls::IoEventAddress::Pio(x) => Self::Pio(x),
235             kvm_ioctls::IoEventAddress::Mmio(x) => Self::Mmio(x),
236         }
237     }
238 }
239 
240 impl From<IoEventAddress> for kvm_ioctls::IoEventAddress {
241     fn from(a: IoEventAddress) -> Self {
242         match a {
243             IoEventAddress::Pio(x) => Self::Pio(x),
244             IoEventAddress::Mmio(x) => Self::Mmio(x),
245         }
246     }
247 }
248 
249 impl From<VcpuKvmState> for CpuState {
250     fn from(s: VcpuKvmState) -> Self {
251         CpuState::Kvm(s)
252     }
253 }
254 
255 impl From<CpuState> for VcpuKvmState {
256     fn from(s: CpuState) -> Self {
257         match s {
258             CpuState::Kvm(s) => s,
259             /* Needed in case other hypervisors are enabled */
260             #[allow(unreachable_patterns)]
261             _ => panic!("CpuState is not valid"),
262         }
263     }
264 }
265 
266 #[cfg(target_arch = "x86_64")]
267 impl From<kvm_clock_data> for ClockData {
268     fn from(d: kvm_clock_data) -> Self {
269         ClockData::Kvm(d)
270     }
271 }
272 
273 #[cfg(target_arch = "x86_64")]
274 impl From<ClockData> for kvm_clock_data {
275     fn from(ms: ClockData) -> Self {
276         match ms {
277             ClockData::Kvm(s) => s,
278             /* Needed in case other hypervisors are enabled */
279             #[allow(unreachable_patterns)]
280             _ => panic!("CpuState is not valid"),
281         }
282     }
283 }
284 
285 impl From<kvm_irq_routing_entry> for IrqRoutingEntry {
286     fn from(s: kvm_irq_routing_entry) -> Self {
287         IrqRoutingEntry::Kvm(s)
288     }
289 }
290 
291 impl From<IrqRoutingEntry> for kvm_irq_routing_entry {
292     fn from(e: IrqRoutingEntry) -> Self {
293         match e {
294             IrqRoutingEntry::Kvm(e) => e,
295             /* Needed in case other hypervisors are enabled */
296             #[allow(unreachable_patterns)]
297             _ => panic!("IrqRoutingEntry is not valid"),
298         }
299     }
300 }
301 
302 struct KvmDirtyLogSlot {
303     slot: u32,
304     guest_phys_addr: u64,
305     memory_size: u64,
306     userspace_addr: u64,
307 }
308 
309 /// Wrapper over KVM VM ioctls.
310 pub struct KvmVm {
311     fd: Arc<VmFd>,
312     #[cfg(target_arch = "x86_64")]
313     msrs: Vec<MsrEntry>,
314     dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>,
315 }
316 
317 ///
318 /// Implementation of Vm trait for KVM
319 /// Example:
320 /// #[cfg(feature = "kvm")]
321 /// extern crate hypervisor
322 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
323 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
324 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
325 /// vm.set/get().unwrap()
326 ///
327 impl vm::Vm for KvmVm {
328     #[cfg(target_arch = "x86_64")]
329     ///
330     /// Sets the address of the one-page region in the VM's address space.
331     ///
332     fn set_identity_map_address(&self, address: u64) -> vm::Result<()> {
333         self.fd
334             .set_identity_map_address(address)
335             .map_err(|e| vm::HypervisorVmError::SetIdentityMapAddress(e.into()))
336     }
337     #[cfg(target_arch = "x86_64")]
338     ///
339     /// Sets the address of the three-page region in the VM's address space.
340     ///
341     fn set_tss_address(&self, offset: usize) -> vm::Result<()> {
342         self.fd
343             .set_tss_address(offset)
344             .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into()))
345     }
346     ///
347     /// Creates an in-kernel interrupt controller.
348     ///
349     fn create_irq_chip(&self) -> vm::Result<()> {
350         self.fd
351             .create_irq_chip()
352             .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into()))
353     }
354     ///
355     /// Registers an event that will, when signaled, trigger the `gsi` IRQ.
356     ///
357     fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
358         self.fd
359             .register_irqfd(fd, gsi)
360             .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into()))
361     }
362     ///
363     /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ.
364     ///
365     fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
366         self.fd
367             .unregister_irqfd(fd, gsi)
368             .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into()))
369     }
370     ///
371     /// Creates a VcpuFd object from a vcpu RawFd.
372     ///
373     fn create_vcpu(
374         &self,
375         id: u8,
376         vm_ops: Option<Arc<dyn VmOps>>,
377     ) -> vm::Result<Arc<dyn cpu::Vcpu>> {
378         let vc = self
379             .fd
380             .create_vcpu(id as u64)
381             .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?;
382         let vcpu = KvmVcpu {
383             fd: vc,
384             #[cfg(target_arch = "x86_64")]
385             msrs: self.msrs.clone(),
386             vm_ops,
387             #[cfg(target_arch = "x86_64")]
388             hyperv_synic: AtomicBool::new(false),
389         };
390         Ok(Arc::new(vcpu))
391     }
392     #[cfg(target_arch = "aarch64")]
393     ///
394     /// Creates a virtual GIC device.
395     ///
396     fn create_vgic(
397         &self,
398         vcpu_count: u64,
399         dist_addr: u64,
400         dist_size: u64,
401         redist_size: u64,
402         msi_size: u64,
403         nr_irqs: u32,
404     ) -> vm::Result<Arc<Mutex<dyn Vgic>>> {
405         let gic_device = KvmGicV3Its::new(
406             self,
407             vcpu_count,
408             dist_addr,
409             dist_size,
410             redist_size,
411             msi_size,
412             nr_irqs,
413         )
414         .map_err(|e| vm::HypervisorVmError::CreateVgic(anyhow!("Vgic error {:?}", e)))?;
415         Ok(Arc::new(Mutex::new(gic_device)))
416     }
417     ///
418     /// Registers an event to be signaled whenever a certain address is written to.
419     ///
420     fn register_ioevent(
421         &self,
422         fd: &EventFd,
423         addr: &IoEventAddress,
424         datamatch: Option<vm::DataMatch>,
425     ) -> vm::Result<()> {
426         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
427         if let Some(dm) = datamatch {
428             match dm {
429                 vm::DataMatch::DataMatch32(kvm_dm32) => self
430                     .fd
431                     .register_ioevent(fd, addr, kvm_dm32)
432                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
433                 vm::DataMatch::DataMatch64(kvm_dm64) => self
434                     .fd
435                     .register_ioevent(fd, addr, kvm_dm64)
436                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
437             }
438         } else {
439             self.fd
440                 .register_ioevent(fd, addr, NoDatamatch)
441                 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into()))
442         }
443     }
444     ///
445     /// Unregisters an event from a certain address it has been previously registered to.
446     ///
447     fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> {
448         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
449         self.fd
450             .unregister_ioevent(fd, addr, NoDatamatch)
451             .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into()))
452     }
453 
454     ///
455     /// Constructs a routing entry
456     ///
457     fn make_routing_entry(&self, gsi: u32, config: &InterruptSourceConfig) -> IrqRoutingEntry {
458         match &config {
459             InterruptSourceConfig::MsiIrq(cfg) => {
460                 let mut kvm_route = kvm_irq_routing_entry {
461                     gsi,
462                     type_: KVM_IRQ_ROUTING_MSI,
463                     ..Default::default()
464                 };
465 
466                 kvm_route.u.msi.address_lo = cfg.low_addr;
467                 kvm_route.u.msi.address_hi = cfg.high_addr;
468                 kvm_route.u.msi.data = cfg.data;
469 
470                 if self.check_extension(crate::kvm::Cap::MsiDevid) {
471                     // On AArch64, there is limitation on the range of the 'devid',
472                     // it can not be greater than 65536 (the max of u16).
473                     //
474                     // BDF can not be used directly, because 'segment' is in high
475                     // 16 bits. The layout of the u32 BDF is:
476                     // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --|
477                     // |      segment    |     bus    |   device   |  function  |
478                     //
479                     // Now that we support 1 bus only in a segment, we can build a
480                     // 'devid' by replacing the 'bus' bits with the low 8 bits of
481                     // 'segment' data.
482                     // This way we can resolve the range checking problem and give
483                     // different `devid` to all the devices. Limitation is that at
484                     // most 256 segments can be supported.
485                     //
486                     let modified_devid = (cfg.devid & 0x00ff_0000) >> 8 | cfg.devid & 0xff;
487 
488                     kvm_route.flags = KVM_MSI_VALID_DEVID;
489                     kvm_route.u.msi.__bindgen_anon_1.devid = modified_devid;
490                 }
491                 kvm_route.into()
492             }
493             InterruptSourceConfig::LegacyIrq(cfg) => {
494                 let mut kvm_route = kvm_irq_routing_entry {
495                     gsi,
496                     type_: KVM_IRQ_ROUTING_IRQCHIP,
497                     ..Default::default()
498                 };
499                 kvm_route.u.irqchip.irqchip = cfg.irqchip;
500                 kvm_route.u.irqchip.pin = cfg.pin;
501 
502                 kvm_route.into()
503             }
504         }
505     }
506 
507     ///
508     /// Sets the GSI routing table entries, overwriting any previously set
509     /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl.
510     ///
511     fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> {
512         let mut irq_routing =
513             vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len());
514         irq_routing[0].nr = entries.len() as u32;
515         irq_routing[0].flags = 0;
516         let entries: Vec<kvm_irq_routing_entry> = entries
517             .iter()
518             .map(|entry| match entry {
519                 IrqRoutingEntry::Kvm(e) => *e,
520                 #[allow(unreachable_patterns)]
521                 _ => panic!("IrqRoutingEntry type is wrong"),
522             })
523             .collect();
524 
525         // SAFETY: irq_routing initialized with entries.len() and now it is being turned into
526         // entries_slice with entries.len() again. It is guaranteed to be large enough to hold
527         // everything from entries.
528         unsafe {
529             let entries_slice: &mut [kvm_irq_routing_entry] =
530                 irq_routing[0].entries.as_mut_slice(entries.len());
531             entries_slice.copy_from_slice(&entries);
532         }
533 
534         self.fd
535             .set_gsi_routing(&irq_routing[0])
536             .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into()))
537     }
538     ///
539     /// Creates a memory region structure that can be used with {create/remove}_user_memory_region
540     ///
541     fn make_user_memory_region(
542         &self,
543         slot: u32,
544         guest_phys_addr: u64,
545         memory_size: u64,
546         userspace_addr: u64,
547         readonly: bool,
548         log_dirty_pages: bool,
549     ) -> UserMemoryRegion {
550         kvm_userspace_memory_region {
551             slot,
552             guest_phys_addr,
553             memory_size,
554             userspace_addr,
555             flags: if readonly { KVM_MEM_READONLY } else { 0 }
556                 | if log_dirty_pages {
557                     KVM_MEM_LOG_DIRTY_PAGES
558                 } else {
559                     0
560                 },
561         }
562         .into()
563     }
564     ///
565     /// Creates a guest physical memory region.
566     ///
567     fn create_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
568         let mut region: kvm_userspace_memory_region = user_memory_region.into();
569 
570         if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 {
571             if (region.flags & KVM_MEM_READONLY) != 0 {
572                 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!(
573                     "Error creating regions with both 'dirty-pages-log' and 'read-only'."
574                 )));
575             }
576 
577             // Keep track of the regions that need dirty pages log
578             self.dirty_log_slots.write().unwrap().insert(
579                 region.slot,
580                 KvmDirtyLogSlot {
581                     slot: region.slot,
582                     guest_phys_addr: region.guest_phys_addr,
583                     memory_size: region.memory_size,
584                     userspace_addr: region.userspace_addr,
585                 },
586             );
587 
588             // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`.
589             // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`.
590             region.flags = 0;
591         }
592 
593         // SAFETY: Safe because guest regions are guaranteed not to overlap.
594         unsafe {
595             self.fd
596                 .set_user_memory_region(region)
597                 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))
598         }
599     }
600     ///
601     /// Removes a guest physical memory region.
602     ///
603     fn remove_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
604         let mut region: kvm_userspace_memory_region = user_memory_region.into();
605 
606         // Remove the corresponding entry from "self.dirty_log_slots" if needed
607         self.dirty_log_slots.write().unwrap().remove(&region.slot);
608 
609         // Setting the size to 0 means "remove"
610         region.memory_size = 0;
611         // SAFETY: Safe because guest regions are guaranteed not to overlap.
612         unsafe {
613             self.fd
614                 .set_user_memory_region(region)
615                 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into()))
616         }
617     }
618     ///
619     /// Creates an emulated device in the kernel.
620     ///
621     /// See the documentation for `KVM_CREATE_DEVICE`.
622     fn create_device(&self, device: &mut CreateDevice) -> vm::Result<Arc<dyn device::Device>> {
623         let device_fd = self
624             .fd
625             .create_device(device)
626             .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?;
627         Ok(Arc::new(device_fd))
628     }
629     ///
630     /// Returns the preferred CPU target type which can be emulated by KVM on underlying host.
631     ///
632     #[cfg(target_arch = "aarch64")]
633     fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> {
634         self.fd
635             .get_preferred_target(kvi)
636             .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into()))
637     }
638     #[cfg(target_arch = "x86_64")]
639     fn enable_split_irq(&self) -> vm::Result<()> {
640         // Create split irqchip
641         // Only the local APIC is emulated in kernel, both PICs and IOAPIC
642         // are not.
643         let mut cap = kvm_enable_cap {
644             cap: KVM_CAP_SPLIT_IRQCHIP,
645             ..Default::default()
646         };
647         cap.args[0] = NUM_IOAPIC_PINS as u64;
648         self.fd
649             .enable_cap(&cap)
650             .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?;
651         Ok(())
652     }
653     #[cfg(target_arch = "x86_64")]
654     fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> {
655         let mut cap = kvm_enable_cap {
656             cap: KVM_CAP_SGX_ATTRIBUTE,
657             ..Default::default()
658         };
659         cap.args[0] = file.as_raw_fd() as u64;
660         self.fd
661             .enable_cap(&cap)
662             .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?;
663         Ok(())
664     }
665     /// Retrieve guest clock.
666     #[cfg(target_arch = "x86_64")]
667     fn get_clock(&self) -> vm::Result<ClockData> {
668         Ok(self
669             .fd
670             .get_clock()
671             .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))?
672             .into())
673     }
674     /// Set guest clock.
675     #[cfg(target_arch = "x86_64")]
676     fn set_clock(&self, data: &ClockData) -> vm::Result<()> {
677         let data = (*data).into();
678         self.fd
679             .set_clock(&data)
680             .map_err(|e| vm::HypervisorVmError::SetClock(e.into()))
681     }
682     /// Checks if a particular `Cap` is available.
683     fn check_extension(&self, c: Cap) -> bool {
684         self.fd.check_extension(c)
685     }
686     /// Create a device that is used for passthrough
687     fn create_passthrough_device(&self) -> vm::Result<Arc<dyn device::Device>> {
688         let mut vfio_dev = kvm_create_device {
689             type_: kvm_device_type_KVM_DEV_TYPE_VFIO,
690             fd: 0,
691             flags: 0,
692         };
693 
694         self.create_device(&mut vfio_dev)
695             .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into()))
696     }
697     ///
698     /// Start logging dirty pages
699     ///
700     fn start_dirty_log(&self) -> vm::Result<()> {
701         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
702         for (_, s) in dirty_log_slots.iter() {
703             let region = kvm_userspace_memory_region {
704                 slot: s.slot,
705                 guest_phys_addr: s.guest_phys_addr,
706                 memory_size: s.memory_size,
707                 userspace_addr: s.userspace_addr,
708                 flags: KVM_MEM_LOG_DIRTY_PAGES,
709             };
710             // SAFETY: Safe because guest regions are guaranteed not to overlap.
711             unsafe {
712                 self.fd
713                     .set_user_memory_region(region)
714                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
715             }
716         }
717 
718         Ok(())
719     }
720 
721     ///
722     /// Stop logging dirty pages
723     ///
724     fn stop_dirty_log(&self) -> vm::Result<()> {
725         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
726         for (_, s) in dirty_log_slots.iter() {
727             let region = kvm_userspace_memory_region {
728                 slot: s.slot,
729                 guest_phys_addr: s.guest_phys_addr,
730                 memory_size: s.memory_size,
731                 userspace_addr: s.userspace_addr,
732                 flags: 0,
733             };
734             // SAFETY: Safe because guest regions are guaranteed not to overlap.
735             unsafe {
736                 self.fd
737                     .set_user_memory_region(region)
738                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
739             }
740         }
741 
742         Ok(())
743     }
744 
745     ///
746     /// Get dirty pages bitmap (one bit per page)
747     ///
748     fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> {
749         self.fd
750             .get_dirty_log(slot, memory_size as usize)
751             .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into()))
752     }
753 
754     ///
755     /// Initialize TDX for this VM
756     ///
757     #[cfg(feature = "tdx")]
758     fn tdx_init(&self, cpuid: &[CpuIdEntry], max_vcpus: u32) -> vm::Result<()> {
759         use std::io::{Error, ErrorKind};
760         let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> =
761             cpuid.iter().map(|e| (*e).into()).collect();
762         let kvm_cpuid = kvm_bindings::CpuId::from_entries(&cpuid).map_err(|_| {
763             vm::HypervisorVmError::InitializeTdx(Error::new(
764                 ErrorKind::Other,
765                 "failed to allocate CpuId",
766             ))
767         })?;
768 
769         #[repr(C)]
770         struct TdxInitVm {
771             max_vcpus: u32,
772             tsc_khz: u32,
773             attributes: u64,
774             cpuid: u64,
775             mrconfigid: [u64; 6],
776             mrowner: [u64; 6],
777             mrownerconfig: [u64; 6],
778             reserved: [u64; 43],
779         }
780         let data = TdxInitVm {
781             max_vcpus,
782             tsc_khz: 0,
783             attributes: 0,
784             cpuid: kvm_cpuid.as_fam_struct_ptr() as u64,
785             mrconfigid: [0; 6],
786             mrowner: [0; 6],
787             mrownerconfig: [0; 6],
788             reserved: [0; 43],
789         };
790 
791         tdx_command(
792             &self.fd.as_raw_fd(),
793             TdxCommand::InitVm,
794             0,
795             &data as *const _ as u64,
796         )
797         .map_err(vm::HypervisorVmError::InitializeTdx)
798     }
799 
800     ///
801     /// Finalize the TDX setup for this VM
802     ///
803     #[cfg(feature = "tdx")]
804     fn tdx_finalize(&self) -> vm::Result<()> {
805         tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0)
806             .map_err(vm::HypervisorVmError::FinalizeTdx)
807     }
808 
809     ///
810     /// Initialize memory regions for the TDX VM
811     ///
812     #[cfg(feature = "tdx")]
813     fn tdx_init_memory_region(
814         &self,
815         host_address: u64,
816         guest_address: u64,
817         size: u64,
818         measure: bool,
819     ) -> vm::Result<()> {
820         #[repr(C)]
821         struct TdxInitMemRegion {
822             host_address: u64,
823             guest_address: u64,
824             pages: u64,
825         }
826         let data = TdxInitMemRegion {
827             host_address,
828             guest_address,
829             pages: size / 4096,
830         };
831 
832         tdx_command(
833             &self.fd.as_raw_fd(),
834             TdxCommand::InitMemRegion,
835             if measure { 1 } else { 0 },
836             &data as *const _ as u64,
837         )
838         .map_err(vm::HypervisorVmError::InitMemRegionTdx)
839     }
840 }
841 
842 #[cfg(feature = "tdx")]
843 fn tdx_command(
844     fd: &RawFd,
845     command: TdxCommand,
846     metadata: u32,
847     data: u64,
848 ) -> std::result::Result<(), std::io::Error> {
849     #[repr(C)]
850     struct TdxIoctlCmd {
851         command: TdxCommand,
852         metadata: u32,
853         data: u64,
854     }
855     let cmd = TdxIoctlCmd {
856         command,
857         metadata,
858         data,
859     };
860     // SAFETY: FFI call. All input parameters are valid.
861     let ret = unsafe {
862         ioctl_with_val(
863             fd,
864             KVM_MEMORY_ENCRYPT_OP(),
865             &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong,
866         )
867     };
868 
869     if ret < 0 {
870         return Err(std::io::Error::last_os_error());
871     }
872     Ok(())
873 }
874 
875 /// Wrapper over KVM system ioctls.
876 pub struct KvmHypervisor {
877     kvm: Kvm,
878 }
879 
880 impl KvmHypervisor {
881     #[cfg(target_arch = "x86_64")]
882     ///
883     /// Retrieve the list of MSRs supported by the hypervisor.
884     ///
885     fn get_msr_list(&self) -> hypervisor::Result<MsrList> {
886         self.kvm
887             .get_msr_index_list()
888             .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into()))
889     }
890 }
891 
892 /// Enum for KVM related error
893 #[derive(Debug, Error)]
894 pub enum KvmError {
895     #[error("Capability missing: {0:?}")]
896     CapabilityMissing(Cap),
897 }
898 pub type KvmResult<T> = result::Result<T, KvmError>;
899 impl KvmHypervisor {
900     /// Create a hypervisor based on Kvm
901     pub fn new() -> hypervisor::Result<KvmHypervisor> {
902         let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?;
903         let api_version = kvm_obj.get_api_version();
904 
905         if api_version != kvm_bindings::KVM_API_VERSION as i32 {
906             return Err(hypervisor::HypervisorError::IncompatibleApiVersion);
907         }
908 
909         Ok(KvmHypervisor { kvm: kvm_obj })
910     }
911 }
912 /// Implementation of Hypervisor trait for KVM
913 /// Example:
914 /// #[cfg(feature = "kvm")]
915 /// extern crate hypervisor
916 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
917 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
918 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
919 ///
920 impl hypervisor::Hypervisor for KvmHypervisor {
921     /// Create a KVM vm object of a specific VM type and return the object as Vm trait object
922     /// Example
923     /// # extern crate hypervisor;
924     /// # use hypervisor::KvmHypervisor;
925     /// use hypervisor::KvmVm;
926     /// let hypervisor = KvmHypervisor::new().unwrap();
927     /// let vm = hypervisor.create_vm_with_type(KvmVmType::LegacyVm).unwrap()
928     ///
929     fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> {
930         let fd: VmFd;
931         loop {
932             match self.kvm.create_vm_with_type(vm_type) {
933                 Ok(res) => fd = res,
934                 Err(e) => {
935                     if e.errno() == libc::EINTR {
936                         // If the error returned is EINTR, which means the
937                         // ioctl has been interrupted, we have to retry as
938                         // this can't be considered as a regular error.
939                         continue;
940                     } else {
941                         return Err(hypervisor::HypervisorError::VmCreate(e.into()));
942                     }
943                 }
944             }
945             break;
946         }
947 
948         let vm_fd = Arc::new(fd);
949 
950         #[cfg(target_arch = "x86_64")]
951         {
952             let msr_list = self.get_msr_list()?;
953             let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize;
954             let mut msrs: Vec<MsrEntry> = vec![
955                 MsrEntry {
956                     ..Default::default()
957                 };
958                 num_msrs
959             ];
960             let indices = msr_list.as_slice();
961             for (pos, index) in indices.iter().enumerate() {
962                 msrs[pos].index = *index;
963             }
964 
965             Ok(Arc::new(KvmVm {
966                 fd: vm_fd,
967                 msrs,
968                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
969             }))
970         }
971 
972         #[cfg(target_arch = "aarch64")]
973         {
974             Ok(Arc::new(KvmVm {
975                 fd: vm_fd,
976                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
977             }))
978         }
979     }
980 
981     /// Create a KVM vm object and return the object as Vm trait object
982     /// Example
983     /// # extern crate hypervisor;
984     /// # use hypervisor::KvmHypervisor;
985     /// use hypervisor::KvmVm;
986     /// let hypervisor = KvmHypervisor::new().unwrap();
987     /// let vm = hypervisor.create_vm().unwrap()
988     ///
989     fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> {
990         #[allow(unused_mut)]
991         let mut vm_type: u64 = 0; // Create with default platform type
992 
993         // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA
994         // size from the host and use that when creating the VM, which may
995         // avoid unnecessary VM creation failures.
996         #[cfg(target_arch = "aarch64")]
997         if self.kvm.check_extension(Cap::ArmVmIPASize) {
998             vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap();
999         }
1000 
1001         self.create_vm_with_type(vm_type)
1002     }
1003 
1004     fn check_required_extensions(&self) -> hypervisor::Result<()> {
1005         check_required_kvm_extensions(&self.kvm)
1006             .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into()))
1007     }
1008 
1009     #[cfg(target_arch = "x86_64")]
1010     ///
1011     /// X86 specific call to get the system supported CPUID values.
1012     ///
1013     fn get_cpuid(&self) -> hypervisor::Result<Vec<CpuIdEntry>> {
1014         let kvm_cpuid = self
1015             .kvm
1016             .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
1017             .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))?;
1018 
1019         let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect();
1020 
1021         Ok(v)
1022     }
1023 
1024     #[cfg(target_arch = "aarch64")]
1025     ///
1026     /// Retrieve AArch64 host maximum IPA size supported by KVM.
1027     ///
1028     fn get_host_ipa_limit(&self) -> i32 {
1029         self.kvm.get_host_ipa_limit()
1030     }
1031 
1032     ///
1033     /// Retrieve TDX capabilities
1034     ///
1035     #[cfg(feature = "tdx")]
1036     fn tdx_capabilities(&self) -> hypervisor::Result<TdxCapabilities> {
1037         let data = TdxCapabilities {
1038             nr_cpuid_configs: TDX_MAX_NR_CPUID_CONFIGS as u32,
1039             ..Default::default()
1040         };
1041 
1042         tdx_command(
1043             &self.kvm.as_raw_fd(),
1044             TdxCommand::Capabilities,
1045             0,
1046             &data as *const _ as u64,
1047         )
1048         .map_err(|e| hypervisor::HypervisorError::TdxCapabilities(e.into()))?;
1049 
1050         Ok(data)
1051     }
1052 }
1053 /// Vcpu struct for KVM
1054 pub struct KvmVcpu {
1055     fd: VcpuFd,
1056     #[cfg(target_arch = "x86_64")]
1057     msrs: Vec<MsrEntry>,
1058     vm_ops: Option<Arc<dyn vm::VmOps>>,
1059     #[cfg(target_arch = "x86_64")]
1060     hyperv_synic: AtomicBool,
1061 }
1062 /// Implementation of Vcpu trait for KVM
1063 /// Example:
1064 /// #[cfg(feature = "kvm")]
1065 /// extern crate hypervisor
1066 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1067 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1068 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
1069 /// let vcpu = vm.create_vcpu(0, None).unwrap();
1070 /// vcpu.get/set().unwrap()
1071 ///
1072 impl cpu::Vcpu for KvmVcpu {
1073     #[cfg(target_arch = "x86_64")]
1074     ///
1075     /// Returns the vCPU general purpose registers.
1076     ///
1077     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1078         Ok(self
1079             .fd
1080             .get_regs()
1081             .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))?
1082             .into())
1083     }
1084     ///
1085     /// Returns the vCPU general purpose registers.
1086     /// The `KVM_GET_REGS` ioctl is not available on AArch64, `KVM_GET_ONE_REG`
1087     /// is used to get registers one by one.
1088     ///
1089     #[cfg(target_arch = "aarch64")]
1090     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1091         let mut state: StandardRegisters = kvm_regs::default();
1092         let mut off = offset__of!(user_pt_regs, regs);
1093         // There are 31 user_pt_regs:
1094         // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72
1095         // These actually are the general-purpose registers of the Armv8-a
1096         // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register).
1097         for i in 0..31 {
1098             state.regs.regs[i] = self
1099                 .fd
1100                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1101                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1102             off += std::mem::size_of::<u64>();
1103         }
1104 
1105         // We are now entering the "Other register" section of the ARMv8-a architecture.
1106         // First one, stack pointer.
1107         let off = offset__of!(user_pt_regs, sp);
1108         state.regs.sp = self
1109             .fd
1110             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1111             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1112 
1113         // Second one, the program counter.
1114         let off = offset__of!(user_pt_regs, pc);
1115         state.regs.pc = self
1116             .fd
1117             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1118             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1119 
1120         // Next is the processor state.
1121         let off = offset__of!(user_pt_regs, pstate);
1122         state.regs.pstate = self
1123             .fd
1124             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1125             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1126 
1127         // The stack pointer associated with EL1
1128         let off = offset__of!(kvm_regs, sp_el1);
1129         state.sp_el1 = self
1130             .fd
1131             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1132             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1133 
1134         // Exception Link Register for EL1, when taking an exception to EL1, this register
1135         // holds the address to which to return afterwards.
1136         let off = offset__of!(kvm_regs, elr_el1);
1137         state.elr_el1 = self
1138             .fd
1139             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1140             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1141 
1142         // Saved Program Status Registers, there are 5 of them used in the kernel.
1143         let mut off = offset__of!(kvm_regs, spsr);
1144         for i in 0..KVM_NR_SPSR as usize {
1145             state.spsr[i] = self
1146                 .fd
1147                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1148                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1149             off += std::mem::size_of::<u64>();
1150         }
1151 
1152         // Now moving on to floting point registers which are stored in the user_fpsimd_state in the kernel:
1153         // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53
1154         let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs);
1155         for i in 0..32 {
1156             state.fp_regs.vregs[i] = self
1157                 .fd
1158                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off))
1159                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1160                 .into();
1161             off += mem::size_of::<u128>();
1162         }
1163 
1164         // Floating-point Status Register
1165         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr);
1166         state.fp_regs.fpsr = self
1167             .fd
1168             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1169             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1170             as u32;
1171 
1172         // Floating-point Control Register
1173         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr);
1174         state.fp_regs.fpcr = self
1175             .fd
1176             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1177             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1178             as u32;
1179         Ok(state)
1180     }
1181     #[cfg(target_arch = "x86_64")]
1182     ///
1183     /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl.
1184     ///
1185     fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> {
1186         let regs = (*regs).into();
1187         self.fd
1188             .set_regs(&regs)
1189             .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into()))
1190     }
1191 
1192     ///
1193     /// Sets the vCPU general purpose registers.
1194     /// The `KVM_SET_REGS` ioctl is not available on AArch64, `KVM_SET_ONE_REG`
1195     /// is used to set registers one by one.
1196     ///
1197     #[cfg(target_arch = "aarch64")]
1198     fn set_regs(&self, state: &StandardRegisters) -> cpu::Result<()> {
1199         // The function follows the exact identical order from `state`. Look there
1200         // for some additional info on registers.
1201         let mut off = offset__of!(user_pt_regs, regs);
1202         for i in 0..31 {
1203             self.fd
1204                 .set_one_reg(
1205                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1206                     state.regs.regs[i],
1207                 )
1208                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1209             off += std::mem::size_of::<u64>();
1210         }
1211 
1212         let off = offset__of!(user_pt_regs, sp);
1213         self.fd
1214             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.sp)
1215             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1216 
1217         let off = offset__of!(user_pt_regs, pc);
1218         self.fd
1219             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pc)
1220             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1221 
1222         let off = offset__of!(user_pt_regs, pstate);
1223         self.fd
1224             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pstate)
1225             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1226 
1227         let off = offset__of!(kvm_regs, sp_el1);
1228         self.fd
1229             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.sp_el1)
1230             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1231 
1232         let off = offset__of!(kvm_regs, elr_el1);
1233         self.fd
1234             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.elr_el1)
1235             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1236 
1237         let mut off = offset__of!(kvm_regs, spsr);
1238         for i in 0..KVM_NR_SPSR as usize {
1239             self.fd
1240                 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.spsr[i])
1241                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1242             off += std::mem::size_of::<u64>();
1243         }
1244 
1245         let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs);
1246         for i in 0..32 {
1247             self.fd
1248                 .set_one_reg(
1249                     arm64_core_reg_id!(KVM_REG_SIZE_U128, off),
1250                     state.fp_regs.vregs[i] as u64,
1251                 )
1252                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1253             off += mem::size_of::<u128>();
1254         }
1255 
1256         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr);
1257         self.fd
1258             .set_one_reg(
1259                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1260                 state.fp_regs.fpsr as u64,
1261             )
1262             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1263 
1264         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr);
1265         self.fd
1266             .set_one_reg(
1267                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1268                 state.fp_regs.fpcr as u64,
1269             )
1270             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1271         Ok(())
1272     }
1273 
1274     #[cfg(target_arch = "aarch64")]
1275     ///
1276     /// Set attribute for vcpu.
1277     ///
1278     fn set_vcpu_attr(&self, attr: &DeviceAttr) -> cpu::Result<()> {
1279         self.fd
1280             .set_device_attr(attr)
1281             .map_err(|e| cpu::HypervisorCpuError::SetVcpuAttribute(e.into()))
1282     }
1283 
1284     #[cfg(target_arch = "aarch64")]
1285     ///
1286     /// Check if vcpu has a certain attribute.
1287     ///
1288     fn has_vcpu_attr(&self, attr: &DeviceAttr) -> cpu::Result<()> {
1289         self.fd
1290             .has_device_attr(attr)
1291             .map_err(|e| cpu::HypervisorCpuError::HasVcpuAttribute(e.into()))
1292     }
1293 
1294     #[cfg(target_arch = "x86_64")]
1295     ///
1296     /// Returns the vCPU special registers.
1297     ///
1298     fn get_sregs(&self) -> cpu::Result<SpecialRegisters> {
1299         Ok(self
1300             .fd
1301             .get_sregs()
1302             .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))?
1303             .into())
1304     }
1305     #[cfg(target_arch = "x86_64")]
1306     ///
1307     /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl.
1308     ///
1309     fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> {
1310         let sregs = (*sregs).into();
1311         self.fd
1312             .set_sregs(&sregs)
1313             .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into()))
1314     }
1315     #[cfg(target_arch = "x86_64")]
1316     ///
1317     /// Returns the floating point state (FPU) from the vCPU.
1318     ///
1319     fn get_fpu(&self) -> cpu::Result<FpuState> {
1320         Ok(self
1321             .fd
1322             .get_fpu()
1323             .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))?
1324             .into())
1325     }
1326     #[cfg(target_arch = "x86_64")]
1327     ///
1328     /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct.
1329     ///
1330     fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> {
1331         let fpu: kvm_bindings::kvm_fpu = (*fpu).clone().into();
1332         self.fd
1333             .set_fpu(&fpu)
1334             .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into()))
1335     }
1336     #[cfg(target_arch = "x86_64")]
1337     ///
1338     /// X86 specific call to setup the CPUID registers.
1339     ///
1340     fn set_cpuid2(&self, cpuid: &[CpuIdEntry]) -> cpu::Result<()> {
1341         let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> =
1342             cpuid.iter().map(|e| (*e).into()).collect();
1343         let kvm_cpuid = <CpuId>::from_entries(&cpuid)
1344             .map_err(|_| cpu::HypervisorCpuError::SetCpuid(anyhow!("failed to create CpuId")))?;
1345 
1346         self.fd
1347             .set_cpuid2(&kvm_cpuid)
1348             .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into()))
1349     }
1350     #[cfg(target_arch = "x86_64")]
1351     ///
1352     /// X86 specific call to enable HyperV SynIC
1353     ///
1354     fn enable_hyperv_synic(&self) -> cpu::Result<()> {
1355         // Update the information about Hyper-V SynIC being enabled and
1356         // emulated as it will influence later which MSRs should be saved.
1357         self.hyperv_synic.store(true, Ordering::Release);
1358 
1359         let cap = kvm_enable_cap {
1360             cap: KVM_CAP_HYPERV_SYNIC,
1361             ..Default::default()
1362         };
1363         self.fd
1364             .enable_cap(&cap)
1365             .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into()))
1366     }
1367     ///
1368     /// X86 specific call to retrieve the CPUID registers.
1369     ///
1370     #[cfg(target_arch = "x86_64")]
1371     fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<Vec<CpuIdEntry>> {
1372         let kvm_cpuid = self
1373             .fd
1374             .get_cpuid2(num_entries)
1375             .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))?;
1376 
1377         let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect();
1378 
1379         Ok(v)
1380     }
1381     #[cfg(target_arch = "x86_64")]
1382     ///
1383     /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1384     ///
1385     fn get_lapic(&self) -> cpu::Result<LapicState> {
1386         Ok(self
1387             .fd
1388             .get_lapic()
1389             .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))?
1390             .into())
1391     }
1392     #[cfg(target_arch = "x86_64")]
1393     ///
1394     /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1395     ///
1396     fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> {
1397         let klapic: kvm_bindings::kvm_lapic_state = (*klapic).clone().into();
1398         self.fd
1399             .set_lapic(&klapic)
1400             .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into()))
1401     }
1402     #[cfg(target_arch = "x86_64")]
1403     ///
1404     /// Returns the model-specific registers (MSR) for this vCPU.
1405     ///
1406     fn get_msrs(&self, msrs: &mut Vec<MsrEntry>) -> cpu::Result<usize> {
1407         let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect();
1408         let mut kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap();
1409         let succ = self
1410             .fd
1411             .get_msrs(&mut kvm_msrs)
1412             .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))?;
1413 
1414         msrs[..succ].copy_from_slice(
1415             &kvm_msrs.as_slice()[..succ]
1416                 .iter()
1417                 .map(|e| (*e).into())
1418                 .collect::<Vec<MsrEntry>>(),
1419         );
1420 
1421         Ok(succ)
1422     }
1423     #[cfg(target_arch = "x86_64")]
1424     ///
1425     /// Setup the model-specific registers (MSR) for this vCPU.
1426     /// Returns the number of MSR entries actually written.
1427     ///
1428     fn set_msrs(&self, msrs: &[MsrEntry]) -> cpu::Result<usize> {
1429         let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect();
1430         let kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap();
1431         self.fd
1432             .set_msrs(&kvm_msrs)
1433             .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into()))
1434     }
1435     ///
1436     /// Returns the vcpu's current "multiprocessing state".
1437     ///
1438     fn get_mp_state(&self) -> cpu::Result<MpState> {
1439         Ok(self
1440             .fd
1441             .get_mp_state()
1442             .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))?
1443             .into())
1444     }
1445     ///
1446     /// Sets the vcpu's current "multiprocessing state".
1447     ///
1448     fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> {
1449         self.fd
1450             .set_mp_state(mp_state.into())
1451             .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into()))
1452     }
1453     #[cfg(target_arch = "x86_64")]
1454     ///
1455     /// Translates guest virtual address to guest physical address using the `KVM_TRANSLATE` ioctl.
1456     ///
1457     fn translate_gva(&self, gva: u64, _flags: u64) -> cpu::Result<(u64, u32)> {
1458         let tr = self
1459             .fd
1460             .translate_gva(gva)
1461             .map_err(|e| cpu::HypervisorCpuError::TranslateVirtualAddress(e.into()))?;
1462         // tr.valid is set if the GVA is mapped to valid GPA.
1463         match tr.valid {
1464             0 => Err(cpu::HypervisorCpuError::TranslateVirtualAddress(anyhow!(
1465                 "Invalid GVA: {:#x}",
1466                 gva
1467             ))),
1468             _ => Ok((tr.physical_address, 0)),
1469         }
1470     }
1471     ///
1472     /// Triggers the running of the current virtual CPU returning an exit reason.
1473     ///
1474     fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> {
1475         match self.fd.run() {
1476             Ok(run) => match run {
1477                 #[cfg(target_arch = "x86_64")]
1478                 VcpuExit::IoIn(addr, data) => {
1479                     if let Some(vm_ops) = &self.vm_ops {
1480                         return vm_ops
1481                             .pio_read(addr.into(), data)
1482                             .map(|_| cpu::VmExit::Ignore)
1483                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1484                     }
1485 
1486                     Ok(cpu::VmExit::IoIn(addr, data))
1487                 }
1488                 #[cfg(target_arch = "x86_64")]
1489                 VcpuExit::IoOut(addr, data) => {
1490                     if let Some(vm_ops) = &self.vm_ops {
1491                         return vm_ops
1492                             .pio_write(addr.into(), data)
1493                             .map(|_| cpu::VmExit::Ignore)
1494                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1495                     }
1496 
1497                     Ok(cpu::VmExit::IoOut(addr, data))
1498                 }
1499                 #[cfg(target_arch = "x86_64")]
1500                 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)),
1501                 #[cfg(target_arch = "x86_64")]
1502                 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset),
1503 
1504                 #[cfg(target_arch = "aarch64")]
1505                 VcpuExit::SystemEvent(event_type, flags) => {
1506                     use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN};
1507                     // On Aarch64, when the VM is shutdown, run() returns
1508                     // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN
1509                     if event_type == KVM_SYSTEM_EVENT_RESET {
1510                         Ok(cpu::VmExit::Reset)
1511                     } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN {
1512                         Ok(cpu::VmExit::Shutdown)
1513                     } else {
1514                         Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1515                             "Unexpected system event with type 0x{:x}, flags 0x{:x}",
1516                             event_type,
1517                             flags
1518                         )))
1519                     }
1520                 }
1521 
1522                 VcpuExit::MmioRead(addr, data) => {
1523                     if let Some(vm_ops) = &self.vm_ops {
1524                         return vm_ops
1525                             .mmio_read(addr, data)
1526                             .map(|_| cpu::VmExit::Ignore)
1527                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1528                     }
1529 
1530                     Ok(cpu::VmExit::MmioRead(addr, data))
1531                 }
1532                 VcpuExit::MmioWrite(addr, data) => {
1533                     if let Some(vm_ops) = &self.vm_ops {
1534                         return vm_ops
1535                             .mmio_write(addr, data)
1536                             .map(|_| cpu::VmExit::Ignore)
1537                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1538                     }
1539 
1540                     Ok(cpu::VmExit::MmioWrite(addr, data))
1541                 }
1542                 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv),
1543                 #[cfg(feature = "tdx")]
1544                 VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx),
1545                 VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug),
1546 
1547                 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1548                     "Unexpected exit reason on vcpu run: {:?}",
1549                     r
1550                 ))),
1551             },
1552 
1553             Err(ref e) => match e.errno() {
1554                 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore),
1555                 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1556                     "VCPU error {:?}",
1557                     e
1558                 ))),
1559             },
1560         }
1561     }
1562     #[cfg(target_arch = "x86_64")]
1563     ///
1564     /// Let the guest know that it has been paused, which prevents from
1565     /// potential soft lockups when being resumed.
1566     ///
1567     fn notify_guest_clock_paused(&self) -> cpu::Result<()> {
1568         if let Err(e) = self.fd.kvmclock_ctrl() {
1569             // Linux kernel returns -EINVAL if the PV clock isn't yet initialised
1570             // which could be because we're still in firmware or the guest doesn't
1571             // use KVM clock.
1572             if e.errno() != libc::EINVAL {
1573                 return Err(cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into()));
1574             }
1575         }
1576 
1577         Ok(())
1578     }
1579     #[cfg(target_arch = "x86_64")]
1580     ///
1581     /// Sets debug registers to set hardware breakpoints and/or enable single step.
1582     ///
1583     fn set_guest_debug(
1584         &self,
1585         addrs: &[vm_memory::GuestAddress],
1586         singlestep: bool,
1587     ) -> cpu::Result<()> {
1588         if addrs.len() > 4 {
1589             return Err(cpu::HypervisorCpuError::SetDebugRegs(anyhow!(
1590                 "Support 4 breakpoints at most but {} addresses are passed",
1591                 addrs.len()
1592             )));
1593         }
1594 
1595         let mut dbg = kvm_guest_debug {
1596             control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP,
1597             ..Default::default()
1598         };
1599         if singlestep {
1600             dbg.control |= KVM_GUESTDBG_SINGLESTEP;
1601         }
1602 
1603         // Set bits 9 and 10.
1604         // bit 9: GE (global exact breakpoint enable) flag.
1605         // bit 10: always 1.
1606         dbg.arch.debugreg[7] = 0x0600;
1607 
1608         for (i, addr) in addrs.iter().enumerate() {
1609             dbg.arch.debugreg[i] = addr.0;
1610             // Set global breakpoint enable flag
1611             dbg.arch.debugreg[7] |= 2 << (i * 2);
1612         }
1613 
1614         self.fd
1615             .set_guest_debug(&dbg)
1616             .map_err(|e| cpu::HypervisorCpuError::SetDebugRegs(e.into()))
1617     }
1618     #[cfg(target_arch = "aarch64")]
1619     fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> {
1620         self.fd
1621             .vcpu_init(kvi)
1622             .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into()))
1623     }
1624     ///
1625     /// Sets the value of one register for this vCPU.
1626     ///
1627     #[cfg(target_arch = "aarch64")]
1628     fn set_reg(&self, reg_id: u64, data: u64) -> cpu::Result<()> {
1629         self.fd
1630             .set_one_reg(reg_id, data)
1631             .map_err(|e| cpu::HypervisorCpuError::SetRegister(e.into()))
1632     }
1633     ///
1634     /// Gets the value of one register for this vCPU.
1635     ///
1636     #[cfg(target_arch = "aarch64")]
1637     fn get_reg(&self, reg_id: u64) -> cpu::Result<u64> {
1638         self.fd
1639             .get_one_reg(reg_id)
1640             .map_err(|e| cpu::HypervisorCpuError::GetRegister(e.into()))
1641     }
1642     ///
1643     /// Gets a list of the guest registers that are supported for the
1644     /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls.
1645     ///
1646     #[cfg(target_arch = "aarch64")]
1647     fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> {
1648         self.fd
1649             .get_reg_list(reg_list)
1650             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))
1651     }
1652     ///
1653     /// Save the state of the system registers.
1654     ///
1655     #[cfg(target_arch = "aarch64")]
1656     fn get_sys_regs(&self) -> cpu::Result<Vec<Register>> {
1657         // Call KVM_GET_REG_LIST to get all registers available to the guest. For ArmV8 there are
1658         // around 500 registers.
1659         let mut state: Vec<Register> = Vec::new();
1660         let mut reg_list = RegList::new(500).unwrap();
1661         self.fd
1662             .get_reg_list(&mut reg_list)
1663             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?;
1664 
1665         // At this point reg_list should contain: core registers and system registers.
1666         // The register list contains the number of registers and their ids. We will be needing to
1667         // call KVM_GET_ONE_REG on each id in order to save all of them. We carve out from the list
1668         // the core registers which are represented in the kernel by kvm_regs structure and for which
1669         // we can calculate the id based on the offset in the structure.
1670         reg_list.retain(|regid| is_system_register(*regid));
1671 
1672         // Now, for the rest of the registers left in the previously fetched register list, we are
1673         // simply calling KVM_GET_ONE_REG.
1674         let indices = reg_list.as_slice();
1675         for index in indices.iter() {
1676             state.push(kvm_bindings::kvm_one_reg {
1677                 id: *index,
1678                 addr: self
1679                     .fd
1680                     .get_one_reg(*index)
1681                     .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?,
1682             });
1683         }
1684 
1685         Ok(state)
1686     }
1687     ///
1688     /// Restore the state of the system registers.
1689     ///
1690     #[cfg(target_arch = "aarch64")]
1691     fn set_sys_regs(&self, state: &[Register]) -> cpu::Result<()> {
1692         for reg in state {
1693             self.fd
1694                 .set_one_reg(reg.id, reg.addr)
1695                 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?;
1696         }
1697         Ok(())
1698     }
1699     ///
1700     /// Read the MPIDR - Multiprocessor Affinity Register.
1701     ///
1702     #[cfg(target_arch = "aarch64")]
1703     fn read_mpidr(&self) -> cpu::Result<u64> {
1704         self.fd
1705             .get_one_reg(MPIDR_EL1)
1706             .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))
1707     }
1708     ///
1709     /// Configure core registers for a given CPU.
1710     ///
1711     #[cfg(target_arch = "aarch64")]
1712     fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> {
1713         #[allow(non_upper_case_globals)]
1714         // PSR (Processor State Register) bits.
1715         // Taken from arch/arm64/include/uapi/asm/ptrace.h.
1716         const PSR_MODE_EL1h: u64 = 0x0000_0005;
1717         const PSR_F_BIT: u64 = 0x0000_0040;
1718         const PSR_I_BIT: u64 = 0x0000_0080;
1719         const PSR_A_BIT: u64 = 0x0000_0100;
1720         const PSR_D_BIT: u64 = 0x0000_0200;
1721         // Taken from arch/arm64/kvm/inject_fault.c.
1722         const PSTATE_FAULT_BITS_64: u64 =
1723             PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | PSR_I_BIT | PSR_D_BIT;
1724 
1725         let kreg_off = offset__of!(kvm_regs, regs);
1726 
1727         // Get the register index of the PSTATE (Processor State) register.
1728         let pstate = offset__of!(user_pt_regs, pstate) + kreg_off;
1729         self.set_reg(
1730             arm64_core_reg_id!(KVM_REG_SIZE_U64, pstate),
1731             PSTATE_FAULT_BITS_64,
1732         )
1733         .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1734 
1735         // Other vCPUs are powered off initially awaiting PSCI wakeup.
1736         if cpu_id == 0 {
1737             // Setting the PC (Processor Counter) to the current program address (kernel address).
1738             let pc = offset__of!(user_pt_regs, pc) + kreg_off;
1739             self.set_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, pc), boot_ip as u64)
1740                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1741 
1742             // Last mandatory thing to set -> the address pointing to the FDT (also called DTB).
1743             // "The device tree blob (dtb) must be placed on an 8-byte boundary and must
1744             // not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt.
1745             // We are choosing to place it the end of DRAM. See `get_fdt_addr`.
1746             let regs0 = offset__of!(user_pt_regs, regs) + kreg_off;
1747             self.set_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, regs0), fdt_start)
1748                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1749         }
1750         Ok(())
1751     }
1752 
1753     #[cfg(target_arch = "x86_64")]
1754     ///
1755     /// Get the current CPU state
1756     ///
1757     /// Ordering requirements:
1758     ///
1759     /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify
1760     /// vCPU/LAPIC state. As such, it must be done before most everything
1761     /// else, otherwise we cannot restore everything and expect it to work.
1762     ///
1763     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1764     /// still running.
1765     ///
1766     /// KVM_GET_LAPIC may change state of LAPIC before returning it.
1767     ///
1768     /// GET_VCPU_EVENTS should probably be last to save. The code looks as
1769     /// it might as well be affected by internal state modifications of the
1770     /// GET ioctls.
1771     ///
1772     /// SREGS saves/restores a pending interrupt, similar to what
1773     /// VCPU_EVENTS also does.
1774     ///
1775     /// GET_MSRS requires a pre-populated data structure to do something
1776     /// meaningful. For SET_MSRS it will then contain good data.
1777     ///
1778     /// # Example
1779     ///
1780     /// ```rust
1781     /// # extern crate hypervisor;
1782     /// # use hypervisor::KvmHypervisor;
1783     /// # use std::sync::Arc;
1784     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1785     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1786     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1787     /// vm.enable_split_irq().unwrap();
1788     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1789     /// let state = vcpu.state().unwrap();
1790     /// ```
1791     fn state(&self) -> cpu::Result<CpuState> {
1792         let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?;
1793         let mp_state = self.get_mp_state()?.into();
1794         let regs = self.get_regs()?;
1795         let sregs = self.get_sregs()?;
1796         let xsave = self.get_xsave()?;
1797         let xcrs = self.get_xcrs()?;
1798         let lapic_state = self.get_lapic()?;
1799         let fpu = self.get_fpu()?;
1800 
1801         // Try to get all MSRs based on the list previously retrieved from KVM.
1802         // If the number of MSRs obtained from GET_MSRS is different from the
1803         // expected amount, we fallback onto a slower method by getting MSRs
1804         // by chunks. This is the only way to make sure we try to get as many
1805         // MSRs as possible, even if some MSRs are not supported.
1806         let mut msr_entries = self.msrs.clone();
1807 
1808         // Save extra MSRs if the Hyper-V synthetic interrupt controller is
1809         // emulated.
1810         if self.hyperv_synic.load(Ordering::Acquire) {
1811             let hyperv_synic_msrs = vec![
1812                 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084,
1813                 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096,
1814                 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d,
1815                 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4,
1816                 0x400000b5, 0x400000b6, 0x400000b7,
1817             ];
1818             for index in hyperv_synic_msrs {
1819                 let msr = kvm_msr_entry {
1820                     index,
1821                     ..Default::default()
1822                 };
1823                 msr_entries.push(msr.into());
1824             }
1825         }
1826 
1827         let expected_num_msrs = msr_entries.len();
1828         let num_msrs = self.get_msrs(&mut msr_entries)?;
1829         let msrs = if num_msrs != expected_num_msrs {
1830             let mut faulty_msr_index = num_msrs;
1831             let mut msr_entries_tmp = msr_entries[..faulty_msr_index].to_vec();
1832 
1833             loop {
1834                 warn!(
1835                     "Detected faulty MSR 0x{:x} while getting MSRs",
1836                     msr_entries[faulty_msr_index].index
1837                 );
1838 
1839                 // Skip the first bad MSR
1840                 let start_pos = faulty_msr_index + 1;
1841 
1842                 let mut sub_msr_entries = msr_entries[start_pos..].to_vec();
1843                 let num_msrs = self.get_msrs(&mut sub_msr_entries)?;
1844 
1845                 msr_entries_tmp.extend(&sub_msr_entries[..num_msrs]);
1846 
1847                 if num_msrs == sub_msr_entries.len() {
1848                     break;
1849                 }
1850 
1851                 faulty_msr_index = start_pos + num_msrs;
1852             }
1853 
1854             msr_entries_tmp
1855         } else {
1856             msr_entries
1857         };
1858 
1859         let vcpu_events = self.get_vcpu_events()?;
1860 
1861         Ok(VcpuKvmState {
1862             cpuid,
1863             msrs,
1864             vcpu_events,
1865             regs: regs.into(),
1866             sregs: sregs.into(),
1867             fpu,
1868             lapic_state,
1869             xsave,
1870             xcrs,
1871             mp_state,
1872         }
1873         .into())
1874     }
1875     ///
1876     /// Get the current AArch64 CPU state
1877     ///
1878     #[cfg(target_arch = "aarch64")]
1879     fn state(&self) -> cpu::Result<CpuState> {
1880         let mut state = VcpuKvmState {
1881             mp_state: self.get_mp_state()?.into(),
1882             mpidr: self.read_mpidr()?,
1883             ..Default::default()
1884         };
1885         state.core_regs = self.get_regs()?;
1886         state.sys_regs = self.get_sys_regs()?;
1887 
1888         Ok(state.into())
1889     }
1890     #[cfg(target_arch = "x86_64")]
1891     ///
1892     /// Restore the previously saved CPU state
1893     ///
1894     /// Ordering requirements:
1895     ///
1896     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1897     /// still running.
1898     ///
1899     /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so
1900     /// if we ever change the BSP, we have to do that before restoring anything.
1901     /// The same seems to be true for CPUID stuff.
1902     ///
1903     /// SREGS saves/restores a pending interrupt, similar to what
1904     /// VCPU_EVENTS also does.
1905     ///
1906     /// SET_REGS clears pending exceptions unconditionally, thus, it must be
1907     /// done before SET_VCPU_EVENTS, which restores it.
1908     ///
1909     /// SET_LAPIC must come after SET_SREGS, because the latter restores
1910     /// the apic base msr.
1911     ///
1912     /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR
1913     /// only restores successfully, when the LAPIC is correctly configured.
1914     ///
1915     /// Arguments: CpuState
1916     /// # Example
1917     ///
1918     /// ```rust
1919     /// # extern crate hypervisor;
1920     /// # use hypervisor::KvmHypervisor;
1921     /// # use std::sync::Arc;
1922     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1923     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1924     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1925     /// vm.enable_split_irq().unwrap();
1926     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1927     /// let state = vcpu.state().unwrap();
1928     /// vcpu.set_state(&state).unwrap();
1929     /// ```
1930     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
1931         let state: VcpuKvmState = state.clone().into();
1932         self.set_cpuid2(&state.cpuid)?;
1933         self.set_mp_state(state.mp_state.into())?;
1934         self.set_regs(&state.regs.into())?;
1935         self.set_sregs(&state.sregs.into())?;
1936         self.set_xsave(&state.xsave)?;
1937         self.set_xcrs(&state.xcrs)?;
1938         self.set_lapic(&state.lapic_state)?;
1939         self.set_fpu(&state.fpu)?;
1940 
1941         // Try to set all MSRs previously stored.
1942         // If the number of MSRs set from SET_MSRS is different from the
1943         // expected amount, we fallback onto a slower method by setting MSRs
1944         // by chunks. This is the only way to make sure we try to set as many
1945         // MSRs as possible, even if some MSRs are not supported.
1946         let expected_num_msrs = state.msrs.len();
1947         let num_msrs = self.set_msrs(&state.msrs)?;
1948         if num_msrs != expected_num_msrs {
1949             let mut faulty_msr_index = num_msrs;
1950 
1951             loop {
1952                 warn!(
1953                     "Detected faulty MSR 0x{:x} while setting MSRs",
1954                     state.msrs[faulty_msr_index].index
1955                 );
1956 
1957                 // Skip the first bad MSR
1958                 let start_pos = faulty_msr_index + 1;
1959 
1960                 let sub_msr_entries = state.msrs[start_pos..].to_vec();
1961 
1962                 let num_msrs = self.set_msrs(&sub_msr_entries)?;
1963 
1964                 if num_msrs == sub_msr_entries.len() {
1965                     break;
1966                 }
1967 
1968                 faulty_msr_index = start_pos + num_msrs;
1969             }
1970         }
1971 
1972         self.set_vcpu_events(&state.vcpu_events)?;
1973 
1974         Ok(())
1975     }
1976     ///
1977     /// Restore the previously saved AArch64 CPU state
1978     ///
1979     #[cfg(target_arch = "aarch64")]
1980     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
1981         let state: VcpuKvmState = state.clone().into();
1982         self.set_regs(&state.core_regs)?;
1983         self.set_sys_regs(&state.sys_regs)?;
1984         self.set_mp_state(state.mp_state.into())?;
1985 
1986         Ok(())
1987     }
1988 
1989     ///
1990     /// Initialize TDX for this CPU
1991     ///
1992     #[cfg(feature = "tdx")]
1993     fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> {
1994         tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address)
1995             .map_err(cpu::HypervisorCpuError::InitializeTdx)
1996     }
1997 
1998     ///
1999     /// Set the "immediate_exit" state
2000     ///
2001     fn set_immediate_exit(&self, exit: bool) {
2002         self.fd.set_kvm_immediate_exit(exit.into());
2003     }
2004 
2005     ///
2006     /// Returns the details about TDX exit reason
2007     ///
2008     #[cfg(feature = "tdx")]
2009     fn get_tdx_exit_details(&mut self) -> cpu::Result<TdxExitDetails> {
2010         let kvm_run = self.fd.get_kvm_run();
2011         let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall };
2012 
2013         tdx_vmcall.status_code = TDG_VP_VMCALL_INVALID_OPERAND;
2014 
2015         if tdx_vmcall.type_ != 0 {
2016             return Err(cpu::HypervisorCpuError::UnknownTdxVmCall);
2017         }
2018 
2019         match tdx_vmcall.subfunction {
2020             TDG_VP_VMCALL_GET_QUOTE => Ok(TdxExitDetails::GetQuote),
2021             TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT => {
2022                 Ok(TdxExitDetails::SetupEventNotifyInterrupt)
2023             }
2024             _ => Err(cpu::HypervisorCpuError::UnknownTdxVmCall),
2025         }
2026     }
2027 
2028     ///
2029     /// Set the status code for TDX exit
2030     ///
2031     #[cfg(feature = "tdx")]
2032     fn set_tdx_status(&mut self, status: TdxExitStatus) {
2033         let kvm_run = self.fd.get_kvm_run();
2034         let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall };
2035 
2036         tdx_vmcall.status_code = match status {
2037             TdxExitStatus::Success => TDG_VP_VMCALL_SUCCESS,
2038             TdxExitStatus::InvalidOperand => TDG_VP_VMCALL_INVALID_OPERAND,
2039         };
2040     }
2041     #[cfg(target_arch = "x86_64")]
2042     ///
2043     /// Return the list of initial MSR entries for a VCPU
2044     ///
2045     fn boot_msr_entries(&self) -> Vec<MsrEntry> {
2046         use crate::arch::x86::{msr_index, MTRR_ENABLE, MTRR_MEM_TYPE_WB};
2047 
2048         [
2049             msr!(msr_index::MSR_IA32_SYSENTER_CS),
2050             msr!(msr_index::MSR_IA32_SYSENTER_ESP),
2051             msr!(msr_index::MSR_IA32_SYSENTER_EIP),
2052             msr!(msr_index::MSR_STAR),
2053             msr!(msr_index::MSR_CSTAR),
2054             msr!(msr_index::MSR_LSTAR),
2055             msr!(msr_index::MSR_KERNEL_GS_BASE),
2056             msr!(msr_index::MSR_SYSCALL_MASK),
2057             msr!(msr_index::MSR_IA32_TSC),
2058             msr_data!(
2059                 msr_index::MSR_IA32_MISC_ENABLE,
2060                 msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64
2061             ),
2062             msr_data!(msr_index::MSR_MTRRdefType, MTRR_ENABLE | MTRR_MEM_TYPE_WB),
2063         ]
2064         .to_vec()
2065     }
2066 }
2067 
2068 impl KvmVcpu {
2069     #[cfg(target_arch = "x86_64")]
2070     ///
2071     /// X86 specific call that returns the vcpu's current "xsave struct".
2072     ///
2073     fn get_xsave(&self) -> cpu::Result<Xsave> {
2074         self.fd
2075             .get_xsave()
2076             .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into()))
2077     }
2078     #[cfg(target_arch = "x86_64")]
2079     ///
2080     /// X86 specific call that sets the vcpu's current "xsave struct".
2081     ///
2082     fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> {
2083         self.fd
2084             .set_xsave(xsave)
2085             .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into()))
2086     }
2087     #[cfg(target_arch = "x86_64")]
2088     ///
2089     /// X86 specific call that returns the vcpu's current "xcrs".
2090     ///
2091     fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> {
2092         self.fd
2093             .get_xcrs()
2094             .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into()))
2095     }
2096     #[cfg(target_arch = "x86_64")]
2097     ///
2098     /// X86 specific call that sets the vcpu's current "xcrs".
2099     ///
2100     fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> {
2101         self.fd
2102             .set_xcrs(xcrs)
2103             .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into()))
2104     }
2105     #[cfg(target_arch = "x86_64")]
2106     ///
2107     /// Returns currently pending exceptions, interrupts, and NMIs as well as related
2108     /// states of the vcpu.
2109     ///
2110     fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> {
2111         self.fd
2112             .get_vcpu_events()
2113             .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into()))
2114     }
2115     #[cfg(target_arch = "x86_64")]
2116     ///
2117     /// Sets pending exceptions, interrupts, and NMIs as well as related states
2118     /// of the vcpu.
2119     ///
2120     fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> {
2121         self.fd
2122             .set_vcpu_events(events)
2123             .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into()))
2124     }
2125 }
2126 
2127 /// Device struct for KVM
2128 pub type KvmDevice = DeviceFd;
2129 
2130 impl device::Device for KvmDevice {
2131     ///
2132     /// Set device attribute
2133     ///
2134     fn set_device_attr(&self, attr: &DeviceAttr) -> device::Result<()> {
2135         self.set_device_attr(attr)
2136             .map_err(|e| device::HypervisorDeviceError::SetDeviceAttribute(e.into()))
2137     }
2138     ///
2139     /// Get device attribute
2140     ///
2141     fn get_device_attr(&self, attr: &mut DeviceAttr) -> device::Result<()> {
2142         self.get_device_attr(attr)
2143             .map_err(|e| device::HypervisorDeviceError::GetDeviceAttribute(e.into()))
2144     }
2145     ///
2146     /// Cast to the underlying KVM device fd
2147     ///
2148     fn as_any(&self) -> &dyn Any {
2149         self
2150     }
2151 }
2152