xref: /cloud-hypervisor/hypervisor/src/kvm/mod.rs (revision 9bcb9849629b2a6f67ecf3ba32d18a8d2582022c)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 // Copyright © 2020, Microsoft Corporation
6 //
7 // Copyright 2018-2019 CrowdStrike, Inc.
8 //
9 //
10 
11 #[cfg(target_arch = "aarch64")]
12 pub use crate::aarch64::{
13     check_required_kvm_extensions, is_system_register, VcpuInit, VcpuKvmState as CpuState,
14     MPIDR_EL1,
15 };
16 use crate::cpu;
17 use crate::device;
18 use crate::hypervisor;
19 use crate::vec_with_array_field;
20 use crate::vm::{self, VmmOps};
21 #[cfg(target_arch = "aarch64")]
22 use crate::{arm64_core_reg_id, offset__of};
23 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd};
24 use serde_derive::{Deserialize, Serialize};
25 use std::collections::HashMap;
26 #[cfg(target_arch = "aarch64")]
27 use std::convert::TryInto;
28 #[cfg(target_arch = "x86_64")]
29 use std::fs::File;
30 use std::os::unix::io::{AsRawFd, RawFd};
31 use std::result;
32 #[cfg(target_arch = "x86_64")]
33 use std::sync::atomic::{AtomicBool, Ordering};
34 use std::sync::{Arc, RwLock};
35 use vmm_sys_util::eventfd::EventFd;
36 // x86_64 dependencies
37 #[cfg(target_arch = "x86_64")]
38 pub mod x86_64;
39 #[cfg(target_arch = "x86_64")]
40 use crate::arch::x86::NUM_IOAPIC_PINS;
41 #[cfg(target_arch = "aarch64")]
42 use aarch64::{RegList, Register, StandardRegisters};
43 #[cfg(target_arch = "x86_64")]
44 use kvm_bindings::{
45     kvm_enable_cap, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, KVM_CAP_SPLIT_IRQCHIP,
46 };
47 #[cfg(target_arch = "x86_64")]
48 use x86_64::{check_required_kvm_extensions, FpuState, SpecialRegisters, StandardRegisters};
49 #[cfg(target_arch = "x86_64")]
50 pub use x86_64::{
51     CpuId, CpuIdEntry, ExtendedControlRegisters, LapicState, MsrEntries, VcpuKvmState as CpuState,
52     Xsave, CPUID_FLAG_VALID_INDEX,
53 };
54 // aarch64 dependencies
55 #[cfg(target_arch = "aarch64")]
56 pub mod aarch64;
57 pub use kvm_bindings;
58 #[cfg(feature = "tdx")]
59 use kvm_bindings::KVMIO;
60 pub use kvm_bindings::{
61     kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_irq_routing, kvm_irq_routing_entry,
62     kvm_userspace_memory_region, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI,
63     KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID,
64 };
65 #[cfg(target_arch = "aarch64")]
66 use kvm_bindings::{
67     kvm_regs, user_fpsimd_state, user_pt_regs, KVM_NR_SPSR, KVM_REG_ARM64, KVM_REG_ARM_CORE,
68     KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64,
69 };
70 pub use kvm_ioctls;
71 pub use kvm_ioctls::{Cap, Kvm};
72 #[cfg(target_arch = "aarch64")]
73 use std::mem;
74 use thiserror::Error;
75 #[cfg(feature = "tdx")]
76 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_expr, ioctl_ioc_nr, ioctl_iowr_nr};
77 ///
78 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms
79 ///
80 pub use {
81     kvm_bindings::kvm_clock_data as ClockData, kvm_bindings::kvm_create_device as CreateDevice,
82     kvm_bindings::kvm_device_attr as DeviceAttr,
83     kvm_bindings::kvm_irq_routing_entry as IrqRoutingEntry, kvm_bindings::kvm_mp_state as MpState,
84     kvm_bindings::kvm_userspace_memory_region as MemoryRegion,
85     kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::DeviceFd, kvm_ioctls::IoEventAddress,
86     kvm_ioctls::VcpuExit,
87 };
88 
89 #[cfg(target_arch = "x86_64")]
90 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196;
91 
92 #[cfg(feature = "tdx")]
93 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong);
94 
95 #[cfg(feature = "tdx")]
96 #[repr(u32)]
97 enum TdxCommand {
98     #[allow(dead_code)]
99     Capabilities = 0,
100     InitVm,
101     InitVcpu,
102     InitMemRegion,
103     Finalize,
104 }
105 
106 #[derive(Clone, Copy, Debug, PartialEq, Deserialize, Serialize)]
107 pub struct KvmVmState {}
108 
109 pub use KvmVmState as VmState;
110 
111 struct KvmDirtyLogSlot {
112     slot: u32,
113     guest_phys_addr: u64,
114     memory_size: u64,
115     userspace_addr: u64,
116 }
117 
118 /// Wrapper over KVM VM ioctls.
119 pub struct KvmVm {
120     fd: Arc<VmFd>,
121     #[cfg(target_arch = "x86_64")]
122     msrs: MsrEntries,
123     state: KvmVmState,
124     dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>,
125 }
126 
127 ///
128 /// Implementation of Vm trait for KVM
129 /// Example:
130 /// #[cfg(feature = "kvm")]
131 /// extern crate hypervisor
132 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
133 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
134 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
135 /// vm.set/get().unwrap()
136 ///
137 impl vm::Vm for KvmVm {
138     #[cfg(target_arch = "x86_64")]
139     ///
140     /// Sets the address of the one-page region in the VM's address space.
141     ///
142     fn set_identity_map_address(&self, address: u64) -> vm::Result<()> {
143         self.fd
144             .set_identity_map_address(address)
145             .map_err(|e| vm::HypervisorVmError::SetIdentityMapAddress(e.into()))
146     }
147     #[cfg(target_arch = "x86_64")]
148     ///
149     /// Sets the address of the three-page region in the VM's address space.
150     ///
151     fn set_tss_address(&self, offset: usize) -> vm::Result<()> {
152         self.fd
153             .set_tss_address(offset)
154             .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into()))
155     }
156     ///
157     /// Creates an in-kernel interrupt controller.
158     ///
159     fn create_irq_chip(&self) -> vm::Result<()> {
160         self.fd
161             .create_irq_chip()
162             .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into()))
163     }
164     ///
165     /// Registers an event that will, when signaled, trigger the `gsi` IRQ.
166     ///
167     fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
168         self.fd
169             .register_irqfd(fd, gsi)
170             .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into()))
171     }
172     ///
173     /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ.
174     ///
175     fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
176         self.fd
177             .unregister_irqfd(fd, gsi)
178             .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into()))
179     }
180     ///
181     /// Creates a VcpuFd object from a vcpu RawFd.
182     ///
183     fn create_vcpu(
184         &self,
185         id: u8,
186         vmmops: Option<Arc<dyn VmmOps>>,
187     ) -> vm::Result<Arc<dyn cpu::Vcpu>> {
188         let vc = self
189             .fd
190             .create_vcpu(id as u64)
191             .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?;
192         let vcpu = KvmVcpu {
193             fd: vc,
194             #[cfg(target_arch = "x86_64")]
195             msrs: self.msrs.clone(),
196             vmmops,
197             #[cfg(target_arch = "x86_64")]
198             hyperv_synic: AtomicBool::new(false),
199         };
200         Ok(Arc::new(vcpu))
201     }
202     ///
203     /// Registers an event to be signaled whenever a certain address is written to.
204     ///
205     fn register_ioevent(
206         &self,
207         fd: &EventFd,
208         addr: &IoEventAddress,
209         datamatch: Option<vm::DataMatch>,
210     ) -> vm::Result<()> {
211         if let Some(dm) = datamatch {
212             match dm {
213                 vm::DataMatch::DataMatch32(kvm_dm32) => self
214                     .fd
215                     .register_ioevent(fd, addr, kvm_dm32)
216                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
217                 vm::DataMatch::DataMatch64(kvm_dm64) => self
218                     .fd
219                     .register_ioevent(fd, addr, kvm_dm64)
220                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
221             }
222         } else {
223             self.fd
224                 .register_ioevent(fd, addr, NoDatamatch)
225                 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into()))
226         }
227     }
228     ///
229     /// Unregisters an event from a certain address it has been previously registered to.
230     ///
231     fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> {
232         self.fd
233             .unregister_ioevent(fd, addr, NoDatamatch)
234             .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into()))
235     }
236     ///
237     /// Sets the GSI routing table entries, overwriting any previously set
238     /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl.
239     ///
240     fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> {
241         let mut irq_routing =
242             vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len());
243         irq_routing[0].nr = entries.len() as u32;
244         irq_routing[0].flags = 0;
245 
246         // SAFETY: irq_routing initialized with entries.len() and now it is being turned into
247         // entries_slice with entries.len() again. It is guaranteed to be large enough to hold
248         // everything from entries.
249         unsafe {
250             let entries_slice: &mut [kvm_irq_routing_entry] =
251                 irq_routing[0].entries.as_mut_slice(entries.len());
252             entries_slice.copy_from_slice(entries);
253         }
254 
255         self.fd
256             .set_gsi_routing(&irq_routing[0])
257             .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into()))
258     }
259     ///
260     /// Creates a memory region structure that can be used with {create/remove}_user_memory_region
261     ///
262     fn make_user_memory_region(
263         &self,
264         slot: u32,
265         guest_phys_addr: u64,
266         memory_size: u64,
267         userspace_addr: u64,
268         readonly: bool,
269         log_dirty_pages: bool,
270     ) -> MemoryRegion {
271         MemoryRegion {
272             slot,
273             guest_phys_addr,
274             memory_size,
275             userspace_addr,
276             flags: if readonly { KVM_MEM_READONLY } else { 0 }
277                 | if log_dirty_pages {
278                     KVM_MEM_LOG_DIRTY_PAGES
279                 } else {
280                     0
281                 },
282         }
283     }
284     ///
285     /// Creates a guest physical memory region.
286     ///
287     fn create_user_memory_region(&self, user_memory_region: MemoryRegion) -> vm::Result<()> {
288         let mut region = user_memory_region;
289 
290         if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 {
291             if (region.flags & KVM_MEM_READONLY) != 0 {
292                 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!(
293                     "Error creating regions with both 'dirty-pages-log' and 'read-only'."
294                 )));
295             }
296 
297             // Keep track of the regions that need dirty pages log
298             self.dirty_log_slots.write().unwrap().insert(
299                 region.slot,
300                 KvmDirtyLogSlot {
301                     slot: region.slot,
302                     guest_phys_addr: region.guest_phys_addr,
303                     memory_size: region.memory_size,
304                     userspace_addr: region.userspace_addr,
305                 },
306             );
307 
308             // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`.
309             // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`.
310             region.flags = 0;
311         }
312 
313         // SAFETY: Safe because guest regions are guaranteed not to overlap.
314         unsafe {
315             self.fd
316                 .set_user_memory_region(region)
317                 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))
318         }
319     }
320     ///
321     /// Removes a guest physical memory region.
322     ///
323     fn remove_user_memory_region(&self, user_memory_region: MemoryRegion) -> vm::Result<()> {
324         let mut region = user_memory_region;
325 
326         // Remove the corresponding entry from "self.dirty_log_slots" if needed
327         self.dirty_log_slots.write().unwrap().remove(&region.slot);
328 
329         // Setting the size to 0 means "remove"
330         region.memory_size = 0;
331         // SAFETY: Safe because guest regions are guaranteed not to overlap.
332         unsafe {
333             self.fd
334                 .set_user_memory_region(region)
335                 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into()))
336         }
337     }
338     ///
339     /// Creates an emulated device in the kernel.
340     ///
341     /// See the documentation for `KVM_CREATE_DEVICE`.
342     fn create_device(&self, device: &mut CreateDevice) -> vm::Result<Arc<dyn device::Device>> {
343         let fd = self
344             .fd
345             .create_device(device)
346             .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?;
347         let device = KvmDevice { fd };
348         Ok(Arc::new(device))
349     }
350     ///
351     /// Returns the preferred CPU target type which can be emulated by KVM on underlying host.
352     ///
353     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
354     fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> {
355         self.fd
356             .get_preferred_target(kvi)
357             .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into()))
358     }
359     #[cfg(target_arch = "x86_64")]
360     fn enable_split_irq(&self) -> vm::Result<()> {
361         // Create split irqchip
362         // Only the local APIC is emulated in kernel, both PICs and IOAPIC
363         // are not.
364         let mut cap = kvm_enable_cap {
365             cap: KVM_CAP_SPLIT_IRQCHIP,
366             ..Default::default()
367         };
368         cap.args[0] = NUM_IOAPIC_PINS as u64;
369         self.fd
370             .enable_cap(&cap)
371             .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?;
372         Ok(())
373     }
374     #[cfg(target_arch = "x86_64")]
375     fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> {
376         let mut cap = kvm_enable_cap {
377             cap: KVM_CAP_SGX_ATTRIBUTE,
378             ..Default::default()
379         };
380         cap.args[0] = file.as_raw_fd() as u64;
381         self.fd
382             .enable_cap(&cap)
383             .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?;
384         Ok(())
385     }
386     /// Retrieve guest clock.
387     #[cfg(target_arch = "x86_64")]
388     fn get_clock(&self) -> vm::Result<ClockData> {
389         self.fd
390             .get_clock()
391             .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))
392     }
393     /// Set guest clock.
394     #[cfg(target_arch = "x86_64")]
395     fn set_clock(&self, data: &ClockData) -> vm::Result<()> {
396         self.fd
397             .set_clock(data)
398             .map_err(|e| vm::HypervisorVmError::SetClock(e.into()))
399     }
400     /// Checks if a particular `Cap` is available.
401     fn check_extension(&self, c: Cap) -> bool {
402         self.fd.check_extension(c)
403     }
404     /// Create a device that is used for passthrough
405     fn create_passthrough_device(&self) -> vm::Result<Arc<dyn device::Device>> {
406         let mut vfio_dev = kvm_create_device {
407             type_: kvm_device_type_KVM_DEV_TYPE_VFIO,
408             fd: 0,
409             flags: 0,
410         };
411 
412         self.create_device(&mut vfio_dev)
413             .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into()))
414     }
415     ///
416     /// Get the Vm state. Return VM specific data
417     ///
418     fn state(&self) -> vm::Result<VmState> {
419         Ok(self.state)
420     }
421     ///
422     /// Set the VM state
423     ///
424     fn set_state(&self, _state: VmState) -> vm::Result<()> {
425         Ok(())
426     }
427 
428     ///
429     /// Start logging dirty pages
430     ///
431     fn start_dirty_log(&self) -> vm::Result<()> {
432         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
433         for (_, s) in dirty_log_slots.iter() {
434             let region = MemoryRegion {
435                 slot: s.slot,
436                 guest_phys_addr: s.guest_phys_addr,
437                 memory_size: s.memory_size,
438                 userspace_addr: s.userspace_addr,
439                 flags: KVM_MEM_LOG_DIRTY_PAGES,
440             };
441             // SAFETY: Safe because guest regions are guaranteed not to overlap.
442             unsafe {
443                 self.fd
444                     .set_user_memory_region(region)
445                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
446             }
447         }
448 
449         Ok(())
450     }
451 
452     ///
453     /// Stop logging dirty pages
454     ///
455     fn stop_dirty_log(&self) -> vm::Result<()> {
456         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
457         for (_, s) in dirty_log_slots.iter() {
458             let region = MemoryRegion {
459                 slot: s.slot,
460                 guest_phys_addr: s.guest_phys_addr,
461                 memory_size: s.memory_size,
462                 userspace_addr: s.userspace_addr,
463                 flags: 0,
464             };
465             // SAFETY: Safe because guest regions are guaranteed not to overlap.
466             unsafe {
467                 self.fd
468                     .set_user_memory_region(region)
469                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
470             }
471         }
472 
473         Ok(())
474     }
475 
476     ///
477     /// Get dirty pages bitmap (one bit per page)
478     ///
479     fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> {
480         self.fd
481             .get_dirty_log(slot, memory_size as usize)
482             .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into()))
483     }
484 
485     ///
486     /// Initialize TDX for this VM
487     ///
488     #[cfg(feature = "tdx")]
489     fn tdx_init(&self, cpuid: &CpuId, max_vcpus: u32) -> vm::Result<()> {
490         #[repr(C)]
491         struct TdxInitVm {
492             max_vcpus: u32,
493             tsc_khz: u32,
494             attributes: u64,
495             cpuid: u64,
496             mrconfigid: [u64; 6],
497             mrowner: [u64; 6],
498             mrownerconfig: [u64; 6],
499             reserved: [u64; 43],
500         }
501         let data = TdxInitVm {
502             max_vcpus,
503             tsc_khz: 0,
504             attributes: 1, // TDX1_TD_ATTRIBUTE_DEBUG,
505             cpuid: cpuid.as_fam_struct_ptr() as u64,
506             mrconfigid: [0; 6],
507             mrowner: [0; 6],
508             mrownerconfig: [0; 6],
509             reserved: [0; 43],
510         };
511 
512         tdx_command(
513             &self.fd.as_raw_fd(),
514             TdxCommand::InitVm,
515             0,
516             &data as *const _ as u64,
517         )
518         .map_err(vm::HypervisorVmError::InitializeTdx)
519     }
520 
521     ///
522     /// Finalize the TDX setup for this VM
523     ///
524     #[cfg(feature = "tdx")]
525     fn tdx_finalize(&self) -> vm::Result<()> {
526         tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0)
527             .map_err(vm::HypervisorVmError::FinalizeTdx)
528     }
529 
530     ///
531     /// Initialize memory regions for the TDX VM
532     ///
533     #[cfg(feature = "tdx")]
534     fn tdx_init_memory_region(
535         &self,
536         host_address: u64,
537         guest_address: u64,
538         size: u64,
539         measure: bool,
540     ) -> vm::Result<()> {
541         #[repr(C)]
542         struct TdxInitMemRegion {
543             host_address: u64,
544             guest_address: u64,
545             pages: u64,
546         }
547         let data = TdxInitMemRegion {
548             host_address,
549             guest_address,
550             pages: size / 4096,
551         };
552 
553         tdx_command(
554             &self.fd.as_raw_fd(),
555             TdxCommand::InitMemRegion,
556             if measure { 1 } else { 0 },
557             &data as *const _ as u64,
558         )
559         .map_err(vm::HypervisorVmError::InitMemRegionTdx)
560     }
561 }
562 
563 #[cfg(feature = "tdx")]
564 fn tdx_command(
565     fd: &RawFd,
566     command: TdxCommand,
567     metadata: u32,
568     data: u64,
569 ) -> std::result::Result<(), std::io::Error> {
570     #[repr(C)]
571     struct TdxIoctlCmd {
572         command: TdxCommand,
573         metadata: u32,
574         data: u64,
575     }
576     let cmd = TdxIoctlCmd {
577         command,
578         metadata,
579         data,
580     };
581     // SAFETY: FFI call. All input parameters are valid.
582     let ret = unsafe {
583         ioctl_with_val(
584             fd,
585             KVM_MEMORY_ENCRYPT_OP(),
586             &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong,
587         )
588     };
589 
590     if ret < 0 {
591         return Err(std::io::Error::last_os_error());
592     }
593     Ok(())
594 }
595 
596 /// Wrapper over KVM system ioctls.
597 pub struct KvmHypervisor {
598     kvm: Kvm,
599 }
600 /// Enum for KVM related error
601 #[derive(Debug, Error)]
602 pub enum KvmError {
603     #[error("Capability missing: {0:?}")]
604     CapabilityMissing(Cap),
605 }
606 pub type KvmResult<T> = result::Result<T, KvmError>;
607 impl KvmHypervisor {
608     /// Create a hypervisor based on Kvm
609     pub fn new() -> hypervisor::Result<KvmHypervisor> {
610         let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?;
611         let api_version = kvm_obj.get_api_version();
612 
613         if api_version != kvm_bindings::KVM_API_VERSION as i32 {
614             return Err(hypervisor::HypervisorError::IncompatibleApiVersion);
615         }
616 
617         Ok(KvmHypervisor { kvm: kvm_obj })
618     }
619 }
620 /// Implementation of Hypervisor trait for KVM
621 /// Example:
622 /// #[cfg(feature = "kvm")]
623 /// extern crate hypervisor
624 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
625 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
626 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
627 ///
628 impl hypervisor::Hypervisor for KvmHypervisor {
629     /// Create a KVM vm object of a specific VM type and return the object as Vm trait object
630     /// Example
631     /// # extern crate hypervisor;
632     /// # use hypervisor::KvmHypervisor;
633     /// use hypervisor::KvmVm;
634     /// let hypervisor = KvmHypervisor::new().unwrap();
635     /// let vm = hypervisor.create_vm_with_type(KvmVmType::LegacyVm).unwrap()
636     ///
637     fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> {
638         let fd: VmFd;
639         loop {
640             match self.kvm.create_vm_with_type(vm_type) {
641                 Ok(res) => fd = res,
642                 Err(e) => {
643                     if e.errno() == libc::EINTR {
644                         // If the error returned is EINTR, which means the
645                         // ioctl has been interrupted, we have to retry as
646                         // this can't be considered as a regular error.
647                         continue;
648                     } else {
649                         return Err(hypervisor::HypervisorError::VmCreate(e.into()));
650                     }
651                 }
652             }
653             break;
654         }
655 
656         let vm_fd = Arc::new(fd);
657 
658         #[cfg(target_arch = "x86_64")]
659         {
660             let msr_list = self.get_msr_list()?;
661             let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize;
662             let mut msrs = MsrEntries::new(num_msrs).unwrap();
663             let indices = msr_list.as_slice();
664             let msr_entries = msrs.as_mut_slice();
665             for (pos, index) in indices.iter().enumerate() {
666                 msr_entries[pos].index = *index;
667             }
668 
669             Ok(Arc::new(KvmVm {
670                 fd: vm_fd,
671                 msrs,
672                 state: VmState {},
673                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
674             }))
675         }
676 
677         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
678         {
679             Ok(Arc::new(KvmVm {
680                 fd: vm_fd,
681                 state: VmState {},
682                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
683             }))
684         }
685     }
686 
687     /// Create a KVM vm object and return the object as Vm trait object
688     /// Example
689     /// # extern crate hypervisor;
690     /// # use hypervisor::KvmHypervisor;
691     /// use hypervisor::KvmVm;
692     /// let hypervisor = KvmHypervisor::new().unwrap();
693     /// let vm = hypervisor.create_vm().unwrap()
694     ///
695     fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> {
696         #[allow(unused_mut)]
697         let mut vm_type: u64 = 0; // Create with default platform type
698 
699         // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA
700         // size from the host and use that when creating the VM, which may
701         // avoid unnecessary VM creation failures.
702         #[cfg(target_arch = "aarch64")]
703         if self.kvm.check_extension(Cap::ArmVmIPASize) {
704             vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap();
705         }
706 
707         self.create_vm_with_type(vm_type)
708     }
709 
710     fn check_required_extensions(&self) -> hypervisor::Result<()> {
711         check_required_kvm_extensions(&self.kvm)
712             .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into()))
713     }
714 
715     #[cfg(target_arch = "x86_64")]
716     ///
717     /// X86 specific call to get the system supported CPUID values.
718     ///
719     fn get_cpuid(&self) -> hypervisor::Result<CpuId> {
720         self.kvm
721             .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
722             .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))
723     }
724 
725     #[cfg(target_arch = "x86_64")]
726     ///
727     /// Retrieve the list of MSRs supported by KVM.
728     ///
729     fn get_msr_list(&self) -> hypervisor::Result<MsrList> {
730         self.kvm
731             .get_msr_index_list()
732             .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into()))
733     }
734     #[cfg(target_arch = "aarch64")]
735     ///
736     /// Retrieve AArch64 host maximum IPA size supported by KVM.
737     ///
738     fn get_host_ipa_limit(&self) -> i32 {
739         self.kvm.get_host_ipa_limit()
740     }
741 }
742 /// Vcpu struct for KVM
743 pub struct KvmVcpu {
744     fd: VcpuFd,
745     #[cfg(target_arch = "x86_64")]
746     msrs: MsrEntries,
747     vmmops: Option<Arc<dyn vm::VmmOps>>,
748     #[cfg(target_arch = "x86_64")]
749     hyperv_synic: AtomicBool,
750 }
751 /// Implementation of Vcpu trait for KVM
752 /// Example:
753 /// #[cfg(feature = "kvm")]
754 /// extern crate hypervisor
755 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
756 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
757 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
758 /// let vcpu = vm.create_vcpu(0, None).unwrap();
759 /// vcpu.get/set().unwrap()
760 ///
761 impl cpu::Vcpu for KvmVcpu {
762     #[cfg(target_arch = "x86_64")]
763     ///
764     /// Returns the vCPU general purpose registers.
765     ///
766     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
767         self.fd
768             .get_regs()
769             .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))
770     }
771     #[cfg(target_arch = "x86_64")]
772     ///
773     /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl.
774     ///
775     fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> {
776         self.fd
777             .set_regs(regs)
778             .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into()))
779     }
780 
781     #[cfg(target_arch = "aarch64")]
782     ///
783     /// Set attribute for vcpu.
784     ///
785     fn set_vcpu_attr(&self, attr: &DeviceAttr) -> cpu::Result<()> {
786         self.fd
787             .set_device_attr(attr)
788             .map_err(|e| cpu::HypervisorCpuError::SetVcpuAttribute(e.into()))
789     }
790 
791     #[cfg(target_arch = "aarch64")]
792     ///
793     /// Check if vcpu has a certain attribute.
794     ///
795     fn has_vcpu_attr(&self, attr: &DeviceAttr) -> cpu::Result<()> {
796         self.fd
797             .has_device_attr(attr)
798             .map_err(|e| cpu::HypervisorCpuError::HasVcpuAttribute(e.into()))
799     }
800 
801     #[cfg(target_arch = "x86_64")]
802     ///
803     /// Returns the vCPU special registers.
804     ///
805     fn get_sregs(&self) -> cpu::Result<SpecialRegisters> {
806         self.fd
807             .get_sregs()
808             .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))
809     }
810     #[cfg(target_arch = "x86_64")]
811     ///
812     /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl.
813     ///
814     fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> {
815         self.fd
816             .set_sregs(sregs)
817             .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into()))
818     }
819     #[cfg(target_arch = "x86_64")]
820     ///
821     /// Returns the floating point state (FPU) from the vCPU.
822     ///
823     fn get_fpu(&self) -> cpu::Result<FpuState> {
824         self.fd
825             .get_fpu()
826             .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))
827     }
828     #[cfg(target_arch = "x86_64")]
829     ///
830     /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct.
831     ///
832     fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> {
833         self.fd
834             .set_fpu(fpu)
835             .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into()))
836     }
837     #[cfg(target_arch = "x86_64")]
838     ///
839     /// X86 specific call to setup the CPUID registers.
840     ///
841     fn set_cpuid2(&self, cpuid: &CpuId) -> cpu::Result<()> {
842         self.fd
843             .set_cpuid2(cpuid)
844             .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into()))
845     }
846     #[cfg(target_arch = "x86_64")]
847     ///
848     /// X86 specific call to enable HyperV SynIC
849     ///
850     fn enable_hyperv_synic(&self) -> cpu::Result<()> {
851         // Update the information about Hyper-V SynIC being enabled and
852         // emulated as it will influence later which MSRs should be saved.
853         self.hyperv_synic.store(true, Ordering::Release);
854 
855         let cap = kvm_enable_cap {
856             cap: KVM_CAP_HYPERV_SYNIC,
857             ..Default::default()
858         };
859         self.fd
860             .enable_cap(&cap)
861             .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into()))
862     }
863     ///
864     /// X86 specific call to retrieve the CPUID registers.
865     ///
866     #[cfg(target_arch = "x86_64")]
867     fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<CpuId> {
868         self.fd
869             .get_cpuid2(num_entries)
870             .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))
871     }
872     #[cfg(target_arch = "x86_64")]
873     ///
874     /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
875     ///
876     fn get_lapic(&self) -> cpu::Result<LapicState> {
877         self.fd
878             .get_lapic()
879             .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))
880     }
881     #[cfg(target_arch = "x86_64")]
882     ///
883     /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
884     ///
885     fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> {
886         self.fd
887             .set_lapic(klapic)
888             .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into()))
889     }
890     #[cfg(target_arch = "x86_64")]
891     ///
892     /// Returns the model-specific registers (MSR) for this vCPU.
893     ///
894     fn get_msrs(&self, msrs: &mut MsrEntries) -> cpu::Result<usize> {
895         self.fd
896             .get_msrs(msrs)
897             .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))
898     }
899     #[cfg(target_arch = "x86_64")]
900     ///
901     /// Setup the model-specific registers (MSR) for this vCPU.
902     /// Returns the number of MSR entries actually written.
903     ///
904     fn set_msrs(&self, msrs: &MsrEntries) -> cpu::Result<usize> {
905         self.fd
906             .set_msrs(msrs)
907             .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into()))
908     }
909     ///
910     /// Returns the vcpu's current "multiprocessing state".
911     ///
912     fn get_mp_state(&self) -> cpu::Result<MpState> {
913         self.fd
914             .get_mp_state()
915             .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))
916     }
917     ///
918     /// Sets the vcpu's current "multiprocessing state".
919     ///
920     fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> {
921         self.fd
922             .set_mp_state(mp_state)
923             .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into()))
924     }
925     #[cfg(target_arch = "x86_64")]
926     ///
927     /// X86 specific call that returns the vcpu's current "xsave struct".
928     ///
929     fn get_xsave(&self) -> cpu::Result<Xsave> {
930         self.fd
931             .get_xsave()
932             .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into()))
933     }
934     #[cfg(target_arch = "x86_64")]
935     ///
936     /// X86 specific call that sets the vcpu's current "xsave struct".
937     ///
938     fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> {
939         self.fd
940             .set_xsave(xsave)
941             .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into()))
942     }
943     #[cfg(target_arch = "x86_64")]
944     ///
945     /// X86 specific call that returns the vcpu's current "xcrs".
946     ///
947     fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> {
948         self.fd
949             .get_xcrs()
950             .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into()))
951     }
952     #[cfg(target_arch = "x86_64")]
953     ///
954     /// X86 specific call that sets the vcpu's current "xcrs".
955     ///
956     fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> {
957         self.fd
958             .set_xcrs(xcrs)
959             .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into()))
960     }
961     ///
962     /// Triggers the running of the current virtual CPU returning an exit reason.
963     ///
964     fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> {
965         match self.fd.run() {
966             Ok(run) => match run {
967                 #[cfg(target_arch = "x86_64")]
968                 VcpuExit::IoIn(addr, data) => {
969                     if let Some(vmmops) = &self.vmmops {
970                         return vmmops
971                             .pio_read(addr.into(), data)
972                             .map(|_| cpu::VmExit::Ignore)
973                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
974                     }
975 
976                     Ok(cpu::VmExit::IoIn(addr, data))
977                 }
978                 #[cfg(target_arch = "x86_64")]
979                 VcpuExit::IoOut(addr, data) => {
980                     if let Some(vmmops) = &self.vmmops {
981                         return vmmops
982                             .pio_write(addr.into(), data)
983                             .map(|_| cpu::VmExit::Ignore)
984                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
985                     }
986 
987                     Ok(cpu::VmExit::IoOut(addr, data))
988                 }
989                 #[cfg(target_arch = "x86_64")]
990                 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)),
991                 #[cfg(target_arch = "x86_64")]
992                 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset),
993 
994                 #[cfg(target_arch = "aarch64")]
995                 VcpuExit::SystemEvent(event_type, flags) => {
996                     use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN};
997                     // On Aarch64, when the VM is shutdown, run() returns
998                     // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN
999                     if event_type == KVM_SYSTEM_EVENT_RESET {
1000                         Ok(cpu::VmExit::Reset)
1001                     } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN {
1002                         Ok(cpu::VmExit::Shutdown)
1003                     } else {
1004                         Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1005                             "Unexpected system event with type 0x{:x}, flags 0x{:x}",
1006                             event_type,
1007                             flags
1008                         )))
1009                     }
1010                 }
1011 
1012                 VcpuExit::MmioRead(addr, data) => {
1013                     if let Some(vmmops) = &self.vmmops {
1014                         return vmmops
1015                             .mmio_read(addr, data)
1016                             .map(|_| cpu::VmExit::Ignore)
1017                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1018                     }
1019 
1020                     Ok(cpu::VmExit::MmioRead(addr, data))
1021                 }
1022                 VcpuExit::MmioWrite(addr, data) => {
1023                     if let Some(vmmops) = &self.vmmops {
1024                         return vmmops
1025                             .mmio_write(addr, data)
1026                             .map(|_| cpu::VmExit::Ignore)
1027                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1028                     }
1029 
1030                     Ok(cpu::VmExit::MmioWrite(addr, data))
1031                 }
1032                 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv),
1033 
1034                 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1035                     "Unexpected exit reason on vcpu run: {:?}",
1036                     r
1037                 ))),
1038             },
1039 
1040             Err(ref e) => match e.errno() {
1041                 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore),
1042                 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1043                     "VCPU error {:?}",
1044                     e
1045                 ))),
1046             },
1047         }
1048     }
1049     #[cfg(target_arch = "x86_64")]
1050     ///
1051     /// Returns currently pending exceptions, interrupts, and NMIs as well as related
1052     /// states of the vcpu.
1053     ///
1054     fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> {
1055         self.fd
1056             .get_vcpu_events()
1057             .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into()))
1058     }
1059     #[cfg(target_arch = "x86_64")]
1060     ///
1061     /// Sets pending exceptions, interrupts, and NMIs as well as related states
1062     /// of the vcpu.
1063     ///
1064     fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> {
1065         self.fd
1066             .set_vcpu_events(events)
1067             .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into()))
1068     }
1069     #[cfg(target_arch = "x86_64")]
1070     ///
1071     /// Let the guest know that it has been paused, which prevents from
1072     /// potential soft lockups when being resumed.
1073     ///
1074     fn notify_guest_clock_paused(&self) -> cpu::Result<()> {
1075         if let Err(e) = self.fd.kvmclock_ctrl() {
1076             // Linux kernel returns -EINVAL if the PV clock isn't yet initialised
1077             // which could be because we're still in firmware or the guest doesn't
1078             // use KVM clock.
1079             if e.errno() != libc::EINVAL {
1080                 return Err(cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into()));
1081             }
1082         }
1083 
1084         Ok(())
1085     }
1086     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1087     fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> {
1088         self.fd
1089             .vcpu_init(kvi)
1090             .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into()))
1091     }
1092     ///
1093     /// Sets the value of one register for this vCPU.
1094     ///
1095     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1096     fn set_reg(&self, reg_id: u64, data: u64) -> cpu::Result<()> {
1097         self.fd
1098             .set_one_reg(reg_id, data)
1099             .map_err(|e| cpu::HypervisorCpuError::SetRegister(e.into()))
1100     }
1101     ///
1102     /// Gets the value of one register for this vCPU.
1103     ///
1104     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1105     fn get_reg(&self, reg_id: u64) -> cpu::Result<u64> {
1106         self.fd
1107             .get_one_reg(reg_id)
1108             .map_err(|e| cpu::HypervisorCpuError::GetRegister(e.into()))
1109     }
1110     ///
1111     /// Gets a list of the guest registers that are supported for the
1112     /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls.
1113     ///
1114     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1115     fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> {
1116         self.fd
1117             .get_reg_list(reg_list)
1118             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))
1119     }
1120     ///
1121     /// Save the state of the core registers.
1122     ///
1123     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1124     fn core_registers(&self, state: &mut StandardRegisters) -> cpu::Result<()> {
1125         let mut off = offset__of!(user_pt_regs, regs);
1126         // There are 31 user_pt_regs:
1127         // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72
1128         // These actually are the general-purpose registers of the Armv8-a
1129         // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register).
1130         for i in 0..31 {
1131             state.regs.regs[i] = self
1132                 .fd
1133                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1134                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1135             off += std::mem::size_of::<u64>();
1136         }
1137 
1138         // We are now entering the "Other register" section of the ARMv8-a architecture.
1139         // First one, stack pointer.
1140         let off = offset__of!(user_pt_regs, sp);
1141         state.regs.sp = self
1142             .fd
1143             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1144             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1145 
1146         // Second one, the program counter.
1147         let off = offset__of!(user_pt_regs, pc);
1148         state.regs.pc = self
1149             .fd
1150             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1151             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1152 
1153         // Next is the processor state.
1154         let off = offset__of!(user_pt_regs, pstate);
1155         state.regs.pstate = self
1156             .fd
1157             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1158             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1159 
1160         // The stack pointer associated with EL1
1161         let off = offset__of!(kvm_regs, sp_el1);
1162         state.sp_el1 = self
1163             .fd
1164             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1165             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1166 
1167         // Exception Link Register for EL1, when taking an exception to EL1, this register
1168         // holds the address to which to return afterwards.
1169         let off = offset__of!(kvm_regs, elr_el1);
1170         state.elr_el1 = self
1171             .fd
1172             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1173             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1174 
1175         // Saved Program Status Registers, there are 5 of them used in the kernel.
1176         let mut off = offset__of!(kvm_regs, spsr);
1177         for i in 0..KVM_NR_SPSR as usize {
1178             state.spsr[i] = self
1179                 .fd
1180                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1181                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1182             off += std::mem::size_of::<u64>();
1183         }
1184 
1185         // Now moving on to floting point registers which are stored in the user_fpsimd_state in the kernel:
1186         // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53
1187         let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs);
1188         for i in 0..32 {
1189             state.fp_regs.vregs[i] = self
1190                 .fd
1191                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off))
1192                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1193                 .into();
1194             off += mem::size_of::<u128>();
1195         }
1196 
1197         // Floating-point Status Register
1198         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr);
1199         state.fp_regs.fpsr = self
1200             .fd
1201             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1202             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1203             as u32;
1204 
1205         // Floating-point Control Register
1206         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr);
1207         state.fp_regs.fpcr = self
1208             .fd
1209             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1210             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1211             as u32;
1212         Ok(())
1213     }
1214     ///
1215     /// Restore the state of the core registers.
1216     ///
1217     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1218     fn set_core_registers(&self, state: &StandardRegisters) -> cpu::Result<()> {
1219         // The function follows the exact identical order from `state`. Look there
1220         // for some additional info on registers.
1221         let mut off = offset__of!(user_pt_regs, regs);
1222         for i in 0..31 {
1223             self.fd
1224                 .set_one_reg(
1225                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1226                     state.regs.regs[i],
1227                 )
1228                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1229             off += std::mem::size_of::<u64>();
1230         }
1231 
1232         let off = offset__of!(user_pt_regs, sp);
1233         self.fd
1234             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.sp)
1235             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1236 
1237         let off = offset__of!(user_pt_regs, pc);
1238         self.fd
1239             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pc)
1240             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1241 
1242         let off = offset__of!(user_pt_regs, pstate);
1243         self.fd
1244             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pstate)
1245             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1246 
1247         let off = offset__of!(kvm_regs, sp_el1);
1248         self.fd
1249             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.sp_el1)
1250             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1251 
1252         let off = offset__of!(kvm_regs, elr_el1);
1253         self.fd
1254             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.elr_el1)
1255             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1256 
1257         let mut off = offset__of!(kvm_regs, spsr);
1258         for i in 0..KVM_NR_SPSR as usize {
1259             self.fd
1260                 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.spsr[i])
1261                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1262             off += std::mem::size_of::<u64>();
1263         }
1264 
1265         let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs);
1266         for i in 0..32 {
1267             self.fd
1268                 .set_one_reg(
1269                     arm64_core_reg_id!(KVM_REG_SIZE_U128, off),
1270                     state.fp_regs.vregs[i] as u64,
1271                 )
1272                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1273             off += mem::size_of::<u128>();
1274         }
1275 
1276         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr);
1277         self.fd
1278             .set_one_reg(
1279                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1280                 state.fp_regs.fpsr as u64,
1281             )
1282             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1283 
1284         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr);
1285         self.fd
1286             .set_one_reg(
1287                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1288                 state.fp_regs.fpcr as u64,
1289             )
1290             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1291         Ok(())
1292     }
1293     ///
1294     /// Save the state of the system registers.
1295     ///
1296     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1297     fn system_registers(&self, state: &mut Vec<Register>) -> cpu::Result<()> {
1298         // Call KVM_GET_REG_LIST to get all registers available to the guest. For ArmV8 there are
1299         // around 500 registers.
1300         let mut reg_list = RegList::new(500).unwrap();
1301         self.fd
1302             .get_reg_list(&mut reg_list)
1303             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?;
1304 
1305         // At this point reg_list should contain: core registers and system registers.
1306         // The register list contains the number of registers and their ids. We will be needing to
1307         // call KVM_GET_ONE_REG on each id in order to save all of them. We carve out from the list
1308         // the core registers which are represented in the kernel by kvm_regs structure and for which
1309         // we can calculate the id based on the offset in the structure.
1310         reg_list.retain(|regid| is_system_register(*regid));
1311 
1312         // Now, for the rest of the registers left in the previously fetched register list, we are
1313         // simply calling KVM_GET_ONE_REG.
1314         let indices = reg_list.as_slice();
1315         for index in indices.iter() {
1316             state.push(kvm_bindings::kvm_one_reg {
1317                 id: *index,
1318                 addr: self
1319                     .fd
1320                     .get_one_reg(*index)
1321                     .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?,
1322             });
1323         }
1324 
1325         Ok(())
1326     }
1327     ///
1328     /// Restore the state of the system registers.
1329     ///
1330     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1331     fn set_system_registers(&self, state: &[Register]) -> cpu::Result<()> {
1332         for reg in state {
1333             self.fd
1334                 .set_one_reg(reg.id, reg.addr)
1335                 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?;
1336         }
1337         Ok(())
1338     }
1339     ///
1340     /// Read the MPIDR - Multiprocessor Affinity Register.
1341     ///
1342     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1343     fn read_mpidr(&self) -> cpu::Result<u64> {
1344         self.fd
1345             .get_one_reg(MPIDR_EL1)
1346             .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))
1347     }
1348     #[cfg(target_arch = "x86_64")]
1349     ///
1350     /// Get the current CPU state
1351     ///
1352     /// Ordering requirements:
1353     ///
1354     /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify
1355     /// vCPU/LAPIC state. As such, it must be done before most everything
1356     /// else, otherwise we cannot restore everything and expect it to work.
1357     ///
1358     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1359     /// still running.
1360     ///
1361     /// KVM_GET_LAPIC may change state of LAPIC before returning it.
1362     ///
1363     /// GET_VCPU_EVENTS should probably be last to save. The code looks as
1364     /// it might as well be affected by internal state modifications of the
1365     /// GET ioctls.
1366     ///
1367     /// SREGS saves/restores a pending interrupt, similar to what
1368     /// VCPU_EVENTS also does.
1369     ///
1370     /// GET_MSRS requires a pre-populated data structure to do something
1371     /// meaningful. For SET_MSRS it will then contain good data.
1372     ///
1373     /// # Example
1374     ///
1375     /// ```rust
1376     /// # extern crate hypervisor;
1377     /// # use hypervisor::KvmHypervisor;
1378     /// # use std::sync::Arc;
1379     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1380     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1381     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1382     /// vm.enable_split_irq().unwrap();
1383     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1384     /// let state = vcpu.state().unwrap();
1385     /// ```
1386     fn state(&self) -> cpu::Result<CpuState> {
1387         let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?;
1388         let mp_state = self.get_mp_state()?;
1389         let regs = self.get_regs()?;
1390         let sregs = self.get_sregs()?;
1391         let xsave = self.get_xsave()?;
1392         let xcrs = self.get_xcrs()?;
1393         let lapic_state = self.get_lapic()?;
1394         let fpu = self.get_fpu()?;
1395 
1396         // Try to get all MSRs based on the list previously retrieved from KVM.
1397         // If the number of MSRs obtained from GET_MSRS is different from the
1398         // expected amount, we fallback onto a slower method by getting MSRs
1399         // by chunks. This is the only way to make sure we try to get as many
1400         // MSRs as possible, even if some MSRs are not supported.
1401         let mut msr_entries = self.msrs.clone();
1402 
1403         // Save extra MSRs if the Hyper-V synthetic interrupt controller is
1404         // emulated.
1405         if self.hyperv_synic.load(Ordering::Acquire) {
1406             let hyperv_synic_msrs = vec![
1407                 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084,
1408                 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096,
1409                 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d,
1410                 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4,
1411                 0x400000b5, 0x400000b6, 0x400000b7,
1412             ];
1413             for index in hyperv_synic_msrs {
1414                 let msr = kvm_msr_entry {
1415                     index,
1416                     ..Default::default()
1417                 };
1418                 msr_entries.push(msr).unwrap();
1419             }
1420         }
1421 
1422         let expected_num_msrs = msr_entries.as_fam_struct_ref().nmsrs as usize;
1423         let num_msrs = self.get_msrs(&mut msr_entries)?;
1424         let msrs = if num_msrs != expected_num_msrs {
1425             let mut faulty_msr_index = num_msrs;
1426             let mut msr_entries_tmp =
1427                 MsrEntries::from_entries(&msr_entries.as_slice()[..faulty_msr_index]).unwrap();
1428 
1429             loop {
1430                 warn!(
1431                     "Detected faulty MSR 0x{:x} while getting MSRs",
1432                     msr_entries.as_slice()[faulty_msr_index].index
1433                 );
1434 
1435                 let start_pos = faulty_msr_index + 1;
1436                 let mut sub_msr_entries =
1437                     MsrEntries::from_entries(&msr_entries.as_slice()[start_pos..]).unwrap();
1438                 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize;
1439                 let num_msrs = self.get_msrs(&mut sub_msr_entries)?;
1440 
1441                 for i in 0..num_msrs {
1442                     msr_entries_tmp
1443                         .push(sub_msr_entries.as_slice()[i])
1444                         .map_err(|e| {
1445                             cpu::HypervisorCpuError::GetMsrEntries(anyhow!(
1446                                 "Failed adding MSR entries: {:?}",
1447                                 e
1448                             ))
1449                         })?;
1450                 }
1451 
1452                 if num_msrs == expected_num_msrs {
1453                     break;
1454                 }
1455 
1456                 faulty_msr_index = start_pos + num_msrs;
1457             }
1458 
1459             msr_entries_tmp
1460         } else {
1461             msr_entries
1462         };
1463 
1464         let vcpu_events = self.get_vcpu_events()?;
1465 
1466         Ok(CpuState {
1467             cpuid,
1468             msrs,
1469             vcpu_events,
1470             regs,
1471             sregs,
1472             fpu,
1473             lapic_state,
1474             xsave,
1475             xcrs,
1476             mp_state,
1477         })
1478     }
1479     ///
1480     /// Get the current AArch64 CPU state
1481     ///
1482     #[cfg(target_arch = "aarch64")]
1483     fn state(&self) -> cpu::Result<CpuState> {
1484         let mut state = CpuState {
1485             mp_state: self.get_mp_state()?,
1486             mpidr: self.read_mpidr()?,
1487             ..Default::default()
1488         };
1489         self.core_registers(&mut state.core_regs)?;
1490         self.system_registers(&mut state.sys_regs)?;
1491 
1492         Ok(state)
1493     }
1494     #[cfg(target_arch = "x86_64")]
1495     ///
1496     /// Restore the previously saved CPU state
1497     ///
1498     /// Ordering requirements:
1499     ///
1500     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1501     /// still running.
1502     ///
1503     /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so
1504     /// if we ever change the BSP, we have to do that before restoring anything.
1505     /// The same seems to be true for CPUID stuff.
1506     ///
1507     /// SREGS saves/restores a pending interrupt, similar to what
1508     /// VCPU_EVENTS also does.
1509     ///
1510     /// SET_REGS clears pending exceptions unconditionally, thus, it must be
1511     /// done before SET_VCPU_EVENTS, which restores it.
1512     ///
1513     /// SET_LAPIC must come after SET_SREGS, because the latter restores
1514     /// the apic base msr.
1515     ///
1516     /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR
1517     /// only restores successfully, when the LAPIC is correctly configured.
1518     ///
1519     /// Arguments: CpuState
1520     /// # Example
1521     ///
1522     /// ```rust
1523     /// # extern crate hypervisor;
1524     /// # use hypervisor::KvmHypervisor;
1525     /// # use std::sync::Arc;
1526     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1527     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1528     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1529     /// vm.enable_split_irq().unwrap();
1530     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1531     /// let state = vcpu.state().unwrap();
1532     /// vcpu.set_state(&state).unwrap();
1533     /// ```
1534     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
1535         self.set_cpuid2(&state.cpuid)?;
1536         self.set_mp_state(state.mp_state)?;
1537         self.set_regs(&state.regs)?;
1538         self.set_sregs(&state.sregs)?;
1539         self.set_xsave(&state.xsave)?;
1540         self.set_xcrs(&state.xcrs)?;
1541         self.set_lapic(&state.lapic_state)?;
1542         self.set_fpu(&state.fpu)?;
1543 
1544         // Try to set all MSRs previously stored.
1545         // If the number of MSRs set from SET_MSRS is different from the
1546         // expected amount, we fallback onto a slower method by setting MSRs
1547         // by chunks. This is the only way to make sure we try to set as many
1548         // MSRs as possible, even if some MSRs are not supported.
1549         let expected_num_msrs = state.msrs.as_fam_struct_ref().nmsrs as usize;
1550         let num_msrs = self.set_msrs(&state.msrs)?;
1551         if num_msrs != expected_num_msrs {
1552             let mut faulty_msr_index = num_msrs;
1553 
1554             loop {
1555                 warn!(
1556                     "Detected faulty MSR 0x{:x} while setting MSRs",
1557                     state.msrs.as_slice()[faulty_msr_index].index
1558                 );
1559 
1560                 let start_pos = faulty_msr_index + 1;
1561                 let sub_msr_entries =
1562                     MsrEntries::from_entries(&state.msrs.as_slice()[start_pos..]).unwrap();
1563                 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize;
1564                 let num_msrs = self.set_msrs(&sub_msr_entries)?;
1565 
1566                 if num_msrs == expected_num_msrs {
1567                     break;
1568                 }
1569 
1570                 faulty_msr_index = start_pos + num_msrs;
1571             }
1572         }
1573 
1574         self.set_vcpu_events(&state.vcpu_events)?;
1575 
1576         Ok(())
1577     }
1578     ///
1579     /// Restore the previously saved AArch64 CPU state
1580     ///
1581     #[cfg(target_arch = "aarch64")]
1582     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
1583         self.set_core_registers(&state.core_regs)?;
1584         self.set_system_registers(&state.sys_regs)?;
1585         self.set_mp_state(state.mp_state)?;
1586 
1587         Ok(())
1588     }
1589 
1590     ///
1591     /// Initialize TDX for this CPU
1592     ///
1593     #[cfg(feature = "tdx")]
1594     fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> {
1595         tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address)
1596             .map_err(cpu::HypervisorCpuError::InitializeTdx)
1597     }
1598 }
1599 
1600 /// Device struct for KVM
1601 pub struct KvmDevice {
1602     fd: DeviceFd,
1603 }
1604 
1605 impl device::Device for KvmDevice {
1606     ///
1607     /// Set device attribute
1608     ///
1609     fn set_device_attr(&self, attr: &DeviceAttr) -> device::Result<()> {
1610         self.fd
1611             .set_device_attr(attr)
1612             .map_err(|e| device::HypervisorDeviceError::SetDeviceAttribute(e.into()))
1613     }
1614     ///
1615     /// Get device attribute
1616     ///
1617     fn get_device_attr(&self, attr: &mut DeviceAttr) -> device::Result<()> {
1618         self.fd
1619             .get_device_attr(attr)
1620             .map_err(|e| device::HypervisorDeviceError::GetDeviceAttribute(e.into()))
1621     }
1622 }
1623 
1624 impl AsRawFd for KvmDevice {
1625     fn as_raw_fd(&self) -> RawFd {
1626         self.fd.as_raw_fd()
1627     }
1628 }
1629