xref: /cloud-hypervisor/hypervisor/src/kvm/mod.rs (revision f7f2f25a574b1b2dba22c094fc8226d404157d15)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 // Copyright © 2020, Microsoft Corporation
6 //
7 // Copyright 2018-2019 CrowdStrike, Inc.
8 //
9 //
10 
11 #[cfg(target_arch = "aarch64")]
12 pub use crate::aarch64::{
13     check_required_kvm_extensions, is_system_register, VcpuInit, VcpuKvmState as CpuState,
14     MPIDR_EL1,
15 };
16 use crate::cpu;
17 use crate::device;
18 use crate::hypervisor;
19 use crate::vec_with_array_field;
20 use crate::vm::{self, VmmOps};
21 #[cfg(target_arch = "aarch64")]
22 use crate::{arm64_core_reg_id, offset__of};
23 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd};
24 use serde_derive::{Deserialize, Serialize};
25 use std::collections::HashMap;
26 #[cfg(target_arch = "aarch64")]
27 use std::convert::TryInto;
28 #[cfg(target_arch = "x86_64")]
29 use std::fs::File;
30 use std::os::unix::io::{AsRawFd, RawFd};
31 use std::result;
32 #[cfg(target_arch = "x86_64")]
33 use std::sync::atomic::{AtomicBool, Ordering};
34 use std::sync::{Arc, RwLock};
35 #[cfg(target_arch = "x86_64")]
36 use vm_memory::Address;
37 use vmm_sys_util::eventfd::EventFd;
38 // x86_64 dependencies
39 #[cfg(target_arch = "x86_64")]
40 pub mod x86_64;
41 #[cfg(target_arch = "x86_64")]
42 use crate::arch::x86::NUM_IOAPIC_PINS;
43 #[cfg(target_arch = "aarch64")]
44 use aarch64::{RegList, Register, StandardRegisters};
45 #[cfg(target_arch = "x86_64")]
46 use kvm_bindings::{
47     kvm_enable_cap, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, KVM_CAP_SPLIT_IRQCHIP,
48 };
49 #[cfg(target_arch = "x86_64")]
50 use x86_64::{
51     check_required_kvm_extensions, FpuState, SpecialRegisters, StandardRegisters, KVM_TSS_ADDRESS,
52 };
53 #[cfg(target_arch = "x86_64")]
54 pub use x86_64::{
55     CpuId, CpuIdEntry, ExtendedControlRegisters, LapicState, MsrEntries, VcpuKvmState as CpuState,
56     Xsave, CPUID_FLAG_VALID_INDEX,
57 };
58 // aarch64 dependencies
59 #[cfg(target_arch = "aarch64")]
60 pub mod aarch64;
61 pub use kvm_bindings;
62 #[cfg(feature = "tdx")]
63 use kvm_bindings::KVMIO;
64 pub use kvm_bindings::{
65     kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_irq_routing, kvm_irq_routing_entry,
66     kvm_userspace_memory_region, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI,
67     KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID,
68 };
69 #[cfg(target_arch = "aarch64")]
70 use kvm_bindings::{
71     kvm_regs, user_fpsimd_state, user_pt_regs, KVM_NR_SPSR, KVM_REG_ARM64, KVM_REG_ARM_CORE,
72     KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64,
73 };
74 pub use kvm_ioctls;
75 pub use kvm_ioctls::{Cap, Kvm};
76 #[cfg(target_arch = "aarch64")]
77 use std::mem;
78 use thiserror::Error;
79 #[cfg(feature = "tdx")]
80 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_expr, ioctl_ioc_nr, ioctl_iowr_nr};
81 ///
82 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms
83 ///
84 pub use {
85     kvm_bindings::kvm_clock_data as ClockData, kvm_bindings::kvm_create_device as CreateDevice,
86     kvm_bindings::kvm_device_attr as DeviceAttr,
87     kvm_bindings::kvm_irq_routing_entry as IrqRoutingEntry, kvm_bindings::kvm_mp_state as MpState,
88     kvm_bindings::kvm_userspace_memory_region as MemoryRegion,
89     kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::DeviceFd, kvm_ioctls::IoEventAddress,
90     kvm_ioctls::VcpuExit,
91 };
92 
93 #[cfg(target_arch = "x86_64")]
94 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196;
95 
96 #[cfg(feature = "tdx")]
97 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong);
98 
99 #[cfg(feature = "tdx")]
100 #[repr(u32)]
101 enum TdxCommand {
102     #[allow(dead_code)]
103     Capabilities = 0,
104     InitVm,
105     InitVcpu,
106     InitMemRegion,
107     Finalize,
108 }
109 
110 #[derive(Clone, Copy, Debug, PartialEq, Deserialize, Serialize)]
111 pub struct KvmVmState {}
112 
113 pub use KvmVmState as VmState;
114 
115 struct KvmDirtyLogSlot {
116     slot: u32,
117     guest_phys_addr: u64,
118     memory_size: u64,
119     userspace_addr: u64,
120 }
121 
122 /// Wrapper over KVM VM ioctls.
123 pub struct KvmVm {
124     fd: Arc<VmFd>,
125     #[cfg(target_arch = "x86_64")]
126     msrs: MsrEntries,
127     state: KvmVmState,
128     dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>,
129 }
130 
131 ///
132 /// Implementation of Vm trait for KVM
133 /// Example:
134 /// #[cfg(feature = "kvm")]
135 /// extern crate hypervisor
136 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
137 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
138 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
139 /// vm.set/get().unwrap()
140 ///
141 impl vm::Vm for KvmVm {
142     #[cfg(target_arch = "x86_64")]
143     ///
144     /// Sets the address of the three-page region in the VM's address space.
145     ///
146     fn set_tss_address(&self, offset: usize) -> vm::Result<()> {
147         self.fd
148             .set_tss_address(offset)
149             .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into()))
150     }
151     ///
152     /// Creates an in-kernel interrupt controller.
153     ///
154     fn create_irq_chip(&self) -> vm::Result<()> {
155         self.fd
156             .create_irq_chip()
157             .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into()))
158     }
159     ///
160     /// Registers an event that will, when signaled, trigger the `gsi` IRQ.
161     ///
162     fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
163         self.fd
164             .register_irqfd(fd, gsi)
165             .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into()))
166     }
167     ///
168     /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ.
169     ///
170     fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
171         self.fd
172             .unregister_irqfd(fd, gsi)
173             .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into()))
174     }
175     ///
176     /// Creates a VcpuFd object from a vcpu RawFd.
177     ///
178     fn create_vcpu(
179         &self,
180         id: u8,
181         vmmops: Option<Arc<dyn VmmOps>>,
182     ) -> vm::Result<Arc<dyn cpu::Vcpu>> {
183         let vc = self
184             .fd
185             .create_vcpu(id as u64)
186             .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?;
187         let vcpu = KvmVcpu {
188             fd: vc,
189             #[cfg(target_arch = "x86_64")]
190             msrs: self.msrs.clone(),
191             vmmops,
192             #[cfg(target_arch = "x86_64")]
193             hyperv_synic: AtomicBool::new(false),
194         };
195         Ok(Arc::new(vcpu))
196     }
197     ///
198     /// Registers an event to be signaled whenever a certain address is written to.
199     ///
200     fn register_ioevent(
201         &self,
202         fd: &EventFd,
203         addr: &IoEventAddress,
204         datamatch: Option<vm::DataMatch>,
205     ) -> vm::Result<()> {
206         if let Some(dm) = datamatch {
207             match dm {
208                 vm::DataMatch::DataMatch32(kvm_dm32) => self
209                     .fd
210                     .register_ioevent(fd, addr, kvm_dm32)
211                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
212                 vm::DataMatch::DataMatch64(kvm_dm64) => self
213                     .fd
214                     .register_ioevent(fd, addr, kvm_dm64)
215                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
216             }
217         } else {
218             self.fd
219                 .register_ioevent(fd, addr, NoDatamatch)
220                 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into()))
221         }
222     }
223     ///
224     /// Unregisters an event from a certain address it has been previously registered to.
225     ///
226     fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> {
227         self.fd
228             .unregister_ioevent(fd, addr, NoDatamatch)
229             .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into()))
230     }
231     ///
232     /// Sets the GSI routing table entries, overwriting any previously set
233     /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl.
234     ///
235     fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> {
236         let mut irq_routing =
237             vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len());
238         irq_routing[0].nr = entries.len() as u32;
239         irq_routing[0].flags = 0;
240 
241         unsafe {
242             let entries_slice: &mut [kvm_irq_routing_entry] =
243                 irq_routing[0].entries.as_mut_slice(entries.len());
244             entries_slice.copy_from_slice(entries);
245         }
246 
247         self.fd
248             .set_gsi_routing(&irq_routing[0])
249             .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into()))
250     }
251     ///
252     /// Creates a memory region structure that can be used with {create/remove}_user_memory_region
253     ///
254     fn make_user_memory_region(
255         &self,
256         slot: u32,
257         guest_phys_addr: u64,
258         memory_size: u64,
259         userspace_addr: u64,
260         readonly: bool,
261         log_dirty_pages: bool,
262     ) -> MemoryRegion {
263         MemoryRegion {
264             slot,
265             guest_phys_addr,
266             memory_size,
267             userspace_addr,
268             flags: if readonly { KVM_MEM_READONLY } else { 0 }
269                 | if log_dirty_pages {
270                     KVM_MEM_LOG_DIRTY_PAGES
271                 } else {
272                     0
273                 },
274         }
275     }
276     ///
277     /// Creates a guest physical memory region.
278     ///
279     fn create_user_memory_region(&self, user_memory_region: MemoryRegion) -> vm::Result<()> {
280         let mut region = user_memory_region;
281 
282         if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 {
283             if (region.flags & KVM_MEM_READONLY) != 0 {
284                 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!(
285                     "Error creating regions with both 'dirty-pages-log' and 'read-only'."
286                 )));
287             }
288 
289             // Keep track of the regions that need dirty pages log
290             self.dirty_log_slots.write().unwrap().insert(
291                 region.slot,
292                 KvmDirtyLogSlot {
293                     slot: region.slot,
294                     guest_phys_addr: region.guest_phys_addr,
295                     memory_size: region.memory_size,
296                     userspace_addr: region.userspace_addr,
297                 },
298             );
299 
300             // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`.
301             // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`.
302             region.flags = 0;
303         }
304 
305         // Safe because guest regions are guaranteed not to overlap.
306         unsafe {
307             self.fd
308                 .set_user_memory_region(region)
309                 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))
310         }
311     }
312     ///
313     /// Removes a guest physical memory region.
314     ///
315     fn remove_user_memory_region(&self, user_memory_region: MemoryRegion) -> vm::Result<()> {
316         let mut region = user_memory_region;
317 
318         // Remove the corresponding entry from "self.dirty_log_slots" if needed
319         self.dirty_log_slots.write().unwrap().remove(&region.slot);
320 
321         // Setting the size to 0 means "remove"
322         region.memory_size = 0;
323         // Safe because guest regions are guaranteed not to overlap.
324         unsafe {
325             self.fd
326                 .set_user_memory_region(region)
327                 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into()))
328         }
329     }
330     ///
331     /// Creates an emulated device in the kernel.
332     ///
333     /// See the documentation for `KVM_CREATE_DEVICE`.
334     fn create_device(&self, device: &mut CreateDevice) -> vm::Result<Arc<dyn device::Device>> {
335         let fd = self
336             .fd
337             .create_device(device)
338             .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?;
339         let device = KvmDevice { fd };
340         Ok(Arc::new(device))
341     }
342     ///
343     /// Returns the preferred CPU target type which can be emulated by KVM on underlying host.
344     ///
345     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
346     fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> {
347         self.fd
348             .get_preferred_target(kvi)
349             .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into()))
350     }
351     #[cfg(target_arch = "x86_64")]
352     fn enable_split_irq(&self) -> vm::Result<()> {
353         // Set TSS
354         self.fd
355             .set_tss_address(KVM_TSS_ADDRESS.raw_value() as usize)
356             .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?;
357         // Create split irqchip
358         // Only the local APIC is emulated in kernel, both PICs and IOAPIC
359         // are not.
360         let mut cap = kvm_enable_cap {
361             cap: KVM_CAP_SPLIT_IRQCHIP,
362             ..Default::default()
363         };
364         cap.args[0] = NUM_IOAPIC_PINS as u64;
365         self.fd
366             .enable_cap(&cap)
367             .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?;
368         Ok(())
369     }
370     #[cfg(target_arch = "x86_64")]
371     fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> {
372         let mut cap = kvm_enable_cap {
373             cap: KVM_CAP_SGX_ATTRIBUTE,
374             ..Default::default()
375         };
376         cap.args[0] = file.as_raw_fd() as u64;
377         self.fd
378             .enable_cap(&cap)
379             .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?;
380         Ok(())
381     }
382     /// Retrieve guest clock.
383     #[cfg(target_arch = "x86_64")]
384     fn get_clock(&self) -> vm::Result<ClockData> {
385         self.fd
386             .get_clock()
387             .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))
388     }
389     /// Set guest clock.
390     #[cfg(target_arch = "x86_64")]
391     fn set_clock(&self, data: &ClockData) -> vm::Result<()> {
392         self.fd
393             .set_clock(data)
394             .map_err(|e| vm::HypervisorVmError::SetClock(e.into()))
395     }
396     /// Checks if a particular `Cap` is available.
397     fn check_extension(&self, c: Cap) -> bool {
398         self.fd.check_extension(c)
399     }
400     /// Create a device that is used for passthrough
401     fn create_passthrough_device(&self) -> vm::Result<Arc<dyn device::Device>> {
402         let mut vfio_dev = kvm_create_device {
403             type_: kvm_device_type_KVM_DEV_TYPE_VFIO,
404             fd: 0,
405             flags: 0,
406         };
407 
408         self.create_device(&mut vfio_dev)
409             .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into()))
410     }
411     ///
412     /// Get the Vm state. Return VM specific data
413     ///
414     fn state(&self) -> vm::Result<VmState> {
415         Ok(self.state)
416     }
417     ///
418     /// Set the VM state
419     ///
420     fn set_state(&self, _state: VmState) -> vm::Result<()> {
421         Ok(())
422     }
423 
424     ///
425     /// Start logging dirty pages
426     ///
427     fn start_dirty_log(&self) -> vm::Result<()> {
428         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
429         for (_, s) in dirty_log_slots.iter() {
430             let region = MemoryRegion {
431                 slot: s.slot,
432                 guest_phys_addr: s.guest_phys_addr,
433                 memory_size: s.memory_size,
434                 userspace_addr: s.userspace_addr,
435                 flags: KVM_MEM_LOG_DIRTY_PAGES,
436             };
437             // Safe because guest regions are guaranteed not to overlap.
438             unsafe {
439                 self.fd
440                     .set_user_memory_region(region)
441                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
442             }
443         }
444 
445         Ok(())
446     }
447 
448     ///
449     /// Stop logging dirty pages
450     ///
451     fn stop_dirty_log(&self) -> vm::Result<()> {
452         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
453         for (_, s) in dirty_log_slots.iter() {
454             let region = MemoryRegion {
455                 slot: s.slot,
456                 guest_phys_addr: s.guest_phys_addr,
457                 memory_size: s.memory_size,
458                 userspace_addr: s.userspace_addr,
459                 flags: 0,
460             };
461             // Safe because guest regions are guaranteed not to overlap.
462             unsafe {
463                 self.fd
464                     .set_user_memory_region(region)
465                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
466             }
467         }
468 
469         Ok(())
470     }
471 
472     ///
473     /// Get dirty pages bitmap (one bit per page)
474     ///
475     fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> {
476         self.fd
477             .get_dirty_log(slot, memory_size as usize)
478             .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into()))
479     }
480 
481     ///
482     /// Initialize TDX for this VM
483     ///
484     #[cfg(feature = "tdx")]
485     fn tdx_init(&self, cpuid: &CpuId, max_vcpus: u32) -> vm::Result<()> {
486         #[repr(C)]
487         struct TdxInitVm {
488             max_vcpus: u32,
489             reserved: u32,
490             attributes: u64,
491             cpuid: u64,
492         }
493         let data = TdxInitVm {
494             max_vcpus,
495             reserved: 0,
496             attributes: 1, // TDX1_TD_ATTRIBUTE_DEBUG,
497             cpuid: cpuid.as_fam_struct_ptr() as u64,
498         };
499 
500         tdx_command(
501             &self.fd.as_raw_fd(),
502             TdxCommand::InitVm,
503             0,
504             &data as *const _ as u64,
505         )
506         .map_err(vm::HypervisorVmError::InitializeTdx)
507     }
508 
509     ///
510     /// Finalize the TDX setup for this VM
511     ///
512     #[cfg(feature = "tdx")]
513     fn tdx_finalize(&self) -> vm::Result<()> {
514         tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0)
515             .map_err(vm::HypervisorVmError::FinalizeTdx)
516     }
517 
518     ///
519     /// Initialize memory regions for the TDX VM
520     ///
521     #[cfg(feature = "tdx")]
522     fn tdx_init_memory_region(
523         &self,
524         host_address: u64,
525         guest_address: u64,
526         size: u64,
527         measure: bool,
528     ) -> vm::Result<()> {
529         #[repr(C)]
530         struct TdxInitMemRegion {
531             host_address: u64,
532             guest_address: u64,
533             pages: u64,
534         }
535         let data = TdxInitMemRegion {
536             host_address,
537             guest_address,
538             pages: size / 4096,
539         };
540 
541         tdx_command(
542             &self.fd.as_raw_fd(),
543             TdxCommand::InitMemRegion,
544             if measure { 1 } else { 0 },
545             &data as *const _ as u64,
546         )
547         .map_err(vm::HypervisorVmError::InitMemRegionTdx)
548     }
549 }
550 
551 #[cfg(feature = "tdx")]
552 fn tdx_command(
553     fd: &RawFd,
554     command: TdxCommand,
555     metadata: u32,
556     data: u64,
557 ) -> std::result::Result<(), std::io::Error> {
558     #[repr(C)]
559     struct TdxIoctlCmd {
560         command: TdxCommand,
561         metadata: u32,
562         data: u64,
563     }
564     let cmd = TdxIoctlCmd {
565         command,
566         metadata,
567         data,
568     };
569     let ret = unsafe {
570         ioctl_with_val(
571             fd,
572             KVM_MEMORY_ENCRYPT_OP(),
573             &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong,
574         )
575     };
576 
577     if ret < 0 {
578         return Err(std::io::Error::last_os_error());
579     }
580     Ok(())
581 }
582 
583 /// Wrapper over KVM system ioctls.
584 pub struct KvmHypervisor {
585     kvm: Kvm,
586 }
587 /// Enum for KVM related error
588 #[derive(Debug, Error)]
589 pub enum KvmError {
590     #[error("Capability missing: {0:?}")]
591     CapabilityMissing(Cap),
592 }
593 pub type KvmResult<T> = result::Result<T, KvmError>;
594 impl KvmHypervisor {
595     /// Create a hypervisor based on Kvm
596     pub fn new() -> hypervisor::Result<KvmHypervisor> {
597         let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?;
598         let api_version = kvm_obj.get_api_version();
599 
600         if api_version != kvm_bindings::KVM_API_VERSION as i32 {
601             return Err(hypervisor::HypervisorError::IncompatibleApiVersion);
602         }
603 
604         Ok(KvmHypervisor { kvm: kvm_obj })
605     }
606 }
607 /// Implementation of Hypervisor trait for KVM
608 /// Example:
609 /// #[cfg(feature = "kvm")]
610 /// extern crate hypervisor
611 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
612 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
613 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
614 ///
615 impl hypervisor::Hypervisor for KvmHypervisor {
616     /// Create a KVM vm object of a specific VM type and return the object as Vm trait object
617     /// Example
618     /// # extern crate hypervisor;
619     /// # use hypervisor::KvmHypervisor;
620     /// use hypervisor::KvmVm;
621     /// let hypervisor = KvmHypervisor::new().unwrap();
622     /// let vm = hypervisor.create_vm_with_type(KvmVmType::LegacyVm).unwrap()
623     ///
624     fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> {
625         let fd: VmFd;
626         loop {
627             match self.kvm.create_vm_with_type(vm_type) {
628                 Ok(res) => fd = res,
629                 Err(e) => {
630                     if e.errno() == libc::EINTR {
631                         // If the error returned is EINTR, which means the
632                         // ioctl has been interrupted, we have to retry as
633                         // this can't be considered as a regular error.
634                         continue;
635                     } else {
636                         return Err(hypervisor::HypervisorError::VmCreate(e.into()));
637                     }
638                 }
639             }
640             break;
641         }
642 
643         let vm_fd = Arc::new(fd);
644 
645         #[cfg(target_arch = "x86_64")]
646         {
647             let msr_list = self.get_msr_list()?;
648             let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize;
649             let mut msrs = MsrEntries::new(num_msrs).unwrap();
650             let indices = msr_list.as_slice();
651             let msr_entries = msrs.as_mut_slice();
652             for (pos, index) in indices.iter().enumerate() {
653                 msr_entries[pos].index = *index;
654             }
655 
656             Ok(Arc::new(KvmVm {
657                 fd: vm_fd,
658                 msrs,
659                 state: VmState {},
660                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
661             }))
662         }
663 
664         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
665         {
666             Ok(Arc::new(KvmVm {
667                 fd: vm_fd,
668                 state: VmState {},
669                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
670             }))
671         }
672     }
673 
674     /// Create a KVM vm object and return the object as Vm trait object
675     /// Example
676     /// # extern crate hypervisor;
677     /// # use hypervisor::KvmHypervisor;
678     /// use hypervisor::KvmVm;
679     /// let hypervisor = KvmHypervisor::new().unwrap();
680     /// let vm = hypervisor.create_vm().unwrap()
681     ///
682     fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> {
683         #[allow(unused_mut)]
684         let mut vm_type: u64 = 0; // Create with default platform type
685 
686         // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA
687         // size from the host and use that when creating the VM, which may
688         // avoid unnecessary VM creation failures.
689         #[cfg(target_arch = "aarch64")]
690         if self.kvm.check_extension(Cap::ArmVmIPASize) {
691             vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap();
692         }
693 
694         self.create_vm_with_type(vm_type)
695     }
696 
697     fn check_required_extensions(&self) -> hypervisor::Result<()> {
698         check_required_kvm_extensions(&self.kvm)
699             .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into()))
700     }
701 
702     #[cfg(target_arch = "x86_64")]
703     ///
704     /// X86 specific call to get the system supported CPUID values.
705     ///
706     fn get_cpuid(&self) -> hypervisor::Result<CpuId> {
707         self.kvm
708             .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
709             .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))
710     }
711 
712     #[cfg(target_arch = "x86_64")]
713     ///
714     /// Retrieve the list of MSRs supported by KVM.
715     ///
716     fn get_msr_list(&self) -> hypervisor::Result<MsrList> {
717         self.kvm
718             .get_msr_index_list()
719             .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into()))
720     }
721     #[cfg(target_arch = "aarch64")]
722     ///
723     /// Retrieve AArch64 host maximum IPA size supported by KVM.
724     ///
725     fn get_host_ipa_limit(&self) -> i32 {
726         self.kvm.get_host_ipa_limit()
727     }
728 }
729 /// Vcpu struct for KVM
730 pub struct KvmVcpu {
731     fd: VcpuFd,
732     #[cfg(target_arch = "x86_64")]
733     msrs: MsrEntries,
734     vmmops: Option<Arc<dyn vm::VmmOps>>,
735     #[cfg(target_arch = "x86_64")]
736     hyperv_synic: AtomicBool,
737 }
738 /// Implementation of Vcpu trait for KVM
739 /// Example:
740 /// #[cfg(feature = "kvm")]
741 /// extern crate hypervisor
742 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
743 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
744 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
745 /// let vcpu = vm.create_vcpu(0, None).unwrap();
746 /// vcpu.get/set().unwrap()
747 ///
748 impl cpu::Vcpu for KvmVcpu {
749     #[cfg(target_arch = "x86_64")]
750     ///
751     /// Returns the vCPU general purpose registers.
752     ///
753     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
754         self.fd
755             .get_regs()
756             .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))
757     }
758     #[cfg(target_arch = "x86_64")]
759     ///
760     /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl.
761     ///
762     fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> {
763         self.fd
764             .set_regs(regs)
765             .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into()))
766     }
767     #[cfg(target_arch = "x86_64")]
768     ///
769     /// Returns the vCPU special registers.
770     ///
771     fn get_sregs(&self) -> cpu::Result<SpecialRegisters> {
772         self.fd
773             .get_sregs()
774             .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))
775     }
776     #[cfg(target_arch = "x86_64")]
777     ///
778     /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl.
779     ///
780     fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> {
781         self.fd
782             .set_sregs(sregs)
783             .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into()))
784     }
785     #[cfg(target_arch = "x86_64")]
786     ///
787     /// Returns the floating point state (FPU) from the vCPU.
788     ///
789     fn get_fpu(&self) -> cpu::Result<FpuState> {
790         self.fd
791             .get_fpu()
792             .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))
793     }
794     #[cfg(target_arch = "x86_64")]
795     ///
796     /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct.
797     ///
798     fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> {
799         self.fd
800             .set_fpu(fpu)
801             .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into()))
802     }
803     #[cfg(target_arch = "x86_64")]
804     ///
805     /// X86 specific call to setup the CPUID registers.
806     ///
807     fn set_cpuid2(&self, cpuid: &CpuId) -> cpu::Result<()> {
808         self.fd
809             .set_cpuid2(cpuid)
810             .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into()))
811     }
812     #[cfg(target_arch = "x86_64")]
813     ///
814     /// X86 specific call to enable HyperV SynIC
815     ///
816     fn enable_hyperv_synic(&self) -> cpu::Result<()> {
817         // Update the information about Hyper-V SynIC being enabled and
818         // emulated as it will influence later which MSRs should be saved.
819         self.hyperv_synic.store(true, Ordering::Release);
820 
821         let cap = kvm_enable_cap {
822             cap: KVM_CAP_HYPERV_SYNIC,
823             ..Default::default()
824         };
825         self.fd
826             .enable_cap(&cap)
827             .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into()))
828     }
829     ///
830     /// X86 specific call to retrieve the CPUID registers.
831     ///
832     #[cfg(target_arch = "x86_64")]
833     fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<CpuId> {
834         self.fd
835             .get_cpuid2(num_entries)
836             .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))
837     }
838     #[cfg(target_arch = "x86_64")]
839     ///
840     /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
841     ///
842     fn get_lapic(&self) -> cpu::Result<LapicState> {
843         self.fd
844             .get_lapic()
845             .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))
846     }
847     #[cfg(target_arch = "x86_64")]
848     ///
849     /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
850     ///
851     fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> {
852         self.fd
853             .set_lapic(klapic)
854             .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into()))
855     }
856     #[cfg(target_arch = "x86_64")]
857     ///
858     /// Returns the model-specific registers (MSR) for this vCPU.
859     ///
860     fn get_msrs(&self, msrs: &mut MsrEntries) -> cpu::Result<usize> {
861         self.fd
862             .get_msrs(msrs)
863             .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))
864     }
865     #[cfg(target_arch = "x86_64")]
866     ///
867     /// Setup the model-specific registers (MSR) for this vCPU.
868     /// Returns the number of MSR entries actually written.
869     ///
870     fn set_msrs(&self, msrs: &MsrEntries) -> cpu::Result<usize> {
871         self.fd
872             .set_msrs(msrs)
873             .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into()))
874     }
875     ///
876     /// Returns the vcpu's current "multiprocessing state".
877     ///
878     fn get_mp_state(&self) -> cpu::Result<MpState> {
879         self.fd
880             .get_mp_state()
881             .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))
882     }
883     ///
884     /// Sets the vcpu's current "multiprocessing state".
885     ///
886     fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> {
887         self.fd
888             .set_mp_state(mp_state)
889             .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into()))
890     }
891     #[cfg(target_arch = "x86_64")]
892     ///
893     /// X86 specific call that returns the vcpu's current "xsave struct".
894     ///
895     fn get_xsave(&self) -> cpu::Result<Xsave> {
896         self.fd
897             .get_xsave()
898             .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into()))
899     }
900     #[cfg(target_arch = "x86_64")]
901     ///
902     /// X86 specific call that sets the vcpu's current "xsave struct".
903     ///
904     fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> {
905         self.fd
906             .set_xsave(xsave)
907             .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into()))
908     }
909     #[cfg(target_arch = "x86_64")]
910     ///
911     /// X86 specific call that returns the vcpu's current "xcrs".
912     ///
913     fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> {
914         self.fd
915             .get_xcrs()
916             .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into()))
917     }
918     #[cfg(target_arch = "x86_64")]
919     ///
920     /// X86 specific call that sets the vcpu's current "xcrs".
921     ///
922     fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> {
923         self.fd
924             .set_xcrs(xcrs)
925             .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into()))
926     }
927     ///
928     /// Triggers the running of the current virtual CPU returning an exit reason.
929     ///
930     fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> {
931         match self.fd.run() {
932             Ok(run) => match run {
933                 #[cfg(target_arch = "x86_64")]
934                 VcpuExit::IoIn(addr, data) => {
935                     if let Some(vmmops) = &self.vmmops {
936                         return vmmops
937                             .pio_read(addr.into(), data)
938                             .map(|_| cpu::VmExit::Ignore)
939                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
940                     }
941 
942                     Ok(cpu::VmExit::IoIn(addr, data))
943                 }
944                 #[cfg(target_arch = "x86_64")]
945                 VcpuExit::IoOut(addr, data) => {
946                     if let Some(vmmops) = &self.vmmops {
947                         return vmmops
948                             .pio_write(addr.into(), data)
949                             .map(|_| cpu::VmExit::Ignore)
950                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
951                     }
952 
953                     Ok(cpu::VmExit::IoOut(addr, data))
954                 }
955                 #[cfg(target_arch = "x86_64")]
956                 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)),
957                 #[cfg(target_arch = "x86_64")]
958                 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset),
959 
960                 #[cfg(target_arch = "aarch64")]
961                 VcpuExit::SystemEvent(event_type, flags) => {
962                     use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN};
963                     // On Aarch64, when the VM is shutdown, run() returns
964                     // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN
965                     if event_type == KVM_SYSTEM_EVENT_RESET {
966                         Ok(cpu::VmExit::Reset)
967                     } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN {
968                         Ok(cpu::VmExit::Shutdown)
969                     } else {
970                         Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
971                             "Unexpected system event with type 0x{:x}, flags 0x{:x}",
972                             event_type,
973                             flags
974                         )))
975                     }
976                 }
977 
978                 VcpuExit::MmioRead(addr, data) => {
979                     if let Some(vmmops) = &self.vmmops {
980                         return vmmops
981                             .mmio_read(addr, data)
982                             .map(|_| cpu::VmExit::Ignore)
983                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
984                     }
985 
986                     Ok(cpu::VmExit::MmioRead(addr, data))
987                 }
988                 VcpuExit::MmioWrite(addr, data) => {
989                     if let Some(vmmops) = &self.vmmops {
990                         return vmmops
991                             .mmio_write(addr, data)
992                             .map(|_| cpu::VmExit::Ignore)
993                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
994                     }
995 
996                     Ok(cpu::VmExit::MmioWrite(addr, data))
997                 }
998                 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv),
999 
1000                 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1001                     "Unexpected exit reason on vcpu run: {:?}",
1002                     r
1003                 ))),
1004             },
1005 
1006             Err(ref e) => match e.errno() {
1007                 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore),
1008                 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1009                     "VCPU error {:?}",
1010                     e
1011                 ))),
1012             },
1013         }
1014     }
1015     #[cfg(target_arch = "x86_64")]
1016     ///
1017     /// Returns currently pending exceptions, interrupts, and NMIs as well as related
1018     /// states of the vcpu.
1019     ///
1020     fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> {
1021         self.fd
1022             .get_vcpu_events()
1023             .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into()))
1024     }
1025     #[cfg(target_arch = "x86_64")]
1026     ///
1027     /// Sets pending exceptions, interrupts, and NMIs as well as related states
1028     /// of the vcpu.
1029     ///
1030     fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> {
1031         self.fd
1032             .set_vcpu_events(events)
1033             .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into()))
1034     }
1035     #[cfg(target_arch = "x86_64")]
1036     ///
1037     /// Let the guest know that it has been paused, which prevents from
1038     /// potential soft lockups when being resumed.
1039     ///
1040     fn notify_guest_clock_paused(&self) -> cpu::Result<()> {
1041         self.fd
1042             .kvmclock_ctrl()
1043             .map_err(|e| cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into()))
1044     }
1045     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1046     fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> {
1047         self.fd
1048             .vcpu_init(kvi)
1049             .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into()))
1050     }
1051     ///
1052     /// Sets the value of one register for this vCPU.
1053     ///
1054     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1055     fn set_reg(&self, reg_id: u64, data: u64) -> cpu::Result<()> {
1056         self.fd
1057             .set_one_reg(reg_id, data)
1058             .map_err(|e| cpu::HypervisorCpuError::SetRegister(e.into()))
1059     }
1060     ///
1061     /// Gets the value of one register for this vCPU.
1062     ///
1063     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1064     fn get_reg(&self, reg_id: u64) -> cpu::Result<u64> {
1065         self.fd
1066             .get_one_reg(reg_id)
1067             .map_err(|e| cpu::HypervisorCpuError::GetRegister(e.into()))
1068     }
1069     ///
1070     /// Gets a list of the guest registers that are supported for the
1071     /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls.
1072     ///
1073     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1074     fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> {
1075         self.fd
1076             .get_reg_list(reg_list)
1077             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))
1078     }
1079     ///
1080     /// Save the state of the core registers.
1081     ///
1082     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1083     fn core_registers(&self, state: &mut StandardRegisters) -> cpu::Result<()> {
1084         let mut off = offset__of!(user_pt_regs, regs);
1085         // There are 31 user_pt_regs:
1086         // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72
1087         // These actually are the general-purpose registers of the Armv8-a
1088         // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register).
1089         for i in 0..31 {
1090             state.regs.regs[i] = self
1091                 .fd
1092                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1093                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1094             off += std::mem::size_of::<u64>();
1095         }
1096 
1097         // We are now entering the "Other register" section of the ARMv8-a architecture.
1098         // First one, stack pointer.
1099         let off = offset__of!(user_pt_regs, sp);
1100         state.regs.sp = self
1101             .fd
1102             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1103             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1104 
1105         // Second one, the program counter.
1106         let off = offset__of!(user_pt_regs, pc);
1107         state.regs.pc = self
1108             .fd
1109             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1110             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1111 
1112         // Next is the processor state.
1113         let off = offset__of!(user_pt_regs, pstate);
1114         state.regs.pstate = self
1115             .fd
1116             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1117             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1118 
1119         // The stack pointer associated with EL1
1120         let off = offset__of!(kvm_regs, sp_el1);
1121         state.sp_el1 = self
1122             .fd
1123             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1124             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1125 
1126         // Exception Link Register for EL1, when taking an exception to EL1, this register
1127         // holds the address to which to return afterwards.
1128         let off = offset__of!(kvm_regs, elr_el1);
1129         state.elr_el1 = self
1130             .fd
1131             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1132             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1133 
1134         // Saved Program Status Registers, there are 5 of them used in the kernel.
1135         let mut off = offset__of!(kvm_regs, spsr);
1136         for i in 0..KVM_NR_SPSR as usize {
1137             state.spsr[i] = self
1138                 .fd
1139                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1140                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1141             off += std::mem::size_of::<u64>();
1142         }
1143 
1144         // Now moving on to floting point registers which are stored in the user_fpsimd_state in the kernel:
1145         // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53
1146         let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs);
1147         for i in 0..32 {
1148             state.fp_regs.vregs[i][0] = self
1149                 .fd
1150                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off))
1151                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1152             off += mem::size_of::<u128>();
1153         }
1154 
1155         // Floating-point Status Register
1156         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr);
1157         state.fp_regs.fpsr = self
1158             .fd
1159             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1160             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1161             as u32;
1162 
1163         // Floating-point Control Register
1164         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr);
1165         state.fp_regs.fpcr = self
1166             .fd
1167             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1168             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1169             as u32;
1170         Ok(())
1171     }
1172     ///
1173     /// Restore the state of the core registers.
1174     ///
1175     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1176     fn set_core_registers(&self, state: &StandardRegisters) -> cpu::Result<()> {
1177         // The function follows the exact identical order from `state`. Look there
1178         // for some additional info on registers.
1179         let mut off = offset__of!(user_pt_regs, regs);
1180         for i in 0..31 {
1181             self.fd
1182                 .set_one_reg(
1183                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1184                     state.regs.regs[i],
1185                 )
1186                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1187             off += std::mem::size_of::<u64>();
1188         }
1189 
1190         let off = offset__of!(user_pt_regs, sp);
1191         self.fd
1192             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.sp)
1193             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1194 
1195         let off = offset__of!(user_pt_regs, pc);
1196         self.fd
1197             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pc)
1198             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1199 
1200         let off = offset__of!(user_pt_regs, pstate);
1201         self.fd
1202             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pstate)
1203             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1204 
1205         let off = offset__of!(kvm_regs, sp_el1);
1206         self.fd
1207             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.sp_el1)
1208             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1209 
1210         let off = offset__of!(kvm_regs, elr_el1);
1211         self.fd
1212             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.elr_el1)
1213             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1214 
1215         let mut off = offset__of!(kvm_regs, spsr);
1216         for i in 0..KVM_NR_SPSR as usize {
1217             self.fd
1218                 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.spsr[i])
1219                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1220             off += std::mem::size_of::<u64>();
1221         }
1222 
1223         let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs);
1224         for i in 0..32 {
1225             self.fd
1226                 .set_one_reg(
1227                     arm64_core_reg_id!(KVM_REG_SIZE_U128, off),
1228                     state.fp_regs.vregs[i][0],
1229                 )
1230                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1231             off += mem::size_of::<u128>();
1232         }
1233 
1234         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr);
1235         self.fd
1236             .set_one_reg(
1237                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1238                 state.fp_regs.fpsr as u64,
1239             )
1240             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1241 
1242         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr);
1243         self.fd
1244             .set_one_reg(
1245                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1246                 state.fp_regs.fpcr as u64,
1247             )
1248             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1249         Ok(())
1250     }
1251     ///
1252     /// Save the state of the system registers.
1253     ///
1254     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1255     fn system_registers(&self, state: &mut Vec<Register>) -> cpu::Result<()> {
1256         // Call KVM_GET_REG_LIST to get all registers available to the guest. For ArmV8 there are
1257         // around 500 registers.
1258         let mut reg_list = RegList::new(500).unwrap();
1259         self.fd
1260             .get_reg_list(&mut reg_list)
1261             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?;
1262 
1263         // At this point reg_list should contain: core registers and system registers.
1264         // The register list contains the number of registers and their ids. We will be needing to
1265         // call KVM_GET_ONE_REG on each id in order to save all of them. We carve out from the list
1266         // the core registers which are represented in the kernel by kvm_regs structure and for which
1267         // we can calculate the id based on the offset in the structure.
1268 
1269         reg_list.retain(|regid| *regid != 0);
1270         reg_list.as_slice().to_vec().sort_unstable();
1271 
1272         reg_list.retain(|regid| is_system_register(*regid));
1273 
1274         // Now, for the rest of the registers left in the previously fetched register list, we are
1275         // simply calling KVM_GET_ONE_REG.
1276         let indices = reg_list.as_slice();
1277         for (_pos, index) in indices.iter().enumerate() {
1278             if _pos > 230 {
1279                 break;
1280             }
1281             state.push(kvm_bindings::kvm_one_reg {
1282                 id: *index,
1283                 addr: self
1284                     .fd
1285                     .get_one_reg(*index)
1286                     .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?,
1287             });
1288         }
1289 
1290         Ok(())
1291     }
1292     ///
1293     /// Restore the state of the system registers.
1294     ///
1295     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1296     fn set_system_registers(&self, state: &[Register]) -> cpu::Result<()> {
1297         for reg in state {
1298             self.fd
1299                 .set_one_reg(reg.id, reg.addr)
1300                 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?;
1301         }
1302         Ok(())
1303     }
1304     ///
1305     /// Read the MPIDR - Multiprocessor Affinity Register.
1306     ///
1307     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1308     fn read_mpidr(&self) -> cpu::Result<u64> {
1309         self.fd
1310             .get_one_reg(MPIDR_EL1)
1311             .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))
1312     }
1313     #[cfg(target_arch = "x86_64")]
1314     ///
1315     /// Get the current CPU state
1316     ///
1317     /// Ordering requirements:
1318     ///
1319     /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify
1320     /// vCPU/LAPIC state. As such, it must be done before most everything
1321     /// else, otherwise we cannot restore everything and expect it to work.
1322     ///
1323     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1324     /// still running.
1325     ///
1326     /// KVM_GET_LAPIC may change state of LAPIC before returning it.
1327     ///
1328     /// GET_VCPU_EVENTS should probably be last to save. The code looks as
1329     /// it might as well be affected by internal state modifications of the
1330     /// GET ioctls.
1331     ///
1332     /// SREGS saves/restores a pending interrupt, similar to what
1333     /// VCPU_EVENTS also does.
1334     ///
1335     /// GET_MSRS requires a pre-populated data structure to do something
1336     /// meaningful. For SET_MSRS it will then contain good data.
1337     ///
1338     /// # Example
1339     ///
1340     /// ```rust
1341     /// # extern crate hypervisor;
1342     /// # use hypervisor::KvmHypervisor;
1343     /// # use std::sync::Arc;
1344     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1345     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1346     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1347     /// vm.enable_split_irq().unwrap();
1348     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1349     /// let state = vcpu.state().unwrap();
1350     /// ```
1351     fn state(&self) -> cpu::Result<CpuState> {
1352         let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?;
1353         let mp_state = self.get_mp_state()?;
1354         let regs = self.get_regs()?;
1355         let sregs = self.get_sregs()?;
1356         let xsave = self.get_xsave()?;
1357         let xcrs = self.get_xcrs()?;
1358         let lapic_state = self.get_lapic()?;
1359         let fpu = self.get_fpu()?;
1360 
1361         // Try to get all MSRs based on the list previously retrieved from KVM.
1362         // If the number of MSRs obtained from GET_MSRS is different from the
1363         // expected amount, we fallback onto a slower method by getting MSRs
1364         // by chunks. This is the only way to make sure we try to get as many
1365         // MSRs as possible, even if some MSRs are not supported.
1366         let mut msr_entries = self.msrs.clone();
1367 
1368         // Save extra MSRs if the Hyper-V synthetic interrupt controller is
1369         // emulated.
1370         if self.hyperv_synic.load(Ordering::Acquire) {
1371             let hyperv_synic_msrs = vec![
1372                 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084,
1373                 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096,
1374                 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d,
1375                 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4, 0x400000b5,
1376                 0x400000b6, 0x400000b7,
1377             ];
1378             for index in hyperv_synic_msrs {
1379                 let msr = kvm_msr_entry {
1380                     index,
1381                     ..Default::default()
1382                 };
1383                 msr_entries.push(msr).unwrap();
1384             }
1385         }
1386 
1387         let expected_num_msrs = msr_entries.as_fam_struct_ref().nmsrs as usize;
1388         let num_msrs = self.get_msrs(&mut msr_entries)?;
1389         let msrs = if num_msrs != expected_num_msrs {
1390             let mut faulty_msr_index = num_msrs;
1391             let mut msr_entries_tmp =
1392                 MsrEntries::from_entries(&msr_entries.as_slice()[..faulty_msr_index]).unwrap();
1393 
1394             loop {
1395                 warn!(
1396                     "Detected faulty MSR 0x{:x} while getting MSRs",
1397                     msr_entries.as_slice()[faulty_msr_index].index
1398                 );
1399 
1400                 let start_pos = faulty_msr_index + 1;
1401                 let mut sub_msr_entries =
1402                     MsrEntries::from_entries(&msr_entries.as_slice()[start_pos..]).unwrap();
1403                 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize;
1404                 let num_msrs = self.get_msrs(&mut sub_msr_entries)?;
1405 
1406                 for i in 0..num_msrs {
1407                     msr_entries_tmp
1408                         .push(sub_msr_entries.as_slice()[i])
1409                         .map_err(|e| {
1410                             cpu::HypervisorCpuError::GetMsrEntries(anyhow!(
1411                                 "Failed adding MSR entries: {:?}",
1412                                 e
1413                             ))
1414                         })?;
1415                 }
1416 
1417                 if num_msrs == expected_num_msrs {
1418                     break;
1419                 }
1420 
1421                 faulty_msr_index = start_pos + num_msrs;
1422             }
1423 
1424             msr_entries_tmp
1425         } else {
1426             msr_entries
1427         };
1428 
1429         let vcpu_events = self.get_vcpu_events()?;
1430 
1431         Ok(CpuState {
1432             cpuid,
1433             msrs,
1434             vcpu_events,
1435             regs,
1436             sregs,
1437             fpu,
1438             lapic_state,
1439             xsave,
1440             xcrs,
1441             mp_state,
1442         })
1443     }
1444     ///
1445     /// Get the current AArch64 CPU state
1446     ///
1447     #[cfg(target_arch = "aarch64")]
1448     fn state(&self) -> cpu::Result<CpuState> {
1449         let mut state = CpuState {
1450             mp_state: self.get_mp_state()?,
1451             mpidr: self.read_mpidr()?,
1452             ..Default::default()
1453         };
1454         self.core_registers(&mut state.core_regs)?;
1455         self.system_registers(&mut state.sys_regs)?;
1456 
1457         Ok(state)
1458     }
1459     #[cfg(target_arch = "x86_64")]
1460     ///
1461     /// Restore the previously saved CPU state
1462     ///
1463     /// Ordering requirements:
1464     ///
1465     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1466     /// still running.
1467     ///
1468     /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so
1469     /// if we ever change the BSP, we have to do that before restoring anything.
1470     /// The same seems to be true for CPUID stuff.
1471     ///
1472     /// SREGS saves/restores a pending interrupt, similar to what
1473     /// VCPU_EVENTS also does.
1474     ///
1475     /// SET_REGS clears pending exceptions unconditionally, thus, it must be
1476     /// done before SET_VCPU_EVENTS, which restores it.
1477     ///
1478     /// SET_LAPIC must come after SET_SREGS, because the latter restores
1479     /// the apic base msr.
1480     ///
1481     /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR
1482     /// only restores successfully, when the LAPIC is correctly configured.
1483     ///
1484     /// Arguments: CpuState
1485     /// # Example
1486     ///
1487     /// ```rust
1488     /// # extern crate hypervisor;
1489     /// # use hypervisor::KvmHypervisor;
1490     /// # use std::sync::Arc;
1491     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1492     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1493     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1494     /// vm.enable_split_irq().unwrap();
1495     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1496     /// let state = vcpu.state().unwrap();
1497     /// vcpu.set_state(&state).unwrap();
1498     /// ```
1499     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
1500         self.set_cpuid2(&state.cpuid)?;
1501         self.set_mp_state(state.mp_state)?;
1502         self.set_regs(&state.regs)?;
1503         self.set_sregs(&state.sregs)?;
1504         self.set_xsave(&state.xsave)?;
1505         self.set_xcrs(&state.xcrs)?;
1506         self.set_lapic(&state.lapic_state)?;
1507         self.set_fpu(&state.fpu)?;
1508 
1509         // Try to set all MSRs previously stored.
1510         // If the number of MSRs set from SET_MSRS is different from the
1511         // expected amount, we fallback onto a slower method by setting MSRs
1512         // by chunks. This is the only way to make sure we try to set as many
1513         // MSRs as possible, even if some MSRs are not supported.
1514         let expected_num_msrs = state.msrs.as_fam_struct_ref().nmsrs as usize;
1515         let num_msrs = self.set_msrs(&state.msrs)?;
1516         if num_msrs != expected_num_msrs {
1517             let mut faulty_msr_index = num_msrs;
1518 
1519             loop {
1520                 warn!(
1521                     "Detected faulty MSR 0x{:x} while setting MSRs",
1522                     state.msrs.as_slice()[faulty_msr_index].index
1523                 );
1524 
1525                 let start_pos = faulty_msr_index + 1;
1526                 let sub_msr_entries =
1527                     MsrEntries::from_entries(&state.msrs.as_slice()[start_pos..]).unwrap();
1528                 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize;
1529                 let num_msrs = self.set_msrs(&sub_msr_entries)?;
1530 
1531                 if num_msrs == expected_num_msrs {
1532                     break;
1533                 }
1534 
1535                 faulty_msr_index = start_pos + num_msrs;
1536             }
1537         }
1538 
1539         self.set_vcpu_events(&state.vcpu_events)?;
1540 
1541         Ok(())
1542     }
1543     ///
1544     /// Restore the previously saved AArch64 CPU state
1545     ///
1546     #[cfg(target_arch = "aarch64")]
1547     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
1548         self.set_core_registers(&state.core_regs)?;
1549         self.set_system_registers(&state.sys_regs)?;
1550         self.set_mp_state(state.mp_state)?;
1551 
1552         Ok(())
1553     }
1554 
1555     ///
1556     /// Initialize TDX for this CPU
1557     ///
1558     #[cfg(feature = "tdx")]
1559     fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> {
1560         tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address)
1561             .map_err(cpu::HypervisorCpuError::InitializeTdx)
1562     }
1563 }
1564 
1565 /// Device struct for KVM
1566 pub struct KvmDevice {
1567     fd: DeviceFd,
1568 }
1569 
1570 impl device::Device for KvmDevice {
1571     ///
1572     /// Set device attribute
1573     ///
1574     fn set_device_attr(&self, attr: &DeviceAttr) -> device::Result<()> {
1575         self.fd
1576             .set_device_attr(attr)
1577             .map_err(|e| device::HypervisorDeviceError::SetDeviceAttribute(e.into()))
1578     }
1579     ///
1580     /// Get device attribute
1581     ///
1582     fn get_device_attr(&self, attr: &mut DeviceAttr) -> device::Result<()> {
1583         self.fd
1584             .get_device_attr(attr)
1585             .map_err(|e| device::HypervisorDeviceError::GetDeviceAttribute(e.into()))
1586     }
1587 }
1588 
1589 impl AsRawFd for KvmDevice {
1590     fn as_raw_fd(&self) -> RawFd {
1591         self.fd.as_raw_fd()
1592     }
1593 }
1594