xref: /cloud-hypervisor/hypervisor/src/kvm/mod.rs (revision f67b3f79ea19c9a66e04074cbbf5d292f6529e43)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 // Copyright © 2020, Microsoft Corporation
6 //
7 // Copyright 2018-2019 CrowdStrike, Inc.
8 //
9 //
10 
11 #[cfg(target_arch = "aarch64")]
12 pub use crate::aarch64::{
13     check_required_kvm_extensions, is_system_register, VcpuInit, VcpuKvmState as CpuState,
14     MPIDR_EL1,
15 };
16 use crate::cpu;
17 use crate::device;
18 use crate::hypervisor;
19 use crate::vec_with_array_field;
20 use crate::vm::{self, VmmOps};
21 #[cfg(target_arch = "aarch64")]
22 use crate::{arm64_core_reg_id, offset__of};
23 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd};
24 use serde_derive::{Deserialize, Serialize};
25 use std::collections::HashMap;
26 #[cfg(target_arch = "aarch64")]
27 use std::convert::TryInto;
28 #[cfg(target_arch = "x86_64")]
29 use std::fs::File;
30 use std::os::unix::io::{AsRawFd, RawFd};
31 use std::result;
32 #[cfg(target_arch = "x86_64")]
33 use std::sync::atomic::{AtomicBool, Ordering};
34 use std::sync::{Arc, RwLock};
35 #[cfg(target_arch = "x86_64")]
36 use vm_memory::Address;
37 use vmm_sys_util::eventfd::EventFd;
38 // x86_64 dependencies
39 #[cfg(target_arch = "x86_64")]
40 pub mod x86_64;
41 #[cfg(target_arch = "x86_64")]
42 use crate::arch::x86::NUM_IOAPIC_PINS;
43 #[cfg(target_arch = "aarch64")]
44 use aarch64::{RegList, Register, StandardRegisters};
45 #[cfg(target_arch = "x86_64")]
46 use kvm_bindings::{
47     kvm_enable_cap, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, KVM_CAP_SPLIT_IRQCHIP,
48 };
49 #[cfg(target_arch = "x86_64")]
50 use x86_64::{
51     check_required_kvm_extensions, FpuState, SpecialRegisters, StandardRegisters, KVM_TSS_ADDRESS,
52 };
53 #[cfg(target_arch = "x86_64")]
54 pub use x86_64::{
55     CpuId, CpuIdEntry, ExtendedControlRegisters, LapicState, MsrEntries, VcpuKvmState as CpuState,
56     Xsave, CPUID_FLAG_VALID_INDEX,
57 };
58 // aarch64 dependencies
59 #[cfg(target_arch = "aarch64")]
60 pub mod aarch64;
61 pub use kvm_bindings;
62 #[cfg(feature = "tdx")]
63 use kvm_bindings::KVMIO;
64 pub use kvm_bindings::{
65     kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_irq_routing, kvm_irq_routing_entry,
66     kvm_userspace_memory_region, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI,
67     KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID,
68 };
69 #[cfg(target_arch = "aarch64")]
70 use kvm_bindings::{
71     kvm_regs, user_fpsimd_state, user_pt_regs, KVM_NR_SPSR, KVM_REG_ARM64, KVM_REG_ARM_CORE,
72     KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64,
73 };
74 pub use kvm_ioctls;
75 pub use kvm_ioctls::{Cap, Kvm};
76 #[cfg(target_arch = "aarch64")]
77 use std::mem;
78 use thiserror::Error;
79 #[cfg(feature = "tdx")]
80 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_expr, ioctl_ioc_nr, ioctl_iowr_nr};
81 ///
82 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms
83 ///
84 pub use {
85     kvm_bindings::kvm_clock_data as ClockData, kvm_bindings::kvm_create_device as CreateDevice,
86     kvm_bindings::kvm_device_attr as DeviceAttr,
87     kvm_bindings::kvm_irq_routing_entry as IrqRoutingEntry, kvm_bindings::kvm_mp_state as MpState,
88     kvm_bindings::kvm_userspace_memory_region as MemoryRegion,
89     kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::DeviceFd, kvm_ioctls::IoEventAddress,
90     kvm_ioctls::VcpuExit,
91 };
92 
93 #[cfg(target_arch = "x86_64")]
94 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196;
95 
96 #[cfg(feature = "tdx")]
97 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong);
98 
99 #[cfg(feature = "tdx")]
100 #[repr(u32)]
101 enum TdxCommand {
102     #[allow(dead_code)]
103     Capabilities = 0,
104     InitVm,
105     InitVcpu,
106     InitMemRegion,
107     Finalize,
108 }
109 
110 #[derive(Clone, Copy, Debug, PartialEq, Deserialize, Serialize)]
111 pub struct KvmVmState {}
112 
113 pub use KvmVmState as VmState;
114 
115 struct KvmDirtyLogSlot {
116     slot: u32,
117     guest_phys_addr: u64,
118     memory_size: u64,
119     userspace_addr: u64,
120 }
121 
122 /// Wrapper over KVM VM ioctls.
123 pub struct KvmVm {
124     fd: Arc<VmFd>,
125     #[cfg(target_arch = "x86_64")]
126     msrs: MsrEntries,
127     state: KvmVmState,
128     dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>,
129 }
130 
131 ///
132 /// Implementation of Vm trait for KVM
133 /// Example:
134 /// #[cfg(feature = "kvm")]
135 /// extern crate hypervisor
136 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
137 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
138 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
139 /// vm.set/get().unwrap()
140 ///
141 impl vm::Vm for KvmVm {
142     #[cfg(target_arch = "x86_64")]
143     ///
144     /// Sets the address of the three-page region in the VM's address space.
145     ///
146     fn set_tss_address(&self, offset: usize) -> vm::Result<()> {
147         self.fd
148             .set_tss_address(offset)
149             .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into()))
150     }
151     ///
152     /// Creates an in-kernel interrupt controller.
153     ///
154     fn create_irq_chip(&self) -> vm::Result<()> {
155         self.fd
156             .create_irq_chip()
157             .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into()))
158     }
159     ///
160     /// Registers an event that will, when signaled, trigger the `gsi` IRQ.
161     ///
162     fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
163         self.fd
164             .register_irqfd(fd, gsi)
165             .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into()))
166     }
167     ///
168     /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ.
169     ///
170     fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
171         self.fd
172             .unregister_irqfd(fd, gsi)
173             .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into()))
174     }
175     ///
176     /// Creates a VcpuFd object from a vcpu RawFd.
177     ///
178     fn create_vcpu(
179         &self,
180         id: u8,
181         vmmops: Option<Arc<dyn VmmOps>>,
182     ) -> vm::Result<Arc<dyn cpu::Vcpu>> {
183         let vc = self
184             .fd
185             .create_vcpu(id as u64)
186             .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?;
187         let vcpu = KvmVcpu {
188             fd: vc,
189             #[cfg(target_arch = "x86_64")]
190             msrs: self.msrs.clone(),
191             vmmops,
192             #[cfg(target_arch = "x86_64")]
193             hyperv_synic: AtomicBool::new(false),
194         };
195         Ok(Arc::new(vcpu))
196     }
197     ///
198     /// Registers an event to be signaled whenever a certain address is written to.
199     ///
200     fn register_ioevent(
201         &self,
202         fd: &EventFd,
203         addr: &IoEventAddress,
204         datamatch: Option<vm::DataMatch>,
205     ) -> vm::Result<()> {
206         if let Some(dm) = datamatch {
207             match dm {
208                 vm::DataMatch::DataMatch32(kvm_dm32) => self
209                     .fd
210                     .register_ioevent(fd, addr, kvm_dm32)
211                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
212                 vm::DataMatch::DataMatch64(kvm_dm64) => self
213                     .fd
214                     .register_ioevent(fd, addr, kvm_dm64)
215                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
216             }
217         } else {
218             self.fd
219                 .register_ioevent(fd, addr, NoDatamatch)
220                 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into()))
221         }
222     }
223     ///
224     /// Unregisters an event from a certain address it has been previously registered to.
225     ///
226     fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> {
227         self.fd
228             .unregister_ioevent(fd, addr, NoDatamatch)
229             .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into()))
230     }
231     ///
232     /// Sets the GSI routing table entries, overwriting any previously set
233     /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl.
234     ///
235     fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> {
236         let mut irq_routing =
237             vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len());
238         irq_routing[0].nr = entries.len() as u32;
239         irq_routing[0].flags = 0;
240 
241         unsafe {
242             let entries_slice: &mut [kvm_irq_routing_entry] =
243                 irq_routing[0].entries.as_mut_slice(entries.len());
244             entries_slice.copy_from_slice(entries);
245         }
246 
247         self.fd
248             .set_gsi_routing(&irq_routing[0])
249             .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into()))
250     }
251     ///
252     /// Creates a memory region structure that can be used with {create/remove}_user_memory_region
253     ///
254     fn make_user_memory_region(
255         &self,
256         slot: u32,
257         guest_phys_addr: u64,
258         memory_size: u64,
259         userspace_addr: u64,
260         readonly: bool,
261         log_dirty_pages: bool,
262     ) -> MemoryRegion {
263         MemoryRegion {
264             slot,
265             guest_phys_addr,
266             memory_size,
267             userspace_addr,
268             flags: if readonly { KVM_MEM_READONLY } else { 0 }
269                 | if log_dirty_pages {
270                     KVM_MEM_LOG_DIRTY_PAGES
271                 } else {
272                     0
273                 },
274         }
275     }
276     ///
277     /// Creates a guest physical memory region.
278     ///
279     fn create_user_memory_region(&self, user_memory_region: MemoryRegion) -> vm::Result<()> {
280         let mut region = user_memory_region;
281 
282         if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 {
283             if (region.flags & KVM_MEM_READONLY) != 0 {
284                 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!(
285                     "Error creating regions with both 'dirty-pages-log' and 'read-only'."
286                 )));
287             }
288 
289             // Keep track of the regions that need dirty pages log
290             self.dirty_log_slots.write().unwrap().insert(
291                 region.slot,
292                 KvmDirtyLogSlot {
293                     slot: region.slot,
294                     guest_phys_addr: region.guest_phys_addr,
295                     memory_size: region.memory_size,
296                     userspace_addr: region.userspace_addr,
297                 },
298             );
299 
300             // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`.
301             // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`.
302             region.flags = 0;
303         }
304 
305         // Safe because guest regions are guaranteed not to overlap.
306         unsafe {
307             self.fd
308                 .set_user_memory_region(region)
309                 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))
310         }
311     }
312     ///
313     /// Removes a guest physical memory region.
314     ///
315     fn remove_user_memory_region(&self, user_memory_region: MemoryRegion) -> vm::Result<()> {
316         let mut region = user_memory_region;
317 
318         // Remove the corresponding entry from "self.dirty_log_slots" if needed
319         self.dirty_log_slots.write().unwrap().remove(&region.slot);
320 
321         // Setting the size to 0 means "remove"
322         region.memory_size = 0;
323         // Safe because guest regions are guaranteed not to overlap.
324         unsafe {
325             self.fd
326                 .set_user_memory_region(region)
327                 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into()))
328         }
329     }
330     ///
331     /// Creates an emulated device in the kernel.
332     ///
333     /// See the documentation for `KVM_CREATE_DEVICE`.
334     fn create_device(&self, device: &mut CreateDevice) -> vm::Result<Arc<dyn device::Device>> {
335         let fd = self
336             .fd
337             .create_device(device)
338             .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?;
339         let device = KvmDevice { fd };
340         Ok(Arc::new(device))
341     }
342     ///
343     /// Returns the preferred CPU target type which can be emulated by KVM on underlying host.
344     ///
345     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
346     fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> {
347         self.fd
348             .get_preferred_target(kvi)
349             .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into()))
350     }
351     #[cfg(target_arch = "x86_64")]
352     fn enable_split_irq(&self) -> vm::Result<()> {
353         // Set TSS
354         self.fd
355             .set_tss_address(KVM_TSS_ADDRESS.raw_value() as usize)
356             .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?;
357         // Create split irqchip
358         // Only the local APIC is emulated in kernel, both PICs and IOAPIC
359         // are not.
360         let mut cap = kvm_enable_cap {
361             cap: KVM_CAP_SPLIT_IRQCHIP,
362             ..Default::default()
363         };
364         cap.args[0] = NUM_IOAPIC_PINS as u64;
365         self.fd
366             .enable_cap(&cap)
367             .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?;
368         Ok(())
369     }
370     #[cfg(target_arch = "x86_64")]
371     fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> {
372         let mut cap = kvm_enable_cap {
373             cap: KVM_CAP_SGX_ATTRIBUTE,
374             ..Default::default()
375         };
376         cap.args[0] = file.as_raw_fd() as u64;
377         self.fd
378             .enable_cap(&cap)
379             .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?;
380         Ok(())
381     }
382     /// Retrieve guest clock.
383     #[cfg(target_arch = "x86_64")]
384     fn get_clock(&self) -> vm::Result<ClockData> {
385         self.fd
386             .get_clock()
387             .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))
388     }
389     /// Set guest clock.
390     #[cfg(target_arch = "x86_64")]
391     fn set_clock(&self, data: &ClockData) -> vm::Result<()> {
392         self.fd
393             .set_clock(data)
394             .map_err(|e| vm::HypervisorVmError::SetClock(e.into()))
395     }
396     /// Checks if a particular `Cap` is available.
397     fn check_extension(&self, c: Cap) -> bool {
398         self.fd.check_extension(c)
399     }
400     /// Create a device that is used for passthrough
401     fn create_passthrough_device(&self) -> vm::Result<Arc<dyn device::Device>> {
402         let mut vfio_dev = kvm_create_device {
403             type_: kvm_device_type_KVM_DEV_TYPE_VFIO,
404             fd: 0,
405             flags: 0,
406         };
407 
408         self.create_device(&mut vfio_dev)
409             .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into()))
410     }
411     ///
412     /// Get the Vm state. Return VM specific data
413     ///
414     fn state(&self) -> vm::Result<VmState> {
415         Ok(self.state)
416     }
417     ///
418     /// Set the VM state
419     ///
420     fn set_state(&self, _state: VmState) -> vm::Result<()> {
421         Ok(())
422     }
423 
424     ///
425     /// Start logging dirty pages
426     ///
427     fn start_dirty_log(&self) -> vm::Result<()> {
428         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
429         for (_, s) in dirty_log_slots.iter() {
430             let region = MemoryRegion {
431                 slot: s.slot,
432                 guest_phys_addr: s.guest_phys_addr,
433                 memory_size: s.memory_size,
434                 userspace_addr: s.userspace_addr,
435                 flags: KVM_MEM_LOG_DIRTY_PAGES,
436             };
437             // Safe because guest regions are guaranteed not to overlap.
438             unsafe {
439                 self.fd
440                     .set_user_memory_region(region)
441                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
442             }
443         }
444 
445         Ok(())
446     }
447 
448     ///
449     /// Stop logging dirty pages
450     ///
451     fn stop_dirty_log(&self) -> vm::Result<()> {
452         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
453         for (_, s) in dirty_log_slots.iter() {
454             let region = MemoryRegion {
455                 slot: s.slot,
456                 guest_phys_addr: s.guest_phys_addr,
457                 memory_size: s.memory_size,
458                 userspace_addr: s.userspace_addr,
459                 flags: 0,
460             };
461             // Safe because guest regions are guaranteed not to overlap.
462             unsafe {
463                 self.fd
464                     .set_user_memory_region(region)
465                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
466             }
467         }
468 
469         Ok(())
470     }
471 
472     ///
473     /// Get dirty pages bitmap (one bit per page)
474     ///
475     fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> {
476         self.fd
477             .get_dirty_log(slot, memory_size as usize)
478             .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into()))
479     }
480 
481     ///
482     /// Initialize TDX for this VM
483     ///
484     #[cfg(feature = "tdx")]
485     fn tdx_init(&self, cpuid: &CpuId, max_vcpus: u32) -> vm::Result<()> {
486         #[repr(C)]
487         struct TdxInitVm {
488             max_vcpus: u32,
489             tsc_khz: u32,
490             attributes: u64,
491             cpuid: u64,
492             mrconfigid: [u64; 6],
493             mrowner: [u64; 6],
494             mrownerconfig: [u64; 6],
495             reserved: [u64; 43],
496         }
497         let data = TdxInitVm {
498             max_vcpus,
499             tsc_khz: 0,
500             attributes: 1, // TDX1_TD_ATTRIBUTE_DEBUG,
501             cpuid: cpuid.as_fam_struct_ptr() as u64,
502             mrconfigid: [0; 6],
503             mrowner: [0; 6],
504             mrownerconfig: [0; 6],
505             reserved: [0; 43],
506         };
507 
508         tdx_command(
509             &self.fd.as_raw_fd(),
510             TdxCommand::InitVm,
511             0,
512             &data as *const _ as u64,
513         )
514         .map_err(vm::HypervisorVmError::InitializeTdx)
515     }
516 
517     ///
518     /// Finalize the TDX setup for this VM
519     ///
520     #[cfg(feature = "tdx")]
521     fn tdx_finalize(&self) -> vm::Result<()> {
522         tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0)
523             .map_err(vm::HypervisorVmError::FinalizeTdx)
524     }
525 
526     ///
527     /// Initialize memory regions for the TDX VM
528     ///
529     #[cfg(feature = "tdx")]
530     fn tdx_init_memory_region(
531         &self,
532         host_address: u64,
533         guest_address: u64,
534         size: u64,
535         measure: bool,
536     ) -> vm::Result<()> {
537         #[repr(C)]
538         struct TdxInitMemRegion {
539             host_address: u64,
540             guest_address: u64,
541             pages: u64,
542         }
543         let data = TdxInitMemRegion {
544             host_address,
545             guest_address,
546             pages: size / 4096,
547         };
548 
549         tdx_command(
550             &self.fd.as_raw_fd(),
551             TdxCommand::InitMemRegion,
552             if measure { 1 } else { 0 },
553             &data as *const _ as u64,
554         )
555         .map_err(vm::HypervisorVmError::InitMemRegionTdx)
556     }
557 }
558 
559 #[cfg(feature = "tdx")]
560 fn tdx_command(
561     fd: &RawFd,
562     command: TdxCommand,
563     metadata: u32,
564     data: u64,
565 ) -> std::result::Result<(), std::io::Error> {
566     #[repr(C)]
567     struct TdxIoctlCmd {
568         command: TdxCommand,
569         metadata: u32,
570         data: u64,
571     }
572     let cmd = TdxIoctlCmd {
573         command,
574         metadata,
575         data,
576     };
577     let ret = unsafe {
578         ioctl_with_val(
579             fd,
580             KVM_MEMORY_ENCRYPT_OP(),
581             &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong,
582         )
583     };
584 
585     if ret < 0 {
586         return Err(std::io::Error::last_os_error());
587     }
588     Ok(())
589 }
590 
591 /// Wrapper over KVM system ioctls.
592 pub struct KvmHypervisor {
593     kvm: Kvm,
594 }
595 /// Enum for KVM related error
596 #[derive(Debug, Error)]
597 pub enum KvmError {
598     #[error("Capability missing: {0:?}")]
599     CapabilityMissing(Cap),
600 }
601 pub type KvmResult<T> = result::Result<T, KvmError>;
602 impl KvmHypervisor {
603     /// Create a hypervisor based on Kvm
604     pub fn new() -> hypervisor::Result<KvmHypervisor> {
605         let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?;
606         let api_version = kvm_obj.get_api_version();
607 
608         if api_version != kvm_bindings::KVM_API_VERSION as i32 {
609             return Err(hypervisor::HypervisorError::IncompatibleApiVersion);
610         }
611 
612         Ok(KvmHypervisor { kvm: kvm_obj })
613     }
614 }
615 /// Implementation of Hypervisor trait for KVM
616 /// Example:
617 /// #[cfg(feature = "kvm")]
618 /// extern crate hypervisor
619 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
620 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
621 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
622 ///
623 impl hypervisor::Hypervisor for KvmHypervisor {
624     /// Create a KVM vm object of a specific VM type and return the object as Vm trait object
625     /// Example
626     /// # extern crate hypervisor;
627     /// # use hypervisor::KvmHypervisor;
628     /// use hypervisor::KvmVm;
629     /// let hypervisor = KvmHypervisor::new().unwrap();
630     /// let vm = hypervisor.create_vm_with_type(KvmVmType::LegacyVm).unwrap()
631     ///
632     fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> {
633         let fd: VmFd;
634         loop {
635             match self.kvm.create_vm_with_type(vm_type) {
636                 Ok(res) => fd = res,
637                 Err(e) => {
638                     if e.errno() == libc::EINTR {
639                         // If the error returned is EINTR, which means the
640                         // ioctl has been interrupted, we have to retry as
641                         // this can't be considered as a regular error.
642                         continue;
643                     } else {
644                         return Err(hypervisor::HypervisorError::VmCreate(e.into()));
645                     }
646                 }
647             }
648             break;
649         }
650 
651         let vm_fd = Arc::new(fd);
652 
653         #[cfg(target_arch = "x86_64")]
654         {
655             let msr_list = self.get_msr_list()?;
656             let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize;
657             let mut msrs = MsrEntries::new(num_msrs).unwrap();
658             let indices = msr_list.as_slice();
659             let msr_entries = msrs.as_mut_slice();
660             for (pos, index) in indices.iter().enumerate() {
661                 msr_entries[pos].index = *index;
662             }
663 
664             Ok(Arc::new(KvmVm {
665                 fd: vm_fd,
666                 msrs,
667                 state: VmState {},
668                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
669             }))
670         }
671 
672         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
673         {
674             Ok(Arc::new(KvmVm {
675                 fd: vm_fd,
676                 state: VmState {},
677                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
678             }))
679         }
680     }
681 
682     /// Create a KVM vm object and return the object as Vm trait object
683     /// Example
684     /// # extern crate hypervisor;
685     /// # use hypervisor::KvmHypervisor;
686     /// use hypervisor::KvmVm;
687     /// let hypervisor = KvmHypervisor::new().unwrap();
688     /// let vm = hypervisor.create_vm().unwrap()
689     ///
690     fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> {
691         #[allow(unused_mut)]
692         let mut vm_type: u64 = 0; // Create with default platform type
693 
694         // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA
695         // size from the host and use that when creating the VM, which may
696         // avoid unnecessary VM creation failures.
697         #[cfg(target_arch = "aarch64")]
698         if self.kvm.check_extension(Cap::ArmVmIPASize) {
699             vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap();
700         }
701 
702         self.create_vm_with_type(vm_type)
703     }
704 
705     fn check_required_extensions(&self) -> hypervisor::Result<()> {
706         check_required_kvm_extensions(&self.kvm)
707             .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into()))
708     }
709 
710     #[cfg(target_arch = "x86_64")]
711     ///
712     /// X86 specific call to get the system supported CPUID values.
713     ///
714     fn get_cpuid(&self) -> hypervisor::Result<CpuId> {
715         self.kvm
716             .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
717             .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))
718     }
719 
720     #[cfg(target_arch = "x86_64")]
721     ///
722     /// Retrieve the list of MSRs supported by KVM.
723     ///
724     fn get_msr_list(&self) -> hypervisor::Result<MsrList> {
725         self.kvm
726             .get_msr_index_list()
727             .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into()))
728     }
729     #[cfg(target_arch = "aarch64")]
730     ///
731     /// Retrieve AArch64 host maximum IPA size supported by KVM.
732     ///
733     fn get_host_ipa_limit(&self) -> i32 {
734         self.kvm.get_host_ipa_limit()
735     }
736 }
737 /// Vcpu struct for KVM
738 pub struct KvmVcpu {
739     fd: VcpuFd,
740     #[cfg(target_arch = "x86_64")]
741     msrs: MsrEntries,
742     vmmops: Option<Arc<dyn vm::VmmOps>>,
743     #[cfg(target_arch = "x86_64")]
744     hyperv_synic: AtomicBool,
745 }
746 /// Implementation of Vcpu trait for KVM
747 /// Example:
748 /// #[cfg(feature = "kvm")]
749 /// extern crate hypervisor
750 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
751 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
752 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
753 /// let vcpu = vm.create_vcpu(0, None).unwrap();
754 /// vcpu.get/set().unwrap()
755 ///
756 impl cpu::Vcpu for KvmVcpu {
757     #[cfg(target_arch = "x86_64")]
758     ///
759     /// Returns the vCPU general purpose registers.
760     ///
761     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
762         self.fd
763             .get_regs()
764             .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))
765     }
766     #[cfg(target_arch = "x86_64")]
767     ///
768     /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl.
769     ///
770     fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> {
771         self.fd
772             .set_regs(regs)
773             .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into()))
774     }
775     #[cfg(target_arch = "x86_64")]
776     ///
777     /// Returns the vCPU special registers.
778     ///
779     fn get_sregs(&self) -> cpu::Result<SpecialRegisters> {
780         self.fd
781             .get_sregs()
782             .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))
783     }
784     #[cfg(target_arch = "x86_64")]
785     ///
786     /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl.
787     ///
788     fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> {
789         self.fd
790             .set_sregs(sregs)
791             .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into()))
792     }
793     #[cfg(target_arch = "x86_64")]
794     ///
795     /// Returns the floating point state (FPU) from the vCPU.
796     ///
797     fn get_fpu(&self) -> cpu::Result<FpuState> {
798         self.fd
799             .get_fpu()
800             .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))
801     }
802     #[cfg(target_arch = "x86_64")]
803     ///
804     /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct.
805     ///
806     fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> {
807         self.fd
808             .set_fpu(fpu)
809             .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into()))
810     }
811     #[cfg(target_arch = "x86_64")]
812     ///
813     /// X86 specific call to setup the CPUID registers.
814     ///
815     fn set_cpuid2(&self, cpuid: &CpuId) -> cpu::Result<()> {
816         self.fd
817             .set_cpuid2(cpuid)
818             .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into()))
819     }
820     #[cfg(target_arch = "x86_64")]
821     ///
822     /// X86 specific call to enable HyperV SynIC
823     ///
824     fn enable_hyperv_synic(&self) -> cpu::Result<()> {
825         // Update the information about Hyper-V SynIC being enabled and
826         // emulated as it will influence later which MSRs should be saved.
827         self.hyperv_synic.store(true, Ordering::Release);
828 
829         let cap = kvm_enable_cap {
830             cap: KVM_CAP_HYPERV_SYNIC,
831             ..Default::default()
832         };
833         self.fd
834             .enable_cap(&cap)
835             .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into()))
836     }
837     ///
838     /// X86 specific call to retrieve the CPUID registers.
839     ///
840     #[cfg(target_arch = "x86_64")]
841     fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<CpuId> {
842         self.fd
843             .get_cpuid2(num_entries)
844             .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))
845     }
846     #[cfg(target_arch = "x86_64")]
847     ///
848     /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
849     ///
850     fn get_lapic(&self) -> cpu::Result<LapicState> {
851         self.fd
852             .get_lapic()
853             .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))
854     }
855     #[cfg(target_arch = "x86_64")]
856     ///
857     /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
858     ///
859     fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> {
860         self.fd
861             .set_lapic(klapic)
862             .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into()))
863     }
864     #[cfg(target_arch = "x86_64")]
865     ///
866     /// Returns the model-specific registers (MSR) for this vCPU.
867     ///
868     fn get_msrs(&self, msrs: &mut MsrEntries) -> cpu::Result<usize> {
869         self.fd
870             .get_msrs(msrs)
871             .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))
872     }
873     #[cfg(target_arch = "x86_64")]
874     ///
875     /// Setup the model-specific registers (MSR) for this vCPU.
876     /// Returns the number of MSR entries actually written.
877     ///
878     fn set_msrs(&self, msrs: &MsrEntries) -> cpu::Result<usize> {
879         self.fd
880             .set_msrs(msrs)
881             .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into()))
882     }
883     ///
884     /// Returns the vcpu's current "multiprocessing state".
885     ///
886     fn get_mp_state(&self) -> cpu::Result<MpState> {
887         self.fd
888             .get_mp_state()
889             .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))
890     }
891     ///
892     /// Sets the vcpu's current "multiprocessing state".
893     ///
894     fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> {
895         self.fd
896             .set_mp_state(mp_state)
897             .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into()))
898     }
899     #[cfg(target_arch = "x86_64")]
900     ///
901     /// X86 specific call that returns the vcpu's current "xsave struct".
902     ///
903     fn get_xsave(&self) -> cpu::Result<Xsave> {
904         self.fd
905             .get_xsave()
906             .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into()))
907     }
908     #[cfg(target_arch = "x86_64")]
909     ///
910     /// X86 specific call that sets the vcpu's current "xsave struct".
911     ///
912     fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> {
913         self.fd
914             .set_xsave(xsave)
915             .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into()))
916     }
917     #[cfg(target_arch = "x86_64")]
918     ///
919     /// X86 specific call that returns the vcpu's current "xcrs".
920     ///
921     fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> {
922         self.fd
923             .get_xcrs()
924             .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into()))
925     }
926     #[cfg(target_arch = "x86_64")]
927     ///
928     /// X86 specific call that sets the vcpu's current "xcrs".
929     ///
930     fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> {
931         self.fd
932             .set_xcrs(xcrs)
933             .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into()))
934     }
935     ///
936     /// Triggers the running of the current virtual CPU returning an exit reason.
937     ///
938     fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> {
939         match self.fd.run() {
940             Ok(run) => match run {
941                 #[cfg(target_arch = "x86_64")]
942                 VcpuExit::IoIn(addr, data) => {
943                     if let Some(vmmops) = &self.vmmops {
944                         return vmmops
945                             .pio_read(addr.into(), data)
946                             .map(|_| cpu::VmExit::Ignore)
947                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
948                     }
949 
950                     Ok(cpu::VmExit::IoIn(addr, data))
951                 }
952                 #[cfg(target_arch = "x86_64")]
953                 VcpuExit::IoOut(addr, data) => {
954                     if let Some(vmmops) = &self.vmmops {
955                         return vmmops
956                             .pio_write(addr.into(), data)
957                             .map(|_| cpu::VmExit::Ignore)
958                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
959                     }
960 
961                     Ok(cpu::VmExit::IoOut(addr, data))
962                 }
963                 #[cfg(target_arch = "x86_64")]
964                 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)),
965                 #[cfg(target_arch = "x86_64")]
966                 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset),
967 
968                 #[cfg(target_arch = "aarch64")]
969                 VcpuExit::SystemEvent(event_type, flags) => {
970                     use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN};
971                     // On Aarch64, when the VM is shutdown, run() returns
972                     // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN
973                     if event_type == KVM_SYSTEM_EVENT_RESET {
974                         Ok(cpu::VmExit::Reset)
975                     } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN {
976                         Ok(cpu::VmExit::Shutdown)
977                     } else {
978                         Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
979                             "Unexpected system event with type 0x{:x}, flags 0x{:x}",
980                             event_type,
981                             flags
982                         )))
983                     }
984                 }
985 
986                 VcpuExit::MmioRead(addr, data) => {
987                     if let Some(vmmops) = &self.vmmops {
988                         return vmmops
989                             .mmio_read(addr, data)
990                             .map(|_| cpu::VmExit::Ignore)
991                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
992                     }
993 
994                     Ok(cpu::VmExit::MmioRead(addr, data))
995                 }
996                 VcpuExit::MmioWrite(addr, data) => {
997                     if let Some(vmmops) = &self.vmmops {
998                         return vmmops
999                             .mmio_write(addr, data)
1000                             .map(|_| cpu::VmExit::Ignore)
1001                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1002                     }
1003 
1004                     Ok(cpu::VmExit::MmioWrite(addr, data))
1005                 }
1006                 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv),
1007 
1008                 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1009                     "Unexpected exit reason on vcpu run: {:?}",
1010                     r
1011                 ))),
1012             },
1013 
1014             Err(ref e) => match e.errno() {
1015                 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore),
1016                 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1017                     "VCPU error {:?}",
1018                     e
1019                 ))),
1020             },
1021         }
1022     }
1023     #[cfg(target_arch = "x86_64")]
1024     ///
1025     /// Returns currently pending exceptions, interrupts, and NMIs as well as related
1026     /// states of the vcpu.
1027     ///
1028     fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> {
1029         self.fd
1030             .get_vcpu_events()
1031             .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into()))
1032     }
1033     #[cfg(target_arch = "x86_64")]
1034     ///
1035     /// Sets pending exceptions, interrupts, and NMIs as well as related states
1036     /// of the vcpu.
1037     ///
1038     fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> {
1039         self.fd
1040             .set_vcpu_events(events)
1041             .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into()))
1042     }
1043     #[cfg(target_arch = "x86_64")]
1044     ///
1045     /// Let the guest know that it has been paused, which prevents from
1046     /// potential soft lockups when being resumed.
1047     ///
1048     fn notify_guest_clock_paused(&self) -> cpu::Result<()> {
1049         self.fd
1050             .kvmclock_ctrl()
1051             .map_err(|e| cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into()))
1052     }
1053     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1054     fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> {
1055         self.fd
1056             .vcpu_init(kvi)
1057             .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into()))
1058     }
1059     ///
1060     /// Sets the value of one register for this vCPU.
1061     ///
1062     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1063     fn set_reg(&self, reg_id: u64, data: u64) -> cpu::Result<()> {
1064         self.fd
1065             .set_one_reg(reg_id, data)
1066             .map_err(|e| cpu::HypervisorCpuError::SetRegister(e.into()))
1067     }
1068     ///
1069     /// Gets the value of one register for this vCPU.
1070     ///
1071     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1072     fn get_reg(&self, reg_id: u64) -> cpu::Result<u64> {
1073         self.fd
1074             .get_one_reg(reg_id)
1075             .map_err(|e| cpu::HypervisorCpuError::GetRegister(e.into()))
1076     }
1077     ///
1078     /// Gets a list of the guest registers that are supported for the
1079     /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls.
1080     ///
1081     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1082     fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> {
1083         self.fd
1084             .get_reg_list(reg_list)
1085             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))
1086     }
1087     ///
1088     /// Save the state of the core registers.
1089     ///
1090     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1091     fn core_registers(&self, state: &mut StandardRegisters) -> cpu::Result<()> {
1092         let mut off = offset__of!(user_pt_regs, regs);
1093         // There are 31 user_pt_regs:
1094         // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72
1095         // These actually are the general-purpose registers of the Armv8-a
1096         // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register).
1097         for i in 0..31 {
1098             state.regs.regs[i] = self
1099                 .fd
1100                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1101                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1102             off += std::mem::size_of::<u64>();
1103         }
1104 
1105         // We are now entering the "Other register" section of the ARMv8-a architecture.
1106         // First one, stack pointer.
1107         let off = offset__of!(user_pt_regs, sp);
1108         state.regs.sp = self
1109             .fd
1110             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1111             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1112 
1113         // Second one, the program counter.
1114         let off = offset__of!(user_pt_regs, pc);
1115         state.regs.pc = self
1116             .fd
1117             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1118             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1119 
1120         // Next is the processor state.
1121         let off = offset__of!(user_pt_regs, pstate);
1122         state.regs.pstate = self
1123             .fd
1124             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1125             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1126 
1127         // The stack pointer associated with EL1
1128         let off = offset__of!(kvm_regs, sp_el1);
1129         state.sp_el1 = self
1130             .fd
1131             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1132             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1133 
1134         // Exception Link Register for EL1, when taking an exception to EL1, this register
1135         // holds the address to which to return afterwards.
1136         let off = offset__of!(kvm_regs, elr_el1);
1137         state.elr_el1 = self
1138             .fd
1139             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1140             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1141 
1142         // Saved Program Status Registers, there are 5 of them used in the kernel.
1143         let mut off = offset__of!(kvm_regs, spsr);
1144         for i in 0..KVM_NR_SPSR as usize {
1145             state.spsr[i] = self
1146                 .fd
1147                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1148                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1149             off += std::mem::size_of::<u64>();
1150         }
1151 
1152         // Now moving on to floting point registers which are stored in the user_fpsimd_state in the kernel:
1153         // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53
1154         let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs);
1155         for i in 0..32 {
1156             state.fp_regs.vregs[i] = self
1157                 .fd
1158                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off))
1159                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1160                 .into();
1161             off += mem::size_of::<u128>();
1162         }
1163 
1164         // Floating-point Status Register
1165         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr);
1166         state.fp_regs.fpsr = self
1167             .fd
1168             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1169             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1170             as u32;
1171 
1172         // Floating-point Control Register
1173         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr);
1174         state.fp_regs.fpcr = self
1175             .fd
1176             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1177             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1178             as u32;
1179         Ok(())
1180     }
1181     ///
1182     /// Restore the state of the core registers.
1183     ///
1184     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1185     fn set_core_registers(&self, state: &StandardRegisters) -> cpu::Result<()> {
1186         // The function follows the exact identical order from `state`. Look there
1187         // for some additional info on registers.
1188         let mut off = offset__of!(user_pt_regs, regs);
1189         for i in 0..31 {
1190             self.fd
1191                 .set_one_reg(
1192                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1193                     state.regs.regs[i],
1194                 )
1195                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1196             off += std::mem::size_of::<u64>();
1197         }
1198 
1199         let off = offset__of!(user_pt_regs, sp);
1200         self.fd
1201             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.sp)
1202             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1203 
1204         let off = offset__of!(user_pt_regs, pc);
1205         self.fd
1206             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pc)
1207             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1208 
1209         let off = offset__of!(user_pt_regs, pstate);
1210         self.fd
1211             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pstate)
1212             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1213 
1214         let off = offset__of!(kvm_regs, sp_el1);
1215         self.fd
1216             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.sp_el1)
1217             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1218 
1219         let off = offset__of!(kvm_regs, elr_el1);
1220         self.fd
1221             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.elr_el1)
1222             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1223 
1224         let mut off = offset__of!(kvm_regs, spsr);
1225         for i in 0..KVM_NR_SPSR as usize {
1226             self.fd
1227                 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.spsr[i])
1228                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1229             off += std::mem::size_of::<u64>();
1230         }
1231 
1232         let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs);
1233         for i in 0..32 {
1234             self.fd
1235                 .set_one_reg(
1236                     arm64_core_reg_id!(KVM_REG_SIZE_U128, off),
1237                     state.fp_regs.vregs[i] as u64,
1238                 )
1239                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1240             off += mem::size_of::<u128>();
1241         }
1242 
1243         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr);
1244         self.fd
1245             .set_one_reg(
1246                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1247                 state.fp_regs.fpsr as u64,
1248             )
1249             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1250 
1251         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr);
1252         self.fd
1253             .set_one_reg(
1254                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1255                 state.fp_regs.fpcr as u64,
1256             )
1257             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1258         Ok(())
1259     }
1260     ///
1261     /// Save the state of the system registers.
1262     ///
1263     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1264     fn system_registers(&self, state: &mut Vec<Register>) -> cpu::Result<()> {
1265         // Call KVM_GET_REG_LIST to get all registers available to the guest. For ArmV8 there are
1266         // around 500 registers.
1267         let mut reg_list = RegList::new(500).unwrap();
1268         self.fd
1269             .get_reg_list(&mut reg_list)
1270             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?;
1271 
1272         // At this point reg_list should contain: core registers and system registers.
1273         // The register list contains the number of registers and their ids. We will be needing to
1274         // call KVM_GET_ONE_REG on each id in order to save all of them. We carve out from the list
1275         // the core registers which are represented in the kernel by kvm_regs structure and for which
1276         // we can calculate the id based on the offset in the structure.
1277         reg_list.retain(|regid| is_system_register(*regid));
1278 
1279         // Now, for the rest of the registers left in the previously fetched register list, we are
1280         // simply calling KVM_GET_ONE_REG.
1281         let indices = reg_list.as_slice();
1282         for index in indices.iter() {
1283             state.push(kvm_bindings::kvm_one_reg {
1284                 id: *index,
1285                 addr: self
1286                     .fd
1287                     .get_one_reg(*index)
1288                     .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?,
1289             });
1290         }
1291 
1292         Ok(())
1293     }
1294     ///
1295     /// Restore the state of the system registers.
1296     ///
1297     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1298     fn set_system_registers(&self, state: &[Register]) -> cpu::Result<()> {
1299         for reg in state {
1300             self.fd
1301                 .set_one_reg(reg.id, reg.addr)
1302                 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?;
1303         }
1304         Ok(())
1305     }
1306     ///
1307     /// Read the MPIDR - Multiprocessor Affinity Register.
1308     ///
1309     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1310     fn read_mpidr(&self) -> cpu::Result<u64> {
1311         self.fd
1312             .get_one_reg(MPIDR_EL1)
1313             .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))
1314     }
1315     #[cfg(target_arch = "x86_64")]
1316     ///
1317     /// Get the current CPU state
1318     ///
1319     /// Ordering requirements:
1320     ///
1321     /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify
1322     /// vCPU/LAPIC state. As such, it must be done before most everything
1323     /// else, otherwise we cannot restore everything and expect it to work.
1324     ///
1325     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1326     /// still running.
1327     ///
1328     /// KVM_GET_LAPIC may change state of LAPIC before returning it.
1329     ///
1330     /// GET_VCPU_EVENTS should probably be last to save. The code looks as
1331     /// it might as well be affected by internal state modifications of the
1332     /// GET ioctls.
1333     ///
1334     /// SREGS saves/restores a pending interrupt, similar to what
1335     /// VCPU_EVENTS also does.
1336     ///
1337     /// GET_MSRS requires a pre-populated data structure to do something
1338     /// meaningful. For SET_MSRS it will then contain good data.
1339     ///
1340     /// # Example
1341     ///
1342     /// ```rust
1343     /// # extern crate hypervisor;
1344     /// # use hypervisor::KvmHypervisor;
1345     /// # use std::sync::Arc;
1346     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1347     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1348     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1349     /// vm.enable_split_irq().unwrap();
1350     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1351     /// let state = vcpu.state().unwrap();
1352     /// ```
1353     fn state(&self) -> cpu::Result<CpuState> {
1354         let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?;
1355         let mp_state = self.get_mp_state()?;
1356         let regs = self.get_regs()?;
1357         let sregs = self.get_sregs()?;
1358         let xsave = self.get_xsave()?;
1359         let xcrs = self.get_xcrs()?;
1360         let lapic_state = self.get_lapic()?;
1361         let fpu = self.get_fpu()?;
1362 
1363         // Try to get all MSRs based on the list previously retrieved from KVM.
1364         // If the number of MSRs obtained from GET_MSRS is different from the
1365         // expected amount, we fallback onto a slower method by getting MSRs
1366         // by chunks. This is the only way to make sure we try to get as many
1367         // MSRs as possible, even if some MSRs are not supported.
1368         let mut msr_entries = self.msrs.clone();
1369 
1370         // Save extra MSRs if the Hyper-V synthetic interrupt controller is
1371         // emulated.
1372         if self.hyperv_synic.load(Ordering::Acquire) {
1373             let hyperv_synic_msrs = vec![
1374                 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084,
1375                 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096,
1376                 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d,
1377                 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4, 0x400000b5,
1378                 0x400000b6, 0x400000b7,
1379             ];
1380             for index in hyperv_synic_msrs {
1381                 let msr = kvm_msr_entry {
1382                     index,
1383                     ..Default::default()
1384                 };
1385                 msr_entries.push(msr).unwrap();
1386             }
1387         }
1388 
1389         let expected_num_msrs = msr_entries.as_fam_struct_ref().nmsrs as usize;
1390         let num_msrs = self.get_msrs(&mut msr_entries)?;
1391         let msrs = if num_msrs != expected_num_msrs {
1392             let mut faulty_msr_index = num_msrs;
1393             let mut msr_entries_tmp =
1394                 MsrEntries::from_entries(&msr_entries.as_slice()[..faulty_msr_index]).unwrap();
1395 
1396             loop {
1397                 warn!(
1398                     "Detected faulty MSR 0x{:x} while getting MSRs",
1399                     msr_entries.as_slice()[faulty_msr_index].index
1400                 );
1401 
1402                 let start_pos = faulty_msr_index + 1;
1403                 let mut sub_msr_entries =
1404                     MsrEntries::from_entries(&msr_entries.as_slice()[start_pos..]).unwrap();
1405                 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize;
1406                 let num_msrs = self.get_msrs(&mut sub_msr_entries)?;
1407 
1408                 for i in 0..num_msrs {
1409                     msr_entries_tmp
1410                         .push(sub_msr_entries.as_slice()[i])
1411                         .map_err(|e| {
1412                             cpu::HypervisorCpuError::GetMsrEntries(anyhow!(
1413                                 "Failed adding MSR entries: {:?}",
1414                                 e
1415                             ))
1416                         })?;
1417                 }
1418 
1419                 if num_msrs == expected_num_msrs {
1420                     break;
1421                 }
1422 
1423                 faulty_msr_index = start_pos + num_msrs;
1424             }
1425 
1426             msr_entries_tmp
1427         } else {
1428             msr_entries
1429         };
1430 
1431         let vcpu_events = self.get_vcpu_events()?;
1432 
1433         Ok(CpuState {
1434             cpuid,
1435             msrs,
1436             vcpu_events,
1437             regs,
1438             sregs,
1439             fpu,
1440             lapic_state,
1441             xsave,
1442             xcrs,
1443             mp_state,
1444         })
1445     }
1446     ///
1447     /// Get the current AArch64 CPU state
1448     ///
1449     #[cfg(target_arch = "aarch64")]
1450     fn state(&self) -> cpu::Result<CpuState> {
1451         let mut state = CpuState {
1452             mp_state: self.get_mp_state()?,
1453             mpidr: self.read_mpidr()?,
1454             ..Default::default()
1455         };
1456         self.core_registers(&mut state.core_regs)?;
1457         self.system_registers(&mut state.sys_regs)?;
1458 
1459         Ok(state)
1460     }
1461     #[cfg(target_arch = "x86_64")]
1462     ///
1463     /// Restore the previously saved CPU state
1464     ///
1465     /// Ordering requirements:
1466     ///
1467     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1468     /// still running.
1469     ///
1470     /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so
1471     /// if we ever change the BSP, we have to do that before restoring anything.
1472     /// The same seems to be true for CPUID stuff.
1473     ///
1474     /// SREGS saves/restores a pending interrupt, similar to what
1475     /// VCPU_EVENTS also does.
1476     ///
1477     /// SET_REGS clears pending exceptions unconditionally, thus, it must be
1478     /// done before SET_VCPU_EVENTS, which restores it.
1479     ///
1480     /// SET_LAPIC must come after SET_SREGS, because the latter restores
1481     /// the apic base msr.
1482     ///
1483     /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR
1484     /// only restores successfully, when the LAPIC is correctly configured.
1485     ///
1486     /// Arguments: CpuState
1487     /// # Example
1488     ///
1489     /// ```rust
1490     /// # extern crate hypervisor;
1491     /// # use hypervisor::KvmHypervisor;
1492     /// # use std::sync::Arc;
1493     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1494     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1495     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1496     /// vm.enable_split_irq().unwrap();
1497     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1498     /// let state = vcpu.state().unwrap();
1499     /// vcpu.set_state(&state).unwrap();
1500     /// ```
1501     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
1502         self.set_cpuid2(&state.cpuid)?;
1503         self.set_mp_state(state.mp_state)?;
1504         self.set_regs(&state.regs)?;
1505         self.set_sregs(&state.sregs)?;
1506         self.set_xsave(&state.xsave)?;
1507         self.set_xcrs(&state.xcrs)?;
1508         self.set_lapic(&state.lapic_state)?;
1509         self.set_fpu(&state.fpu)?;
1510 
1511         // Try to set all MSRs previously stored.
1512         // If the number of MSRs set from SET_MSRS is different from the
1513         // expected amount, we fallback onto a slower method by setting MSRs
1514         // by chunks. This is the only way to make sure we try to set as many
1515         // MSRs as possible, even if some MSRs are not supported.
1516         let expected_num_msrs = state.msrs.as_fam_struct_ref().nmsrs as usize;
1517         let num_msrs = self.set_msrs(&state.msrs)?;
1518         if num_msrs != expected_num_msrs {
1519             let mut faulty_msr_index = num_msrs;
1520 
1521             loop {
1522                 warn!(
1523                     "Detected faulty MSR 0x{:x} while setting MSRs",
1524                     state.msrs.as_slice()[faulty_msr_index].index
1525                 );
1526 
1527                 let start_pos = faulty_msr_index + 1;
1528                 let sub_msr_entries =
1529                     MsrEntries::from_entries(&state.msrs.as_slice()[start_pos..]).unwrap();
1530                 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize;
1531                 let num_msrs = self.set_msrs(&sub_msr_entries)?;
1532 
1533                 if num_msrs == expected_num_msrs {
1534                     break;
1535                 }
1536 
1537                 faulty_msr_index = start_pos + num_msrs;
1538             }
1539         }
1540 
1541         self.set_vcpu_events(&state.vcpu_events)?;
1542 
1543         Ok(())
1544     }
1545     ///
1546     /// Restore the previously saved AArch64 CPU state
1547     ///
1548     #[cfg(target_arch = "aarch64")]
1549     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
1550         self.set_core_registers(&state.core_regs)?;
1551         self.set_system_registers(&state.sys_regs)?;
1552         self.set_mp_state(state.mp_state)?;
1553 
1554         Ok(())
1555     }
1556 
1557     ///
1558     /// Initialize TDX for this CPU
1559     ///
1560     #[cfg(feature = "tdx")]
1561     fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> {
1562         tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address)
1563             .map_err(cpu::HypervisorCpuError::InitializeTdx)
1564     }
1565 }
1566 
1567 /// Device struct for KVM
1568 pub struct KvmDevice {
1569     fd: DeviceFd,
1570 }
1571 
1572 impl device::Device for KvmDevice {
1573     ///
1574     /// Set device attribute
1575     ///
1576     fn set_device_attr(&self, attr: &DeviceAttr) -> device::Result<()> {
1577         self.fd
1578             .set_device_attr(attr)
1579             .map_err(|e| device::HypervisorDeviceError::SetDeviceAttribute(e.into()))
1580     }
1581     ///
1582     /// Get device attribute
1583     ///
1584     fn get_device_attr(&self, attr: &mut DeviceAttr) -> device::Result<()> {
1585         self.fd
1586             .get_device_attr(attr)
1587             .map_err(|e| device::HypervisorDeviceError::GetDeviceAttribute(e.into()))
1588     }
1589 }
1590 
1591 impl AsRawFd for KvmDevice {
1592     fn as_raw_fd(&self) -> RawFd {
1593         self.fd.as_raw_fd()
1594     }
1595 }
1596