xref: /cloud-hypervisor/hypervisor/src/kvm/mod.rs (revision 348def9dfb1243a0538e4432bc48dae691abf68b)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 // Copyright © 2020, Microsoft Corporation
6 //
7 // Copyright 2018-2019 CrowdStrike, Inc.
8 //
9 //
10 
11 #[cfg(target_arch = "aarch64")]
12 pub use crate::aarch64::{
13     check_required_kvm_extensions, is_system_register, VcpuInit, VcpuKvmState as CpuState,
14     MPIDR_EL1,
15 };
16 use crate::cpu;
17 use crate::device;
18 use crate::hypervisor;
19 use crate::vec_with_array_field;
20 use crate::vm::{self, VmmOps};
21 #[cfg(target_arch = "aarch64")]
22 use crate::{arm64_core_reg_id, offset__of};
23 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd};
24 use serde_derive::{Deserialize, Serialize};
25 use std::collections::HashMap;
26 #[cfg(target_arch = "aarch64")]
27 use std::convert::TryInto;
28 #[cfg(target_arch = "x86_64")]
29 use std::fs::File;
30 use std::os::unix::io::{AsRawFd, RawFd};
31 use std::result;
32 #[cfg(target_arch = "x86_64")]
33 use std::sync::atomic::{AtomicBool, Ordering};
34 use std::sync::{Arc, RwLock};
35 use vmm_sys_util::eventfd::EventFd;
36 // x86_64 dependencies
37 #[cfg(target_arch = "x86_64")]
38 pub mod x86_64;
39 #[cfg(target_arch = "x86_64")]
40 use crate::arch::x86::NUM_IOAPIC_PINS;
41 #[cfg(target_arch = "aarch64")]
42 use aarch64::{RegList, Register, StandardRegisters};
43 #[cfg(target_arch = "x86_64")]
44 use kvm_bindings::{
45     kvm_enable_cap, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, KVM_CAP_SPLIT_IRQCHIP,
46 };
47 #[cfg(target_arch = "x86_64")]
48 use x86_64::{check_required_kvm_extensions, FpuState, SpecialRegisters, StandardRegisters};
49 #[cfg(target_arch = "x86_64")]
50 pub use x86_64::{
51     CpuId, CpuIdEntry, ExtendedControlRegisters, LapicState, MsrEntries, VcpuKvmState as CpuState,
52     Xsave, CPUID_FLAG_VALID_INDEX,
53 };
54 // aarch64 dependencies
55 #[cfg(target_arch = "aarch64")]
56 pub mod aarch64;
57 pub use kvm_bindings;
58 #[cfg(feature = "tdx")]
59 use kvm_bindings::KVMIO;
60 pub use kvm_bindings::{
61     kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_irq_routing, kvm_irq_routing_entry,
62     kvm_userspace_memory_region, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI,
63     KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID,
64 };
65 #[cfg(target_arch = "aarch64")]
66 use kvm_bindings::{
67     kvm_regs, user_fpsimd_state, user_pt_regs, KVM_NR_SPSR, KVM_REG_ARM64, KVM_REG_ARM_CORE,
68     KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64,
69 };
70 pub use kvm_ioctls;
71 pub use kvm_ioctls::{Cap, Kvm};
72 #[cfg(target_arch = "aarch64")]
73 use std::mem;
74 use thiserror::Error;
75 #[cfg(feature = "tdx")]
76 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_expr, ioctl_ioc_nr, ioctl_iowr_nr};
77 ///
78 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms
79 ///
80 pub use {
81     kvm_bindings::kvm_clock_data as ClockData, kvm_bindings::kvm_create_device as CreateDevice,
82     kvm_bindings::kvm_device_attr as DeviceAttr,
83     kvm_bindings::kvm_irq_routing_entry as IrqRoutingEntry, kvm_bindings::kvm_mp_state as MpState,
84     kvm_bindings::kvm_userspace_memory_region as MemoryRegion,
85     kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::DeviceFd, kvm_ioctls::IoEventAddress,
86     kvm_ioctls::VcpuExit,
87 };
88 
89 #[cfg(target_arch = "x86_64")]
90 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196;
91 
92 #[cfg(feature = "tdx")]
93 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong);
94 
95 #[cfg(feature = "tdx")]
96 #[repr(u32)]
97 enum TdxCommand {
98     #[allow(dead_code)]
99     Capabilities = 0,
100     InitVm,
101     InitVcpu,
102     InitMemRegion,
103     Finalize,
104 }
105 
106 #[derive(Clone, Copy, Debug, PartialEq, Deserialize, Serialize)]
107 pub struct KvmVmState {}
108 
109 pub use KvmVmState as VmState;
110 
111 struct KvmDirtyLogSlot {
112     slot: u32,
113     guest_phys_addr: u64,
114     memory_size: u64,
115     userspace_addr: u64,
116 }
117 
118 /// Wrapper over KVM VM ioctls.
119 pub struct KvmVm {
120     fd: Arc<VmFd>,
121     #[cfg(target_arch = "x86_64")]
122     msrs: MsrEntries,
123     state: KvmVmState,
124     dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>,
125 }
126 
127 ///
128 /// Implementation of Vm trait for KVM
129 /// Example:
130 /// #[cfg(feature = "kvm")]
131 /// extern crate hypervisor
132 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
133 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
134 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
135 /// vm.set/get().unwrap()
136 ///
137 impl vm::Vm for KvmVm {
138     #[cfg(target_arch = "x86_64")]
139     ///
140     /// Sets the address of the three-page region in the VM's address space.
141     ///
142     fn set_tss_address(&self, offset: usize) -> vm::Result<()> {
143         self.fd
144             .set_tss_address(offset)
145             .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into()))
146     }
147     ///
148     /// Creates an in-kernel interrupt controller.
149     ///
150     fn create_irq_chip(&self) -> vm::Result<()> {
151         self.fd
152             .create_irq_chip()
153             .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into()))
154     }
155     ///
156     /// Registers an event that will, when signaled, trigger the `gsi` IRQ.
157     ///
158     fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
159         self.fd
160             .register_irqfd(fd, gsi)
161             .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into()))
162     }
163     ///
164     /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ.
165     ///
166     fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
167         self.fd
168             .unregister_irqfd(fd, gsi)
169             .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into()))
170     }
171     ///
172     /// Creates a VcpuFd object from a vcpu RawFd.
173     ///
174     fn create_vcpu(
175         &self,
176         id: u8,
177         vmmops: Option<Arc<dyn VmmOps>>,
178     ) -> vm::Result<Arc<dyn cpu::Vcpu>> {
179         let vc = self
180             .fd
181             .create_vcpu(id as u64)
182             .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?;
183         let vcpu = KvmVcpu {
184             fd: vc,
185             #[cfg(target_arch = "x86_64")]
186             msrs: self.msrs.clone(),
187             vmmops,
188             #[cfg(target_arch = "x86_64")]
189             hyperv_synic: AtomicBool::new(false),
190         };
191         Ok(Arc::new(vcpu))
192     }
193     ///
194     /// Registers an event to be signaled whenever a certain address is written to.
195     ///
196     fn register_ioevent(
197         &self,
198         fd: &EventFd,
199         addr: &IoEventAddress,
200         datamatch: Option<vm::DataMatch>,
201     ) -> vm::Result<()> {
202         if let Some(dm) = datamatch {
203             match dm {
204                 vm::DataMatch::DataMatch32(kvm_dm32) => self
205                     .fd
206                     .register_ioevent(fd, addr, kvm_dm32)
207                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
208                 vm::DataMatch::DataMatch64(kvm_dm64) => self
209                     .fd
210                     .register_ioevent(fd, addr, kvm_dm64)
211                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
212             }
213         } else {
214             self.fd
215                 .register_ioevent(fd, addr, NoDatamatch)
216                 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into()))
217         }
218     }
219     ///
220     /// Unregisters an event from a certain address it has been previously registered to.
221     ///
222     fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> {
223         self.fd
224             .unregister_ioevent(fd, addr, NoDatamatch)
225             .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into()))
226     }
227     ///
228     /// Sets the GSI routing table entries, overwriting any previously set
229     /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl.
230     ///
231     fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> {
232         let mut irq_routing =
233             vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len());
234         irq_routing[0].nr = entries.len() as u32;
235         irq_routing[0].flags = 0;
236 
237         // SAFETY: irq_routing initialized with entries.len() and now it is being turned into
238         // entries_slice with entries.len() again. It is guaranteed to be large enough to hold
239         // everything from entries.
240         unsafe {
241             let entries_slice: &mut [kvm_irq_routing_entry] =
242                 irq_routing[0].entries.as_mut_slice(entries.len());
243             entries_slice.copy_from_slice(entries);
244         }
245 
246         self.fd
247             .set_gsi_routing(&irq_routing[0])
248             .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into()))
249     }
250     ///
251     /// Creates a memory region structure that can be used with {create/remove}_user_memory_region
252     ///
253     fn make_user_memory_region(
254         &self,
255         slot: u32,
256         guest_phys_addr: u64,
257         memory_size: u64,
258         userspace_addr: u64,
259         readonly: bool,
260         log_dirty_pages: bool,
261     ) -> MemoryRegion {
262         MemoryRegion {
263             slot,
264             guest_phys_addr,
265             memory_size,
266             userspace_addr,
267             flags: if readonly { KVM_MEM_READONLY } else { 0 }
268                 | if log_dirty_pages {
269                     KVM_MEM_LOG_DIRTY_PAGES
270                 } else {
271                     0
272                 },
273         }
274     }
275     ///
276     /// Creates a guest physical memory region.
277     ///
278     fn create_user_memory_region(&self, user_memory_region: MemoryRegion) -> vm::Result<()> {
279         let mut region = user_memory_region;
280 
281         if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 {
282             if (region.flags & KVM_MEM_READONLY) != 0 {
283                 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!(
284                     "Error creating regions with both 'dirty-pages-log' and 'read-only'."
285                 )));
286             }
287 
288             // Keep track of the regions that need dirty pages log
289             self.dirty_log_slots.write().unwrap().insert(
290                 region.slot,
291                 KvmDirtyLogSlot {
292                     slot: region.slot,
293                     guest_phys_addr: region.guest_phys_addr,
294                     memory_size: region.memory_size,
295                     userspace_addr: region.userspace_addr,
296                 },
297             );
298 
299             // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`.
300             // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`.
301             region.flags = 0;
302         }
303 
304         // SAFETY: Safe because guest regions are guaranteed not to overlap.
305         unsafe {
306             self.fd
307                 .set_user_memory_region(region)
308                 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))
309         }
310     }
311     ///
312     /// Removes a guest physical memory region.
313     ///
314     fn remove_user_memory_region(&self, user_memory_region: MemoryRegion) -> vm::Result<()> {
315         let mut region = user_memory_region;
316 
317         // Remove the corresponding entry from "self.dirty_log_slots" if needed
318         self.dirty_log_slots.write().unwrap().remove(&region.slot);
319 
320         // Setting the size to 0 means "remove"
321         region.memory_size = 0;
322         // SAFETY: Safe because guest regions are guaranteed not to overlap.
323         unsafe {
324             self.fd
325                 .set_user_memory_region(region)
326                 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into()))
327         }
328     }
329     ///
330     /// Creates an emulated device in the kernel.
331     ///
332     /// See the documentation for `KVM_CREATE_DEVICE`.
333     fn create_device(&self, device: &mut CreateDevice) -> vm::Result<Arc<dyn device::Device>> {
334         let fd = self
335             .fd
336             .create_device(device)
337             .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?;
338         let device = KvmDevice { fd };
339         Ok(Arc::new(device))
340     }
341     ///
342     /// Returns the preferred CPU target type which can be emulated by KVM on underlying host.
343     ///
344     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
345     fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> {
346         self.fd
347             .get_preferred_target(kvi)
348             .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into()))
349     }
350     #[cfg(target_arch = "x86_64")]
351     fn enable_split_irq(&self) -> vm::Result<()> {
352         // Create split irqchip
353         // Only the local APIC is emulated in kernel, both PICs and IOAPIC
354         // are not.
355         let mut cap = kvm_enable_cap {
356             cap: KVM_CAP_SPLIT_IRQCHIP,
357             ..Default::default()
358         };
359         cap.args[0] = NUM_IOAPIC_PINS as u64;
360         self.fd
361             .enable_cap(&cap)
362             .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?;
363         Ok(())
364     }
365     #[cfg(target_arch = "x86_64")]
366     fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> {
367         let mut cap = kvm_enable_cap {
368             cap: KVM_CAP_SGX_ATTRIBUTE,
369             ..Default::default()
370         };
371         cap.args[0] = file.as_raw_fd() as u64;
372         self.fd
373             .enable_cap(&cap)
374             .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?;
375         Ok(())
376     }
377     /// Retrieve guest clock.
378     #[cfg(target_arch = "x86_64")]
379     fn get_clock(&self) -> vm::Result<ClockData> {
380         self.fd
381             .get_clock()
382             .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))
383     }
384     /// Set guest clock.
385     #[cfg(target_arch = "x86_64")]
386     fn set_clock(&self, data: &ClockData) -> vm::Result<()> {
387         self.fd
388             .set_clock(data)
389             .map_err(|e| vm::HypervisorVmError::SetClock(e.into()))
390     }
391     /// Checks if a particular `Cap` is available.
392     fn check_extension(&self, c: Cap) -> bool {
393         self.fd.check_extension(c)
394     }
395     /// Create a device that is used for passthrough
396     fn create_passthrough_device(&self) -> vm::Result<Arc<dyn device::Device>> {
397         let mut vfio_dev = kvm_create_device {
398             type_: kvm_device_type_KVM_DEV_TYPE_VFIO,
399             fd: 0,
400             flags: 0,
401         };
402 
403         self.create_device(&mut vfio_dev)
404             .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into()))
405     }
406     ///
407     /// Get the Vm state. Return VM specific data
408     ///
409     fn state(&self) -> vm::Result<VmState> {
410         Ok(self.state)
411     }
412     ///
413     /// Set the VM state
414     ///
415     fn set_state(&self, _state: VmState) -> vm::Result<()> {
416         Ok(())
417     }
418 
419     ///
420     /// Start logging dirty pages
421     ///
422     fn start_dirty_log(&self) -> vm::Result<()> {
423         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
424         for (_, s) in dirty_log_slots.iter() {
425             let region = MemoryRegion {
426                 slot: s.slot,
427                 guest_phys_addr: s.guest_phys_addr,
428                 memory_size: s.memory_size,
429                 userspace_addr: s.userspace_addr,
430                 flags: KVM_MEM_LOG_DIRTY_PAGES,
431             };
432             // SAFETY: Safe because guest regions are guaranteed not to overlap.
433             unsafe {
434                 self.fd
435                     .set_user_memory_region(region)
436                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
437             }
438         }
439 
440         Ok(())
441     }
442 
443     ///
444     /// Stop logging dirty pages
445     ///
446     fn stop_dirty_log(&self) -> vm::Result<()> {
447         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
448         for (_, s) in dirty_log_slots.iter() {
449             let region = MemoryRegion {
450                 slot: s.slot,
451                 guest_phys_addr: s.guest_phys_addr,
452                 memory_size: s.memory_size,
453                 userspace_addr: s.userspace_addr,
454                 flags: 0,
455             };
456             // SAFETY: Safe because guest regions are guaranteed not to overlap.
457             unsafe {
458                 self.fd
459                     .set_user_memory_region(region)
460                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
461             }
462         }
463 
464         Ok(())
465     }
466 
467     ///
468     /// Get dirty pages bitmap (one bit per page)
469     ///
470     fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> {
471         self.fd
472             .get_dirty_log(slot, memory_size as usize)
473             .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into()))
474     }
475 
476     ///
477     /// Initialize TDX for this VM
478     ///
479     #[cfg(feature = "tdx")]
480     fn tdx_init(&self, cpuid: &CpuId, max_vcpus: u32) -> vm::Result<()> {
481         #[repr(C)]
482         struct TdxInitVm {
483             max_vcpus: u32,
484             tsc_khz: u32,
485             attributes: u64,
486             cpuid: u64,
487             mrconfigid: [u64; 6],
488             mrowner: [u64; 6],
489             mrownerconfig: [u64; 6],
490             reserved: [u64; 43],
491         }
492         let data = TdxInitVm {
493             max_vcpus,
494             tsc_khz: 0,
495             attributes: 1, // TDX1_TD_ATTRIBUTE_DEBUG,
496             cpuid: cpuid.as_fam_struct_ptr() as u64,
497             mrconfigid: [0; 6],
498             mrowner: [0; 6],
499             mrownerconfig: [0; 6],
500             reserved: [0; 43],
501         };
502 
503         tdx_command(
504             &self.fd.as_raw_fd(),
505             TdxCommand::InitVm,
506             0,
507             &data as *const _ as u64,
508         )
509         .map_err(vm::HypervisorVmError::InitializeTdx)
510     }
511 
512     ///
513     /// Finalize the TDX setup for this VM
514     ///
515     #[cfg(feature = "tdx")]
516     fn tdx_finalize(&self) -> vm::Result<()> {
517         tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0)
518             .map_err(vm::HypervisorVmError::FinalizeTdx)
519     }
520 
521     ///
522     /// Initialize memory regions for the TDX VM
523     ///
524     #[cfg(feature = "tdx")]
525     fn tdx_init_memory_region(
526         &self,
527         host_address: u64,
528         guest_address: u64,
529         size: u64,
530         measure: bool,
531     ) -> vm::Result<()> {
532         #[repr(C)]
533         struct TdxInitMemRegion {
534             host_address: u64,
535             guest_address: u64,
536             pages: u64,
537         }
538         let data = TdxInitMemRegion {
539             host_address,
540             guest_address,
541             pages: size / 4096,
542         };
543 
544         tdx_command(
545             &self.fd.as_raw_fd(),
546             TdxCommand::InitMemRegion,
547             if measure { 1 } else { 0 },
548             &data as *const _ as u64,
549         )
550         .map_err(vm::HypervisorVmError::InitMemRegionTdx)
551     }
552 }
553 
554 #[cfg(feature = "tdx")]
555 fn tdx_command(
556     fd: &RawFd,
557     command: TdxCommand,
558     metadata: u32,
559     data: u64,
560 ) -> std::result::Result<(), std::io::Error> {
561     #[repr(C)]
562     struct TdxIoctlCmd {
563         command: TdxCommand,
564         metadata: u32,
565         data: u64,
566     }
567     let cmd = TdxIoctlCmd {
568         command,
569         metadata,
570         data,
571     };
572     // SAFETY: FFI call. All input parameters are valid.
573     let ret = unsafe {
574         ioctl_with_val(
575             fd,
576             KVM_MEMORY_ENCRYPT_OP(),
577             &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong,
578         )
579     };
580 
581     if ret < 0 {
582         return Err(std::io::Error::last_os_error());
583     }
584     Ok(())
585 }
586 
587 /// Wrapper over KVM system ioctls.
588 pub struct KvmHypervisor {
589     kvm: Kvm,
590 }
591 /// Enum for KVM related error
592 #[derive(Debug, Error)]
593 pub enum KvmError {
594     #[error("Capability missing: {0:?}")]
595     CapabilityMissing(Cap),
596 }
597 pub type KvmResult<T> = result::Result<T, KvmError>;
598 impl KvmHypervisor {
599     /// Create a hypervisor based on Kvm
600     pub fn new() -> hypervisor::Result<KvmHypervisor> {
601         let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?;
602         let api_version = kvm_obj.get_api_version();
603 
604         if api_version != kvm_bindings::KVM_API_VERSION as i32 {
605             return Err(hypervisor::HypervisorError::IncompatibleApiVersion);
606         }
607 
608         Ok(KvmHypervisor { kvm: kvm_obj })
609     }
610 }
611 /// Implementation of Hypervisor trait for KVM
612 /// Example:
613 /// #[cfg(feature = "kvm")]
614 /// extern crate hypervisor
615 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
616 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
617 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
618 ///
619 impl hypervisor::Hypervisor for KvmHypervisor {
620     /// Create a KVM vm object of a specific VM type and return the object as Vm trait object
621     /// Example
622     /// # extern crate hypervisor;
623     /// # use hypervisor::KvmHypervisor;
624     /// use hypervisor::KvmVm;
625     /// let hypervisor = KvmHypervisor::new().unwrap();
626     /// let vm = hypervisor.create_vm_with_type(KvmVmType::LegacyVm).unwrap()
627     ///
628     fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> {
629         let fd: VmFd;
630         loop {
631             match self.kvm.create_vm_with_type(vm_type) {
632                 Ok(res) => fd = res,
633                 Err(e) => {
634                     if e.errno() == libc::EINTR {
635                         // If the error returned is EINTR, which means the
636                         // ioctl has been interrupted, we have to retry as
637                         // this can't be considered as a regular error.
638                         continue;
639                     } else {
640                         return Err(hypervisor::HypervisorError::VmCreate(e.into()));
641                     }
642                 }
643             }
644             break;
645         }
646 
647         let vm_fd = Arc::new(fd);
648 
649         #[cfg(target_arch = "x86_64")]
650         {
651             let msr_list = self.get_msr_list()?;
652             let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize;
653             let mut msrs = MsrEntries::new(num_msrs).unwrap();
654             let indices = msr_list.as_slice();
655             let msr_entries = msrs.as_mut_slice();
656             for (pos, index) in indices.iter().enumerate() {
657                 msr_entries[pos].index = *index;
658             }
659 
660             Ok(Arc::new(KvmVm {
661                 fd: vm_fd,
662                 msrs,
663                 state: VmState {},
664                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
665             }))
666         }
667 
668         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
669         {
670             Ok(Arc::new(KvmVm {
671                 fd: vm_fd,
672                 state: VmState {},
673                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
674             }))
675         }
676     }
677 
678     /// Create a KVM vm object and return the object as Vm trait object
679     /// Example
680     /// # extern crate hypervisor;
681     /// # use hypervisor::KvmHypervisor;
682     /// use hypervisor::KvmVm;
683     /// let hypervisor = KvmHypervisor::new().unwrap();
684     /// let vm = hypervisor.create_vm().unwrap()
685     ///
686     fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> {
687         #[allow(unused_mut)]
688         let mut vm_type: u64 = 0; // Create with default platform type
689 
690         // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA
691         // size from the host and use that when creating the VM, which may
692         // avoid unnecessary VM creation failures.
693         #[cfg(target_arch = "aarch64")]
694         if self.kvm.check_extension(Cap::ArmVmIPASize) {
695             vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap();
696         }
697 
698         self.create_vm_with_type(vm_type)
699     }
700 
701     fn check_required_extensions(&self) -> hypervisor::Result<()> {
702         check_required_kvm_extensions(&self.kvm)
703             .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into()))
704     }
705 
706     #[cfg(target_arch = "x86_64")]
707     ///
708     /// X86 specific call to get the system supported CPUID values.
709     ///
710     fn get_cpuid(&self) -> hypervisor::Result<CpuId> {
711         self.kvm
712             .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
713             .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))
714     }
715 
716     #[cfg(target_arch = "x86_64")]
717     ///
718     /// Retrieve the list of MSRs supported by KVM.
719     ///
720     fn get_msr_list(&self) -> hypervisor::Result<MsrList> {
721         self.kvm
722             .get_msr_index_list()
723             .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into()))
724     }
725     #[cfg(target_arch = "aarch64")]
726     ///
727     /// Retrieve AArch64 host maximum IPA size supported by KVM.
728     ///
729     fn get_host_ipa_limit(&self) -> i32 {
730         self.kvm.get_host_ipa_limit()
731     }
732 }
733 /// Vcpu struct for KVM
734 pub struct KvmVcpu {
735     fd: VcpuFd,
736     #[cfg(target_arch = "x86_64")]
737     msrs: MsrEntries,
738     vmmops: Option<Arc<dyn vm::VmmOps>>,
739     #[cfg(target_arch = "x86_64")]
740     hyperv_synic: AtomicBool,
741 }
742 /// Implementation of Vcpu trait for KVM
743 /// Example:
744 /// #[cfg(feature = "kvm")]
745 /// extern crate hypervisor
746 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
747 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
748 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
749 /// let vcpu = vm.create_vcpu(0, None).unwrap();
750 /// vcpu.get/set().unwrap()
751 ///
752 impl cpu::Vcpu for KvmVcpu {
753     #[cfg(target_arch = "x86_64")]
754     ///
755     /// Returns the vCPU general purpose registers.
756     ///
757     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
758         self.fd
759             .get_regs()
760             .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))
761     }
762     #[cfg(target_arch = "x86_64")]
763     ///
764     /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl.
765     ///
766     fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> {
767         self.fd
768             .set_regs(regs)
769             .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into()))
770     }
771     #[cfg(target_arch = "x86_64")]
772     ///
773     /// Returns the vCPU special registers.
774     ///
775     fn get_sregs(&self) -> cpu::Result<SpecialRegisters> {
776         self.fd
777             .get_sregs()
778             .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))
779     }
780     #[cfg(target_arch = "x86_64")]
781     ///
782     /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl.
783     ///
784     fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> {
785         self.fd
786             .set_sregs(sregs)
787             .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into()))
788     }
789     #[cfg(target_arch = "x86_64")]
790     ///
791     /// Returns the floating point state (FPU) from the vCPU.
792     ///
793     fn get_fpu(&self) -> cpu::Result<FpuState> {
794         self.fd
795             .get_fpu()
796             .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))
797     }
798     #[cfg(target_arch = "x86_64")]
799     ///
800     /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct.
801     ///
802     fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> {
803         self.fd
804             .set_fpu(fpu)
805             .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into()))
806     }
807     #[cfg(target_arch = "x86_64")]
808     ///
809     /// X86 specific call to setup the CPUID registers.
810     ///
811     fn set_cpuid2(&self, cpuid: &CpuId) -> cpu::Result<()> {
812         self.fd
813             .set_cpuid2(cpuid)
814             .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into()))
815     }
816     #[cfg(target_arch = "x86_64")]
817     ///
818     /// X86 specific call to enable HyperV SynIC
819     ///
820     fn enable_hyperv_synic(&self) -> cpu::Result<()> {
821         // Update the information about Hyper-V SynIC being enabled and
822         // emulated as it will influence later which MSRs should be saved.
823         self.hyperv_synic.store(true, Ordering::Release);
824 
825         let cap = kvm_enable_cap {
826             cap: KVM_CAP_HYPERV_SYNIC,
827             ..Default::default()
828         };
829         self.fd
830             .enable_cap(&cap)
831             .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into()))
832     }
833     ///
834     /// X86 specific call to retrieve the CPUID registers.
835     ///
836     #[cfg(target_arch = "x86_64")]
837     fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<CpuId> {
838         self.fd
839             .get_cpuid2(num_entries)
840             .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))
841     }
842     #[cfg(target_arch = "x86_64")]
843     ///
844     /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
845     ///
846     fn get_lapic(&self) -> cpu::Result<LapicState> {
847         self.fd
848             .get_lapic()
849             .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))
850     }
851     #[cfg(target_arch = "x86_64")]
852     ///
853     /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
854     ///
855     fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> {
856         self.fd
857             .set_lapic(klapic)
858             .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into()))
859     }
860     #[cfg(target_arch = "x86_64")]
861     ///
862     /// Returns the model-specific registers (MSR) for this vCPU.
863     ///
864     fn get_msrs(&self, msrs: &mut MsrEntries) -> cpu::Result<usize> {
865         self.fd
866             .get_msrs(msrs)
867             .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))
868     }
869     #[cfg(target_arch = "x86_64")]
870     ///
871     /// Setup the model-specific registers (MSR) for this vCPU.
872     /// Returns the number of MSR entries actually written.
873     ///
874     fn set_msrs(&self, msrs: &MsrEntries) -> cpu::Result<usize> {
875         self.fd
876             .set_msrs(msrs)
877             .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into()))
878     }
879     ///
880     /// Returns the vcpu's current "multiprocessing state".
881     ///
882     fn get_mp_state(&self) -> cpu::Result<MpState> {
883         self.fd
884             .get_mp_state()
885             .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))
886     }
887     ///
888     /// Sets the vcpu's current "multiprocessing state".
889     ///
890     fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> {
891         self.fd
892             .set_mp_state(mp_state)
893             .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into()))
894     }
895     #[cfg(target_arch = "x86_64")]
896     ///
897     /// X86 specific call that returns the vcpu's current "xsave struct".
898     ///
899     fn get_xsave(&self) -> cpu::Result<Xsave> {
900         self.fd
901             .get_xsave()
902             .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into()))
903     }
904     #[cfg(target_arch = "x86_64")]
905     ///
906     /// X86 specific call that sets the vcpu's current "xsave struct".
907     ///
908     fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> {
909         self.fd
910             .set_xsave(xsave)
911             .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into()))
912     }
913     #[cfg(target_arch = "x86_64")]
914     ///
915     /// X86 specific call that returns the vcpu's current "xcrs".
916     ///
917     fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> {
918         self.fd
919             .get_xcrs()
920             .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into()))
921     }
922     #[cfg(target_arch = "x86_64")]
923     ///
924     /// X86 specific call that sets the vcpu's current "xcrs".
925     ///
926     fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> {
927         self.fd
928             .set_xcrs(xcrs)
929             .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into()))
930     }
931     ///
932     /// Triggers the running of the current virtual CPU returning an exit reason.
933     ///
934     fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> {
935         match self.fd.run() {
936             Ok(run) => match run {
937                 #[cfg(target_arch = "x86_64")]
938                 VcpuExit::IoIn(addr, data) => {
939                     if let Some(vmmops) = &self.vmmops {
940                         return vmmops
941                             .pio_read(addr.into(), data)
942                             .map(|_| cpu::VmExit::Ignore)
943                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
944                     }
945 
946                     Ok(cpu::VmExit::IoIn(addr, data))
947                 }
948                 #[cfg(target_arch = "x86_64")]
949                 VcpuExit::IoOut(addr, data) => {
950                     if let Some(vmmops) = &self.vmmops {
951                         return vmmops
952                             .pio_write(addr.into(), data)
953                             .map(|_| cpu::VmExit::Ignore)
954                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
955                     }
956 
957                     Ok(cpu::VmExit::IoOut(addr, data))
958                 }
959                 #[cfg(target_arch = "x86_64")]
960                 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)),
961                 #[cfg(target_arch = "x86_64")]
962                 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset),
963 
964                 #[cfg(target_arch = "aarch64")]
965                 VcpuExit::SystemEvent(event_type, flags) => {
966                     use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN};
967                     // On Aarch64, when the VM is shutdown, run() returns
968                     // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN
969                     if event_type == KVM_SYSTEM_EVENT_RESET {
970                         Ok(cpu::VmExit::Reset)
971                     } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN {
972                         Ok(cpu::VmExit::Shutdown)
973                     } else {
974                         Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
975                             "Unexpected system event with type 0x{:x}, flags 0x{:x}",
976                             event_type,
977                             flags
978                         )))
979                     }
980                 }
981 
982                 VcpuExit::MmioRead(addr, data) => {
983                     if let Some(vmmops) = &self.vmmops {
984                         return vmmops
985                             .mmio_read(addr, data)
986                             .map(|_| cpu::VmExit::Ignore)
987                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
988                     }
989 
990                     Ok(cpu::VmExit::MmioRead(addr, data))
991                 }
992                 VcpuExit::MmioWrite(addr, data) => {
993                     if let Some(vmmops) = &self.vmmops {
994                         return vmmops
995                             .mmio_write(addr, data)
996                             .map(|_| cpu::VmExit::Ignore)
997                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
998                     }
999 
1000                     Ok(cpu::VmExit::MmioWrite(addr, data))
1001                 }
1002                 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv),
1003 
1004                 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1005                     "Unexpected exit reason on vcpu run: {:?}",
1006                     r
1007                 ))),
1008             },
1009 
1010             Err(ref e) => match e.errno() {
1011                 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore),
1012                 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1013                     "VCPU error {:?}",
1014                     e
1015                 ))),
1016             },
1017         }
1018     }
1019     #[cfg(target_arch = "x86_64")]
1020     ///
1021     /// Returns currently pending exceptions, interrupts, and NMIs as well as related
1022     /// states of the vcpu.
1023     ///
1024     fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> {
1025         self.fd
1026             .get_vcpu_events()
1027             .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into()))
1028     }
1029     #[cfg(target_arch = "x86_64")]
1030     ///
1031     /// Sets pending exceptions, interrupts, and NMIs as well as related states
1032     /// of the vcpu.
1033     ///
1034     fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> {
1035         self.fd
1036             .set_vcpu_events(events)
1037             .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into()))
1038     }
1039     #[cfg(target_arch = "x86_64")]
1040     ///
1041     /// Let the guest know that it has been paused, which prevents from
1042     /// potential soft lockups when being resumed.
1043     ///
1044     fn notify_guest_clock_paused(&self) -> cpu::Result<()> {
1045         self.fd
1046             .kvmclock_ctrl()
1047             .map_err(|e| cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into()))
1048     }
1049     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1050     fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> {
1051         self.fd
1052             .vcpu_init(kvi)
1053             .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into()))
1054     }
1055     ///
1056     /// Sets the value of one register for this vCPU.
1057     ///
1058     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1059     fn set_reg(&self, reg_id: u64, data: u64) -> cpu::Result<()> {
1060         self.fd
1061             .set_one_reg(reg_id, data)
1062             .map_err(|e| cpu::HypervisorCpuError::SetRegister(e.into()))
1063     }
1064     ///
1065     /// Gets the value of one register for this vCPU.
1066     ///
1067     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1068     fn get_reg(&self, reg_id: u64) -> cpu::Result<u64> {
1069         self.fd
1070             .get_one_reg(reg_id)
1071             .map_err(|e| cpu::HypervisorCpuError::GetRegister(e.into()))
1072     }
1073     ///
1074     /// Gets a list of the guest registers that are supported for the
1075     /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls.
1076     ///
1077     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1078     fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> {
1079         self.fd
1080             .get_reg_list(reg_list)
1081             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))
1082     }
1083     ///
1084     /// Save the state of the core registers.
1085     ///
1086     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1087     fn core_registers(&self, state: &mut StandardRegisters) -> cpu::Result<()> {
1088         let mut off = offset__of!(user_pt_regs, regs);
1089         // There are 31 user_pt_regs:
1090         // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72
1091         // These actually are the general-purpose registers of the Armv8-a
1092         // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register).
1093         for i in 0..31 {
1094             state.regs.regs[i] = self
1095                 .fd
1096                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1097                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1098             off += std::mem::size_of::<u64>();
1099         }
1100 
1101         // We are now entering the "Other register" section of the ARMv8-a architecture.
1102         // First one, stack pointer.
1103         let off = offset__of!(user_pt_regs, sp);
1104         state.regs.sp = self
1105             .fd
1106             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1107             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1108 
1109         // Second one, the program counter.
1110         let off = offset__of!(user_pt_regs, pc);
1111         state.regs.pc = self
1112             .fd
1113             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1114             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1115 
1116         // Next is the processor state.
1117         let off = offset__of!(user_pt_regs, pstate);
1118         state.regs.pstate = self
1119             .fd
1120             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1121             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1122 
1123         // The stack pointer associated with EL1
1124         let off = offset__of!(kvm_regs, sp_el1);
1125         state.sp_el1 = self
1126             .fd
1127             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1128             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1129 
1130         // Exception Link Register for EL1, when taking an exception to EL1, this register
1131         // holds the address to which to return afterwards.
1132         let off = offset__of!(kvm_regs, elr_el1);
1133         state.elr_el1 = self
1134             .fd
1135             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1136             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1137 
1138         // Saved Program Status Registers, there are 5 of them used in the kernel.
1139         let mut off = offset__of!(kvm_regs, spsr);
1140         for i in 0..KVM_NR_SPSR as usize {
1141             state.spsr[i] = self
1142                 .fd
1143                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1144                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1145             off += std::mem::size_of::<u64>();
1146         }
1147 
1148         // Now moving on to floting point registers which are stored in the user_fpsimd_state in the kernel:
1149         // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53
1150         let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs);
1151         for i in 0..32 {
1152             state.fp_regs.vregs[i] = self
1153                 .fd
1154                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off))
1155                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1156                 .into();
1157             off += mem::size_of::<u128>();
1158         }
1159 
1160         // Floating-point Status Register
1161         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr);
1162         state.fp_regs.fpsr = self
1163             .fd
1164             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1165             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1166             as u32;
1167 
1168         // Floating-point Control Register
1169         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr);
1170         state.fp_regs.fpcr = self
1171             .fd
1172             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1173             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1174             as u32;
1175         Ok(())
1176     }
1177     ///
1178     /// Restore the state of the core registers.
1179     ///
1180     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1181     fn set_core_registers(&self, state: &StandardRegisters) -> cpu::Result<()> {
1182         // The function follows the exact identical order from `state`. Look there
1183         // for some additional info on registers.
1184         let mut off = offset__of!(user_pt_regs, regs);
1185         for i in 0..31 {
1186             self.fd
1187                 .set_one_reg(
1188                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1189                     state.regs.regs[i],
1190                 )
1191                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1192             off += std::mem::size_of::<u64>();
1193         }
1194 
1195         let off = offset__of!(user_pt_regs, sp);
1196         self.fd
1197             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.sp)
1198             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1199 
1200         let off = offset__of!(user_pt_regs, pc);
1201         self.fd
1202             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pc)
1203             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1204 
1205         let off = offset__of!(user_pt_regs, pstate);
1206         self.fd
1207             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pstate)
1208             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1209 
1210         let off = offset__of!(kvm_regs, sp_el1);
1211         self.fd
1212             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.sp_el1)
1213             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1214 
1215         let off = offset__of!(kvm_regs, elr_el1);
1216         self.fd
1217             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.elr_el1)
1218             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1219 
1220         let mut off = offset__of!(kvm_regs, spsr);
1221         for i in 0..KVM_NR_SPSR as usize {
1222             self.fd
1223                 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.spsr[i])
1224                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1225             off += std::mem::size_of::<u64>();
1226         }
1227 
1228         let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs);
1229         for i in 0..32 {
1230             self.fd
1231                 .set_one_reg(
1232                     arm64_core_reg_id!(KVM_REG_SIZE_U128, off),
1233                     state.fp_regs.vregs[i] as u64,
1234                 )
1235                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1236             off += mem::size_of::<u128>();
1237         }
1238 
1239         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr);
1240         self.fd
1241             .set_one_reg(
1242                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1243                 state.fp_regs.fpsr as u64,
1244             )
1245             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1246 
1247         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr);
1248         self.fd
1249             .set_one_reg(
1250                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1251                 state.fp_regs.fpcr as u64,
1252             )
1253             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1254         Ok(())
1255     }
1256     ///
1257     /// Save the state of the system registers.
1258     ///
1259     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1260     fn system_registers(&self, state: &mut Vec<Register>) -> cpu::Result<()> {
1261         // Call KVM_GET_REG_LIST to get all registers available to the guest. For ArmV8 there are
1262         // around 500 registers.
1263         let mut reg_list = RegList::new(500).unwrap();
1264         self.fd
1265             .get_reg_list(&mut reg_list)
1266             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?;
1267 
1268         // At this point reg_list should contain: core registers and system registers.
1269         // The register list contains the number of registers and their ids. We will be needing to
1270         // call KVM_GET_ONE_REG on each id in order to save all of them. We carve out from the list
1271         // the core registers which are represented in the kernel by kvm_regs structure and for which
1272         // we can calculate the id based on the offset in the structure.
1273         reg_list.retain(|regid| is_system_register(*regid));
1274 
1275         // Now, for the rest of the registers left in the previously fetched register list, we are
1276         // simply calling KVM_GET_ONE_REG.
1277         let indices = reg_list.as_slice();
1278         for index in indices.iter() {
1279             state.push(kvm_bindings::kvm_one_reg {
1280                 id: *index,
1281                 addr: self
1282                     .fd
1283                     .get_one_reg(*index)
1284                     .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?,
1285             });
1286         }
1287 
1288         Ok(())
1289     }
1290     ///
1291     /// Restore the state of the system registers.
1292     ///
1293     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1294     fn set_system_registers(&self, state: &[Register]) -> cpu::Result<()> {
1295         for reg in state {
1296             self.fd
1297                 .set_one_reg(reg.id, reg.addr)
1298                 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?;
1299         }
1300         Ok(())
1301     }
1302     ///
1303     /// Read the MPIDR - Multiprocessor Affinity Register.
1304     ///
1305     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1306     fn read_mpidr(&self) -> cpu::Result<u64> {
1307         self.fd
1308             .get_one_reg(MPIDR_EL1)
1309             .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))
1310     }
1311     #[cfg(target_arch = "x86_64")]
1312     ///
1313     /// Get the current CPU state
1314     ///
1315     /// Ordering requirements:
1316     ///
1317     /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify
1318     /// vCPU/LAPIC state. As such, it must be done before most everything
1319     /// else, otherwise we cannot restore everything and expect it to work.
1320     ///
1321     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1322     /// still running.
1323     ///
1324     /// KVM_GET_LAPIC may change state of LAPIC before returning it.
1325     ///
1326     /// GET_VCPU_EVENTS should probably be last to save. The code looks as
1327     /// it might as well be affected by internal state modifications of the
1328     /// GET ioctls.
1329     ///
1330     /// SREGS saves/restores a pending interrupt, similar to what
1331     /// VCPU_EVENTS also does.
1332     ///
1333     /// GET_MSRS requires a pre-populated data structure to do something
1334     /// meaningful. For SET_MSRS it will then contain good data.
1335     ///
1336     /// # Example
1337     ///
1338     /// ```rust
1339     /// # extern crate hypervisor;
1340     /// # use hypervisor::KvmHypervisor;
1341     /// # use std::sync::Arc;
1342     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1343     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1344     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1345     /// vm.enable_split_irq().unwrap();
1346     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1347     /// let state = vcpu.state().unwrap();
1348     /// ```
1349     fn state(&self) -> cpu::Result<CpuState> {
1350         let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?;
1351         let mp_state = self.get_mp_state()?;
1352         let regs = self.get_regs()?;
1353         let sregs = self.get_sregs()?;
1354         let xsave = self.get_xsave()?;
1355         let xcrs = self.get_xcrs()?;
1356         let lapic_state = self.get_lapic()?;
1357         let fpu = self.get_fpu()?;
1358 
1359         // Try to get all MSRs based on the list previously retrieved from KVM.
1360         // If the number of MSRs obtained from GET_MSRS is different from the
1361         // expected amount, we fallback onto a slower method by getting MSRs
1362         // by chunks. This is the only way to make sure we try to get as many
1363         // MSRs as possible, even if some MSRs are not supported.
1364         let mut msr_entries = self.msrs.clone();
1365 
1366         // Save extra MSRs if the Hyper-V synthetic interrupt controller is
1367         // emulated.
1368         if self.hyperv_synic.load(Ordering::Acquire) {
1369             let hyperv_synic_msrs = vec![
1370                 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084,
1371                 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096,
1372                 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d,
1373                 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4,
1374                 0x400000b5, 0x400000b6, 0x400000b7,
1375             ];
1376             for index in hyperv_synic_msrs {
1377                 let msr = kvm_msr_entry {
1378                     index,
1379                     ..Default::default()
1380                 };
1381                 msr_entries.push(msr).unwrap();
1382             }
1383         }
1384 
1385         let expected_num_msrs = msr_entries.as_fam_struct_ref().nmsrs as usize;
1386         let num_msrs = self.get_msrs(&mut msr_entries)?;
1387         let msrs = if num_msrs != expected_num_msrs {
1388             let mut faulty_msr_index = num_msrs;
1389             let mut msr_entries_tmp =
1390                 MsrEntries::from_entries(&msr_entries.as_slice()[..faulty_msr_index]).unwrap();
1391 
1392             loop {
1393                 warn!(
1394                     "Detected faulty MSR 0x{:x} while getting MSRs",
1395                     msr_entries.as_slice()[faulty_msr_index].index
1396                 );
1397 
1398                 let start_pos = faulty_msr_index + 1;
1399                 let mut sub_msr_entries =
1400                     MsrEntries::from_entries(&msr_entries.as_slice()[start_pos..]).unwrap();
1401                 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize;
1402                 let num_msrs = self.get_msrs(&mut sub_msr_entries)?;
1403 
1404                 for i in 0..num_msrs {
1405                     msr_entries_tmp
1406                         .push(sub_msr_entries.as_slice()[i])
1407                         .map_err(|e| {
1408                             cpu::HypervisorCpuError::GetMsrEntries(anyhow!(
1409                                 "Failed adding MSR entries: {:?}",
1410                                 e
1411                             ))
1412                         })?;
1413                 }
1414 
1415                 if num_msrs == expected_num_msrs {
1416                     break;
1417                 }
1418 
1419                 faulty_msr_index = start_pos + num_msrs;
1420             }
1421 
1422             msr_entries_tmp
1423         } else {
1424             msr_entries
1425         };
1426 
1427         let vcpu_events = self.get_vcpu_events()?;
1428 
1429         Ok(CpuState {
1430             cpuid,
1431             msrs,
1432             vcpu_events,
1433             regs,
1434             sregs,
1435             fpu,
1436             lapic_state,
1437             xsave,
1438             xcrs,
1439             mp_state,
1440         })
1441     }
1442     ///
1443     /// Get the current AArch64 CPU state
1444     ///
1445     #[cfg(target_arch = "aarch64")]
1446     fn state(&self) -> cpu::Result<CpuState> {
1447         let mut state = CpuState {
1448             mp_state: self.get_mp_state()?,
1449             mpidr: self.read_mpidr()?,
1450             ..Default::default()
1451         };
1452         self.core_registers(&mut state.core_regs)?;
1453         self.system_registers(&mut state.sys_regs)?;
1454 
1455         Ok(state)
1456     }
1457     #[cfg(target_arch = "x86_64")]
1458     ///
1459     /// Restore the previously saved CPU state
1460     ///
1461     /// Ordering requirements:
1462     ///
1463     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1464     /// still running.
1465     ///
1466     /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so
1467     /// if we ever change the BSP, we have to do that before restoring anything.
1468     /// The same seems to be true for CPUID stuff.
1469     ///
1470     /// SREGS saves/restores a pending interrupt, similar to what
1471     /// VCPU_EVENTS also does.
1472     ///
1473     /// SET_REGS clears pending exceptions unconditionally, thus, it must be
1474     /// done before SET_VCPU_EVENTS, which restores it.
1475     ///
1476     /// SET_LAPIC must come after SET_SREGS, because the latter restores
1477     /// the apic base msr.
1478     ///
1479     /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR
1480     /// only restores successfully, when the LAPIC is correctly configured.
1481     ///
1482     /// Arguments: CpuState
1483     /// # Example
1484     ///
1485     /// ```rust
1486     /// # extern crate hypervisor;
1487     /// # use hypervisor::KvmHypervisor;
1488     /// # use std::sync::Arc;
1489     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1490     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1491     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1492     /// vm.enable_split_irq().unwrap();
1493     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1494     /// let state = vcpu.state().unwrap();
1495     /// vcpu.set_state(&state).unwrap();
1496     /// ```
1497     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
1498         self.set_cpuid2(&state.cpuid)?;
1499         self.set_mp_state(state.mp_state)?;
1500         self.set_regs(&state.regs)?;
1501         self.set_sregs(&state.sregs)?;
1502         self.set_xsave(&state.xsave)?;
1503         self.set_xcrs(&state.xcrs)?;
1504         self.set_lapic(&state.lapic_state)?;
1505         self.set_fpu(&state.fpu)?;
1506 
1507         // Try to set all MSRs previously stored.
1508         // If the number of MSRs set from SET_MSRS is different from the
1509         // expected amount, we fallback onto a slower method by setting MSRs
1510         // by chunks. This is the only way to make sure we try to set as many
1511         // MSRs as possible, even if some MSRs are not supported.
1512         let expected_num_msrs = state.msrs.as_fam_struct_ref().nmsrs as usize;
1513         let num_msrs = self.set_msrs(&state.msrs)?;
1514         if num_msrs != expected_num_msrs {
1515             let mut faulty_msr_index = num_msrs;
1516 
1517             loop {
1518                 warn!(
1519                     "Detected faulty MSR 0x{:x} while setting MSRs",
1520                     state.msrs.as_slice()[faulty_msr_index].index
1521                 );
1522 
1523                 let start_pos = faulty_msr_index + 1;
1524                 let sub_msr_entries =
1525                     MsrEntries::from_entries(&state.msrs.as_slice()[start_pos..]).unwrap();
1526                 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize;
1527                 let num_msrs = self.set_msrs(&sub_msr_entries)?;
1528 
1529                 if num_msrs == expected_num_msrs {
1530                     break;
1531                 }
1532 
1533                 faulty_msr_index = start_pos + num_msrs;
1534             }
1535         }
1536 
1537         self.set_vcpu_events(&state.vcpu_events)?;
1538 
1539         Ok(())
1540     }
1541     ///
1542     /// Restore the previously saved AArch64 CPU state
1543     ///
1544     #[cfg(target_arch = "aarch64")]
1545     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
1546         self.set_core_registers(&state.core_regs)?;
1547         self.set_system_registers(&state.sys_regs)?;
1548         self.set_mp_state(state.mp_state)?;
1549 
1550         Ok(())
1551     }
1552 
1553     ///
1554     /// Initialize TDX for this CPU
1555     ///
1556     #[cfg(feature = "tdx")]
1557     fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> {
1558         tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address)
1559             .map_err(cpu::HypervisorCpuError::InitializeTdx)
1560     }
1561 }
1562 
1563 /// Device struct for KVM
1564 pub struct KvmDevice {
1565     fd: DeviceFd,
1566 }
1567 
1568 impl device::Device for KvmDevice {
1569     ///
1570     /// Set device attribute
1571     ///
1572     fn set_device_attr(&self, attr: &DeviceAttr) -> device::Result<()> {
1573         self.fd
1574             .set_device_attr(attr)
1575             .map_err(|e| device::HypervisorDeviceError::SetDeviceAttribute(e.into()))
1576     }
1577     ///
1578     /// Get device attribute
1579     ///
1580     fn get_device_attr(&self, attr: &mut DeviceAttr) -> device::Result<()> {
1581         self.fd
1582             .get_device_attr(attr)
1583             .map_err(|e| device::HypervisorDeviceError::GetDeviceAttribute(e.into()))
1584     }
1585 }
1586 
1587 impl AsRawFd for KvmDevice {
1588     fn as_raw_fd(&self) -> RawFd {
1589         self.fd.as_raw_fd()
1590     }
1591 }
1592