xref: /cloud-hypervisor/hypervisor/src/kvm/mod.rs (revision 9af2968a7dc47b89bf07ea9dc5e735084efcfa3a)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 // Copyright © 2020, Microsoft Corporation
6 //
7 // Copyright 2018-2019 CrowdStrike, Inc.
8 //
9 //
10 
11 #[cfg(target_arch = "aarch64")]
12 pub use crate::aarch64::{
13     check_required_kvm_extensions, is_system_register, VcpuInit, VcpuKvmState as CpuState,
14     MPIDR_EL1,
15 };
16 use crate::cpu;
17 use crate::device;
18 use crate::hypervisor;
19 use crate::vec_with_array_field;
20 use crate::vm::{self, VmmOps};
21 #[cfg(target_arch = "aarch64")]
22 use crate::{arm64_core_reg_id, offset__of};
23 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd};
24 use serde_derive::{Deserialize, Serialize};
25 #[cfg(target_arch = "aarch64")]
26 use std::convert::TryInto;
27 #[cfg(target_arch = "x86_64")]
28 use std::fs::File;
29 use std::os::unix::io::{AsRawFd, RawFd};
30 use std::result;
31 #[cfg(target_arch = "x86_64")]
32 use std::sync::atomic::{AtomicBool, Ordering};
33 use std::sync::Arc;
34 #[cfg(target_arch = "x86_64")]
35 use vm_memory::Address;
36 use vmm_sys_util::eventfd::EventFd;
37 // x86_64 dependencies
38 #[cfg(target_arch = "x86_64")]
39 pub mod x86_64;
40 #[cfg(target_arch = "x86_64")]
41 use crate::arch::x86::NUM_IOAPIC_PINS;
42 #[cfg(target_arch = "aarch64")]
43 use aarch64::{RegList, Register, StandardRegisters};
44 #[cfg(target_arch = "x86_64")]
45 use kvm_bindings::{
46     kvm_enable_cap, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, KVM_CAP_SPLIT_IRQCHIP,
47 };
48 #[cfg(target_arch = "x86_64")]
49 use x86_64::{
50     check_required_kvm_extensions, FpuState, SpecialRegisters, StandardRegisters, KVM_TSS_ADDRESS,
51 };
52 #[cfg(target_arch = "x86_64")]
53 pub use x86_64::{
54     CpuId, CpuIdEntry, ExtendedControlRegisters, LapicState, MsrEntries, VcpuKvmState as CpuState,
55     Xsave, CPUID_FLAG_VALID_INDEX,
56 };
57 // aarch64 dependencies
58 #[cfg(target_arch = "aarch64")]
59 pub mod aarch64;
60 pub use kvm_bindings;
61 #[cfg(feature = "tdx")]
62 use kvm_bindings::KVMIO;
63 pub use kvm_bindings::{
64     kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_irq_routing, kvm_irq_routing_entry,
65     kvm_userspace_memory_region, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI,
66     KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID,
67 };
68 #[cfg(target_arch = "aarch64")]
69 use kvm_bindings::{
70     kvm_regs, user_fpsimd_state, user_pt_regs, KVM_NR_SPSR, KVM_REG_ARM64, KVM_REG_ARM_CORE,
71     KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64,
72 };
73 pub use kvm_ioctls;
74 pub use kvm_ioctls::{Cap, Kvm};
75 #[cfg(target_arch = "aarch64")]
76 use std::mem;
77 use thiserror::Error;
78 #[cfg(feature = "tdx")]
79 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_expr, ioctl_ioc_nr, ioctl_iowr_nr};
80 ///
81 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms
82 ///
83 pub use {
84     kvm_bindings::kvm_clock_data as ClockData, kvm_bindings::kvm_create_device as CreateDevice,
85     kvm_bindings::kvm_device_attr as DeviceAttr,
86     kvm_bindings::kvm_irq_routing_entry as IrqRoutingEntry, kvm_bindings::kvm_mp_state as MpState,
87     kvm_bindings::kvm_userspace_memory_region as MemoryRegion,
88     kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::DeviceFd, kvm_ioctls::IoEventAddress,
89     kvm_ioctls::VcpuExit,
90 };
91 
92 #[cfg(target_arch = "x86_64")]
93 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196;
94 
95 #[cfg(feature = "tdx")]
96 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong);
97 
98 #[cfg(feature = "tdx")]
99 #[repr(u32)]
100 enum TdxCommand {
101     #[allow(dead_code)]
102     Capabilities = 0,
103     InitVm,
104     InitVcpu,
105     InitMemRegion,
106     Finalize,
107 }
108 
109 #[derive(Clone, Copy, Debug, PartialEq, Deserialize, Serialize)]
110 pub struct KvmVmState {}
111 
112 pub use KvmVmState as VmState;
113 /// Wrapper over KVM VM ioctls.
114 pub struct KvmVm {
115     fd: Arc<VmFd>,
116     #[cfg(target_arch = "x86_64")]
117     msrs: MsrEntries,
118     state: KvmVmState,
119 }
120 
121 ///
122 /// Implementation of Vm trait for KVM
123 /// Example:
124 /// #[cfg(feature = "kvm")]
125 /// extern crate hypervisor
126 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
127 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
128 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
129 /// vm.set/get().unwrap()
130 ///
131 impl vm::Vm for KvmVm {
132     #[cfg(target_arch = "x86_64")]
133     ///
134     /// Sets the address of the three-page region in the VM's address space.
135     ///
136     fn set_tss_address(&self, offset: usize) -> vm::Result<()> {
137         self.fd
138             .set_tss_address(offset)
139             .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into()))
140     }
141     ///
142     /// Creates an in-kernel interrupt controller.
143     ///
144     fn create_irq_chip(&self) -> vm::Result<()> {
145         self.fd
146             .create_irq_chip()
147             .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into()))
148     }
149     ///
150     /// Registers an event that will, when signaled, trigger the `gsi` IRQ.
151     ///
152     fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
153         self.fd
154             .register_irqfd(fd, gsi)
155             .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into()))
156     }
157     ///
158     /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ.
159     ///
160     fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
161         self.fd
162             .unregister_irqfd(fd, gsi)
163             .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into()))
164     }
165     ///
166     /// Creates a VcpuFd object from a vcpu RawFd.
167     ///
168     fn create_vcpu(
169         &self,
170         id: u8,
171         vmmops: Option<Arc<Box<dyn VmmOps>>>,
172     ) -> vm::Result<Arc<dyn cpu::Vcpu>> {
173         let vc = self
174             .fd
175             .create_vcpu(id as u64)
176             .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?;
177         let vcpu = KvmVcpu {
178             fd: vc,
179             #[cfg(target_arch = "x86_64")]
180             msrs: self.msrs.clone(),
181             vmmops,
182             #[cfg(target_arch = "x86_64")]
183             hyperv_synic: AtomicBool::new(false),
184         };
185         Ok(Arc::new(vcpu))
186     }
187     ///
188     /// Registers an event to be signaled whenever a certain address is written to.
189     ///
190     fn register_ioevent(
191         &self,
192         fd: &EventFd,
193         addr: &IoEventAddress,
194         datamatch: Option<vm::DataMatch>,
195     ) -> vm::Result<()> {
196         if let Some(dm) = datamatch {
197             match dm {
198                 vm::DataMatch::DataMatch32(kvm_dm32) => self
199                     .fd
200                     .register_ioevent(fd, addr, kvm_dm32)
201                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
202                 vm::DataMatch::DataMatch64(kvm_dm64) => self
203                     .fd
204                     .register_ioevent(fd, addr, kvm_dm64)
205                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
206             }
207         } else {
208             self.fd
209                 .register_ioevent(fd, addr, NoDatamatch)
210                 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into()))
211         }
212     }
213     ///
214     /// Unregisters an event from a certain address it has been previously registered to.
215     ///
216     fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> {
217         self.fd
218             .unregister_ioevent(fd, addr, NoDatamatch)
219             .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into()))
220     }
221     ///
222     /// Sets the GSI routing table entries, overwriting any previously set
223     /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl.
224     ///
225     fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> {
226         let mut irq_routing =
227             vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len());
228         irq_routing[0].nr = entries.len() as u32;
229         irq_routing[0].flags = 0;
230 
231         unsafe {
232             let entries_slice: &mut [kvm_irq_routing_entry] =
233                 irq_routing[0].entries.as_mut_slice(entries.len());
234             entries_slice.copy_from_slice(entries);
235         }
236 
237         self.fd
238             .set_gsi_routing(&irq_routing[0])
239             .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into()))
240     }
241     ///
242     /// Creates a memory region structure that can be used with {create/remove}_user_memory_region
243     ///
244     fn make_user_memory_region(
245         &self,
246         slot: u32,
247         guest_phys_addr: u64,
248         memory_size: u64,
249         userspace_addr: u64,
250         readonly: bool,
251         log_dirty_pages: bool,
252     ) -> MemoryRegion {
253         MemoryRegion {
254             slot,
255             guest_phys_addr,
256             memory_size,
257             userspace_addr,
258             flags: if readonly { KVM_MEM_READONLY } else { 0 }
259                 | if log_dirty_pages {
260                     KVM_MEM_LOG_DIRTY_PAGES
261                 } else {
262                     0
263                 },
264         }
265     }
266     ///
267     /// Creates a guest physical memory region.
268     ///
269     fn create_user_memory_region(&self, user_memory_region: MemoryRegion) -> vm::Result<()> {
270         // Safe because guest regions are guaranteed not to overlap.
271         unsafe {
272             self.fd
273                 .set_user_memory_region(user_memory_region)
274                 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))
275         }
276     }
277     ///
278     /// Removes a guest physical memory region.
279     ///
280     fn remove_user_memory_region(&self, user_memory_region: MemoryRegion) -> vm::Result<()> {
281         let mut region = user_memory_region;
282 
283         // Setting the size to 0 means "remove"
284         region.memory_size = 0;
285         // Safe because guest regions are guaranteed not to overlap.
286         unsafe {
287             self.fd
288                 .set_user_memory_region(region)
289                 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into()))
290         }
291     }
292     ///
293     /// Creates an emulated device in the kernel.
294     ///
295     /// See the documentation for `KVM_CREATE_DEVICE`.
296     fn create_device(&self, device: &mut CreateDevice) -> vm::Result<Arc<dyn device::Device>> {
297         let fd = self
298             .fd
299             .create_device(device)
300             .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?;
301         let device = KvmDevice { fd };
302         Ok(Arc::new(device))
303     }
304     ///
305     /// Returns the preferred CPU target type which can be emulated by KVM on underlying host.
306     ///
307     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
308     fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> {
309         self.fd
310             .get_preferred_target(kvi)
311             .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into()))
312     }
313     #[cfg(target_arch = "x86_64")]
314     fn enable_split_irq(&self) -> vm::Result<()> {
315         // Set TSS
316         self.fd
317             .set_tss_address(KVM_TSS_ADDRESS.raw_value() as usize)
318             .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?;
319         // Create split irqchip
320         // Only the local APIC is emulated in kernel, both PICs and IOAPIC
321         // are not.
322         let mut cap = kvm_enable_cap {
323             cap: KVM_CAP_SPLIT_IRQCHIP,
324             ..Default::default()
325         };
326         cap.args[0] = NUM_IOAPIC_PINS as u64;
327         self.fd
328             .enable_cap(&cap)
329             .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?;
330         Ok(())
331     }
332     #[cfg(target_arch = "x86_64")]
333     fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> {
334         let mut cap = kvm_enable_cap {
335             cap: KVM_CAP_SGX_ATTRIBUTE,
336             ..Default::default()
337         };
338         cap.args[0] = file.as_raw_fd() as u64;
339         self.fd
340             .enable_cap(&cap)
341             .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?;
342         Ok(())
343     }
344     /// Retrieve guest clock.
345     #[cfg(target_arch = "x86_64")]
346     fn get_clock(&self) -> vm::Result<ClockData> {
347         self.fd
348             .get_clock()
349             .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))
350     }
351     /// Set guest clock.
352     #[cfg(target_arch = "x86_64")]
353     fn set_clock(&self, data: &ClockData) -> vm::Result<()> {
354         self.fd
355             .set_clock(data)
356             .map_err(|e| vm::HypervisorVmError::SetClock(e.into()))
357     }
358     /// Checks if a particular `Cap` is available.
359     fn check_extension(&self, c: Cap) -> bool {
360         self.fd.check_extension(c)
361     }
362     /// Create a device that is used for passthrough
363     fn create_passthrough_device(&self) -> vm::Result<Arc<dyn device::Device>> {
364         let mut vfio_dev = kvm_create_device {
365             type_: kvm_device_type_KVM_DEV_TYPE_VFIO,
366             fd: 0,
367             flags: 0,
368         };
369 
370         self.create_device(&mut vfio_dev)
371             .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into()))
372     }
373     ///
374     /// Get the Vm state. Return VM specific data
375     ///
376     fn state(&self) -> vm::Result<VmState> {
377         Ok(self.state)
378     }
379     ///
380     /// Set the VM state
381     ///
382     fn set_state(&self, _state: VmState) -> vm::Result<()> {
383         Ok(())
384     }
385 
386     ///
387     /// Get dirty pages bitmap (one bit per page)
388     ///
389     fn get_dirty_log(&self, slot: u32, memory_size: u64) -> vm::Result<Vec<u64>> {
390         self.fd
391             .get_dirty_log(slot, memory_size as usize)
392             .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into()))
393     }
394 
395     ///
396     /// Initialize TDX for this VM
397     ///
398     #[cfg(feature = "tdx")]
399     fn tdx_init(&self, cpuid: &CpuId, max_vcpus: u32) -> vm::Result<()> {
400         #[repr(C)]
401         struct TdxInitVm {
402             max_vcpus: u32,
403             reserved: u32,
404             attributes: u64,
405             cpuid: u64,
406         }
407         let data = TdxInitVm {
408             max_vcpus,
409             reserved: 0,
410             attributes: 1, // TDX1_TD_ATTRIBUTE_DEBUG,
411             cpuid: cpuid.as_fam_struct_ptr() as u64,
412         };
413 
414         tdx_command(
415             &self.fd.as_raw_fd(),
416             TdxCommand::InitVm,
417             0,
418             &data as *const _ as u64,
419         )
420         .map_err(vm::HypervisorVmError::InitializeTdx)
421     }
422 
423     ///
424     /// Finalize the TDX setup for this VM
425     ///
426     #[cfg(feature = "tdx")]
427     fn tdx_finalize(&self) -> vm::Result<()> {
428         tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0)
429             .map_err(vm::HypervisorVmError::FinalizeTdx)
430     }
431 
432     ///
433     /// Initialize memory regions for the TDX VM
434     ///
435     #[cfg(feature = "tdx")]
436     fn tdx_init_memory_region(
437         &self,
438         host_address: u64,
439         guest_address: u64,
440         size: u64,
441         measure: bool,
442     ) -> vm::Result<()> {
443         #[repr(C)]
444         struct TdxInitMemRegion {
445             host_address: u64,
446             guest_address: u64,
447             pages: u64,
448         }
449         let data = TdxInitMemRegion {
450             host_address,
451             guest_address,
452             pages: size / 4096,
453         };
454 
455         tdx_command(
456             &self.fd.as_raw_fd(),
457             TdxCommand::InitMemRegion,
458             if measure { 1 } else { 0 },
459             &data as *const _ as u64,
460         )
461         .map_err(vm::HypervisorVmError::InitMemRegionTdx)
462     }
463 }
464 
465 #[cfg(feature = "tdx")]
466 fn tdx_command(
467     fd: &RawFd,
468     command: TdxCommand,
469     metadata: u32,
470     data: u64,
471 ) -> std::result::Result<(), std::io::Error> {
472     #[repr(C)]
473     struct TdxIoctlCmd {
474         command: TdxCommand,
475         metadata: u32,
476         data: u64,
477     }
478     let cmd = TdxIoctlCmd {
479         command,
480         metadata,
481         data,
482     };
483     let ret = unsafe {
484         ioctl_with_val(
485             fd,
486             KVM_MEMORY_ENCRYPT_OP(),
487             &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong,
488         )
489     };
490 
491     if ret < 0 {
492         return Err(std::io::Error::last_os_error());
493     }
494     Ok(())
495 }
496 
497 /// Wrapper over KVM system ioctls.
498 pub struct KvmHypervisor {
499     kvm: Kvm,
500 }
501 /// Enum for KVM related error
502 #[derive(Debug, Error)]
503 pub enum KvmError {
504     #[error("Capability missing: {0:?}")]
505     CapabilityMissing(Cap),
506 }
507 pub type KvmResult<T> = result::Result<T, KvmError>;
508 impl KvmHypervisor {
509     /// Create a hypervisor based on Kvm
510     pub fn new() -> hypervisor::Result<KvmHypervisor> {
511         let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?;
512         let api_version = kvm_obj.get_api_version();
513 
514         if api_version != kvm_bindings::KVM_API_VERSION as i32 {
515             return Err(hypervisor::HypervisorError::IncompatibleApiVersion);
516         }
517 
518         Ok(KvmHypervisor { kvm: kvm_obj })
519     }
520 }
521 /// Implementation of Hypervisor trait for KVM
522 /// Example:
523 /// #[cfg(feature = "kvm")]
524 /// extern crate hypervisor
525 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
526 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
527 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
528 ///
529 impl hypervisor::Hypervisor for KvmHypervisor {
530     /// Create a KVM vm object of a specific VM type and return the object as Vm trait object
531     /// Example
532     /// # extern crate hypervisor;
533     /// # use hypervisor::KvmHypervisor;
534     /// use hypervisor::KvmVm;
535     /// let hypervisor = KvmHypervisor::new().unwrap();
536     /// let vm = hypervisor.create_vm_with_type(KvmVmType::LegacyVm).unwrap()
537     ///
538     fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> {
539         let fd: VmFd;
540         loop {
541             match self.kvm.create_vm_with_type(vm_type) {
542                 Ok(res) => fd = res,
543                 Err(e) => {
544                     if e.errno() == libc::EINTR {
545                         // If the error returned is EINTR, which means the
546                         // ioctl has been interrupted, we have to retry as
547                         // this can't be considered as a regular error.
548                         continue;
549                     } else {
550                         return Err(hypervisor::HypervisorError::VmCreate(e.into()));
551                     }
552                 }
553             }
554             break;
555         }
556 
557         let vm_fd = Arc::new(fd);
558 
559         #[cfg(target_arch = "x86_64")]
560         {
561             let msr_list = self.get_msr_list()?;
562             let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize;
563             let mut msrs = MsrEntries::new(num_msrs).unwrap();
564             let indices = msr_list.as_slice();
565             let msr_entries = msrs.as_mut_slice();
566             for (pos, index) in indices.iter().enumerate() {
567                 msr_entries[pos].index = *index;
568             }
569 
570             Ok(Arc::new(KvmVm {
571                 fd: vm_fd,
572                 msrs,
573                 state: VmState {},
574             }))
575         }
576 
577         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
578         {
579             Ok(Arc::new(KvmVm {
580                 fd: vm_fd,
581                 state: VmState {},
582             }))
583         }
584     }
585 
586     /// Create a KVM vm object and return the object as Vm trait object
587     /// Example
588     /// # extern crate hypervisor;
589     /// # use hypervisor::KvmHypervisor;
590     /// use hypervisor::KvmVm;
591     /// let hypervisor = KvmHypervisor::new().unwrap();
592     /// let vm = hypervisor.create_vm().unwrap()
593     ///
594     fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> {
595         #[allow(unused_mut)]
596         let mut vm_type: u64 = 0; // Create with default platform type
597 
598         // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA
599         // size from the host and use that when creating the VM, which may
600         // avoid unnecessary VM creation failures.
601         #[cfg(target_arch = "aarch64")]
602         if self.kvm.check_extension(Cap::ArmVmIPASize) {
603             vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap();
604         }
605 
606         self.create_vm_with_type(vm_type)
607     }
608 
609     fn check_required_extensions(&self) -> hypervisor::Result<()> {
610         check_required_kvm_extensions(&self.kvm)
611             .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into()))
612     }
613 
614     #[cfg(target_arch = "x86_64")]
615     ///
616     /// X86 specific call to get the system supported CPUID values.
617     ///
618     fn get_cpuid(&self) -> hypervisor::Result<CpuId> {
619         self.kvm
620             .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
621             .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))
622     }
623 
624     #[cfg(target_arch = "x86_64")]
625     ///
626     /// Retrieve the list of MSRs supported by KVM.
627     ///
628     fn get_msr_list(&self) -> hypervisor::Result<MsrList> {
629         self.kvm
630             .get_msr_index_list()
631             .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into()))
632     }
633     #[cfg(target_arch = "aarch64")]
634     ///
635     /// Retrieve AArch64 host maximum IPA size supported by KVM.
636     ///
637     fn get_host_ipa_limit(&self) -> i32 {
638         self.kvm.get_host_ipa_limit()
639     }
640 }
641 /// Vcpu struct for KVM
642 pub struct KvmVcpu {
643     fd: VcpuFd,
644     #[cfg(target_arch = "x86_64")]
645     msrs: MsrEntries,
646     vmmops: Option<Arc<Box<dyn vm::VmmOps>>>,
647     #[cfg(target_arch = "x86_64")]
648     hyperv_synic: AtomicBool,
649 }
650 /// Implementation of Vcpu trait for KVM
651 /// Example:
652 /// #[cfg(feature = "kvm")]
653 /// extern crate hypervisor
654 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
655 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
656 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
657 /// let vcpu = vm.create_vcpu(0, None).unwrap();
658 /// vcpu.get/set().unwrap()
659 ///
660 impl cpu::Vcpu for KvmVcpu {
661     #[cfg(target_arch = "x86_64")]
662     ///
663     /// Returns the vCPU general purpose registers.
664     ///
665     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
666         self.fd
667             .get_regs()
668             .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))
669     }
670     #[cfg(target_arch = "x86_64")]
671     ///
672     /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl.
673     ///
674     fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> {
675         self.fd
676             .set_regs(regs)
677             .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into()))
678     }
679     #[cfg(target_arch = "x86_64")]
680     ///
681     /// Returns the vCPU special registers.
682     ///
683     fn get_sregs(&self) -> cpu::Result<SpecialRegisters> {
684         self.fd
685             .get_sregs()
686             .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))
687     }
688     #[cfg(target_arch = "x86_64")]
689     ///
690     /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl.
691     ///
692     fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> {
693         self.fd
694             .set_sregs(sregs)
695             .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into()))
696     }
697     #[cfg(target_arch = "x86_64")]
698     ///
699     /// Returns the floating point state (FPU) from the vCPU.
700     ///
701     fn get_fpu(&self) -> cpu::Result<FpuState> {
702         self.fd
703             .get_fpu()
704             .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))
705     }
706     #[cfg(target_arch = "x86_64")]
707     ///
708     /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct.
709     ///
710     fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> {
711         self.fd
712             .set_fpu(fpu)
713             .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into()))
714     }
715     #[cfg(target_arch = "x86_64")]
716     ///
717     /// X86 specific call to setup the CPUID registers.
718     ///
719     fn set_cpuid2(&self, cpuid: &CpuId) -> cpu::Result<()> {
720         self.fd
721             .set_cpuid2(cpuid)
722             .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into()))
723     }
724     #[cfg(target_arch = "x86_64")]
725     ///
726     /// X86 specific call to enable HyperV SynIC
727     ///
728     fn enable_hyperv_synic(&self) -> cpu::Result<()> {
729         // Update the information about Hyper-V SynIC being enabled and
730         // emulated as it will influence later which MSRs should be saved.
731         self.hyperv_synic.store(true, Ordering::Release);
732 
733         let cap = kvm_enable_cap {
734             cap: KVM_CAP_HYPERV_SYNIC,
735             ..Default::default()
736         };
737         self.fd
738             .enable_cap(&cap)
739             .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into()))
740     }
741     ///
742     /// X86 specific call to retrieve the CPUID registers.
743     ///
744     #[cfg(target_arch = "x86_64")]
745     fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<CpuId> {
746         self.fd
747             .get_cpuid2(num_entries)
748             .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))
749     }
750     #[cfg(target_arch = "x86_64")]
751     ///
752     /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
753     ///
754     fn get_lapic(&self) -> cpu::Result<LapicState> {
755         self.fd
756             .get_lapic()
757             .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))
758     }
759     #[cfg(target_arch = "x86_64")]
760     ///
761     /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
762     ///
763     fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> {
764         self.fd
765             .set_lapic(klapic)
766             .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into()))
767     }
768     #[cfg(target_arch = "x86_64")]
769     ///
770     /// Returns the model-specific registers (MSR) for this vCPU.
771     ///
772     fn get_msrs(&self, msrs: &mut MsrEntries) -> cpu::Result<usize> {
773         self.fd
774             .get_msrs(msrs)
775             .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))
776     }
777     #[cfg(target_arch = "x86_64")]
778     ///
779     /// Setup the model-specific registers (MSR) for this vCPU.
780     /// Returns the number of MSR entries actually written.
781     ///
782     fn set_msrs(&self, msrs: &MsrEntries) -> cpu::Result<usize> {
783         self.fd
784             .set_msrs(msrs)
785             .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into()))
786     }
787     ///
788     /// Returns the vcpu's current "multiprocessing state".
789     ///
790     fn get_mp_state(&self) -> cpu::Result<MpState> {
791         self.fd
792             .get_mp_state()
793             .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))
794     }
795     ///
796     /// Sets the vcpu's current "multiprocessing state".
797     ///
798     fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> {
799         self.fd
800             .set_mp_state(mp_state)
801             .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into()))
802     }
803     #[cfg(target_arch = "x86_64")]
804     ///
805     /// X86 specific call that returns the vcpu's current "xsave struct".
806     ///
807     fn get_xsave(&self) -> cpu::Result<Xsave> {
808         self.fd
809             .get_xsave()
810             .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into()))
811     }
812     #[cfg(target_arch = "x86_64")]
813     ///
814     /// X86 specific call that sets the vcpu's current "xsave struct".
815     ///
816     fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> {
817         self.fd
818             .set_xsave(xsave)
819             .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into()))
820     }
821     #[cfg(target_arch = "x86_64")]
822     ///
823     /// X86 specific call that returns the vcpu's current "xcrs".
824     ///
825     fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> {
826         self.fd
827             .get_xcrs()
828             .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into()))
829     }
830     #[cfg(target_arch = "x86_64")]
831     ///
832     /// X86 specific call that sets the vcpu's current "xcrs".
833     ///
834     fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> {
835         self.fd
836             .set_xcrs(xcrs)
837             .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into()))
838     }
839     ///
840     /// Triggers the running of the current virtual CPU returning an exit reason.
841     ///
842     fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> {
843         match self.fd.run() {
844             Ok(run) => match run {
845                 #[cfg(target_arch = "x86_64")]
846                 VcpuExit::IoIn(addr, data) => {
847                     if let Some(vmmops) = &self.vmmops {
848                         return vmmops
849                             .pio_read(addr.into(), data)
850                             .map(|_| cpu::VmExit::Ignore)
851                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
852                     }
853 
854                     Ok(cpu::VmExit::IoIn(addr, data))
855                 }
856                 #[cfg(target_arch = "x86_64")]
857                 VcpuExit::IoOut(addr, data) => {
858                     if let Some(vmmops) = &self.vmmops {
859                         return vmmops
860                             .pio_write(addr.into(), data)
861                             .map(|_| cpu::VmExit::Ignore)
862                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
863                     }
864 
865                     Ok(cpu::VmExit::IoOut(addr, data))
866                 }
867                 #[cfg(target_arch = "x86_64")]
868                 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)),
869                 #[cfg(target_arch = "x86_64")]
870                 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset),
871 
872                 #[cfg(target_arch = "aarch64")]
873                 VcpuExit::SystemEvent(event_type, flags) => {
874                     use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN};
875                     // On Aarch64, when the VM is shutdown, run() returns
876                     // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN
877                     if event_type == KVM_SYSTEM_EVENT_RESET {
878                         Ok(cpu::VmExit::Reset)
879                     } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN {
880                         Ok(cpu::VmExit::Shutdown)
881                     } else {
882                         Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
883                             "Unexpected system event with type 0x{:x}, flags 0x{:x}",
884                             event_type,
885                             flags
886                         )))
887                     }
888                 }
889 
890                 VcpuExit::MmioRead(addr, data) => {
891                     if let Some(vmmops) = &self.vmmops {
892                         return vmmops
893                             .mmio_read(addr, data)
894                             .map(|_| cpu::VmExit::Ignore)
895                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
896                     }
897 
898                     Ok(cpu::VmExit::MmioRead(addr, data))
899                 }
900                 VcpuExit::MmioWrite(addr, data) => {
901                     if let Some(vmmops) = &self.vmmops {
902                         return vmmops
903                             .mmio_write(addr, data)
904                             .map(|_| cpu::VmExit::Ignore)
905                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
906                     }
907 
908                     Ok(cpu::VmExit::MmioWrite(addr, data))
909                 }
910                 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv),
911 
912                 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
913                     "Unexpected exit reason on vcpu run: {:?}",
914                     r
915                 ))),
916             },
917 
918             Err(ref e) => match e.errno() {
919                 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore),
920                 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
921                     "VCPU error {:?}",
922                     e
923                 ))),
924             },
925         }
926     }
927     #[cfg(target_arch = "x86_64")]
928     ///
929     /// Returns currently pending exceptions, interrupts, and NMIs as well as related
930     /// states of the vcpu.
931     ///
932     fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> {
933         self.fd
934             .get_vcpu_events()
935             .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into()))
936     }
937     #[cfg(target_arch = "x86_64")]
938     ///
939     /// Sets pending exceptions, interrupts, and NMIs as well as related states
940     /// of the vcpu.
941     ///
942     fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> {
943         self.fd
944             .set_vcpu_events(events)
945             .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into()))
946     }
947     #[cfg(target_arch = "x86_64")]
948     ///
949     /// Let the guest know that it has been paused, which prevents from
950     /// potential soft lockups when being resumed.
951     ///
952     fn notify_guest_clock_paused(&self) -> cpu::Result<()> {
953         self.fd
954             .kvmclock_ctrl()
955             .map_err(|e| cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into()))
956     }
957     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
958     fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> {
959         self.fd
960             .vcpu_init(kvi)
961             .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into()))
962     }
963     ///
964     /// Sets the value of one register for this vCPU.
965     ///
966     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
967     fn set_reg(&self, reg_id: u64, data: u64) -> cpu::Result<()> {
968         self.fd
969             .set_one_reg(reg_id, data)
970             .map_err(|e| cpu::HypervisorCpuError::SetRegister(e.into()))
971     }
972     ///
973     /// Gets the value of one register for this vCPU.
974     ///
975     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
976     fn get_reg(&self, reg_id: u64) -> cpu::Result<u64> {
977         self.fd
978             .get_one_reg(reg_id)
979             .map_err(|e| cpu::HypervisorCpuError::GetRegister(e.into()))
980     }
981     ///
982     /// Gets a list of the guest registers that are supported for the
983     /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls.
984     ///
985     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
986     fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> {
987         self.fd
988             .get_reg_list(reg_list)
989             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))
990     }
991     ///
992     /// Save the state of the core registers.
993     ///
994     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
995     fn core_registers(&self, state: &mut StandardRegisters) -> cpu::Result<()> {
996         let mut off = offset__of!(user_pt_regs, regs);
997         // There are 31 user_pt_regs:
998         // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72
999         // These actually are the general-purpose registers of the Armv8-a
1000         // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register).
1001         for i in 0..31 {
1002             state.regs.regs[i] = self
1003                 .fd
1004                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1005                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1006             off += std::mem::size_of::<u64>();
1007         }
1008 
1009         // We are now entering the "Other register" section of the ARMv8-a architecture.
1010         // First one, stack pointer.
1011         let off = offset__of!(user_pt_regs, sp);
1012         state.regs.sp = self
1013             .fd
1014             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1015             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1016 
1017         // Second one, the program counter.
1018         let off = offset__of!(user_pt_regs, pc);
1019         state.regs.pc = self
1020             .fd
1021             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1022             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1023 
1024         // Next is the processor state.
1025         let off = offset__of!(user_pt_regs, pstate);
1026         state.regs.pstate = self
1027             .fd
1028             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1029             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1030 
1031         // The stack pointer associated with EL1
1032         let off = offset__of!(kvm_regs, sp_el1);
1033         state.sp_el1 = self
1034             .fd
1035             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1036             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1037 
1038         // Exception Link Register for EL1, when taking an exception to EL1, this register
1039         // holds the address to which to return afterwards.
1040         let off = offset__of!(kvm_regs, elr_el1);
1041         state.elr_el1 = self
1042             .fd
1043             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1044             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1045 
1046         // Saved Program Status Registers, there are 5 of them used in the kernel.
1047         let mut off = offset__of!(kvm_regs, spsr);
1048         for i in 0..KVM_NR_SPSR as usize {
1049             state.spsr[i] = self
1050                 .fd
1051                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1052                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1053             off += std::mem::size_of::<u64>();
1054         }
1055 
1056         // Now moving on to floting point registers which are stored in the user_fpsimd_state in the kernel:
1057         // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53
1058         let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs);
1059         for i in 0..32 {
1060             state.fp_regs.vregs[i][0] = self
1061                 .fd
1062                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off))
1063                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1064             off += mem::size_of::<u128>();
1065         }
1066 
1067         // Floating-point Status Register
1068         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr);
1069         state.fp_regs.fpsr = self
1070             .fd
1071             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1072             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1073             as u32;
1074 
1075         // Floating-point Control Register
1076         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr);
1077         state.fp_regs.fpcr = self
1078             .fd
1079             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1080             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1081             as u32;
1082         Ok(())
1083     }
1084     ///
1085     /// Restore the state of the core registers.
1086     ///
1087     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1088     fn set_core_registers(&self, state: &StandardRegisters) -> cpu::Result<()> {
1089         // The function follows the exact identical order from `state`. Look there
1090         // for some additional info on registers.
1091         let mut off = offset__of!(user_pt_regs, regs);
1092         for i in 0..31 {
1093             self.fd
1094                 .set_one_reg(
1095                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1096                     state.regs.regs[i],
1097                 )
1098                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1099             off += std::mem::size_of::<u64>();
1100         }
1101 
1102         let off = offset__of!(user_pt_regs, sp);
1103         self.fd
1104             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.sp)
1105             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1106 
1107         let off = offset__of!(user_pt_regs, pc);
1108         self.fd
1109             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pc)
1110             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1111 
1112         let off = offset__of!(user_pt_regs, pstate);
1113         self.fd
1114             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pstate)
1115             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1116 
1117         let off = offset__of!(kvm_regs, sp_el1);
1118         self.fd
1119             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.sp_el1)
1120             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1121 
1122         let off = offset__of!(kvm_regs, elr_el1);
1123         self.fd
1124             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.elr_el1)
1125             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1126 
1127         let mut off = offset__of!(kvm_regs, spsr);
1128         for i in 0..KVM_NR_SPSR as usize {
1129             self.fd
1130                 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.spsr[i])
1131                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1132             off += std::mem::size_of::<u64>();
1133         }
1134 
1135         let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs);
1136         for i in 0..32 {
1137             self.fd
1138                 .set_one_reg(
1139                     arm64_core_reg_id!(KVM_REG_SIZE_U128, off),
1140                     state.fp_regs.vregs[i][0],
1141                 )
1142                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1143             off += mem::size_of::<u128>();
1144         }
1145 
1146         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr);
1147         self.fd
1148             .set_one_reg(
1149                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1150                 state.fp_regs.fpsr as u64,
1151             )
1152             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1153 
1154         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr);
1155         self.fd
1156             .set_one_reg(
1157                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1158                 state.fp_regs.fpcr as u64,
1159             )
1160             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1161         Ok(())
1162     }
1163     ///
1164     /// Save the state of the system registers.
1165     ///
1166     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1167     fn system_registers(&self, state: &mut Vec<Register>) -> cpu::Result<()> {
1168         // Call KVM_GET_REG_LIST to get all registers available to the guest. For ArmV8 there are
1169         // around 500 registers.
1170         let mut reg_list = RegList::new(500).unwrap();
1171         self.fd
1172             .get_reg_list(&mut reg_list)
1173             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?;
1174 
1175         // At this point reg_list should contain: core registers and system registers.
1176         // The register list contains the number of registers and their ids. We will be needing to
1177         // call KVM_GET_ONE_REG on each id in order to save all of them. We carve out from the list
1178         // the core registers which are represented in the kernel by kvm_regs structure and for which
1179         // we can calculate the id based on the offset in the structure.
1180 
1181         reg_list.retain(|regid| *regid != 0);
1182         reg_list.as_slice().to_vec().sort_unstable();
1183 
1184         reg_list.retain(|regid| is_system_register(*regid));
1185 
1186         // Now, for the rest of the registers left in the previously fetched register list, we are
1187         // simply calling KVM_GET_ONE_REG.
1188         let indices = reg_list.as_slice();
1189         for (_pos, index) in indices.iter().enumerate() {
1190             if _pos > 230 {
1191                 break;
1192             }
1193             state.push(kvm_bindings::kvm_one_reg {
1194                 id: *index,
1195                 addr: self
1196                     .fd
1197                     .get_one_reg(*index)
1198                     .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?,
1199             });
1200         }
1201 
1202         Ok(())
1203     }
1204     ///
1205     /// Restore the state of the system registers.
1206     ///
1207     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1208     fn set_system_registers(&self, state: &[Register]) -> cpu::Result<()> {
1209         for reg in state {
1210             self.fd
1211                 .set_one_reg(reg.id, reg.addr)
1212                 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?;
1213         }
1214         Ok(())
1215     }
1216     ///
1217     /// Read the MPIDR - Multiprocessor Affinity Register.
1218     ///
1219     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1220     fn read_mpidr(&self) -> cpu::Result<u64> {
1221         self.fd
1222             .get_one_reg(MPIDR_EL1)
1223             .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))
1224     }
1225     #[cfg(target_arch = "x86_64")]
1226     ///
1227     /// Get the current CPU state
1228     ///
1229     /// Ordering requirements:
1230     ///
1231     /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify
1232     /// vCPU/LAPIC state. As such, it must be done before most everything
1233     /// else, otherwise we cannot restore everything and expect it to work.
1234     ///
1235     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1236     /// still running.
1237     ///
1238     /// KVM_GET_LAPIC may change state of LAPIC before returning it.
1239     ///
1240     /// GET_VCPU_EVENTS should probably be last to save. The code looks as
1241     /// it might as well be affected by internal state modifications of the
1242     /// GET ioctls.
1243     ///
1244     /// SREGS saves/restores a pending interrupt, similar to what
1245     /// VCPU_EVENTS also does.
1246     ///
1247     /// GET_MSRS requires a pre-populated data structure to do something
1248     /// meaningful. For SET_MSRS it will then contain good data.
1249     ///
1250     /// # Example
1251     ///
1252     /// ```rust
1253     /// # extern crate hypervisor;
1254     /// # use hypervisor::KvmHypervisor;
1255     /// # use std::sync::Arc;
1256     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1257     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1258     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1259     /// vm.enable_split_irq().unwrap();
1260     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1261     /// let state = vcpu.state().unwrap();
1262     /// ```
1263     fn state(&self) -> cpu::Result<CpuState> {
1264         let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?;
1265         let mp_state = self.get_mp_state()?;
1266         let regs = self.get_regs()?;
1267         let sregs = self.get_sregs()?;
1268         let xsave = self.get_xsave()?;
1269         let xcrs = self.get_xcrs()?;
1270         let lapic_state = self.get_lapic()?;
1271         let fpu = self.get_fpu()?;
1272 
1273         // Try to get all MSRs based on the list previously retrieved from KVM.
1274         // If the number of MSRs obtained from GET_MSRS is different from the
1275         // expected amount, we fallback onto a slower method by getting MSRs
1276         // by chunks. This is the only way to make sure we try to get as many
1277         // MSRs as possible, even if some MSRs are not supported.
1278         let mut msr_entries = self.msrs.clone();
1279 
1280         // Save extra MSRs if the Hyper-V synthetic interrupt controller is
1281         // emulated.
1282         if self.hyperv_synic.load(Ordering::Acquire) {
1283             let hyperv_synic_msrs = vec![
1284                 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084,
1285                 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096,
1286                 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d,
1287                 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4, 0x400000b5,
1288                 0x400000b6, 0x400000b7,
1289             ];
1290             for index in hyperv_synic_msrs {
1291                 let msr = kvm_msr_entry {
1292                     index,
1293                     ..Default::default()
1294                 };
1295                 msr_entries.push(msr).unwrap();
1296             }
1297         }
1298 
1299         let expected_num_msrs = msr_entries.as_fam_struct_ref().nmsrs as usize;
1300         let num_msrs = self.get_msrs(&mut msr_entries)?;
1301         let msrs = if num_msrs != expected_num_msrs {
1302             let mut faulty_msr_index = num_msrs;
1303             let mut msr_entries_tmp =
1304                 MsrEntries::from_entries(&msr_entries.as_slice()[..faulty_msr_index]).unwrap();
1305 
1306             loop {
1307                 warn!(
1308                     "Detected faulty MSR 0x{:x} while getting MSRs",
1309                     msr_entries.as_slice()[faulty_msr_index].index
1310                 );
1311 
1312                 let start_pos = faulty_msr_index + 1;
1313                 let mut sub_msr_entries =
1314                     MsrEntries::from_entries(&msr_entries.as_slice()[start_pos..]).unwrap();
1315                 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize;
1316                 let num_msrs = self.get_msrs(&mut sub_msr_entries)?;
1317 
1318                 for i in 0..num_msrs {
1319                     msr_entries_tmp
1320                         .push(sub_msr_entries.as_slice()[i])
1321                         .map_err(|e| {
1322                             cpu::HypervisorCpuError::GetMsrEntries(anyhow!(
1323                                 "Failed adding MSR entries: {:?}",
1324                                 e
1325                             ))
1326                         })?;
1327                 }
1328 
1329                 if num_msrs == expected_num_msrs {
1330                     break;
1331                 }
1332 
1333                 faulty_msr_index = start_pos + num_msrs;
1334             }
1335 
1336             msr_entries_tmp
1337         } else {
1338             msr_entries
1339         };
1340 
1341         let vcpu_events = self.get_vcpu_events()?;
1342 
1343         Ok(CpuState {
1344             cpuid,
1345             msrs,
1346             vcpu_events,
1347             regs,
1348             sregs,
1349             fpu,
1350             lapic_state,
1351             xsave,
1352             xcrs,
1353             mp_state,
1354         })
1355     }
1356     ///
1357     /// Get the current AArch64 CPU state
1358     ///
1359     #[cfg(target_arch = "aarch64")]
1360     fn state(&self) -> cpu::Result<CpuState> {
1361         let mut state = CpuState {
1362             mp_state: self.get_mp_state()?,
1363             mpidr: self.read_mpidr()?,
1364             ..Default::default()
1365         };
1366         self.core_registers(&mut state.core_regs)?;
1367         self.system_registers(&mut state.sys_regs)?;
1368 
1369         Ok(state)
1370     }
1371     #[cfg(target_arch = "x86_64")]
1372     ///
1373     /// Restore the previously saved CPU state
1374     ///
1375     /// Ordering requirements:
1376     ///
1377     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1378     /// still running.
1379     ///
1380     /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so
1381     /// if we ever change the BSP, we have to do that before restoring anything.
1382     /// The same seems to be true for CPUID stuff.
1383     ///
1384     /// SREGS saves/restores a pending interrupt, similar to what
1385     /// VCPU_EVENTS also does.
1386     ///
1387     /// SET_REGS clears pending exceptions unconditionally, thus, it must be
1388     /// done before SET_VCPU_EVENTS, which restores it.
1389     ///
1390     /// SET_LAPIC must come after SET_SREGS, because the latter restores
1391     /// the apic base msr.
1392     ///
1393     /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR
1394     /// only restores successfully, when the LAPIC is correctly configured.
1395     ///
1396     /// Arguments: CpuState
1397     /// # Example
1398     ///
1399     /// ```rust
1400     /// # extern crate hypervisor;
1401     /// # use hypervisor::KvmHypervisor;
1402     /// # use std::sync::Arc;
1403     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1404     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1405     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1406     /// vm.enable_split_irq().unwrap();
1407     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1408     /// let state = vcpu.state().unwrap();
1409     /// vcpu.set_state(&state).unwrap();
1410     /// ```
1411     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
1412         self.set_cpuid2(&state.cpuid)?;
1413         self.set_mp_state(state.mp_state)?;
1414         self.set_regs(&state.regs)?;
1415         self.set_sregs(&state.sregs)?;
1416         self.set_xsave(&state.xsave)?;
1417         self.set_xcrs(&state.xcrs)?;
1418         self.set_lapic(&state.lapic_state)?;
1419         self.set_fpu(&state.fpu)?;
1420 
1421         // Try to set all MSRs previously stored.
1422         // If the number of MSRs set from SET_MSRS is different from the
1423         // expected amount, we fallback onto a slower method by setting MSRs
1424         // by chunks. This is the only way to make sure we try to set as many
1425         // MSRs as possible, even if some MSRs are not supported.
1426         let expected_num_msrs = state.msrs.as_fam_struct_ref().nmsrs as usize;
1427         let num_msrs = self.set_msrs(&state.msrs)?;
1428         if num_msrs != expected_num_msrs {
1429             let mut faulty_msr_index = num_msrs;
1430 
1431             loop {
1432                 warn!(
1433                     "Detected faulty MSR 0x{:x} while setting MSRs",
1434                     state.msrs.as_slice()[faulty_msr_index].index
1435                 );
1436 
1437                 let start_pos = faulty_msr_index + 1;
1438                 let sub_msr_entries =
1439                     MsrEntries::from_entries(&state.msrs.as_slice()[start_pos..]).unwrap();
1440                 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize;
1441                 let num_msrs = self.set_msrs(&sub_msr_entries)?;
1442 
1443                 if num_msrs == expected_num_msrs {
1444                     break;
1445                 }
1446 
1447                 faulty_msr_index = start_pos + num_msrs;
1448             }
1449         }
1450 
1451         self.set_vcpu_events(&state.vcpu_events)?;
1452 
1453         Ok(())
1454     }
1455     ///
1456     /// Restore the previously saved AArch64 CPU state
1457     ///
1458     #[cfg(target_arch = "aarch64")]
1459     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
1460         self.set_core_registers(&state.core_regs)?;
1461         self.set_system_registers(&state.sys_regs)?;
1462         self.set_mp_state(state.mp_state)?;
1463 
1464         Ok(())
1465     }
1466 
1467     ///
1468     /// Initialize TDX for this CPU
1469     ///
1470     #[cfg(feature = "tdx")]
1471     fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> {
1472         tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address)
1473             .map_err(cpu::HypervisorCpuError::InitializeTdx)
1474     }
1475 }
1476 
1477 /// Device struct for KVM
1478 pub struct KvmDevice {
1479     fd: DeviceFd,
1480 }
1481 
1482 impl device::Device for KvmDevice {
1483     ///
1484     /// Set device attribute
1485     ///
1486     fn set_device_attr(&self, attr: &DeviceAttr) -> device::Result<()> {
1487         self.fd
1488             .set_device_attr(attr)
1489             .map_err(|e| device::HypervisorDeviceError::SetDeviceAttribute(e.into()))
1490     }
1491     ///
1492     /// Get device attribute
1493     ///
1494     fn get_device_attr(&self, attr: &mut DeviceAttr) -> device::Result<()> {
1495         self.fd
1496             .get_device_attr(attr)
1497             .map_err(|e| device::HypervisorDeviceError::GetDeviceAttribute(e.into()))
1498     }
1499 }
1500 
1501 impl AsRawFd for KvmDevice {
1502     fn as_raw_fd(&self) -> RawFd {
1503         self.fd.as_raw_fd()
1504     }
1505 }
1506