xref: /cloud-hypervisor/hypervisor/src/kvm/mod.rs (revision 496ceed1d02b5884e2a4c570ef231d2c90b64fc0)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 // Copyright © 2020, Microsoft Corporation
6 //
7 // Copyright 2018-2019 CrowdStrike, Inc.
8 //
9 //
10 
11 #[cfg(target_arch = "aarch64")]
12 pub use crate::aarch64::{
13     check_required_kvm_extensions, is_system_register, VcpuInit, VcpuKvmState as CpuState,
14     MPIDR_EL1,
15 };
16 use crate::cpu;
17 use crate::device;
18 use crate::hypervisor;
19 use crate::vec_with_array_field;
20 use crate::vm::{self, VmmOps};
21 #[cfg(target_arch = "aarch64")]
22 use crate::{arm64_core_reg_id, offset__of};
23 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd};
24 use serde_derive::{Deserialize, Serialize};
25 use std::os::unix::io::{AsRawFd, RawFd};
26 use std::result;
27 #[cfg(target_arch = "x86_64")]
28 use std::sync::atomic::{AtomicBool, Ordering};
29 use std::sync::Arc;
30 #[cfg(target_arch = "x86_64")]
31 use vm_memory::Address;
32 use vmm_sys_util::eventfd::EventFd;
33 // x86_64 dependencies
34 #[cfg(target_arch = "x86_64")]
35 pub mod x86_64;
36 #[cfg(target_arch = "x86_64")]
37 use crate::arch::x86::NUM_IOAPIC_PINS;
38 #[cfg(target_arch = "aarch64")]
39 use aarch64::{RegList, Register, StandardRegisters};
40 #[cfg(target_arch = "x86_64")]
41 use kvm_bindings::{
42     kvm_enable_cap, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, KVM_CAP_SPLIT_IRQCHIP,
43 };
44 #[cfg(target_arch = "x86_64")]
45 use x86_64::{
46     check_required_kvm_extensions, FpuState, SpecialRegisters, StandardRegisters, KVM_TSS_ADDRESS,
47 };
48 #[cfg(target_arch = "x86_64")]
49 pub use x86_64::{
50     CpuId, CpuIdEntry, ExtendedControlRegisters, LapicState, MsrEntries, VcpuKvmState as CpuState,
51     Xsave, CPUID_FLAG_VALID_INDEX,
52 };
53 // aarch64 dependencies
54 #[cfg(target_arch = "aarch64")]
55 pub mod aarch64;
56 pub use kvm_bindings;
57 #[cfg(feature = "tdx")]
58 use kvm_bindings::KVMIO;
59 pub use kvm_bindings::{
60     kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_irq_routing, kvm_irq_routing_entry,
61     kvm_userspace_memory_region, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI,
62     KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID,
63 };
64 #[cfg(target_arch = "aarch64")]
65 use kvm_bindings::{
66     kvm_regs, user_fpsimd_state, user_pt_regs, KVM_NR_SPSR, KVM_REG_ARM64, KVM_REG_ARM_CORE,
67     KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64,
68 };
69 pub use kvm_ioctls;
70 pub use kvm_ioctls::{Cap, Kvm};
71 #[cfg(target_arch = "aarch64")]
72 use std::mem;
73 #[cfg(feature = "tdx")]
74 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_expr, ioctl_ioc_nr, ioctl_iowr_nr};
75 
76 ///
77 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms
78 ///
79 pub use {
80     kvm_bindings::kvm_clock_data as ClockData, kvm_bindings::kvm_create_device as CreateDevice,
81     kvm_bindings::kvm_device_attr as DeviceAttr,
82     kvm_bindings::kvm_irq_routing_entry as IrqRoutingEntry, kvm_bindings::kvm_mp_state as MpState,
83     kvm_bindings::kvm_userspace_memory_region as MemoryRegion,
84     kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::DeviceFd, kvm_ioctls::IoEventAddress,
85     kvm_ioctls::VcpuExit,
86 };
87 
88 #[cfg(feature = "tdx")]
89 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong);
90 
91 #[cfg(feature = "tdx")]
92 #[repr(u32)]
93 enum TdxCommand {
94     #[allow(dead_code)]
95     Capabilities = 0,
96     InitVm,
97     InitVcpu,
98     InitMemRegion,
99     Finalize,
100 }
101 
102 #[derive(Clone, Copy, Debug, PartialEq, Deserialize, Serialize)]
103 pub struct KvmVmState {}
104 
105 pub use KvmVmState as VmState;
106 /// Wrapper over KVM VM ioctls.
107 pub struct KvmVm {
108     fd: Arc<VmFd>,
109     #[cfg(target_arch = "x86_64")]
110     msrs: MsrEntries,
111     state: KvmVmState,
112 }
113 
114 ///
115 /// Implementation of Vm trait for KVM
116 /// Example:
117 /// #[cfg(feature = "kvm")]
118 /// extern crate hypervisor
119 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
120 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
121 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
122 /// vm.set/get().unwrap()
123 ///
124 impl vm::Vm for KvmVm {
125     #[cfg(target_arch = "x86_64")]
126     ///
127     /// Sets the address of the three-page region in the VM's address space.
128     ///
129     fn set_tss_address(&self, offset: usize) -> vm::Result<()> {
130         self.fd
131             .set_tss_address(offset)
132             .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into()))
133     }
134     ///
135     /// Creates an in-kernel interrupt controller.
136     ///
137     fn create_irq_chip(&self) -> vm::Result<()> {
138         self.fd
139             .create_irq_chip()
140             .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into()))
141     }
142     ///
143     /// Registers an event that will, when signaled, trigger the `gsi` IRQ.
144     ///
145     fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
146         self.fd
147             .register_irqfd(fd, gsi)
148             .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into()))
149     }
150     ///
151     /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ.
152     ///
153     fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
154         self.fd
155             .unregister_irqfd(fd, gsi)
156             .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into()))
157     }
158     ///
159     /// Creates a VcpuFd object from a vcpu RawFd.
160     ///
161     fn create_vcpu(
162         &self,
163         id: u8,
164         vmmops: Option<Arc<Box<dyn VmmOps>>>,
165     ) -> vm::Result<Arc<dyn cpu::Vcpu>> {
166         let vc = self
167             .fd
168             .create_vcpu(id as u64)
169             .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?;
170         let vcpu = KvmVcpu {
171             fd: vc,
172             #[cfg(target_arch = "x86_64")]
173             msrs: self.msrs.clone(),
174             vmmops,
175             #[cfg(target_arch = "x86_64")]
176             hyperv_synic: AtomicBool::new(false),
177         };
178         Ok(Arc::new(vcpu))
179     }
180     ///
181     /// Registers an event to be signaled whenever a certain address is written to.
182     ///
183     fn register_ioevent(
184         &self,
185         fd: &EventFd,
186         addr: &IoEventAddress,
187         datamatch: Option<vm::DataMatch>,
188     ) -> vm::Result<()> {
189         if let Some(dm) = datamatch {
190             match dm {
191                 vm::DataMatch::DataMatch32(kvm_dm32) => self
192                     .fd
193                     .register_ioevent(fd, addr, kvm_dm32)
194                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
195                 vm::DataMatch::DataMatch64(kvm_dm64) => self
196                     .fd
197                     .register_ioevent(fd, addr, kvm_dm64)
198                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
199             }
200         } else {
201             self.fd
202                 .register_ioevent(fd, addr, NoDatamatch)
203                 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into()))
204         }
205     }
206     ///
207     /// Unregisters an event from a certain address it has been previously registered to.
208     ///
209     fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> {
210         self.fd
211             .unregister_ioevent(fd, addr, NoDatamatch)
212             .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into()))
213     }
214     ///
215     /// Sets the GSI routing table entries, overwriting any previously set
216     /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl.
217     ///
218     fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> {
219         let mut irq_routing =
220             vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len());
221         irq_routing[0].nr = entries.len() as u32;
222         irq_routing[0].flags = 0;
223 
224         unsafe {
225             let entries_slice: &mut [kvm_irq_routing_entry] =
226                 irq_routing[0].entries.as_mut_slice(entries.len());
227             entries_slice.copy_from_slice(&entries);
228         }
229 
230         self.fd
231             .set_gsi_routing(&irq_routing[0])
232             .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into()))
233     }
234     ///
235     /// Creates a memory region structure that can be used with set_user_memory_region
236     ///
237     fn make_user_memory_region(
238         &self,
239         slot: u32,
240         guest_phys_addr: u64,
241         memory_size: u64,
242         userspace_addr: u64,
243         readonly: bool,
244         log_dirty_pages: bool,
245     ) -> MemoryRegion {
246         MemoryRegion {
247             slot,
248             guest_phys_addr,
249             memory_size,
250             userspace_addr,
251             flags: if readonly { KVM_MEM_READONLY } else { 0 }
252                 | if log_dirty_pages {
253                     KVM_MEM_LOG_DIRTY_PAGES
254                 } else {
255                     0
256                 },
257         }
258     }
259     ///
260     /// Creates/modifies a guest physical memory slot.
261     ///
262     fn set_user_memory_region(&self, user_memory_region: MemoryRegion) -> vm::Result<()> {
263         // Safe because guest regions are guaranteed not to overlap.
264         unsafe {
265             self.fd
266                 .set_user_memory_region(user_memory_region)
267                 .map_err(|e| vm::HypervisorVmError::SetUserMemory(e.into()))
268         }
269     }
270     ///
271     /// Creates an emulated device in the kernel.
272     ///
273     /// See the documentation for `KVM_CREATE_DEVICE`.
274     fn create_device(&self, device: &mut CreateDevice) -> vm::Result<Arc<dyn device::Device>> {
275         let fd = self
276             .fd
277             .create_device(device)
278             .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?;
279         let device = KvmDevice { fd };
280         Ok(Arc::new(device))
281     }
282     ///
283     /// Returns the preferred CPU target type which can be emulated by KVM on underlying host.
284     ///
285     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
286     fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> {
287         self.fd
288             .get_preferred_target(kvi)
289             .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into()))
290     }
291     #[cfg(target_arch = "x86_64")]
292     fn enable_split_irq(&self) -> vm::Result<()> {
293         // Set TSS
294         self.fd
295             .set_tss_address(KVM_TSS_ADDRESS.raw_value() as usize)
296             .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?;
297         // Create split irqchip
298         // Only the local APIC is emulated in kernel, both PICs and IOAPIC
299         // are not.
300         let mut cap = kvm_enable_cap {
301             cap: KVM_CAP_SPLIT_IRQCHIP,
302             ..Default::default()
303         };
304         cap.args[0] = NUM_IOAPIC_PINS as u64;
305         self.fd
306             .enable_cap(&cap)
307             .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?;
308         Ok(())
309     }
310     /// Retrieve guest clock.
311     #[cfg(target_arch = "x86_64")]
312     fn get_clock(&self) -> vm::Result<ClockData> {
313         self.fd
314             .get_clock()
315             .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))
316     }
317     /// Set guest clock.
318     #[cfg(target_arch = "x86_64")]
319     fn set_clock(&self, data: &ClockData) -> vm::Result<()> {
320         self.fd
321             .set_clock(data)
322             .map_err(|e| vm::HypervisorVmError::SetClock(e.into()))
323     }
324     /// Checks if a particular `Cap` is available.
325     fn check_extension(&self, c: Cap) -> bool {
326         self.fd.check_extension(c)
327     }
328     /// Create a device that is used for passthrough
329     fn create_passthrough_device(&self) -> vm::Result<Arc<dyn device::Device>> {
330         let mut vfio_dev = kvm_create_device {
331             type_: kvm_device_type_KVM_DEV_TYPE_VFIO,
332             fd: 0,
333             flags: 0,
334         };
335 
336         self.create_device(&mut vfio_dev)
337             .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into()))
338     }
339     ///
340     /// Get the Vm state. Return VM specific data
341     ///
342     fn state(&self) -> vm::Result<VmState> {
343         Ok(self.state)
344     }
345     ///
346     /// Set the VM state
347     ///
348     fn set_state(&self, _state: VmState) -> vm::Result<()> {
349         Ok(())
350     }
351 
352     ///
353     /// Get dirty pages bitmap (one bit per page)
354     ///
355     fn get_dirty_log(&self, slot: u32, memory_size: u64) -> vm::Result<Vec<u64>> {
356         self.fd
357             .get_dirty_log(slot, memory_size as usize)
358             .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into()))
359     }
360 
361     ///
362     /// Initialize TDX for this VM
363     ///
364     #[cfg(feature = "tdx")]
365     fn tdx_init(&self, cpuid: &CpuId, max_vcpus: u32) -> vm::Result<()> {
366         #[repr(C)]
367         struct TdxInitVm {
368             max_vcpus: u32,
369             reserved: u32,
370             attributes: u64,
371             cpuid: u64,
372         }
373         let data = TdxInitVm {
374             max_vcpus,
375             reserved: 0,
376             attributes: 0,
377             cpuid: cpuid.as_fam_struct_ptr() as u64,
378         };
379 
380         tdx_command(
381             &self.fd.as_raw_fd(),
382             TdxCommand::InitVm,
383             0,
384             &data as *const _ as u64,
385         )
386         .map_err(vm::HypervisorVmError::InitializeTdx)
387     }
388 
389     ///
390     /// Finalize the TDX setup for this VM
391     ///
392     #[cfg(feature = "tdx")]
393     fn tdx_finalize(&self) -> vm::Result<()> {
394         tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0)
395             .map_err(vm::HypervisorVmError::FinalizeTdx)
396     }
397 
398     ///
399     /// Initialize memory regions for the TDX VM
400     ///
401     #[cfg(feature = "tdx")]
402     fn tdx_init_memory_region(
403         &self,
404         host_address: u64,
405         guest_address: u64,
406         size: u64,
407         measure: bool,
408     ) -> vm::Result<()> {
409         #[repr(C)]
410         struct TdxInitMemRegion {
411             host_address: u64,
412             guest_address: u64,
413             pages: u64,
414         }
415         let data = TdxInitMemRegion {
416             host_address,
417             guest_address,
418             pages: size / 4096,
419         };
420 
421         tdx_command(
422             &self.fd.as_raw_fd(),
423             TdxCommand::InitMemRegion,
424             if measure { 1 } else { 0 },
425             &data as *const _ as u64,
426         )
427         .map_err(vm::HypervisorVmError::InitMemRegionTdx)
428     }
429 }
430 
431 #[cfg(feature = "tdx")]
432 fn tdx_command(
433     fd: &RawFd,
434     command: TdxCommand,
435     metadata: u32,
436     data: u64,
437 ) -> std::result::Result<(), std::io::Error> {
438     #[repr(C)]
439     struct TdxIoctlCmd {
440         command: TdxCommand,
441         metadata: u32,
442         data: u64,
443     }
444     let cmd = TdxIoctlCmd {
445         command,
446         metadata,
447         data,
448     };
449     let ret = unsafe {
450         ioctl_with_val(
451             fd,
452             KVM_MEMORY_ENCRYPT_OP(),
453             &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong,
454         )
455     };
456 
457     if ret < 0 {
458         return Err(std::io::Error::last_os_error());
459     }
460     Ok(())
461 }
462 
463 /// Wrapper over KVM system ioctls.
464 pub struct KvmHypervisor {
465     kvm: Kvm,
466 }
467 /// Enum for KVM related error
468 #[derive(Debug)]
469 pub enum KvmError {
470     CapabilityMissing(Cap),
471 }
472 pub type KvmResult<T> = result::Result<T, KvmError>;
473 impl KvmHypervisor {
474     /// Create a hypervisor based on Kvm
475     pub fn new() -> hypervisor::Result<KvmHypervisor> {
476         let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?;
477         let api_version = kvm_obj.get_api_version();
478 
479         if api_version != kvm_bindings::KVM_API_VERSION as i32 {
480             return Err(hypervisor::HypervisorError::IncompatibleApiVersion);
481         }
482 
483         Ok(KvmHypervisor { kvm: kvm_obj })
484     }
485 }
486 /// Implementation of Hypervisor trait for KVM
487 /// Example:
488 /// #[cfg(feature = "kvm")]
489 /// extern crate hypervisor
490 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
491 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
492 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
493 ///
494 impl hypervisor::Hypervisor for KvmHypervisor {
495     /// Create a KVM vm object of a specific VM type and return the object as Vm trait object
496     /// Example
497     /// # extern crate hypervisor;
498     /// # use hypervisor::KvmHypervisor;
499     /// use hypervisor::KvmVm;
500     /// let hypervisor = KvmHypervisor::new().unwrap();
501     /// let vm = hypervisor.create_vm_with_type(KvmVmType::LegacyVm).unwrap()
502     ///
503     fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> {
504         let fd: VmFd;
505         loop {
506             match self.kvm.create_vm_with_type(vm_type) {
507                 Ok(res) => fd = res,
508                 Err(e) => {
509                     if e.errno() == libc::EINTR {
510                         // If the error returned is EINTR, which means the
511                         // ioctl has been interrupted, we have to retry as
512                         // this can't be considered as a regular error.
513                         continue;
514                     } else {
515                         return Err(hypervisor::HypervisorError::VmCreate(e.into()));
516                     }
517                 }
518             }
519             break;
520         }
521 
522         let vm_fd = Arc::new(fd);
523 
524         #[cfg(target_arch = "x86_64")]
525         {
526             let msr_list = self.get_msr_list()?;
527             let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize;
528             let mut msrs = MsrEntries::new(num_msrs).unwrap();
529             let indices = msr_list.as_slice();
530             let msr_entries = msrs.as_mut_slice();
531             for (pos, index) in indices.iter().enumerate() {
532                 msr_entries[pos].index = *index;
533             }
534 
535             Ok(Arc::new(KvmVm {
536                 fd: vm_fd,
537                 msrs,
538                 state: VmState {},
539             }))
540         }
541 
542         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
543         {
544             Ok(Arc::new(KvmVm {
545                 fd: vm_fd,
546                 state: VmState {},
547             }))
548         }
549     }
550 
551     /// Create a KVM vm object and return the object as Vm trait object
552     /// Example
553     /// # extern crate hypervisor;
554     /// # use hypervisor::KvmHypervisor;
555     /// use hypervisor::KvmVm;
556     /// let hypervisor = KvmHypervisor::new().unwrap();
557     /// let vm = hypervisor.create_vm().unwrap()
558     ///
559     fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> {
560         self.create_vm_with_type(0) // Create with default platform type
561     }
562 
563     fn check_required_extensions(&self) -> hypervisor::Result<()> {
564         check_required_kvm_extensions(&self.kvm).expect("Missing KVM capabilities");
565         Ok(())
566     }
567 
568     ///
569     ///  Returns the size of the memory mapping required to use the vcpu's `kvm_run` structure.
570     ///
571     fn get_vcpu_mmap_size(&self) -> hypervisor::Result<usize> {
572         self.kvm
573             .get_vcpu_mmap_size()
574             .map_err(|e| hypervisor::HypervisorError::GetVcpuMmap(e.into()))
575     }
576     ///
577     /// Gets the recommended maximum number of VCPUs per VM.
578     ///
579     fn get_max_vcpus(&self) -> hypervisor::Result<usize> {
580         Ok(self.kvm.get_max_vcpus())
581     }
582     ///
583     /// Gets the recommended number of VCPUs per VM.
584     ///
585     fn get_nr_vcpus(&self) -> hypervisor::Result<usize> {
586         Ok(self.kvm.get_nr_vcpus())
587     }
588     #[cfg(target_arch = "x86_64")]
589     ///
590     /// Checks if a particular `Cap` is available.
591     ///
592     fn check_capability(&self, c: Cap) -> bool {
593         self.kvm.check_extension(c)
594     }
595     #[cfg(target_arch = "x86_64")]
596     ///
597     /// X86 specific call to get the system supported CPUID values.
598     ///
599     fn get_cpuid(&self) -> hypervisor::Result<CpuId> {
600         self.kvm
601             .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
602             .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))
603     }
604     #[cfg(target_arch = "x86_64")]
605     ///
606     /// Retrieve the list of MSRs supported by KVM.
607     ///
608     fn get_msr_list(&self) -> hypervisor::Result<MsrList> {
609         self.kvm
610             .get_msr_index_list()
611             .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into()))
612     }
613 }
614 /// Vcpu struct for KVM
615 pub struct KvmVcpu {
616     fd: VcpuFd,
617     #[cfg(target_arch = "x86_64")]
618     msrs: MsrEntries,
619     vmmops: Option<Arc<Box<dyn vm::VmmOps>>>,
620     #[cfg(target_arch = "x86_64")]
621     hyperv_synic: AtomicBool,
622 }
623 /// Implementation of Vcpu trait for KVM
624 /// Example:
625 /// #[cfg(feature = "kvm")]
626 /// extern crate hypervisor
627 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
628 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
629 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
630 /// let vcpu = vm.create_vcpu(0, None).unwrap();
631 /// vcpu.get/set().unwrap()
632 ///
633 impl cpu::Vcpu for KvmVcpu {
634     #[cfg(target_arch = "x86_64")]
635     ///
636     /// Returns the vCPU general purpose registers.
637     ///
638     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
639         self.fd
640             .get_regs()
641             .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))
642     }
643     #[cfg(target_arch = "x86_64")]
644     ///
645     /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl.
646     ///
647     fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> {
648         self.fd
649             .set_regs(regs)
650             .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into()))
651     }
652     #[cfg(target_arch = "x86_64")]
653     ///
654     /// Returns the vCPU special registers.
655     ///
656     fn get_sregs(&self) -> cpu::Result<SpecialRegisters> {
657         self.fd
658             .get_sregs()
659             .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))
660     }
661     #[cfg(target_arch = "x86_64")]
662     ///
663     /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl.
664     ///
665     fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> {
666         self.fd
667             .set_sregs(sregs)
668             .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into()))
669     }
670     #[cfg(target_arch = "x86_64")]
671     ///
672     /// Returns the floating point state (FPU) from the vCPU.
673     ///
674     fn get_fpu(&self) -> cpu::Result<FpuState> {
675         self.fd
676             .get_fpu()
677             .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))
678     }
679     #[cfg(target_arch = "x86_64")]
680     ///
681     /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct.
682     ///
683     fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> {
684         self.fd
685             .set_fpu(fpu)
686             .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into()))
687     }
688     #[cfg(target_arch = "x86_64")]
689     ///
690     /// X86 specific call to setup the CPUID registers.
691     ///
692     fn set_cpuid2(&self, cpuid: &CpuId) -> cpu::Result<()> {
693         self.fd
694             .set_cpuid2(cpuid)
695             .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into()))
696     }
697     #[cfg(target_arch = "x86_64")]
698     ///
699     /// X86 specific call to enable HyperV SynIC
700     ///
701     fn enable_hyperv_synic(&self) -> cpu::Result<()> {
702         // Update the information about Hyper-V SynIC being enabled and
703         // emulated as it will influence later which MSRs should be saved.
704         self.hyperv_synic.store(true, Ordering::Release);
705 
706         let cap = kvm_enable_cap {
707             cap: KVM_CAP_HYPERV_SYNIC,
708             ..Default::default()
709         };
710         self.fd
711             .enable_cap(&cap)
712             .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into()))
713     }
714     ///
715     /// X86 specific call to retrieve the CPUID registers.
716     ///
717     #[cfg(target_arch = "x86_64")]
718     fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<CpuId> {
719         self.fd
720             .get_cpuid2(num_entries)
721             .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))
722     }
723     #[cfg(target_arch = "x86_64")]
724     ///
725     /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
726     ///
727     fn get_lapic(&self) -> cpu::Result<LapicState> {
728         self.fd
729             .get_lapic()
730             .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))
731     }
732     #[cfg(target_arch = "x86_64")]
733     ///
734     /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
735     ///
736     fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> {
737         self.fd
738             .set_lapic(klapic)
739             .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into()))
740     }
741     #[cfg(target_arch = "x86_64")]
742     ///
743     /// Returns the model-specific registers (MSR) for this vCPU.
744     ///
745     fn get_msrs(&self, msrs: &mut MsrEntries) -> cpu::Result<usize> {
746         self.fd
747             .get_msrs(msrs)
748             .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))
749     }
750     #[cfg(target_arch = "x86_64")]
751     ///
752     /// Setup the model-specific registers (MSR) for this vCPU.
753     /// Returns the number of MSR entries actually written.
754     ///
755     fn set_msrs(&self, msrs: &MsrEntries) -> cpu::Result<usize> {
756         self.fd
757             .set_msrs(msrs)
758             .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into()))
759     }
760     ///
761     /// Returns the vcpu's current "multiprocessing state".
762     ///
763     fn get_mp_state(&self) -> cpu::Result<MpState> {
764         self.fd
765             .get_mp_state()
766             .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))
767     }
768     ///
769     /// Sets the vcpu's current "multiprocessing state".
770     ///
771     fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> {
772         self.fd
773             .set_mp_state(mp_state)
774             .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into()))
775     }
776     #[cfg(target_arch = "x86_64")]
777     ///
778     /// X86 specific call that returns the vcpu's current "xsave struct".
779     ///
780     fn get_xsave(&self) -> cpu::Result<Xsave> {
781         self.fd
782             .get_xsave()
783             .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into()))
784     }
785     #[cfg(target_arch = "x86_64")]
786     ///
787     /// X86 specific call that sets the vcpu's current "xsave struct".
788     ///
789     fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> {
790         self.fd
791             .set_xsave(xsave)
792             .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into()))
793     }
794     #[cfg(target_arch = "x86_64")]
795     ///
796     /// X86 specific call that returns the vcpu's current "xcrs".
797     ///
798     fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> {
799         self.fd
800             .get_xcrs()
801             .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into()))
802     }
803     #[cfg(target_arch = "x86_64")]
804     ///
805     /// X86 specific call that sets the vcpu's current "xcrs".
806     ///
807     fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> {
808         self.fd
809             .set_xcrs(&xcrs)
810             .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into()))
811     }
812     ///
813     /// Triggers the running of the current virtual CPU returning an exit reason.
814     ///
815     fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> {
816         match self.fd.run() {
817             Ok(run) => match run {
818                 #[cfg(target_arch = "x86_64")]
819                 VcpuExit::IoIn(addr, data) => {
820                     if let Some(vmmops) = &self.vmmops {
821                         return vmmops
822                             .pio_read(addr.into(), data)
823                             .map(|_| cpu::VmExit::Ignore)
824                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
825                     }
826 
827                     Ok(cpu::VmExit::IoIn(addr, data))
828                 }
829                 #[cfg(target_arch = "x86_64")]
830                 VcpuExit::IoOut(addr, data) => {
831                     if let Some(vmmops) = &self.vmmops {
832                         return vmmops
833                             .pio_write(addr.into(), data)
834                             .map(|_| cpu::VmExit::Ignore)
835                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
836                     }
837 
838                     Ok(cpu::VmExit::IoOut(addr, data))
839                 }
840                 #[cfg(target_arch = "x86_64")]
841                 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)),
842                 #[cfg(target_arch = "x86_64")]
843                 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset),
844 
845                 #[cfg(target_arch = "aarch64")]
846                 VcpuExit::SystemEvent(event_type, flags) => {
847                     use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN};
848                     // On Aarch64, when the VM is shutdown, run() returns
849                     // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN
850                     if event_type == KVM_SYSTEM_EVENT_RESET {
851                         Ok(cpu::VmExit::Reset)
852                     } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN {
853                         Ok(cpu::VmExit::Shutdown)
854                     } else {
855                         Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
856                             "Unexpected system event with type 0x{:x}, flags 0x{:x}",
857                             event_type,
858                             flags
859                         )))
860                     }
861                 }
862 
863                 VcpuExit::MmioRead(addr, data) => {
864                     if let Some(vmmops) = &self.vmmops {
865                         return vmmops
866                             .mmio_read(addr, data)
867                             .map(|_| cpu::VmExit::Ignore)
868                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
869                     }
870 
871                     Ok(cpu::VmExit::MmioRead(addr, data))
872                 }
873                 VcpuExit::MmioWrite(addr, data) => {
874                     if let Some(vmmops) = &self.vmmops {
875                         return vmmops
876                             .mmio_write(addr, data)
877                             .map(|_| cpu::VmExit::Ignore)
878                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
879                     }
880 
881                     Ok(cpu::VmExit::MmioWrite(addr, data))
882                 }
883                 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv),
884 
885                 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
886                     "Unexpected exit reason on vcpu run: {:?}",
887                     r
888                 ))),
889             },
890 
891             Err(ref e) => match e.errno() {
892                 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore),
893                 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
894                     "VCPU error {:?}",
895                     e
896                 ))),
897             },
898         }
899     }
900     #[cfg(target_arch = "x86_64")]
901     ///
902     /// Returns currently pending exceptions, interrupts, and NMIs as well as related
903     /// states of the vcpu.
904     ///
905     fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> {
906         self.fd
907             .get_vcpu_events()
908             .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into()))
909     }
910     #[cfg(target_arch = "x86_64")]
911     ///
912     /// Sets pending exceptions, interrupts, and NMIs as well as related states
913     /// of the vcpu.
914     ///
915     fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> {
916         self.fd
917             .set_vcpu_events(events)
918             .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into()))
919     }
920     #[cfg(target_arch = "x86_64")]
921     ///
922     /// Let the guest know that it has been paused, which prevents from
923     /// potential soft lockups when being resumed.
924     ///
925     fn notify_guest_clock_paused(&self) -> cpu::Result<()> {
926         self.fd
927             .kvmclock_ctrl()
928             .map_err(|e| cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into()))
929     }
930     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
931     fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> {
932         self.fd
933             .vcpu_init(kvi)
934             .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into()))
935     }
936     ///
937     /// Sets the value of one register for this vCPU.
938     ///
939     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
940     fn set_reg(&self, reg_id: u64, data: u64) -> cpu::Result<()> {
941         self.fd
942             .set_one_reg(reg_id, data)
943             .map_err(|e| cpu::HypervisorCpuError::SetRegister(e.into()))
944     }
945     ///
946     /// Gets the value of one register for this vCPU.
947     ///
948     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
949     fn get_reg(&self, reg_id: u64) -> cpu::Result<u64> {
950         self.fd
951             .get_one_reg(reg_id)
952             .map_err(|e| cpu::HypervisorCpuError::GetRegister(e.into()))
953     }
954     ///
955     /// Gets a list of the guest registers that are supported for the
956     /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls.
957     ///
958     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
959     fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> {
960         self.fd
961             .get_reg_list(reg_list)
962             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))
963     }
964     ///
965     /// Save the state of the core registers.
966     ///
967     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
968     fn core_registers(&self, state: &mut StandardRegisters) -> cpu::Result<()> {
969         let mut off = offset__of!(user_pt_regs, regs);
970         // There are 31 user_pt_regs:
971         // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72
972         // These actually are the general-purpose registers of the Armv8-a
973         // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register).
974         for i in 0..31 {
975             state.regs.regs[i] = self
976                 .fd
977                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
978                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
979             off += std::mem::size_of::<u64>();
980         }
981 
982         // We are now entering the "Other register" section of the ARMv8-a architecture.
983         // First one, stack pointer.
984         let off = offset__of!(user_pt_regs, sp);
985         state.regs.sp = self
986             .fd
987             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
988             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
989 
990         // Second one, the program counter.
991         let off = offset__of!(user_pt_regs, pc);
992         state.regs.pc = self
993             .fd
994             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
995             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
996 
997         // Next is the processor state.
998         let off = offset__of!(user_pt_regs, pstate);
999         state.regs.pstate = self
1000             .fd
1001             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1002             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1003 
1004         // The stack pointer associated with EL1
1005         let off = offset__of!(kvm_regs, sp_el1);
1006         state.sp_el1 = self
1007             .fd
1008             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1009             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1010 
1011         // Exception Link Register for EL1, when taking an exception to EL1, this register
1012         // holds the address to which to return afterwards.
1013         let off = offset__of!(kvm_regs, elr_el1);
1014         state.elr_el1 = self
1015             .fd
1016             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1017             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1018 
1019         // Saved Program Status Registers, there are 5 of them used in the kernel.
1020         let mut off = offset__of!(kvm_regs, spsr);
1021         for i in 0..KVM_NR_SPSR as usize {
1022             state.spsr[i] = self
1023                 .fd
1024                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1025                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1026             off += std::mem::size_of::<u64>();
1027         }
1028 
1029         // Now moving on to floting point registers which are stored in the user_fpsimd_state in the kernel:
1030         // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53
1031         let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs);
1032         for i in 0..32 {
1033             state.fp_regs.vregs[i][0] = self
1034                 .fd
1035                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off))
1036                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1037             off += mem::size_of::<u128>();
1038         }
1039 
1040         // Floating-point Status Register
1041         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr);
1042         state.fp_regs.fpsr = self
1043             .fd
1044             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1045             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1046             as u32;
1047 
1048         // Floating-point Control Register
1049         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr);
1050         state.fp_regs.fpcr = self
1051             .fd
1052             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1053             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1054             as u32;
1055         Ok(())
1056     }
1057     ///
1058     /// Restore the state of the core registers.
1059     ///
1060     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1061     fn set_core_registers(&self, state: &StandardRegisters) -> cpu::Result<()> {
1062         // The function follows the exact identical order from `state`. Look there
1063         // for some additional info on registers.
1064         let mut off = offset__of!(user_pt_regs, regs);
1065         for i in 0..31 {
1066             self.fd
1067                 .set_one_reg(
1068                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1069                     state.regs.regs[i],
1070                 )
1071                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1072             off += std::mem::size_of::<u64>();
1073         }
1074 
1075         let off = offset__of!(user_pt_regs, sp);
1076         self.fd
1077             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.sp)
1078             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1079 
1080         let off = offset__of!(user_pt_regs, pc);
1081         self.fd
1082             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pc)
1083             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1084 
1085         let off = offset__of!(user_pt_regs, pstate);
1086         self.fd
1087             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pstate)
1088             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1089 
1090         let off = offset__of!(kvm_regs, sp_el1);
1091         self.fd
1092             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.sp_el1)
1093             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1094 
1095         let off = offset__of!(kvm_regs, elr_el1);
1096         self.fd
1097             .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.elr_el1)
1098             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1099 
1100         let mut off = offset__of!(kvm_regs, spsr);
1101         for i in 0..KVM_NR_SPSR as usize {
1102             self.fd
1103                 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.spsr[i])
1104                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1105             off += std::mem::size_of::<u64>();
1106         }
1107 
1108         let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs);
1109         for i in 0..32 {
1110             self.fd
1111                 .set_one_reg(
1112                     arm64_core_reg_id!(KVM_REG_SIZE_U128, off),
1113                     state.fp_regs.vregs[i][0],
1114                 )
1115                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1116             off += mem::size_of::<u128>();
1117         }
1118 
1119         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr);
1120         self.fd
1121             .set_one_reg(
1122                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1123                 state.fp_regs.fpsr as u64,
1124             )
1125             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1126 
1127         let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr);
1128         self.fd
1129             .set_one_reg(
1130                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1131                 state.fp_regs.fpcr as u64,
1132             )
1133             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1134         Ok(())
1135     }
1136     ///
1137     /// Save the state of the system registers.
1138     ///
1139     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1140     fn system_registers(&self, state: &mut Vec<Register>) -> cpu::Result<()> {
1141         // Call KVM_GET_REG_LIST to get all registers available to the guest. For ArmV8 there are
1142         // around 500 registers.
1143         let mut reg_list = RegList::new(500).unwrap();
1144         self.fd
1145             .get_reg_list(&mut reg_list)
1146             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?;
1147 
1148         // At this point reg_list should contain: core registers and system registers.
1149         // The register list contains the number of registers and their ids. We will be needing to
1150         // call KVM_GET_ONE_REG on each id in order to save all of them. We carve out from the list
1151         // the core registers which are represented in the kernel by kvm_regs structure and for which
1152         // we can calculate the id based on the offset in the structure.
1153 
1154         reg_list.retain(|regid| *regid != 0);
1155         reg_list.as_slice().to_vec().sort_unstable();
1156 
1157         reg_list.retain(|regid| is_system_register(*regid));
1158 
1159         // Now, for the rest of the registers left in the previously fetched register list, we are
1160         // simply calling KVM_GET_ONE_REG.
1161         let indices = reg_list.as_slice();
1162         for (_pos, index) in indices.iter().enumerate() {
1163             if _pos > 230 {
1164                 break;
1165             }
1166             state.push(kvm_bindings::kvm_one_reg {
1167                 id: *index,
1168                 addr: self
1169                     .fd
1170                     .get_one_reg(*index)
1171                     .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?,
1172             });
1173         }
1174 
1175         Ok(())
1176     }
1177     ///
1178     /// Restore the state of the system registers.
1179     ///
1180     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1181     fn set_system_registers(&self, state: &[Register]) -> cpu::Result<()> {
1182         for reg in state {
1183             self.fd
1184                 .set_one_reg(reg.id, reg.addr)
1185                 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?;
1186         }
1187         Ok(())
1188     }
1189     ///
1190     /// Read the MPIDR - Multiprocessor Affinity Register.
1191     ///
1192     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
1193     fn read_mpidr(&self) -> cpu::Result<u64> {
1194         self.fd
1195             .get_one_reg(MPIDR_EL1)
1196             .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))
1197     }
1198     #[cfg(target_arch = "x86_64")]
1199     ///
1200     /// Get the current CPU state
1201     ///
1202     /// Ordering requirements:
1203     ///
1204     /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify
1205     /// vCPU/LAPIC state. As such, it must be done before most everything
1206     /// else, otherwise we cannot restore everything and expect it to work.
1207     ///
1208     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1209     /// still running.
1210     ///
1211     /// KVM_GET_LAPIC may change state of LAPIC before returning it.
1212     ///
1213     /// GET_VCPU_EVENTS should probably be last to save. The code looks as
1214     /// it might as well be affected by internal state modifications of the
1215     /// GET ioctls.
1216     ///
1217     /// SREGS saves/restores a pending interrupt, similar to what
1218     /// VCPU_EVENTS also does.
1219     ///
1220     /// GET_MSRS requires a pre-populated data structure to do something
1221     /// meaningful. For SET_MSRS it will then contain good data.
1222     ///
1223     /// # Example
1224     ///
1225     /// ```rust
1226     /// # extern crate hypervisor;
1227     /// # use hypervisor::KvmHypervisor;
1228     /// # use std::sync::Arc;
1229     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1230     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1231     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1232     /// vm.enable_split_irq().unwrap();
1233     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1234     /// let state = vcpu.state().unwrap();
1235     /// ```
1236     fn state(&self) -> cpu::Result<CpuState> {
1237         let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?;
1238         let mp_state = self.get_mp_state()?;
1239         let regs = self.get_regs()?;
1240         let sregs = self.get_sregs()?;
1241         let xsave = self.get_xsave()?;
1242         let xcrs = self.get_xcrs()?;
1243         let lapic_state = self.get_lapic()?;
1244         let fpu = self.get_fpu()?;
1245 
1246         // Try to get all MSRs based on the list previously retrieved from KVM.
1247         // If the number of MSRs obtained from GET_MSRS is different from the
1248         // expected amount, we fallback onto a slower method by getting MSRs
1249         // by chunks. This is the only way to make sure we try to get as many
1250         // MSRs as possible, even if some MSRs are not supported.
1251         let mut msr_entries = self.msrs.clone();
1252 
1253         // Save extra MSRs if the Hyper-V synthetic interrupt controller is
1254         // emulated.
1255         if self.hyperv_synic.load(Ordering::Acquire) {
1256             let hyperv_synic_msrs = vec![
1257                 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084,
1258                 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096,
1259                 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d,
1260                 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4, 0x400000b5,
1261                 0x400000b6, 0x400000b7,
1262             ];
1263             for index in hyperv_synic_msrs {
1264                 let msr = kvm_msr_entry {
1265                     index,
1266                     ..Default::default()
1267                 };
1268                 msr_entries.push(msr).unwrap();
1269             }
1270         }
1271 
1272         let expected_num_msrs = msr_entries.as_fam_struct_ref().nmsrs as usize;
1273         let num_msrs = self.get_msrs(&mut msr_entries)?;
1274         let msrs = if num_msrs != expected_num_msrs {
1275             let mut faulty_msr_index = num_msrs;
1276             let mut msr_entries_tmp =
1277                 MsrEntries::from_entries(&msr_entries.as_slice()[..faulty_msr_index]).unwrap();
1278 
1279             loop {
1280                 warn!(
1281                     "Detected faulty MSR 0x{:x} while getting MSRs",
1282                     msr_entries.as_slice()[faulty_msr_index].index
1283                 );
1284 
1285                 let start_pos = faulty_msr_index + 1;
1286                 let mut sub_msr_entries =
1287                     MsrEntries::from_entries(&msr_entries.as_slice()[start_pos..]).unwrap();
1288                 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize;
1289                 let num_msrs = self.get_msrs(&mut sub_msr_entries)?;
1290 
1291                 for i in 0..num_msrs {
1292                     msr_entries_tmp
1293                         .push(sub_msr_entries.as_slice()[i])
1294                         .map_err(|e| {
1295                             cpu::HypervisorCpuError::GetMsrEntries(anyhow!(
1296                                 "Failed adding MSR entries: {:?}",
1297                                 e
1298                             ))
1299                         })?;
1300                 }
1301 
1302                 if num_msrs == expected_num_msrs {
1303                     break;
1304                 }
1305 
1306                 faulty_msr_index = start_pos + num_msrs;
1307             }
1308 
1309             msr_entries_tmp
1310         } else {
1311             msr_entries
1312         };
1313 
1314         let vcpu_events = self.get_vcpu_events()?;
1315 
1316         Ok(CpuState {
1317             cpuid,
1318             msrs,
1319             vcpu_events,
1320             regs,
1321             sregs,
1322             fpu,
1323             lapic_state,
1324             xsave,
1325             xcrs,
1326             mp_state,
1327         })
1328     }
1329     ///
1330     /// Get the current AArch64 CPU state
1331     ///
1332     #[cfg(target_arch = "aarch64")]
1333     fn state(&self) -> cpu::Result<CpuState> {
1334         let mut state = CpuState {
1335             mp_state: self.get_mp_state()?,
1336             mpidr: self.read_mpidr()?,
1337             ..Default::default()
1338         };
1339         self.core_registers(&mut state.core_regs)?;
1340         self.system_registers(&mut state.sys_regs)?;
1341 
1342         Ok(state)
1343     }
1344     #[cfg(target_arch = "x86_64")]
1345     ///
1346     /// Restore the previously saved CPU state
1347     ///
1348     /// Ordering requirements:
1349     ///
1350     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1351     /// still running.
1352     ///
1353     /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so
1354     /// if we ever change the BSP, we have to do that before restoring anything.
1355     /// The same seems to be true for CPUID stuff.
1356     ///
1357     /// SREGS saves/restores a pending interrupt, similar to what
1358     /// VCPU_EVENTS also does.
1359     ///
1360     /// SET_REGS clears pending exceptions unconditionally, thus, it must be
1361     /// done before SET_VCPU_EVENTS, which restores it.
1362     ///
1363     /// SET_LAPIC must come after SET_SREGS, because the latter restores
1364     /// the apic base msr.
1365     ///
1366     /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR
1367     /// only restores successfully, when the LAPIC is correctly configured.
1368     ///
1369     /// Arguments: CpuState
1370     /// # Example
1371     ///
1372     /// ```rust
1373     /// # extern crate hypervisor;
1374     /// # use hypervisor::KvmHypervisor;
1375     /// # use std::sync::Arc;
1376     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
1377     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
1378     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1379     /// vm.enable_split_irq().unwrap();
1380     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1381     /// let state = vcpu.state().unwrap();
1382     /// vcpu.set_state(&state).unwrap();
1383     /// ```
1384     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
1385         self.set_cpuid2(&state.cpuid)?;
1386         self.set_mp_state(state.mp_state)?;
1387         self.set_regs(&state.regs)?;
1388         self.set_sregs(&state.sregs)?;
1389         self.set_xsave(&state.xsave)?;
1390         self.set_xcrs(&state.xcrs)?;
1391         self.set_lapic(&state.lapic_state)?;
1392         self.set_fpu(&state.fpu)?;
1393 
1394         // Try to set all MSRs previously stored.
1395         // If the number of MSRs set from SET_MSRS is different from the
1396         // expected amount, we fallback onto a slower method by setting MSRs
1397         // by chunks. This is the only way to make sure we try to set as many
1398         // MSRs as possible, even if some MSRs are not supported.
1399         let expected_num_msrs = state.msrs.as_fam_struct_ref().nmsrs as usize;
1400         let num_msrs = self.set_msrs(&state.msrs)?;
1401         if num_msrs != expected_num_msrs {
1402             let mut faulty_msr_index = num_msrs;
1403 
1404             loop {
1405                 warn!(
1406                     "Detected faulty MSR 0x{:x} while setting MSRs",
1407                     state.msrs.as_slice()[faulty_msr_index].index
1408                 );
1409 
1410                 let start_pos = faulty_msr_index + 1;
1411                 let sub_msr_entries =
1412                     MsrEntries::from_entries(&state.msrs.as_slice()[start_pos..]).unwrap();
1413                 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize;
1414                 let num_msrs = self.set_msrs(&sub_msr_entries)?;
1415 
1416                 if num_msrs == expected_num_msrs {
1417                     break;
1418                 }
1419 
1420                 faulty_msr_index = start_pos + num_msrs;
1421             }
1422         }
1423 
1424         self.set_vcpu_events(&state.vcpu_events)?;
1425 
1426         Ok(())
1427     }
1428     ///
1429     /// Restore the previously saved AArch64 CPU state
1430     ///
1431     #[cfg(target_arch = "aarch64")]
1432     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
1433         self.set_core_registers(&state.core_regs)?;
1434         self.set_system_registers(&state.sys_regs)?;
1435         self.set_mp_state(state.mp_state)?;
1436 
1437         Ok(())
1438     }
1439 
1440     ///
1441     /// Initialize TDX for this CPU
1442     ///
1443     #[cfg(feature = "tdx")]
1444     fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> {
1445         tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address)
1446             .map_err(cpu::HypervisorCpuError::InitializeTdx)
1447     }
1448 }
1449 
1450 /// Device struct for KVM
1451 pub struct KvmDevice {
1452     fd: DeviceFd,
1453 }
1454 
1455 impl device::Device for KvmDevice {
1456     ///
1457     /// Set device attribute
1458     ///
1459     fn set_device_attr(&self, attr: &DeviceAttr) -> device::Result<()> {
1460         self.fd
1461             .set_device_attr(attr)
1462             .map_err(|e| device::HypervisorDeviceError::SetDeviceAttribute(e.into()))
1463     }
1464     ///
1465     /// Get device attribute
1466     ///
1467     fn get_device_attr(&self, attr: &mut DeviceAttr) -> device::Result<()> {
1468         self.fd
1469             .get_device_attr(attr)
1470             .map_err(|e| device::HypervisorDeviceError::GetDeviceAttribute(e.into()))
1471     }
1472 }
1473 
1474 impl AsRawFd for KvmDevice {
1475     fn as_raw_fd(&self) -> RawFd {
1476         self.fd.as_raw_fd()
1477     }
1478 }
1479