xref: /cloud-hypervisor/hypervisor/src/kvm/mod.rs (revision da642fcf7fa90e06cfd5256a77ae4be6cfde811f)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 // Copyright © 2020, Microsoft Corporation
6 //
7 // Copyright 2018-2019 CrowdStrike, Inc.
8 //
9 //
10 
11 #[cfg(target_arch = "aarch64")]
12 pub use crate::aarch64::{check_required_kvm_extensions, VcpuInit, VcpuKvmState as CpuState};
13 use crate::cpu;
14 use crate::device;
15 use crate::hypervisor;
16 use crate::vm;
17 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd};
18 use serde_derive::{Deserialize, Serialize};
19 use std::os::unix::io::{AsRawFd, RawFd};
20 use std::result;
21 use std::sync::Arc;
22 #[cfg(target_arch = "x86_64")]
23 use vm_memory::Address;
24 use vmm_sys_util::eventfd::EventFd;
25 // x86_64 dependencies
26 #[cfg(target_arch = "x86_64")]
27 pub mod x86_64;
28 
29 #[cfg(target_arch = "x86_64")]
30 use x86_64::{
31     check_required_kvm_extensions, FpuState, SpecialRegisters, StandardRegisters, KVM_TSS_ADDRESS,
32 };
33 
34 #[cfg(target_arch = "x86_64")]
35 pub use x86_64::{
36     CpuId, CpuIdEntry, ExtendedControlRegisters, LapicState, MsrEntries, VcpuKvmState as CpuState,
37     Xsave, CPUID_FLAG_VALID_INDEX,
38 };
39 
40 #[cfg(target_arch = "x86_64")]
41 use kvm_bindings::{kvm_enable_cap, MsrList, KVM_CAP_HYPERV_SYNIC, KVM_CAP_SPLIT_IRQCHIP};
42 
43 #[cfg(target_arch = "x86_64")]
44 use crate::arch::x86::NUM_IOAPIC_PINS;
45 
46 // aarch64 dependencies
47 #[cfg(target_arch = "aarch64")]
48 pub mod aarch64;
49 
50 pub use kvm_bindings;
51 pub use kvm_bindings::{
52     kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_irq_routing, kvm_irq_routing_entry,
53     kvm_userspace_memory_region, KVM_IRQ_ROUTING_MSI, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID,
54 };
55 pub use kvm_ioctls;
56 pub use kvm_ioctls::{Cap, Kvm};
57 
58 ///
59 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms
60 ///
61 pub use {
62     kvm_bindings::kvm_clock_data as ClockData, kvm_bindings::kvm_create_device as CreateDevice,
63     kvm_bindings::kvm_device_attr as DeviceAttr,
64     kvm_bindings::kvm_irq_routing_entry as IrqRoutingEntry, kvm_bindings::kvm_mp_state as MpState,
65     kvm_bindings::kvm_userspace_memory_region as MemoryRegion,
66     kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::DeviceFd, kvm_ioctls::IoEventAddress,
67     kvm_ioctls::VcpuExit,
68 };
69 #[derive(Clone, Copy, Debug, PartialEq, Deserialize, Serialize)]
70 pub struct KvmVmState {}
71 
72 pub use KvmVmState as VmState;
73 /// Wrapper over KVM VM ioctls.
74 pub struct KvmVm {
75     fd: Arc<VmFd>,
76     #[cfg(target_arch = "x86_64")]
77     msrs: MsrEntries,
78     state: KvmVmState,
79 }
80 
81 // Returns a `Vec<T>` with a size in bytes at least as large as `size_in_bytes`.
82 fn vec_with_size_in_bytes<T: Default>(size_in_bytes: usize) -> Vec<T> {
83     let rounded_size = (size_in_bytes + size_of::<T>() - 1) / size_of::<T>();
84     let mut v = Vec::with_capacity(rounded_size);
85     v.resize_with(rounded_size, T::default);
86     v
87 }
88 
89 // The kvm API has many structs that resemble the following `Foo` structure:
90 //
91 // ```
92 // #[repr(C)]
93 // struct Foo {
94 //    some_data: u32
95 //    entries: __IncompleteArrayField<__u32>,
96 // }
97 // ```
98 //
99 // In order to allocate such a structure, `size_of::<Foo>()` would be too small because it would not
100 // include any space for `entries`. To make the allocation large enough while still being aligned
101 // for `Foo`, a `Vec<Foo>` is created. Only the first element of `Vec<Foo>` would actually be used
102 // as a `Foo`. The remaining memory in the `Vec<Foo>` is for `entries`, which must be contiguous
103 // with `Foo`. This function is used to make the `Vec<Foo>` with enough space for `count` entries.
104 use std::mem::size_of;
105 fn vec_with_array_field<T: Default, F>(count: usize) -> Vec<T> {
106     let element_space = count * size_of::<F>();
107     let vec_size_bytes = size_of::<T>() + element_space;
108     vec_with_size_in_bytes(vec_size_bytes)
109 }
110 
111 ///
112 /// Implementation of Vm trait for KVM
113 /// Example:
114 /// #[cfg(feature = "kvm")]
115 /// extern crate hypervisor
116 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
117 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
118 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
119 /// vm.set/get().unwrap()
120 ///
121 impl vm::Vm for KvmVm {
122     #[cfg(target_arch = "x86_64")]
123     ///
124     /// Sets the address of the three-page region in the VM's address space.
125     ///
126     fn set_tss_address(&self, offset: usize) -> vm::Result<()> {
127         self.fd
128             .set_tss_address(offset)
129             .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into()))
130     }
131     ///
132     /// Creates an in-kernel interrupt controller.
133     ///
134     fn create_irq_chip(&self) -> vm::Result<()> {
135         self.fd
136             .create_irq_chip()
137             .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into()))
138     }
139     ///
140     /// Registers an event that will, when signaled, trigger the `gsi` IRQ.
141     ///
142     fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
143         self.fd
144             .register_irqfd(fd, gsi)
145             .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into()))
146     }
147     ///
148     /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ.
149     ///
150     fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
151         self.fd
152             .unregister_irqfd(fd, gsi)
153             .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into()))
154     }
155     ///
156     /// Creates a VcpuFd object from a vcpu RawFd.
157     ///
158     fn create_vcpu(&self, id: u8) -> vm::Result<Arc<dyn cpu::Vcpu>> {
159         let vc = self
160             .fd
161             .create_vcpu(id)
162             .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?;
163         let vcpu = KvmVcpu {
164             fd: vc,
165             #[cfg(target_arch = "x86_64")]
166             msrs: self.msrs.clone(),
167         };
168         Ok(Arc::new(vcpu))
169     }
170     ///
171     /// Registers an event to be signaled whenever a certain address is written to.
172     ///
173     fn register_ioevent(
174         &self,
175         fd: &EventFd,
176         addr: &IoEventAddress,
177         datamatch: Option<vm::DataMatch>,
178     ) -> vm::Result<()> {
179         if let Some(dm) = datamatch {
180             match dm {
181                 vm::DataMatch::DataMatch32(kvm_dm32) => self
182                     .fd
183                     .register_ioevent(fd, addr, kvm_dm32)
184                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
185                 vm::DataMatch::DataMatch64(kvm_dm64) => self
186                     .fd
187                     .register_ioevent(fd, addr, kvm_dm64)
188                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
189             }
190         } else {
191             self.fd
192                 .register_ioevent(fd, addr, NoDatamatch)
193                 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into()))
194         }
195     }
196     ///
197     /// Unregisters an event from a certain address it has been previously registered to.
198     ///
199     fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> {
200         self.fd
201             .unregister_ioevent(fd, addr, NoDatamatch)
202             .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into()))
203     }
204     ///
205     /// Sets the GSI routing table entries, overwriting any previously set
206     /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl.
207     ///
208     fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> {
209         let mut irq_routing =
210             vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len());
211         irq_routing[0].nr = entries.len() as u32;
212         irq_routing[0].flags = 0;
213 
214         unsafe {
215             let entries_slice: &mut [kvm_irq_routing_entry] =
216                 irq_routing[0].entries.as_mut_slice(entries.len());
217             entries_slice.copy_from_slice(&entries);
218         }
219 
220         self.fd
221             .set_gsi_routing(&irq_routing[0])
222             .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into()))
223     }
224     ///
225     /// Creates a memory region structure that can be used with set_user_memory_region
226     ///
227     fn make_user_memory_region(
228         &self,
229         slot: u32,
230         guest_phys_addr: u64,
231         memory_size: u64,
232         userspace_addr: u64,
233         readonly: bool,
234     ) -> MemoryRegion {
235         MemoryRegion {
236             slot,
237             guest_phys_addr,
238             memory_size,
239             userspace_addr,
240             flags: if readonly { KVM_MEM_READONLY } else { 0 },
241         }
242     }
243     ///
244     /// Creates/modifies a guest physical memory slot.
245     ///
246     fn set_user_memory_region(&self, user_memory_region: MemoryRegion) -> vm::Result<()> {
247         // Safe because guest regions are guaranteed not to overlap.
248         unsafe {
249             self.fd
250                 .set_user_memory_region(user_memory_region)
251                 .map_err(|e| vm::HypervisorVmError::SetUserMemory(e.into()))
252         }
253     }
254     ///
255     /// Creates an emulated device in the kernel.
256     ///
257     /// See the documentation for `KVM_CREATE_DEVICE`.
258     fn create_device(&self, device: &mut CreateDevice) -> vm::Result<Arc<dyn device::Device>> {
259         let fd = self
260             .fd
261             .create_device(device)
262             .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?;
263         let device = KvmDevice { fd };
264         Ok(Arc::new(device))
265     }
266     ///
267     /// Returns the preferred CPU target type which can be emulated by KVM on underlying host.
268     ///
269     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
270     fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> {
271         self.fd
272             .get_preferred_target(kvi)
273             .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into()))
274     }
275     #[cfg(target_arch = "x86_64")]
276     fn enable_split_irq(&self) -> vm::Result<()> {
277         // Set TSS
278         self.fd
279             .set_tss_address(KVM_TSS_ADDRESS.raw_value() as usize)
280             .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?;
281         // Create split irqchip
282         // Only the local APIC is emulated in kernel, both PICs and IOAPIC
283         // are not.
284         let mut cap: kvm_enable_cap = Default::default();
285         cap.cap = KVM_CAP_SPLIT_IRQCHIP;
286         cap.args[0] = NUM_IOAPIC_PINS as u64;
287         self.fd
288             .enable_cap(&cap)
289             .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?;
290         Ok(())
291     }
292     /// Retrieve guest clock.
293     #[cfg(target_arch = "x86_64")]
294     fn get_clock(&self) -> vm::Result<ClockData> {
295         self.fd
296             .get_clock()
297             .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))
298     }
299     /// Set guest clock.
300     #[cfg(target_arch = "x86_64")]
301     fn set_clock(&self, data: &ClockData) -> vm::Result<()> {
302         self.fd
303             .set_clock(data)
304             .map_err(|e| vm::HypervisorVmError::SetClock(e.into()))
305     }
306     /// Checks if a particular `Cap` is available.
307     fn check_extension(&self, c: Cap) -> bool {
308         self.fd.check_extension(c)
309     }
310     /// Create a device that is used for passthrough
311     fn create_passthrough_device(&self) -> vm::Result<Arc<dyn device::Device>> {
312         let mut vfio_dev = kvm_create_device {
313             type_: kvm_device_type_KVM_DEV_TYPE_VFIO,
314             fd: 0,
315             flags: 0,
316         };
317 
318         self.create_device(&mut vfio_dev)
319             .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into()))
320     }
321     ///
322     /// Get the Vm state. Return VM specific data
323     ///
324     fn state(&self) -> vm::Result<VmState> {
325         Ok(self.state)
326     }
327     ///
328     /// Set the VM state
329     ///
330     fn set_state(&self, _state: &VmState) -> vm::Result<()> {
331         Ok(())
332     }
333 }
334 /// Wrapper over KVM system ioctls.
335 pub struct KvmHypervisor {
336     kvm: Kvm,
337 }
338 /// Enum for KVM related error
339 #[derive(Debug)]
340 pub enum KvmError {
341     CapabilityMissing(Cap),
342 }
343 pub type KvmResult<T> = result::Result<T, KvmError>;
344 impl KvmHypervisor {
345     /// Create a hypervisor based on Kvm
346     pub fn new() -> hypervisor::Result<KvmHypervisor> {
347         let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?;
348         let api_version = kvm_obj.get_api_version();
349 
350         if api_version != kvm_bindings::KVM_API_VERSION as i32 {
351             return Err(hypervisor::HypervisorError::IncompatibleApiVersion);
352         }
353 
354         Ok(KvmHypervisor { kvm: kvm_obj })
355     }
356 }
357 /// Implementation of Hypervisor trait for KVM
358 /// Example:
359 /// #[cfg(feature = "kvm")]
360 /// extern crate hypervisor
361 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
362 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
363 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
364 ///
365 impl hypervisor::Hypervisor for KvmHypervisor {
366     /// Create a KVM vm object and return the object as Vm trait object
367     /// Example
368     /// # extern crate hypervisor;
369     /// # use hypervisor::KvmHypervisor;
370     /// use hypervisor::KvmVm;
371     /// let hypervisor = KvmHypervisor::new().unwrap();
372     /// let vm = hypervisor.create_vm().unwrap()
373     ///
374     fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> {
375         let fd: VmFd;
376         loop {
377             match self.kvm.create_vm() {
378                 Ok(res) => fd = res,
379                 Err(e) => {
380                     if e.errno() == libc::EINTR {
381                         // If the error returned is EINTR, which means the
382                         // ioctl has been interrupted, we have to retry as
383                         // this can't be considered as a regular error.
384                         continue;
385                     } else {
386                         return Err(hypervisor::HypervisorError::VmCreate(e.into()));
387                     }
388                 }
389             }
390             break;
391         }
392 
393         let vm_fd = Arc::new(fd);
394 
395         #[cfg(target_arch = "x86_64")]
396         {
397             let msr_list = self.get_msr_list()?;
398             let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize;
399             let mut msrs = MsrEntries::new(num_msrs);
400             let indices = msr_list.as_slice();
401             let msr_entries = msrs.as_mut_slice();
402             for (pos, index) in indices.iter().enumerate() {
403                 msr_entries[pos].index = *index;
404             }
405 
406             Ok(Arc::new(KvmVm {
407                 fd: vm_fd,
408                 msrs,
409                 state: VmState {},
410             }))
411         }
412 
413         #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
414         {
415             Ok(Arc::new(KvmVm {
416                 fd: vm_fd,
417                 state: VmState {},
418             }))
419         }
420     }
421 
422     fn check_required_extensions(&self) -> hypervisor::Result<()> {
423         check_required_kvm_extensions(&self.kvm).expect("Missing KVM capabilities");
424         Ok(())
425     }
426 
427     ///
428     ///  Returns the size of the memory mapping required to use the vcpu's `kvm_run` structure.
429     ///
430     fn get_vcpu_mmap_size(&self) -> hypervisor::Result<usize> {
431         self.kvm
432             .get_vcpu_mmap_size()
433             .map_err(|e| hypervisor::HypervisorError::GetVcpuMmap(e.into()))
434     }
435     ///
436     /// Gets the recommended maximum number of VCPUs per VM.
437     ///
438     fn get_max_vcpus(&self) -> hypervisor::Result<usize> {
439         Ok(self.kvm.get_max_vcpus())
440     }
441     ///
442     /// Gets the recommended number of VCPUs per VM.
443     ///
444     fn get_nr_vcpus(&self) -> hypervisor::Result<usize> {
445         Ok(self.kvm.get_nr_vcpus())
446     }
447     #[cfg(target_arch = "x86_64")]
448     ///
449     /// Checks if a particular `Cap` is available.
450     ///
451     fn check_capability(&self, c: Cap) -> bool {
452         self.kvm.check_extension(c)
453     }
454     #[cfg(target_arch = "x86_64")]
455     ///
456     /// X86 specific call to get the system supported CPUID values.
457     ///
458     fn get_cpuid(&self) -> hypervisor::Result<CpuId> {
459         self.kvm
460             .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
461             .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))
462     }
463     #[cfg(target_arch = "x86_64")]
464     ///
465     /// Retrieve the list of MSRs supported by KVM.
466     ///
467     fn get_msr_list(&self) -> hypervisor::Result<MsrList> {
468         self.kvm
469             .get_msr_index_list()
470             .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into()))
471     }
472 }
473 /// Vcpu struct for KVM
474 pub struct KvmVcpu {
475     fd: VcpuFd,
476     #[cfg(target_arch = "x86_64")]
477     msrs: MsrEntries,
478 }
479 /// Implementation of Vcpu trait for KVM
480 /// Example:
481 /// #[cfg(feature = "kvm")]
482 /// extern crate hypervisor
483 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
484 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
485 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
486 /// let vcpu = vm.create_vcpu(0).unwrap();
487 /// vcpu.get/set().unwrap()
488 ///
489 impl cpu::Vcpu for KvmVcpu {
490     #[cfg(target_arch = "x86_64")]
491     ///
492     /// Returns the vCPU general purpose registers.
493     ///
494     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
495         self.fd
496             .get_regs()
497             .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))
498     }
499     #[cfg(target_arch = "x86_64")]
500     ///
501     /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl.
502     ///
503     fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> {
504         self.fd
505             .set_regs(regs)
506             .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into()))
507     }
508     #[cfg(target_arch = "x86_64")]
509     ///
510     /// Returns the vCPU special registers.
511     ///
512     fn get_sregs(&self) -> cpu::Result<SpecialRegisters> {
513         self.fd
514             .get_sregs()
515             .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))
516     }
517     #[cfg(target_arch = "x86_64")]
518     ///
519     /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl.
520     ///
521     fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> {
522         self.fd
523             .set_sregs(sregs)
524             .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into()))
525     }
526     #[cfg(target_arch = "x86_64")]
527     ///
528     /// Returns the floating point state (FPU) from the vCPU.
529     ///
530     fn get_fpu(&self) -> cpu::Result<FpuState> {
531         self.fd
532             .get_fpu()
533             .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))
534     }
535     #[cfg(target_arch = "x86_64")]
536     ///
537     /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct.
538     ///
539     fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> {
540         self.fd
541             .set_fpu(fpu)
542             .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into()))
543     }
544     #[cfg(target_arch = "x86_64")]
545     ///
546     /// X86 specific call to setup the CPUID registers.
547     ///
548     fn set_cpuid2(&self, cpuid: &CpuId) -> cpu::Result<()> {
549         self.fd
550             .set_cpuid2(cpuid)
551             .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into()))
552     }
553     #[cfg(target_arch = "x86_64")]
554     ///
555     /// X86 specific call to enable HyperV SynIC
556     ///
557     fn enable_hyperv_synic(&self) -> cpu::Result<()> {
558         let mut cap: kvm_enable_cap = Default::default();
559         cap.cap = KVM_CAP_HYPERV_SYNIC;
560         self.fd
561             .enable_cap(&cap)
562             .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSynIC(e.into()))
563     }
564     ///
565     /// X86 specific call to retrieve the CPUID registers.
566     ///
567     #[cfg(target_arch = "x86_64")]
568     fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<CpuId> {
569         self.fd
570             .get_cpuid2(num_entries)
571             .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))
572     }
573     #[cfg(target_arch = "x86_64")]
574     ///
575     /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
576     ///
577     fn get_lapic(&self) -> cpu::Result<LapicState> {
578         self.fd
579             .get_lapic()
580             .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))
581     }
582     #[cfg(target_arch = "x86_64")]
583     ///
584     /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
585     ///
586     fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> {
587         self.fd
588             .set_lapic(klapic)
589             .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into()))
590     }
591     #[cfg(target_arch = "x86_64")]
592     ///
593     /// Returns the model-specific registers (MSR) for this vCPU.
594     ///
595     fn get_msrs(&self, msrs: &mut MsrEntries) -> cpu::Result<usize> {
596         self.fd
597             .get_msrs(msrs)
598             .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))
599     }
600     #[cfg(target_arch = "x86_64")]
601     ///
602     /// Setup the model-specific registers (MSR) for this vCPU.
603     /// Returns the number of MSR entries actually written.
604     ///
605     fn set_msrs(&self, msrs: &MsrEntries) -> cpu::Result<usize> {
606         self.fd
607             .set_msrs(msrs)
608             .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into()))
609     }
610     ///
611     /// Returns the vcpu's current "multiprocessing state".
612     ///
613     fn get_mp_state(&self) -> cpu::Result<MpState> {
614         self.fd
615             .get_mp_state()
616             .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))
617     }
618     ///
619     /// Sets the vcpu's current "multiprocessing state".
620     ///
621     fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> {
622         self.fd
623             .set_mp_state(mp_state)
624             .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into()))
625     }
626     #[cfg(target_arch = "x86_64")]
627     ///
628     /// X86 specific call that returns the vcpu's current "xsave struct".
629     ///
630     fn get_xsave(&self) -> cpu::Result<Xsave> {
631         self.fd
632             .get_xsave()
633             .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into()))
634     }
635     #[cfg(target_arch = "x86_64")]
636     ///
637     /// X86 specific call that sets the vcpu's current "xsave struct".
638     ///
639     fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> {
640         self.fd
641             .set_xsave(xsave)
642             .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into()))
643     }
644     #[cfg(target_arch = "x86_64")]
645     ///
646     /// X86 specific call that returns the vcpu's current "xcrs".
647     ///
648     fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> {
649         self.fd
650             .get_xcrs()
651             .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into()))
652     }
653     #[cfg(target_arch = "x86_64")]
654     ///
655     /// X86 specific call that sets the vcpu's current "xcrs".
656     ///
657     fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> {
658         self.fd
659             .set_xcrs(&xcrs)
660             .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into()))
661     }
662     ///
663     /// Triggers the running of the current virtual CPU returning an exit reason.
664     ///
665     fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> {
666         match self.fd.run() {
667             Ok(run) => match run {
668                 #[cfg(target_arch = "x86_64")]
669                 VcpuExit::IoIn(addr, data) => Ok(cpu::VmExit::IoIn(addr, data)),
670                 #[cfg(target_arch = "x86_64")]
671                 VcpuExit::IoOut(addr, data) => Ok(cpu::VmExit::IoOut(addr, data)),
672                 #[cfg(target_arch = "x86_64")]
673                 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)),
674                 #[cfg(target_arch = "x86_64")]
675                 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset),
676 
677                 #[cfg(target_arch = "aarch64")]
678                 VcpuExit::SystemEvent(event_type, flags) => {
679                     use kvm_bindings::KVM_SYSTEM_EVENT_SHUTDOWN;
680                     // On Aarch64, when the VM is shutdown, run() returns
681                     // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN
682                     if event_type == KVM_SYSTEM_EVENT_SHUTDOWN {
683                         Ok(cpu::VmExit::Reset)
684                     } else {
685                         Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
686                             "Unexpected system event with type 0x{:x}, flags 0x{:x}",
687                             event_type,
688                             flags
689                         )))
690                     }
691                 }
692 
693                 VcpuExit::MmioRead(addr, data) => Ok(cpu::VmExit::MmioRead(addr, data)),
694                 VcpuExit::MmioWrite(addr, data) => Ok(cpu::VmExit::MmioWrite(addr, data)),
695                 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv),
696 
697                 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
698                     "Unexpected exit reason on vcpu run: {:?}",
699                     r
700                 ))),
701             },
702 
703             Err(ref e) => match e.errno() {
704                 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore),
705                 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
706                     "VCPU error {:?}",
707                     e
708                 ))),
709             },
710         }
711     }
712     #[cfg(target_arch = "x86_64")]
713     ///
714     /// Returns currently pending exceptions, interrupts, and NMIs as well as related
715     /// states of the vcpu.
716     ///
717     fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> {
718         self.fd
719             .get_vcpu_events()
720             .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into()))
721     }
722     #[cfg(target_arch = "x86_64")]
723     ///
724     /// Sets pending exceptions, interrupts, and NMIs as well as related states
725     /// of the vcpu.
726     ///
727     fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> {
728         self.fd
729             .set_vcpu_events(events)
730             .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into()))
731     }
732     #[cfg(target_arch = "x86_64")]
733     ///
734     /// Let the guest know that it has been paused, which prevents from
735     /// potential soft lockups when being resumed.
736     ///
737     fn notify_guest_clock_paused(&self) -> cpu::Result<()> {
738         self.fd
739             .kvmclock_ctrl()
740             .map_err(|e| cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into()))
741     }
742     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
743     fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> {
744         self.fd
745             .vcpu_init(kvi)
746             .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into()))
747     }
748     ///
749     /// Sets the value of one register for this vCPU.
750     ///
751     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
752     fn set_one_reg(&self, reg_id: u64, data: u64) -> cpu::Result<()> {
753         self.fd
754             .set_one_reg(reg_id, data)
755             .map_err(|e| cpu::HypervisorCpuError::SetOneReg(e.into()))
756     }
757     ///
758     /// Gets the value of one register for this vCPU.
759     ///
760     #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
761     fn get_one_reg(&self, reg_id: u64) -> cpu::Result<u64> {
762         self.fd
763             .get_one_reg(reg_id)
764             .map_err(|e| cpu::HypervisorCpuError::GetOneReg(e.into()))
765     }
766     #[cfg(target_arch = "x86_64")]
767     ///
768     /// Get the current CPU state
769     ///
770     /// Ordering requirements:
771     ///
772     /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify
773     /// vCPU/LAPIC state. As such, it must be done before most everything
774     /// else, otherwise we cannot restore everything and expect it to work.
775     ///
776     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
777     /// still running.
778     ///
779     /// KVM_GET_LAPIC may change state of LAPIC before returning it.
780     ///
781     /// GET_VCPU_EVENTS should probably be last to save. The code looks as
782     /// it might as well be affected by internal state modifications of the
783     /// GET ioctls.
784     ///
785     /// SREGS saves/restores a pending interrupt, similar to what
786     /// VCPU_EVENTS also does.
787     ///
788     /// GET_MSRS requires a pre-populated data structure to do something
789     /// meaningful. For SET_MSRS it will then contain good data.
790     ///
791     /// # Example
792     ///
793     /// ```rust
794     /// # extern crate hypervisor;
795     /// # use hypervisor::KvmHypervisor;
796     /// # use std::sync::Arc;
797     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
798     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
799     /// let vm = hv.create_vm().expect("new VM fd creation failed");
800     /// vm.enable_split_irq().unwrap();
801     /// let vcpu = vm.create_vcpu(0).unwrap();
802     /// let state = vcpu.state().unwrap();
803     /// ```
804     fn state(&self) -> cpu::Result<CpuState> {
805         let mp_state = self.get_mp_state()?;
806         let regs = self.get_regs()?;
807         let sregs = self.get_sregs()?;
808         let xsave = self.get_xsave()?;
809         let xcrs = self.get_xcrs()?;
810         let lapic_state = self.get_lapic()?;
811         let fpu = self.get_fpu()?;
812 
813         // Try to get all MSRs based on the list previously retrieved from KVM.
814         // If the number of MSRs obtained from GET_MSRS is different from the
815         // expected amount, we fallback onto a slower method by getting MSRs
816         // by chunks. This is the only way to make sure we try to get as many
817         // MSRs as possible, even if some MSRs are not supported.
818         let mut msr_entries = self.msrs.clone();
819         let expected_num_msrs = msr_entries.as_fam_struct_ref().nmsrs as usize;
820         let num_msrs = self.get_msrs(&mut msr_entries)?;
821         let msrs = if num_msrs != expected_num_msrs {
822             let mut faulty_msr_index = num_msrs;
823             let mut msr_entries_tmp =
824                 MsrEntries::from_entries(&msr_entries.as_slice()[..faulty_msr_index]);
825 
826             loop {
827                 warn!(
828                     "Detected faulty MSR 0x{:x} while getting MSRs",
829                     msr_entries.as_slice()[faulty_msr_index].index
830                 );
831 
832                 let start_pos = faulty_msr_index + 1;
833                 let mut sub_msr_entries =
834                     MsrEntries::from_entries(&msr_entries.as_slice()[start_pos..]);
835                 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize;
836                 let num_msrs = self.get_msrs(&mut sub_msr_entries)?;
837 
838                 for i in 0..num_msrs {
839                     msr_entries_tmp
840                         .push(sub_msr_entries.as_slice()[i])
841                         .map_err(|e| {
842                             cpu::HypervisorCpuError::GetMsrEntries(anyhow!(
843                                 "Failed adding MSR entries: {:?}",
844                                 e
845                             ))
846                         })?;
847                 }
848 
849                 if num_msrs == expected_num_msrs {
850                     break;
851                 }
852 
853                 faulty_msr_index = start_pos + num_msrs;
854             }
855 
856             msr_entries_tmp
857         } else {
858             msr_entries
859         };
860 
861         let vcpu_events = self.get_vcpu_events()?;
862 
863         Ok(CpuState {
864             msrs,
865             vcpu_events,
866             regs,
867             sregs,
868             fpu,
869             lapic_state,
870             xsave,
871             xcrs,
872             mp_state,
873         })
874     }
875     #[cfg(target_arch = "aarch64")]
876     fn state(&self) -> cpu::Result<CpuState> {
877         unimplemented!();
878     }
879     #[cfg(target_arch = "x86_64")]
880     ///
881     /// Restore the previously saved CPU state
882     ///
883     /// Ordering requirements:
884     ///
885     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
886     /// still running.
887     ///
888     /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so
889     /// if we ever change the BSP, we have to do that before restoring anything.
890     /// The same seems to be true for CPUID stuff.
891     ///
892     /// SREGS saves/restores a pending interrupt, similar to what
893     /// VCPU_EVENTS also does.
894     ///
895     /// SET_REGS clears pending exceptions unconditionally, thus, it must be
896     /// done before SET_VCPU_EVENTS, which restores it.
897     ///
898     /// SET_LAPIC must come after SET_SREGS, because the latter restores
899     /// the apic base msr.
900     ///
901     /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR
902     /// only restores successfully, when the LAPIC is correctly configured.
903     ///
904     /// Arguments: CpuState
905     /// # Example
906     ///
907     /// ```rust
908     /// # extern crate hypervisor;
909     /// # use hypervisor::KvmHypervisor;
910     /// # use std::sync::Arc;
911     /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap();
912     /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm);
913     /// let vm = hv.create_vm().expect("new VM fd creation failed");
914     /// vm.enable_split_irq().unwrap();
915     /// let vcpu = vm.create_vcpu(0).unwrap();
916     /// let state = vcpu.state().unwrap();
917     /// vcpu.set_state(&state).unwrap();
918     /// ```
919     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
920         self.set_mp_state(state.mp_state)?;
921         self.set_regs(&state.regs)?;
922         self.set_sregs(&state.sregs)?;
923         self.set_xsave(&state.xsave)?;
924         self.set_xcrs(&state.xcrs)?;
925         self.set_lapic(&state.lapic_state)?;
926         self.set_fpu(&state.fpu)?;
927 
928         // Try to set all MSRs previously stored.
929         // If the number of MSRs set from SET_MSRS is different from the
930         // expected amount, we fallback onto a slower method by setting MSRs
931         // by chunks. This is the only way to make sure we try to set as many
932         // MSRs as possible, even if some MSRs are not supported.
933         let expected_num_msrs = state.msrs.as_fam_struct_ref().nmsrs as usize;
934         let num_msrs = self.set_msrs(&state.msrs)?;
935         if num_msrs != expected_num_msrs {
936             let mut faulty_msr_index = num_msrs;
937 
938             loop {
939                 warn!(
940                     "Detected faulty MSR 0x{:x} while setting MSRs",
941                     state.msrs.as_slice()[faulty_msr_index].index
942                 );
943 
944                 let start_pos = faulty_msr_index + 1;
945                 let sub_msr_entries = MsrEntries::from_entries(&state.msrs.as_slice()[start_pos..]);
946                 let expected_num_msrs = sub_msr_entries.as_fam_struct_ref().nmsrs as usize;
947                 let num_msrs = self.set_msrs(&sub_msr_entries)?;
948 
949                 if num_msrs == expected_num_msrs {
950                     break;
951                 }
952 
953                 faulty_msr_index = start_pos + num_msrs;
954             }
955         }
956 
957         self.set_vcpu_events(&state.vcpu_events)?;
958 
959         Ok(())
960     }
961     #[allow(unused_variables)]
962     #[cfg(target_arch = "aarch64")]
963     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
964         warn!("CPU state was not restored");
965         Ok(())
966     }
967 }
968 
969 /// Device struct for KVM
970 pub struct KvmDevice {
971     fd: DeviceFd,
972 }
973 
974 impl device::Device for KvmDevice {
975     ///
976     /// Set device attribute
977     ///
978     fn set_device_attr(&self, attr: &DeviceAttr) -> device::Result<()> {
979         self.fd
980             .set_device_attr(attr)
981             .map_err(|e| device::HypervisorDeviceError::SetDeviceAttribute(e.into()))
982     }
983 }
984 
985 impl AsRawFd for KvmDevice {
986     fn as_raw_fd(&self) -> RawFd {
987         self.fd.as_raw_fd()
988     }
989 }
990