xref: /cloud-hypervisor/hypervisor/src/kvm/mod.rs (revision 190d90196fff389b60b93b57acf958957b71b249)
1 // Copyright © 2024 Institute of Software, CAS. All rights reserved.
2 //
3 // Copyright © 2019 Intel Corporation
4 //
5 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
6 //
7 // Copyright © 2020, Microsoft Corporation
8 //
9 // Copyright 2018-2019 CrowdStrike, Inc.
10 //
11 //
12 
13 use std::any::Any;
14 use std::collections::HashMap;
15 #[cfg(target_arch = "x86_64")]
16 use std::fs::File;
17 #[cfg(target_arch = "x86_64")]
18 use std::os::unix::io::AsRawFd;
19 #[cfg(feature = "tdx")]
20 use std::os::unix::io::RawFd;
21 use std::result;
22 #[cfg(target_arch = "x86_64")]
23 use std::sync::atomic::{AtomicBool, Ordering};
24 use std::sync::{Arc, Mutex, RwLock};
25 
26 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd};
27 use vmm_sys_util::eventfd::EventFd;
28 
29 #[cfg(target_arch = "aarch64")]
30 use crate::aarch64::gic::KvmGicV3Its;
31 #[cfg(target_arch = "aarch64")]
32 pub use crate::aarch64::{check_required_kvm_extensions, is_system_register, VcpuKvmState};
33 #[cfg(target_arch = "aarch64")]
34 use crate::arch::aarch64::gic::{Vgic, VgicConfig};
35 #[cfg(target_arch = "riscv64")]
36 use crate::arch::riscv64::aia::{Vaia, VaiaConfig};
37 #[cfg(target_arch = "riscv64")]
38 use crate::riscv64::aia::KvmAiaImsics;
39 #[cfg(target_arch = "riscv64")]
40 pub use crate::riscv64::{
41     aia::AiaImsicsState as AiaState, check_required_kvm_extensions, is_non_core_register,
42     VcpuKvmState,
43 };
44 use crate::vm::{self, InterruptSourceConfig, VmOps};
45 #[cfg(target_arch = "aarch64")]
46 use crate::{arm64_core_reg_id, offset_of};
47 use crate::{cpu, hypervisor, HypervisorType};
48 #[cfg(target_arch = "riscv64")]
49 use crate::{offset_of, riscv64_reg_id};
50 // x86_64 dependencies
51 #[cfg(target_arch = "x86_64")]
52 pub mod x86_64;
53 #[cfg(target_arch = "x86_64")]
54 use kvm_bindings::{
55     kvm_enable_cap, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, KVM_CAP_SPLIT_IRQCHIP,
56     KVM_GUESTDBG_USE_HW_BP,
57 };
58 #[cfg(target_arch = "x86_64")]
59 use x86_64::check_required_kvm_extensions;
60 #[cfg(target_arch = "x86_64")]
61 pub use x86_64::{CpuId, ExtendedControlRegisters, MsrEntries, VcpuKvmState};
62 
63 #[cfg(target_arch = "x86_64")]
64 use crate::arch::x86::{
65     CpuIdEntry, FpuState, LapicState, MsrEntry, SpecialRegisters, XsaveState, NUM_IOAPIC_PINS,
66 };
67 #[cfg(target_arch = "x86_64")]
68 use crate::ClockData;
69 use crate::{
70     CpuState, IoEventAddress, IrqRoutingEntry, MpState, StandardRegisters, UserMemoryRegion,
71     USER_MEMORY_REGION_LOG_DIRTY, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE,
72 };
73 // aarch64 dependencies
74 #[cfg(target_arch = "aarch64")]
75 pub mod aarch64;
76 // riscv64 dependencies
77 #[cfg(target_arch = "riscv64")]
78 pub mod riscv64;
79 #[cfg(target_arch = "aarch64")]
80 use std::mem;
81 
82 ///
83 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms
84 ///
85 #[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
86 pub use kvm_bindings::kvm_vcpu_events as VcpuEvents;
87 pub use kvm_bindings::{
88     kvm_clock_data, kvm_create_device, kvm_create_device as CreateDevice,
89     kvm_device_attr as DeviceAttr, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_guest_debug,
90     kvm_irq_routing, kvm_irq_routing_entry, kvm_mp_state, kvm_run, kvm_userspace_memory_region,
91     KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI,
92     KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID,
93 };
94 #[cfg(target_arch = "aarch64")]
95 use kvm_bindings::{
96     kvm_regs, user_fpsimd_state, user_pt_regs, KVM_GUESTDBG_USE_HW, KVM_NR_SPSR, KVM_REG_ARM64,
97     KVM_REG_ARM64_SYSREG, KVM_REG_ARM64_SYSREG_CRM_MASK, KVM_REG_ARM64_SYSREG_CRN_MASK,
98     KVM_REG_ARM64_SYSREG_OP0_MASK, KVM_REG_ARM64_SYSREG_OP1_MASK, KVM_REG_ARM64_SYSREG_OP2_MASK,
99     KVM_REG_ARM_CORE, KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64,
100 };
101 #[cfg(target_arch = "riscv64")]
102 use kvm_bindings::{kvm_riscv_core, user_regs_struct, KVM_REG_RISCV_CORE};
103 #[cfg(feature = "tdx")]
104 use kvm_bindings::{kvm_run__bindgen_ty_1, KVMIO};
105 pub use kvm_ioctls::{Cap, Kvm, VcpuExit};
106 use thiserror::Error;
107 use vfio_ioctls::VfioDeviceFd;
108 #[cfg(feature = "tdx")]
109 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_iowr_nr};
110 pub use {kvm_bindings, kvm_ioctls};
111 
112 #[cfg(target_arch = "aarch64")]
113 use crate::arch::aarch64::regs;
114 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
115 use crate::RegList;
116 
117 #[cfg(target_arch = "x86_64")]
118 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196;
119 
120 #[cfg(target_arch = "x86_64")]
121 use vmm_sys_util::ioctl_io_nr;
122 
123 #[cfg(target_arch = "x86_64")]
124 ioctl_io_nr!(KVM_NMI, kvm_bindings::KVMIO, 0x9a);
125 
126 #[cfg(feature = "tdx")]
127 const KVM_EXIT_TDX: u32 = 50;
128 #[cfg(feature = "tdx")]
129 const TDG_VP_VMCALL_GET_QUOTE: u64 = 0x10002;
130 #[cfg(feature = "tdx")]
131 const TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: u64 = 0x10004;
132 #[cfg(feature = "tdx")]
133 const TDG_VP_VMCALL_SUCCESS: u64 = 0;
134 #[cfg(feature = "tdx")]
135 const TDG_VP_VMCALL_INVALID_OPERAND: u64 = 0x8000000000000000;
136 
137 #[cfg(feature = "tdx")]
138 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong);
139 
140 #[cfg(feature = "tdx")]
141 #[repr(u32)]
142 enum TdxCommand {
143     Capabilities = 0,
144     InitVm,
145     InitVcpu,
146     InitMemRegion,
147     Finalize,
148 }
149 
150 #[cfg(feature = "tdx")]
151 pub enum TdxExitDetails {
152     GetQuote,
153     SetupEventNotifyInterrupt,
154 }
155 
156 #[cfg(feature = "tdx")]
157 pub enum TdxExitStatus {
158     Success,
159     InvalidOperand,
160 }
161 
162 #[cfg(feature = "tdx")]
163 const TDX_MAX_NR_CPUID_CONFIGS: usize = 6;
164 
165 #[cfg(feature = "tdx")]
166 #[repr(C)]
167 #[derive(Debug, Default)]
168 pub struct TdxCpuidConfig {
169     pub leaf: u32,
170     pub sub_leaf: u32,
171     pub eax: u32,
172     pub ebx: u32,
173     pub ecx: u32,
174     pub edx: u32,
175 }
176 
177 #[cfg(feature = "tdx")]
178 #[repr(C)]
179 #[derive(Debug, Default)]
180 pub struct TdxCapabilities {
181     pub attrs_fixed0: u64,
182     pub attrs_fixed1: u64,
183     pub xfam_fixed0: u64,
184     pub xfam_fixed1: u64,
185     pub nr_cpuid_configs: u32,
186     pub padding: u32,
187     pub cpuid_configs: [TdxCpuidConfig; TDX_MAX_NR_CPUID_CONFIGS],
188 }
189 
190 #[cfg(feature = "tdx")]
191 #[derive(Copy, Clone)]
192 pub struct KvmTdxExit {
193     pub type_: u32,
194     pub pad: u32,
195     pub u: KvmTdxExitU,
196 }
197 
198 #[cfg(feature = "tdx")]
199 #[repr(C)]
200 #[derive(Copy, Clone)]
201 pub union KvmTdxExitU {
202     pub vmcall: KvmTdxExitVmcall,
203 }
204 
205 #[cfg(feature = "tdx")]
206 #[repr(C)]
207 #[derive(Debug, Default, Copy, Clone, PartialEq)]
208 pub struct KvmTdxExitVmcall {
209     pub type_: u64,
210     pub subfunction: u64,
211     pub reg_mask: u64,
212     pub in_r12: u64,
213     pub in_r13: u64,
214     pub in_r14: u64,
215     pub in_r15: u64,
216     pub in_rbx: u64,
217     pub in_rdi: u64,
218     pub in_rsi: u64,
219     pub in_r8: u64,
220     pub in_r9: u64,
221     pub in_rdx: u64,
222     pub status_code: u64,
223     pub out_r11: u64,
224     pub out_r12: u64,
225     pub out_r13: u64,
226     pub out_r14: u64,
227     pub out_r15: u64,
228     pub out_rbx: u64,
229     pub out_rdi: u64,
230     pub out_rsi: u64,
231     pub out_r8: u64,
232     pub out_r9: u64,
233     pub out_rdx: u64,
234 }
235 
236 impl From<kvm_userspace_memory_region> for UserMemoryRegion {
from(region: kvm_userspace_memory_region) -> Self237     fn from(region: kvm_userspace_memory_region) -> Self {
238         let mut flags = USER_MEMORY_REGION_READ;
239         if region.flags & KVM_MEM_READONLY == 0 {
240             flags |= USER_MEMORY_REGION_WRITE;
241         }
242         if region.flags & KVM_MEM_LOG_DIRTY_PAGES != 0 {
243             flags |= USER_MEMORY_REGION_LOG_DIRTY;
244         }
245 
246         UserMemoryRegion {
247             slot: region.slot,
248             guest_phys_addr: region.guest_phys_addr,
249             memory_size: region.memory_size,
250             userspace_addr: region.userspace_addr,
251             flags,
252         }
253     }
254 }
255 
256 impl From<UserMemoryRegion> for kvm_userspace_memory_region {
from(region: UserMemoryRegion) -> Self257     fn from(region: UserMemoryRegion) -> Self {
258         assert!(
259             region.flags & USER_MEMORY_REGION_READ != 0,
260             "KVM mapped memory is always readable"
261         );
262 
263         let mut flags = 0;
264         if region.flags & USER_MEMORY_REGION_WRITE == 0 {
265             flags |= KVM_MEM_READONLY;
266         }
267         if region.flags & USER_MEMORY_REGION_LOG_DIRTY != 0 {
268             flags |= KVM_MEM_LOG_DIRTY_PAGES;
269         }
270 
271         kvm_userspace_memory_region {
272             slot: region.slot,
273             guest_phys_addr: region.guest_phys_addr,
274             memory_size: region.memory_size,
275             userspace_addr: region.userspace_addr,
276             flags,
277         }
278     }
279 }
280 
281 impl From<kvm_mp_state> for MpState {
from(s: kvm_mp_state) -> Self282     fn from(s: kvm_mp_state) -> Self {
283         MpState::Kvm(s)
284     }
285 }
286 
287 impl From<MpState> for kvm_mp_state {
from(ms: MpState) -> Self288     fn from(ms: MpState) -> Self {
289         match ms {
290             MpState::Kvm(s) => s,
291             /* Needed in case other hypervisors are enabled */
292             #[allow(unreachable_patterns)]
293             _ => panic!("CpuState is not valid"),
294         }
295     }
296 }
297 
298 impl From<kvm_ioctls::IoEventAddress> for IoEventAddress {
from(a: kvm_ioctls::IoEventAddress) -> Self299     fn from(a: kvm_ioctls::IoEventAddress) -> Self {
300         match a {
301             kvm_ioctls::IoEventAddress::Pio(x) => Self::Pio(x),
302             kvm_ioctls::IoEventAddress::Mmio(x) => Self::Mmio(x),
303         }
304     }
305 }
306 
307 impl From<IoEventAddress> for kvm_ioctls::IoEventAddress {
from(a: IoEventAddress) -> Self308     fn from(a: IoEventAddress) -> Self {
309         match a {
310             IoEventAddress::Pio(x) => Self::Pio(x),
311             IoEventAddress::Mmio(x) => Self::Mmio(x),
312         }
313     }
314 }
315 
316 impl From<VcpuKvmState> for CpuState {
from(s: VcpuKvmState) -> Self317     fn from(s: VcpuKvmState) -> Self {
318         CpuState::Kvm(s)
319     }
320 }
321 
322 impl From<CpuState> for VcpuKvmState {
from(s: CpuState) -> Self323     fn from(s: CpuState) -> Self {
324         match s {
325             CpuState::Kvm(s) => s,
326             /* Needed in case other hypervisors are enabled */
327             #[allow(unreachable_patterns)]
328             _ => panic!("CpuState is not valid"),
329         }
330     }
331 }
332 
333 #[cfg(target_arch = "x86_64")]
334 impl From<kvm_clock_data> for ClockData {
from(d: kvm_clock_data) -> Self335     fn from(d: kvm_clock_data) -> Self {
336         ClockData::Kvm(d)
337     }
338 }
339 
340 #[cfg(target_arch = "x86_64")]
341 impl From<ClockData> for kvm_clock_data {
from(ms: ClockData) -> Self342     fn from(ms: ClockData) -> Self {
343         match ms {
344             ClockData::Kvm(s) => s,
345             /* Needed in case other hypervisors are enabled */
346             #[allow(unreachable_patterns)]
347             _ => panic!("CpuState is not valid"),
348         }
349     }
350 }
351 
352 impl From<kvm_bindings::kvm_one_reg> for crate::Register {
from(s: kvm_bindings::kvm_one_reg) -> Self353     fn from(s: kvm_bindings::kvm_one_reg) -> Self {
354         crate::Register::Kvm(s)
355     }
356 }
357 
358 impl From<crate::Register> for kvm_bindings::kvm_one_reg {
from(e: crate::Register) -> Self359     fn from(e: crate::Register) -> Self {
360         match e {
361             crate::Register::Kvm(e) => e,
362             /* Needed in case other hypervisors are enabled */
363             #[allow(unreachable_patterns)]
364             _ => panic!("Register is not valid"),
365         }
366     }
367 }
368 
369 #[cfg(target_arch = "aarch64")]
370 impl From<kvm_bindings::kvm_vcpu_init> for crate::VcpuInit {
from(s: kvm_bindings::kvm_vcpu_init) -> Self371     fn from(s: kvm_bindings::kvm_vcpu_init) -> Self {
372         crate::VcpuInit::Kvm(s)
373     }
374 }
375 
376 #[cfg(target_arch = "aarch64")]
377 impl From<crate::VcpuInit> for kvm_bindings::kvm_vcpu_init {
from(e: crate::VcpuInit) -> Self378     fn from(e: crate::VcpuInit) -> Self {
379         match e {
380             crate::VcpuInit::Kvm(e) => e,
381             /* Needed in case other hypervisors are enabled */
382             #[allow(unreachable_patterns)]
383             _ => panic!("VcpuInit is not valid"),
384         }
385     }
386 }
387 
388 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
389 impl From<kvm_bindings::RegList> for crate::RegList {
from(s: kvm_bindings::RegList) -> Self390     fn from(s: kvm_bindings::RegList) -> Self {
391         crate::RegList::Kvm(s)
392     }
393 }
394 
395 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
396 impl From<crate::RegList> for kvm_bindings::RegList {
from(e: crate::RegList) -> Self397     fn from(e: crate::RegList) -> Self {
398         match e {
399             crate::RegList::Kvm(e) => e,
400             /* Needed in case other hypervisors are enabled */
401             #[allow(unreachable_patterns)]
402             _ => panic!("RegList is not valid"),
403         }
404     }
405 }
406 
407 #[cfg(not(target_arch = "riscv64"))]
408 impl From<kvm_bindings::kvm_regs> for crate::StandardRegisters {
from(s: kvm_bindings::kvm_regs) -> Self409     fn from(s: kvm_bindings::kvm_regs) -> Self {
410         crate::StandardRegisters::Kvm(s)
411     }
412 }
413 
414 #[cfg(not(target_arch = "riscv64"))]
415 impl From<crate::StandardRegisters> for kvm_bindings::kvm_regs {
from(e: crate::StandardRegisters) -> Self416     fn from(e: crate::StandardRegisters) -> Self {
417         match e {
418             crate::StandardRegisters::Kvm(e) => e,
419             /* Needed in case other hypervisors are enabled */
420             #[allow(unreachable_patterns)]
421             _ => panic!("StandardRegisters are not valid"),
422         }
423     }
424 }
425 
426 #[cfg(target_arch = "riscv64")]
427 impl From<kvm_bindings::kvm_riscv_core> for crate::StandardRegisters {
from(s: kvm_bindings::kvm_riscv_core) -> Self428     fn from(s: kvm_bindings::kvm_riscv_core) -> Self {
429         crate::StandardRegisters::Kvm(s)
430     }
431 }
432 
433 #[cfg(target_arch = "riscv64")]
434 impl From<crate::StandardRegisters> for kvm_bindings::kvm_riscv_core {
from(e: crate::StandardRegisters) -> Self435     fn from(e: crate::StandardRegisters) -> Self {
436         match e {
437             crate::StandardRegisters::Kvm(e) => e,
438             /* Needed in case other hypervisors are enabled */
439             #[allow(unreachable_patterns)]
440             _ => panic!("StandardRegisters are not valid"),
441         }
442     }
443 }
444 
445 impl From<kvm_irq_routing_entry> for IrqRoutingEntry {
from(s: kvm_irq_routing_entry) -> Self446     fn from(s: kvm_irq_routing_entry) -> Self {
447         IrqRoutingEntry::Kvm(s)
448     }
449 }
450 
451 impl From<IrqRoutingEntry> for kvm_irq_routing_entry {
from(e: IrqRoutingEntry) -> Self452     fn from(e: IrqRoutingEntry) -> Self {
453         match e {
454             IrqRoutingEntry::Kvm(e) => e,
455             /* Needed in case other hypervisors are enabled */
456             #[allow(unreachable_patterns)]
457             _ => panic!("IrqRoutingEntry is not valid"),
458         }
459     }
460 }
461 
462 struct KvmDirtyLogSlot {
463     slot: u32,
464     guest_phys_addr: u64,
465     memory_size: u64,
466     userspace_addr: u64,
467 }
468 
469 /// Wrapper over KVM VM ioctls.
470 pub struct KvmVm {
471     fd: Arc<VmFd>,
472     #[cfg(target_arch = "x86_64")]
473     msrs: Vec<MsrEntry>,
474     dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>,
475 }
476 
477 impl KvmVm {
478     ///
479     /// Creates an emulated device in the kernel.
480     ///
481     /// See the documentation for `KVM_CREATE_DEVICE`.
create_device(&self, device: &mut CreateDevice) -> vm::Result<vfio_ioctls::VfioDeviceFd>482     fn create_device(&self, device: &mut CreateDevice) -> vm::Result<vfio_ioctls::VfioDeviceFd> {
483         let device_fd = self
484             .fd
485             .create_device(device)
486             .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?;
487         Ok(VfioDeviceFd::new_from_kvm(device_fd))
488     }
489     /// Checks if a particular `Cap` is available.
check_extension(&self, c: Cap) -> bool490     pub fn check_extension(&self, c: Cap) -> bool {
491         self.fd.check_extension(c)
492     }
493 }
494 
495 /// Implementation of Vm trait for KVM
496 ///
497 /// # Examples
498 ///
499 /// ```
500 /// # use hypervisor::kvm::KvmHypervisor;
501 /// # use std::sync::Arc;
502 /// let kvm = KvmHypervisor::new().unwrap();
503 /// let hypervisor = Arc::new(kvm);
504 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
505 /// ```
506 impl vm::Vm for KvmVm {
507     #[cfg(target_arch = "x86_64")]
508     ///
509     /// Sets the address of the one-page region in the VM's address space.
510     ///
set_identity_map_address(&self, address: u64) -> vm::Result<()>511     fn set_identity_map_address(&self, address: u64) -> vm::Result<()> {
512         self.fd
513             .set_identity_map_address(address)
514             .map_err(|e| vm::HypervisorVmError::SetIdentityMapAddress(e.into()))
515     }
516 
517     #[cfg(target_arch = "x86_64")]
518     ///
519     /// Sets the address of the three-page region in the VM's address space.
520     ///
set_tss_address(&self, offset: usize) -> vm::Result<()>521     fn set_tss_address(&self, offset: usize) -> vm::Result<()> {
522         self.fd
523             .set_tss_address(offset)
524             .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into()))
525     }
526 
527     #[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
528     ///
529     /// Creates an in-kernel interrupt controller.
530     ///
create_irq_chip(&self) -> vm::Result<()>531     fn create_irq_chip(&self) -> vm::Result<()> {
532         self.fd
533             .create_irq_chip()
534             .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into()))
535     }
536 
537     ///
538     /// Registers an event that will, when signaled, trigger the `gsi` IRQ.
539     ///
register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()>540     fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
541         self.fd
542             .register_irqfd(fd, gsi)
543             .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into()))
544     }
545 
546     ///
547     /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ.
548     ///
unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()>549     fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
550         self.fd
551             .unregister_irqfd(fd, gsi)
552             .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into()))
553     }
554 
555     ///
556     /// Creates a VcpuFd object from a vcpu RawFd.
557     ///
create_vcpu( &self, id: u8, vm_ops: Option<Arc<dyn VmOps>>, ) -> vm::Result<Arc<dyn cpu::Vcpu>>558     fn create_vcpu(
559         &self,
560         id: u8,
561         vm_ops: Option<Arc<dyn VmOps>>,
562     ) -> vm::Result<Arc<dyn cpu::Vcpu>> {
563         let fd = self
564             .fd
565             .create_vcpu(id as u64)
566             .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?;
567         let vcpu = KvmVcpu {
568             fd: Arc::new(Mutex::new(fd)),
569             #[cfg(target_arch = "x86_64")]
570             msrs: self.msrs.clone(),
571             vm_ops,
572             #[cfg(target_arch = "x86_64")]
573             hyperv_synic: AtomicBool::new(false),
574         };
575         Ok(Arc::new(vcpu))
576     }
577 
578     #[cfg(target_arch = "aarch64")]
579     ///
580     /// Creates a virtual GIC device.
581     ///
create_vgic(&self, config: VgicConfig) -> vm::Result<Arc<Mutex<dyn Vgic>>>582     fn create_vgic(&self, config: VgicConfig) -> vm::Result<Arc<Mutex<dyn Vgic>>> {
583         let gic_device = KvmGicV3Its::new(self, config)
584             .map_err(|e| vm::HypervisorVmError::CreateVgic(anyhow!("Vgic error {:?}", e)))?;
585         Ok(Arc::new(Mutex::new(gic_device)))
586     }
587 
588     #[cfg(target_arch = "riscv64")]
589     ///
590     /// Creates a virtual AIA device.
591     ///
create_vaia(&self, config: VaiaConfig) -> vm::Result<Arc<Mutex<dyn Vaia>>>592     fn create_vaia(&self, config: VaiaConfig) -> vm::Result<Arc<Mutex<dyn Vaia>>> {
593         let aia_device = KvmAiaImsics::new(self, config)
594             .map_err(|e| vm::HypervisorVmError::CreateVaia(anyhow!("Vaia error {:?}", e)))?;
595         Ok(Arc::new(Mutex::new(aia_device)))
596     }
597 
598     ///
599     /// Registers an event to be signaled whenever a certain address is written to.
600     ///
register_ioevent( &self, fd: &EventFd, addr: &IoEventAddress, datamatch: Option<vm::DataMatch>, ) -> vm::Result<()>601     fn register_ioevent(
602         &self,
603         fd: &EventFd,
604         addr: &IoEventAddress,
605         datamatch: Option<vm::DataMatch>,
606     ) -> vm::Result<()> {
607         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
608         if let Some(dm) = datamatch {
609             match dm {
610                 vm::DataMatch::DataMatch32(kvm_dm32) => self
611                     .fd
612                     .register_ioevent(fd, addr, kvm_dm32)
613                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
614                 vm::DataMatch::DataMatch64(kvm_dm64) => self
615                     .fd
616                     .register_ioevent(fd, addr, kvm_dm64)
617                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
618             }
619         } else {
620             self.fd
621                 .register_ioevent(fd, addr, NoDatamatch)
622                 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into()))
623         }
624     }
625 
626     ///
627     /// Unregisters an event from a certain address it has been previously registered to.
628     ///
unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()>629     fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> {
630         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
631         self.fd
632             .unregister_ioevent(fd, addr, NoDatamatch)
633             .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into()))
634     }
635 
636     ///
637     /// Constructs a routing entry
638     ///
make_routing_entry(&self, gsi: u32, config: &InterruptSourceConfig) -> IrqRoutingEntry639     fn make_routing_entry(&self, gsi: u32, config: &InterruptSourceConfig) -> IrqRoutingEntry {
640         match &config {
641             InterruptSourceConfig::MsiIrq(cfg) => {
642                 let mut kvm_route = kvm_irq_routing_entry {
643                     gsi,
644                     type_: KVM_IRQ_ROUTING_MSI,
645                     ..Default::default()
646                 };
647 
648                 kvm_route.u.msi.address_lo = cfg.low_addr;
649                 kvm_route.u.msi.address_hi = cfg.high_addr;
650                 kvm_route.u.msi.data = cfg.data;
651 
652                 if self.check_extension(crate::kvm::Cap::MsiDevid) {
653                     // On AArch64, there is limitation on the range of the 'devid',
654                     // it cannot be greater than 65536 (the max of u16).
655                     //
656                     // BDF cannot be used directly, because 'segment' is in high
657                     // 16 bits. The layout of the u32 BDF is:
658                     // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --|
659                     // |      segment    |     bus    |   device   |  function  |
660                     //
661                     // Now that we support 1 bus only in a segment, we can build a
662                     // 'devid' by replacing the 'bus' bits with the low 8 bits of
663                     // 'segment' data.
664                     // This way we can resolve the range checking problem and give
665                     // different `devid` to all the devices. Limitation is that at
666                     // most 256 segments can be supported.
667                     //
668                     let modified_devid = ((cfg.devid & 0x00ff_0000) >> 8) | cfg.devid & 0xff;
669 
670                     kvm_route.flags = KVM_MSI_VALID_DEVID;
671                     kvm_route.u.msi.__bindgen_anon_1.devid = modified_devid;
672                 }
673                 kvm_route.into()
674             }
675             InterruptSourceConfig::LegacyIrq(cfg) => {
676                 let mut kvm_route = kvm_irq_routing_entry {
677                     gsi,
678                     type_: KVM_IRQ_ROUTING_IRQCHIP,
679                     ..Default::default()
680                 };
681                 kvm_route.u.irqchip.irqchip = cfg.irqchip;
682                 kvm_route.u.irqchip.pin = cfg.pin;
683 
684                 kvm_route.into()
685             }
686         }
687     }
688 
689     ///
690     /// Sets the GSI routing table entries, overwriting any previously set
691     /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl.
692     ///
set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()>693     fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> {
694         let entries: Vec<kvm_irq_routing_entry> = entries
695             .iter()
696             .map(|entry| match entry {
697                 IrqRoutingEntry::Kvm(e) => *e,
698                 #[allow(unreachable_patterns)]
699                 _ => panic!("IrqRoutingEntry type is wrong"),
700             })
701             .collect();
702 
703         let irq_routing =
704             kvm_bindings::fam_wrappers::KvmIrqRouting::from_entries(&entries).unwrap();
705 
706         self.fd
707             .set_gsi_routing(&irq_routing)
708             .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into()))
709     }
710 
711     ///
712     /// Creates a memory region structure that can be used with {create/remove}_user_memory_region
713     ///
make_user_memory_region( &self, slot: u32, guest_phys_addr: u64, memory_size: u64, userspace_addr: u64, readonly: bool, log_dirty_pages: bool, ) -> UserMemoryRegion714     fn make_user_memory_region(
715         &self,
716         slot: u32,
717         guest_phys_addr: u64,
718         memory_size: u64,
719         userspace_addr: u64,
720         readonly: bool,
721         log_dirty_pages: bool,
722     ) -> UserMemoryRegion {
723         kvm_userspace_memory_region {
724             slot,
725             guest_phys_addr,
726             memory_size,
727             userspace_addr,
728             flags: if readonly { KVM_MEM_READONLY } else { 0 }
729                 | if log_dirty_pages {
730                     KVM_MEM_LOG_DIRTY_PAGES
731                 } else {
732                     0
733                 },
734         }
735         .into()
736     }
737 
738     ///
739     /// Creates a guest physical memory region.
740     ///
create_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()>741     fn create_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
742         let mut region: kvm_userspace_memory_region = user_memory_region.into();
743 
744         if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 {
745             if (region.flags & KVM_MEM_READONLY) != 0 {
746                 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!(
747                     "Error creating regions with both 'dirty-pages-log' and 'read-only'."
748                 )));
749             }
750 
751             // Keep track of the regions that need dirty pages log
752             self.dirty_log_slots.write().unwrap().insert(
753                 region.slot,
754                 KvmDirtyLogSlot {
755                     slot: region.slot,
756                     guest_phys_addr: region.guest_phys_addr,
757                     memory_size: region.memory_size,
758                     userspace_addr: region.userspace_addr,
759                 },
760             );
761 
762             // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`.
763             // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`.
764             region.flags = 0;
765         }
766 
767         // SAFETY: Safe because guest regions are guaranteed not to overlap.
768         unsafe {
769             self.fd
770                 .set_user_memory_region(region)
771                 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))
772         }
773     }
774 
775     ///
776     /// Removes a guest physical memory region.
777     ///
remove_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()>778     fn remove_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
779         let mut region: kvm_userspace_memory_region = user_memory_region.into();
780 
781         // Remove the corresponding entry from "self.dirty_log_slots" if needed
782         self.dirty_log_slots.write().unwrap().remove(&region.slot);
783 
784         // Setting the size to 0 means "remove"
785         region.memory_size = 0;
786         // SAFETY: Safe because guest regions are guaranteed not to overlap.
787         unsafe {
788             self.fd
789                 .set_user_memory_region(region)
790                 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into()))
791         }
792     }
793 
794     ///
795     /// Returns the preferred CPU target type which can be emulated by KVM on underlying host.
796     ///
797     #[cfg(target_arch = "aarch64")]
get_preferred_target(&self, kvi: &mut crate::VcpuInit) -> vm::Result<()>798     fn get_preferred_target(&self, kvi: &mut crate::VcpuInit) -> vm::Result<()> {
799         let mut kvm_kvi: kvm_bindings::kvm_vcpu_init = (*kvi).into();
800         self.fd
801             .get_preferred_target(&mut kvm_kvi)
802             .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into()))?;
803         *kvi = kvm_kvi.into();
804         Ok(())
805     }
806 
807     #[cfg(target_arch = "x86_64")]
enable_split_irq(&self) -> vm::Result<()>808     fn enable_split_irq(&self) -> vm::Result<()> {
809         // Create split irqchip
810         // Only the local APIC is emulated in kernel, both PICs and IOAPIC
811         // are not.
812         let mut cap = kvm_enable_cap {
813             cap: KVM_CAP_SPLIT_IRQCHIP,
814             ..Default::default()
815         };
816         cap.args[0] = NUM_IOAPIC_PINS as u64;
817         self.fd
818             .enable_cap(&cap)
819             .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?;
820         Ok(())
821     }
822 
823     #[cfg(target_arch = "x86_64")]
enable_sgx_attribute(&self, file: File) -> vm::Result<()>824     fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> {
825         let mut cap = kvm_enable_cap {
826             cap: KVM_CAP_SGX_ATTRIBUTE,
827             ..Default::default()
828         };
829         cap.args[0] = file.as_raw_fd() as u64;
830         self.fd
831             .enable_cap(&cap)
832             .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?;
833         Ok(())
834     }
835 
836     /// Retrieve guest clock.
837     #[cfg(target_arch = "x86_64")]
get_clock(&self) -> vm::Result<ClockData>838     fn get_clock(&self) -> vm::Result<ClockData> {
839         Ok(self
840             .fd
841             .get_clock()
842             .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))?
843             .into())
844     }
845 
846     /// Set guest clock.
847     #[cfg(target_arch = "x86_64")]
set_clock(&self, data: &ClockData) -> vm::Result<()>848     fn set_clock(&self, data: &ClockData) -> vm::Result<()> {
849         let data = (*data).into();
850         self.fd
851             .set_clock(&data)
852             .map_err(|e| vm::HypervisorVmError::SetClock(e.into()))
853     }
854 
855     /// Create a device that is used for passthrough
create_passthrough_device(&self) -> vm::Result<VfioDeviceFd>856     fn create_passthrough_device(&self) -> vm::Result<VfioDeviceFd> {
857         let mut vfio_dev = kvm_create_device {
858             type_: kvm_device_type_KVM_DEV_TYPE_VFIO,
859             fd: 0,
860             flags: 0,
861         };
862 
863         self.create_device(&mut vfio_dev)
864             .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into()))
865     }
866 
867     ///
868     /// Start logging dirty pages
869     ///
start_dirty_log(&self) -> vm::Result<()>870     fn start_dirty_log(&self) -> vm::Result<()> {
871         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
872         for (_, s) in dirty_log_slots.iter() {
873             let region = kvm_userspace_memory_region {
874                 slot: s.slot,
875                 guest_phys_addr: s.guest_phys_addr,
876                 memory_size: s.memory_size,
877                 userspace_addr: s.userspace_addr,
878                 flags: KVM_MEM_LOG_DIRTY_PAGES,
879             };
880             // SAFETY: Safe because guest regions are guaranteed not to overlap.
881             unsafe {
882                 self.fd
883                     .set_user_memory_region(region)
884                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
885             }
886         }
887 
888         Ok(())
889     }
890 
891     ///
892     /// Stop logging dirty pages
893     ///
stop_dirty_log(&self) -> vm::Result<()>894     fn stop_dirty_log(&self) -> vm::Result<()> {
895         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
896         for (_, s) in dirty_log_slots.iter() {
897             let region = kvm_userspace_memory_region {
898                 slot: s.slot,
899                 guest_phys_addr: s.guest_phys_addr,
900                 memory_size: s.memory_size,
901                 userspace_addr: s.userspace_addr,
902                 flags: 0,
903             };
904             // SAFETY: Safe because guest regions are guaranteed not to overlap.
905             unsafe {
906                 self.fd
907                     .set_user_memory_region(region)
908                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
909             }
910         }
911 
912         Ok(())
913     }
914 
915     ///
916     /// Get dirty pages bitmap (one bit per page)
917     ///
get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>>918     fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> {
919         self.fd
920             .get_dirty_log(slot, memory_size as usize)
921             .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into()))
922     }
923 
924     ///
925     /// Initialize TDX for this VM
926     ///
927     #[cfg(feature = "tdx")]
tdx_init(&self, cpuid: &[CpuIdEntry], max_vcpus: u32) -> vm::Result<()>928     fn tdx_init(&self, cpuid: &[CpuIdEntry], max_vcpus: u32) -> vm::Result<()> {
929         const TDX_ATTR_SEPT_VE_DISABLE: usize = 28;
930 
931         let mut cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> =
932             cpuid.iter().map(|e| (*e).into()).collect();
933         cpuid.resize(256, kvm_bindings::kvm_cpuid_entry2::default());
934 
935         #[repr(C)]
936         struct TdxInitVm {
937             attributes: u64,
938             max_vcpus: u32,
939             padding: u32,
940             mrconfigid: [u64; 6],
941             mrowner: [u64; 6],
942             mrownerconfig: [u64; 6],
943             cpuid_nent: u32,
944             cpuid_padding: u32,
945             cpuid_entries: [kvm_bindings::kvm_cpuid_entry2; 256],
946         }
947         let data = TdxInitVm {
948             attributes: 1 << TDX_ATTR_SEPT_VE_DISABLE,
949             max_vcpus,
950             padding: 0,
951             mrconfigid: [0; 6],
952             mrowner: [0; 6],
953             mrownerconfig: [0; 6],
954             cpuid_nent: cpuid.len() as u32,
955             cpuid_padding: 0,
956             cpuid_entries: cpuid.as_slice().try_into().unwrap(),
957         };
958 
959         tdx_command(
960             &self.fd.as_raw_fd(),
961             TdxCommand::InitVm,
962             0,
963             &data as *const _ as u64,
964         )
965         .map_err(vm::HypervisorVmError::InitializeTdx)
966     }
967 
968     ///
969     /// Finalize the TDX setup for this VM
970     ///
971     #[cfg(feature = "tdx")]
tdx_finalize(&self) -> vm::Result<()>972     fn tdx_finalize(&self) -> vm::Result<()> {
973         tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0)
974             .map_err(vm::HypervisorVmError::FinalizeTdx)
975     }
976 
977     ///
978     /// Initialize memory regions for the TDX VM
979     ///
980     #[cfg(feature = "tdx")]
tdx_init_memory_region( &self, host_address: u64, guest_address: u64, size: u64, measure: bool, ) -> vm::Result<()>981     fn tdx_init_memory_region(
982         &self,
983         host_address: u64,
984         guest_address: u64,
985         size: u64,
986         measure: bool,
987     ) -> vm::Result<()> {
988         #[repr(C)]
989         struct TdxInitMemRegion {
990             host_address: u64,
991             guest_address: u64,
992             pages: u64,
993         }
994         let data = TdxInitMemRegion {
995             host_address,
996             guest_address,
997             pages: size / 4096,
998         };
999 
1000         tdx_command(
1001             &self.fd.as_raw_fd(),
1002             TdxCommand::InitMemRegion,
1003             u32::from(measure),
1004             &data as *const _ as u64,
1005         )
1006         .map_err(vm::HypervisorVmError::InitMemRegionTdx)
1007     }
1008 
1009     /// Downcast to the underlying KvmVm type
as_any(&self) -> &dyn Any1010     fn as_any(&self) -> &dyn Any {
1011         self
1012     }
1013 }
1014 
1015 #[cfg(feature = "tdx")]
tdx_command( fd: &RawFd, command: TdxCommand, flags: u32, data: u64, ) -> std::result::Result<(), std::io::Error>1016 fn tdx_command(
1017     fd: &RawFd,
1018     command: TdxCommand,
1019     flags: u32,
1020     data: u64,
1021 ) -> std::result::Result<(), std::io::Error> {
1022     #[repr(C)]
1023     struct TdxIoctlCmd {
1024         command: TdxCommand,
1025         flags: u32,
1026         data: u64,
1027         error: u64,
1028         unused: u64,
1029     }
1030     let cmd = TdxIoctlCmd {
1031         command,
1032         flags,
1033         data,
1034         error: 0,
1035         unused: 0,
1036     };
1037     // SAFETY: FFI call. All input parameters are valid.
1038     let ret = unsafe {
1039         ioctl_with_val(
1040             fd,
1041             KVM_MEMORY_ENCRYPT_OP(),
1042             &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong,
1043         )
1044     };
1045 
1046     if ret < 0 {
1047         return Err(std::io::Error::last_os_error());
1048     }
1049     Ok(())
1050 }
1051 
1052 /// Wrapper over KVM system ioctls.
1053 pub struct KvmHypervisor {
1054     kvm: Kvm,
1055 }
1056 
1057 impl KvmHypervisor {
1058     #[cfg(target_arch = "x86_64")]
1059     ///
1060     /// Retrieve the list of MSRs supported by the hypervisor.
1061     ///
get_msr_list(&self) -> hypervisor::Result<MsrList>1062     fn get_msr_list(&self) -> hypervisor::Result<MsrList> {
1063         self.kvm
1064             .get_msr_index_list()
1065             .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into()))
1066     }
1067 }
1068 
1069 /// Enum for KVM related error
1070 #[derive(Debug, Error)]
1071 pub enum KvmError {
1072     #[error("Capability missing: {0:?}")]
1073     CapabilityMissing(Cap),
1074 }
1075 
1076 pub type KvmResult<T> = result::Result<T, KvmError>;
1077 
1078 impl KvmHypervisor {
1079     /// Create a hypervisor based on Kvm
1080     #[allow(clippy::new_ret_no_self)]
new() -> hypervisor::Result<Arc<dyn hypervisor::Hypervisor>>1081     pub fn new() -> hypervisor::Result<Arc<dyn hypervisor::Hypervisor>> {
1082         let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?;
1083         let api_version = kvm_obj.get_api_version();
1084 
1085         if api_version != kvm_bindings::KVM_API_VERSION as i32 {
1086             return Err(hypervisor::HypervisorError::IncompatibleApiVersion);
1087         }
1088 
1089         Ok(Arc::new(KvmHypervisor { kvm: kvm_obj }))
1090     }
1091 
1092     /// Check if the hypervisor is available
is_available() -> hypervisor::Result<bool>1093     pub fn is_available() -> hypervisor::Result<bool> {
1094         match std::fs::metadata("/dev/kvm") {
1095             Ok(_) => Ok(true),
1096             Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false),
1097             Err(err) => Err(hypervisor::HypervisorError::HypervisorAvailableCheck(
1098                 err.into(),
1099             )),
1100         }
1101     }
1102 }
1103 
1104 /// Implementation of Hypervisor trait for KVM
1105 ///
1106 /// # Examples
1107 ///
1108 /// ```
1109 /// # use hypervisor::kvm::KvmHypervisor;
1110 /// # use std::sync::Arc;
1111 /// let kvm = KvmHypervisor::new().unwrap();
1112 /// let hypervisor = Arc::new(kvm);
1113 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
1114 /// ```
1115 impl hypervisor::Hypervisor for KvmHypervisor {
1116     ///
1117     /// Returns the type of the hypervisor
1118     ///
hypervisor_type(&self) -> HypervisorType1119     fn hypervisor_type(&self) -> HypervisorType {
1120         HypervisorType::Kvm
1121     }
1122 
1123     ///
1124     /// Create a Vm of a specific type using the underlying hypervisor, passing memory size
1125     /// Return a hypervisor-agnostic Vm trait object
1126     ///
1127     /// # Examples
1128     ///
1129     /// ```
1130     /// # use hypervisor::kvm::KvmHypervisor;
1131     /// use hypervisor::kvm::KvmVm;
1132     /// let hypervisor = KvmHypervisor::new().unwrap();
1133     /// let vm = hypervisor.create_vm_with_type_and_memory(0).unwrap();
1134     /// ```
create_vm_with_type_and_memory( &self, vm_type: u64, #[cfg(feature = "sev_snp")] _mem_size: u64, ) -> hypervisor::Result<Arc<dyn vm::Vm>>1135     fn create_vm_with_type_and_memory(
1136         &self,
1137         vm_type: u64,
1138         #[cfg(feature = "sev_snp")] _mem_size: u64,
1139     ) -> hypervisor::Result<Arc<dyn vm::Vm>> {
1140         self.create_vm_with_type(vm_type)
1141     }
1142 
1143     /// Create a KVM vm object of a specific VM type and return the object as Vm trait object
1144     ///
1145     /// # Examples
1146     ///
1147     /// ```
1148     /// # use hypervisor::kvm::KvmHypervisor;
1149     /// use hypervisor::kvm::KvmVm;
1150     /// let hypervisor = KvmHypervisor::new().unwrap();
1151     /// let vm = hypervisor.create_vm_with_type(0).unwrap();
1152     /// ```
create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>>1153     fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> {
1154         let fd: VmFd;
1155         loop {
1156             match self.kvm.create_vm_with_type(vm_type) {
1157                 Ok(res) => fd = res,
1158                 Err(e) => {
1159                     if e.errno() == libc::EINTR {
1160                         // If the error returned is EINTR, which means the
1161                         // ioctl has been interrupted, we have to retry as
1162                         // this can't be considered as a regular error.
1163                         continue;
1164                     } else {
1165                         return Err(hypervisor::HypervisorError::VmCreate(e.into()));
1166                     }
1167                 }
1168             }
1169             break;
1170         }
1171 
1172         let vm_fd = Arc::new(fd);
1173 
1174         #[cfg(target_arch = "x86_64")]
1175         {
1176             let msr_list = self.get_msr_list()?;
1177             let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize;
1178             let mut msrs: Vec<MsrEntry> = vec![
1179                 MsrEntry {
1180                     ..Default::default()
1181                 };
1182                 num_msrs
1183             ];
1184             let indices = msr_list.as_slice();
1185             for (pos, index) in indices.iter().enumerate() {
1186                 msrs[pos].index = *index;
1187             }
1188 
1189             Ok(Arc::new(KvmVm {
1190                 fd: vm_fd,
1191                 msrs,
1192                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
1193             }))
1194         }
1195 
1196         #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
1197         {
1198             Ok(Arc::new(KvmVm {
1199                 fd: vm_fd,
1200                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
1201             }))
1202         }
1203     }
1204 
1205     /// Create a KVM vm object and return the object as Vm trait object
1206     ///
1207     /// # Examples
1208     ///
1209     /// ```
1210     /// # use hypervisor::kvm::KvmHypervisor;
1211     /// use hypervisor::kvm::KvmVm;
1212     /// let hypervisor = KvmHypervisor::new().unwrap();
1213     /// let vm = hypervisor.create_vm().unwrap();
1214     /// ```
create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>>1215     fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> {
1216         #[allow(unused_mut)]
1217         let mut vm_type: u64 = 0; // Create with default platform type
1218 
1219         // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA
1220         // size from the host and use that when creating the VM, which may
1221         // avoid unnecessary VM creation failures.
1222         #[cfg(target_arch = "aarch64")]
1223         if self.kvm.check_extension(Cap::ArmVmIPASize) {
1224             vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap();
1225         }
1226 
1227         self.create_vm_with_type(vm_type)
1228     }
1229 
check_required_extensions(&self) -> hypervisor::Result<()>1230     fn check_required_extensions(&self) -> hypervisor::Result<()> {
1231         check_required_kvm_extensions(&self.kvm)
1232             .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into()))
1233     }
1234 
1235     #[cfg(target_arch = "x86_64")]
1236     ///
1237     /// X86 specific call to get the system supported CPUID values.
1238     ///
get_supported_cpuid(&self) -> hypervisor::Result<Vec<CpuIdEntry>>1239     fn get_supported_cpuid(&self) -> hypervisor::Result<Vec<CpuIdEntry>> {
1240         let kvm_cpuid = self
1241             .kvm
1242             .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
1243             .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))?;
1244 
1245         let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect();
1246 
1247         Ok(v)
1248     }
1249 
1250     #[cfg(target_arch = "aarch64")]
1251     ///
1252     /// Retrieve AArch64 host maximum IPA size supported by KVM.
1253     ///
get_host_ipa_limit(&self) -> i321254     fn get_host_ipa_limit(&self) -> i32 {
1255         self.kvm.get_host_ipa_limit()
1256     }
1257 
1258     ///
1259     /// Retrieve TDX capabilities
1260     ///
1261     #[cfg(feature = "tdx")]
tdx_capabilities(&self) -> hypervisor::Result<TdxCapabilities>1262     fn tdx_capabilities(&self) -> hypervisor::Result<TdxCapabilities> {
1263         let data = TdxCapabilities {
1264             nr_cpuid_configs: TDX_MAX_NR_CPUID_CONFIGS as u32,
1265             ..Default::default()
1266         };
1267 
1268         tdx_command(
1269             &self.kvm.as_raw_fd(),
1270             TdxCommand::Capabilities,
1271             0,
1272             &data as *const _ as u64,
1273         )
1274         .map_err(|e| hypervisor::HypervisorError::TdxCapabilities(e.into()))?;
1275 
1276         Ok(data)
1277     }
1278 
1279     #[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
1280     ///
1281     /// Get the number of supported hardware breakpoints
1282     ///
get_guest_debug_hw_bps(&self) -> usize1283     fn get_guest_debug_hw_bps(&self) -> usize {
1284         #[cfg(target_arch = "x86_64")]
1285         {
1286             4
1287         }
1288         #[cfg(target_arch = "aarch64")]
1289         {
1290             self.kvm.get_guest_debug_hw_bps() as usize
1291         }
1292     }
1293 
1294     /// Get maximum number of vCPUs
get_max_vcpus(&self) -> u321295     fn get_max_vcpus(&self) -> u32 {
1296         self.kvm.get_max_vcpus().min(u32::MAX as usize) as u32
1297     }
1298 }
1299 
1300 /// Vcpu struct for KVM
1301 pub struct KvmVcpu {
1302     fd: Arc<Mutex<VcpuFd>>,
1303     #[cfg(target_arch = "x86_64")]
1304     msrs: Vec<MsrEntry>,
1305     vm_ops: Option<Arc<dyn vm::VmOps>>,
1306     #[cfg(target_arch = "x86_64")]
1307     hyperv_synic: AtomicBool,
1308 }
1309 
1310 /// Implementation of Vcpu trait for KVM
1311 ///
1312 /// # Examples
1313 ///
1314 /// ```
1315 /// # use hypervisor::kvm::KvmHypervisor;
1316 /// # use std::sync::Arc;
1317 /// let kvm = KvmHypervisor::new().unwrap();
1318 /// let hypervisor = Arc::new(kvm);
1319 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
1320 /// let vcpu = vm.create_vcpu(0, None).unwrap();
1321 /// ```
1322 impl cpu::Vcpu for KvmVcpu {
1323     ///
1324     /// Returns StandardRegisters with default value set
1325     ///
create_standard_regs(&self) -> StandardRegisters1326     fn create_standard_regs(&self) -> StandardRegisters {
1327         #[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
1328         {
1329             kvm_bindings::kvm_regs::default().into()
1330         }
1331         #[cfg(target_arch = "riscv64")]
1332         {
1333             kvm_bindings::kvm_riscv_core::default().into()
1334         }
1335     }
1336     #[cfg(target_arch = "x86_64")]
1337     ///
1338     /// Returns the vCPU general purpose registers.
1339     ///
get_regs(&self) -> cpu::Result<StandardRegisters>1340     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1341         Ok(self
1342             .fd
1343             .lock()
1344             .unwrap()
1345             .get_regs()
1346             .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))?
1347             .into())
1348     }
1349 
1350     ///
1351     /// Returns the vCPU general purpose registers.
1352     /// The `KVM_GET_REGS` ioctl is not available on AArch64, `KVM_GET_ONE_REG`
1353     /// is used to get registers one by one.
1354     ///
1355     #[cfg(target_arch = "aarch64")]
get_regs(&self) -> cpu::Result<StandardRegisters>1356     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1357         let mut state = kvm_regs::default();
1358         let mut off = offset_of!(user_pt_regs, regs);
1359         // There are 31 user_pt_regs:
1360         // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72
1361         // These actually are the general-purpose registers of the Armv8-a
1362         // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register).
1363         for i in 0..31 {
1364             let mut bytes = [0_u8; 8];
1365             self.fd
1366                 .lock()
1367                 .unwrap()
1368                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1369                 .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1370             state.regs.regs[i] = u64::from_le_bytes(bytes);
1371             off += std::mem::size_of::<u64>();
1372         }
1373 
1374         // We are now entering the "Other register" section of the ARMv8-a architecture.
1375         // First one, stack pointer.
1376         let off = offset_of!(user_pt_regs, sp);
1377         let mut bytes = [0_u8; 8];
1378         self.fd
1379             .lock()
1380             .unwrap()
1381             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1382             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1383         state.regs.sp = u64::from_le_bytes(bytes);
1384 
1385         // Second one, the program counter.
1386         let off = offset_of!(user_pt_regs, pc);
1387         let mut bytes = [0_u8; 8];
1388         self.fd
1389             .lock()
1390             .unwrap()
1391             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1392             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1393         state.regs.pc = u64::from_le_bytes(bytes);
1394 
1395         // Next is the processor state.
1396         let off = offset_of!(user_pt_regs, pstate);
1397         let mut bytes = [0_u8; 8];
1398         self.fd
1399             .lock()
1400             .unwrap()
1401             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1402             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1403         state.regs.pstate = u64::from_le_bytes(bytes);
1404 
1405         // The stack pointer associated with EL1
1406         let off = offset_of!(kvm_regs, sp_el1);
1407         let mut bytes = [0_u8; 8];
1408         self.fd
1409             .lock()
1410             .unwrap()
1411             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1412             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1413         state.sp_el1 = u64::from_le_bytes(bytes);
1414 
1415         // Exception Link Register for EL1, when taking an exception to EL1, this register
1416         // holds the address to which to return afterwards.
1417         let off = offset_of!(kvm_regs, elr_el1);
1418         let mut bytes = [0_u8; 8];
1419         self.fd
1420             .lock()
1421             .unwrap()
1422             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1423             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1424         state.elr_el1 = u64::from_le_bytes(bytes);
1425 
1426         // Saved Program Status Registers, there are 5 of them used in the kernel.
1427         let mut off = offset_of!(kvm_regs, spsr);
1428         for i in 0..KVM_NR_SPSR as usize {
1429             let mut bytes = [0_u8; 8];
1430             self.fd
1431                 .lock()
1432                 .unwrap()
1433                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), &mut bytes)
1434                 .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1435             state.spsr[i] = u64::from_le_bytes(bytes);
1436             off += std::mem::size_of::<u64>();
1437         }
1438 
1439         // Now moving on to floating point registers which are stored in the user_fpsimd_state in the kernel:
1440         // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53
1441         let mut off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, vregs);
1442         for i in 0..32 {
1443             let mut bytes = [0_u8; 16];
1444             self.fd
1445                 .lock()
1446                 .unwrap()
1447                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off), &mut bytes)
1448                 .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1449             state.fp_regs.vregs[i] = u128::from_le_bytes(bytes);
1450             off += mem::size_of::<u128>();
1451         }
1452 
1453         // Floating-point Status Register
1454         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpsr);
1455         let mut bytes = [0_u8; 4];
1456         self.fd
1457             .lock()
1458             .unwrap()
1459             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off), &mut bytes)
1460             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1461         state.fp_regs.fpsr = u32::from_le_bytes(bytes);
1462 
1463         // Floating-point Control Register
1464         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpcr);
1465         let mut bytes = [0_u8; 4];
1466         self.fd
1467             .lock()
1468             .unwrap()
1469             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off), &mut bytes)
1470             .map_err(|e| cpu::HypervisorCpuError::GetAarchCoreRegister(e.into()))?;
1471         state.fp_regs.fpcr = u32::from_le_bytes(bytes);
1472         Ok(state.into())
1473     }
1474 
1475     #[cfg(target_arch = "riscv64")]
1476     ///
1477     /// Returns the RISC-V vCPU core registers.
1478     /// The `KVM_GET_REGS` ioctl is not available on RISC-V 64-bit,
1479     /// `KVM_GET_ONE_REG` is used to get registers one by one.
1480     ///
get_regs(&self) -> cpu::Result<StandardRegisters>1481     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1482         let mut state = kvm_riscv_core::default();
1483 
1484         /// Macro used to extract RISC-V register data from KVM Vcpu according
1485         /// to `$reg_name` provided to `state`.
1486         macro_rules! riscv64_get_one_reg_from_vcpu {
1487             (mode) => {
1488                 let off = offset_of!(kvm_riscv_core, mode);
1489                 let mut bytes = [0_u8; 8];
1490                 self.fd
1491                     .lock()
1492                     .unwrap()
1493                     .get_one_reg(riscv64_reg_id!(KVM_REG_RISCV_CORE, off), &mut bytes)
1494                     .map_err(|e| cpu::HypervisorCpuError::GetRiscvCoreRegister(e.into()))?;
1495                 state.mode = u64::from_le_bytes(bytes);
1496             };
1497             ($reg_name:ident) => {
1498                 let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, $reg_name);
1499                 let mut bytes = [0_u8; 8];
1500                 self.fd
1501                     .lock()
1502                     .unwrap()
1503                     .get_one_reg(riscv64_reg_id!(KVM_REG_RISCV_CORE, off), &mut bytes)
1504                     .map_err(|e| cpu::HypervisorCpuError::GetRiscvCoreRegister(e.into()))?;
1505                 state.regs.$reg_name = u64::from_le_bytes(bytes);
1506             };
1507         }
1508 
1509         riscv64_get_one_reg_from_vcpu!(pc);
1510         riscv64_get_one_reg_from_vcpu!(ra);
1511         riscv64_get_one_reg_from_vcpu!(sp);
1512         riscv64_get_one_reg_from_vcpu!(gp);
1513         riscv64_get_one_reg_from_vcpu!(tp);
1514         riscv64_get_one_reg_from_vcpu!(t0);
1515         riscv64_get_one_reg_from_vcpu!(t1);
1516         riscv64_get_one_reg_from_vcpu!(t2);
1517         riscv64_get_one_reg_from_vcpu!(s0);
1518         riscv64_get_one_reg_from_vcpu!(s1);
1519         riscv64_get_one_reg_from_vcpu!(a0);
1520         riscv64_get_one_reg_from_vcpu!(a1);
1521         riscv64_get_one_reg_from_vcpu!(a2);
1522         riscv64_get_one_reg_from_vcpu!(a3);
1523         riscv64_get_one_reg_from_vcpu!(a4);
1524         riscv64_get_one_reg_from_vcpu!(a5);
1525         riscv64_get_one_reg_from_vcpu!(a6);
1526         riscv64_get_one_reg_from_vcpu!(a7);
1527         riscv64_get_one_reg_from_vcpu!(s2);
1528         riscv64_get_one_reg_from_vcpu!(s3);
1529         riscv64_get_one_reg_from_vcpu!(s4);
1530         riscv64_get_one_reg_from_vcpu!(s5);
1531         riscv64_get_one_reg_from_vcpu!(s6);
1532         riscv64_get_one_reg_from_vcpu!(s7);
1533         riscv64_get_one_reg_from_vcpu!(s8);
1534         riscv64_get_one_reg_from_vcpu!(s9);
1535         riscv64_get_one_reg_from_vcpu!(s10);
1536         riscv64_get_one_reg_from_vcpu!(s11);
1537         riscv64_get_one_reg_from_vcpu!(t3);
1538         riscv64_get_one_reg_from_vcpu!(t4);
1539         riscv64_get_one_reg_from_vcpu!(t5);
1540         riscv64_get_one_reg_from_vcpu!(t6);
1541         riscv64_get_one_reg_from_vcpu!(mode);
1542 
1543         Ok(state.into())
1544     }
1545 
1546     #[cfg(target_arch = "x86_64")]
1547     ///
1548     /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl.
1549     ///
set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()>1550     fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> {
1551         let regs = (*regs).into();
1552         self.fd
1553             .lock()
1554             .unwrap()
1555             .set_regs(&regs)
1556             .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into()))
1557     }
1558 
1559     ///
1560     /// Sets the vCPU general purpose registers.
1561     /// The `KVM_SET_REGS` ioctl is not available on AArch64, `KVM_SET_ONE_REG`
1562     /// is used to set registers one by one.
1563     ///
1564     #[cfg(target_arch = "aarch64")]
set_regs(&self, state: &StandardRegisters) -> cpu::Result<()>1565     fn set_regs(&self, state: &StandardRegisters) -> cpu::Result<()> {
1566         // The function follows the exact identical order from `state`. Look there
1567         // for some additional info on registers.
1568         let kvm_regs_state: kvm_regs = (*state).into();
1569         let mut off = offset_of!(user_pt_regs, regs);
1570         for i in 0..31 {
1571             self.fd
1572                 .lock()
1573                 .unwrap()
1574                 .set_one_reg(
1575                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1576                     &kvm_regs_state.regs.regs[i].to_le_bytes(),
1577                 )
1578                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1579             off += std::mem::size_of::<u64>();
1580         }
1581 
1582         let off = offset_of!(user_pt_regs, sp);
1583         self.fd
1584             .lock()
1585             .unwrap()
1586             .set_one_reg(
1587                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1588                 &kvm_regs_state.regs.sp.to_le_bytes(),
1589             )
1590             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1591 
1592         let off = offset_of!(user_pt_regs, pc);
1593         self.fd
1594             .lock()
1595             .unwrap()
1596             .set_one_reg(
1597                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1598                 &kvm_regs_state.regs.pc.to_le_bytes(),
1599             )
1600             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1601 
1602         let off = offset_of!(user_pt_regs, pstate);
1603         self.fd
1604             .lock()
1605             .unwrap()
1606             .set_one_reg(
1607                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1608                 &kvm_regs_state.regs.pstate.to_le_bytes(),
1609             )
1610             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1611 
1612         let off = offset_of!(kvm_regs, sp_el1);
1613         self.fd
1614             .lock()
1615             .unwrap()
1616             .set_one_reg(
1617                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1618                 &kvm_regs_state.sp_el1.to_le_bytes(),
1619             )
1620             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1621 
1622         let off = offset_of!(kvm_regs, elr_el1);
1623         self.fd
1624             .lock()
1625             .unwrap()
1626             .set_one_reg(
1627                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1628                 &kvm_regs_state.elr_el1.to_le_bytes(),
1629             )
1630             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1631 
1632         let mut off = offset_of!(kvm_regs, spsr);
1633         for i in 0..KVM_NR_SPSR as usize {
1634             self.fd
1635                 .lock()
1636                 .unwrap()
1637                 .set_one_reg(
1638                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1639                     &kvm_regs_state.spsr[i].to_le_bytes(),
1640                 )
1641                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1642             off += std::mem::size_of::<u64>();
1643         }
1644 
1645         let mut off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, vregs);
1646         for i in 0..32 {
1647             self.fd
1648                 .lock()
1649                 .unwrap()
1650                 .set_one_reg(
1651                     arm64_core_reg_id!(KVM_REG_SIZE_U128, off),
1652                     &kvm_regs_state.fp_regs.vregs[i].to_le_bytes(),
1653                 )
1654                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1655             off += mem::size_of::<u128>();
1656         }
1657 
1658         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpsr);
1659         self.fd
1660             .lock()
1661             .unwrap()
1662             .set_one_reg(
1663                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1664                 &kvm_regs_state.fp_regs.fpsr.to_le_bytes(),
1665             )
1666             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1667 
1668         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpcr);
1669         self.fd
1670             .lock()
1671             .unwrap()
1672             .set_one_reg(
1673                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1674                 &kvm_regs_state.fp_regs.fpcr.to_le_bytes(),
1675             )
1676             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
1677         Ok(())
1678     }
1679 
1680     #[cfg(target_arch = "riscv64")]
1681     ///
1682     /// Sets the RISC-V vCPU core registers.
1683     /// The `KVM_SET_REGS` ioctl is not available on RISC-V 64-bit,
1684     /// `KVM_SET_ONE_REG` is used to set registers one by one.
1685     ///
set_regs(&self, state: &StandardRegisters) -> cpu::Result<()>1686     fn set_regs(&self, state: &StandardRegisters) -> cpu::Result<()> {
1687         // The function follows the exact identical order from `state`. Look there
1688         // for some additional info on registers.
1689         let kvm_regs_state: kvm_riscv_core = (*state).into();
1690 
1691         /// Macro used to set value of specific RISC-V `$reg_name` stored in
1692         /// `state` to KVM Vcpu.
1693         macro_rules! riscv64_set_one_reg_to_vcpu {
1694             (mode) => {
1695                 let off = offset_of!(kvm_riscv_core, mode);
1696                 self.fd
1697                     .lock()
1698                     .unwrap()
1699                     .set_one_reg(
1700                         riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1701                         &kvm_regs_state.mode.to_le_bytes(),
1702                     )
1703                     .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1704             };
1705             ($reg_name:ident) => {
1706                 let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, $reg_name);
1707                 self.fd
1708                     .lock()
1709                     .unwrap()
1710                     .set_one_reg(
1711                         riscv64_reg_id!(KVM_REG_RISCV_CORE, off),
1712                         &kvm_regs_state.regs.$reg_name.to_le_bytes(),
1713                     )
1714                     .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
1715             };
1716         }
1717 
1718         riscv64_set_one_reg_to_vcpu!(pc);
1719         riscv64_set_one_reg_to_vcpu!(ra);
1720         riscv64_set_one_reg_to_vcpu!(sp);
1721         riscv64_set_one_reg_to_vcpu!(gp);
1722         riscv64_set_one_reg_to_vcpu!(tp);
1723         riscv64_set_one_reg_to_vcpu!(t0);
1724         riscv64_set_one_reg_to_vcpu!(t1);
1725         riscv64_set_one_reg_to_vcpu!(t2);
1726         riscv64_set_one_reg_to_vcpu!(s0);
1727         riscv64_set_one_reg_to_vcpu!(s1);
1728         riscv64_set_one_reg_to_vcpu!(a0);
1729         riscv64_set_one_reg_to_vcpu!(a1);
1730         riscv64_set_one_reg_to_vcpu!(a2);
1731         riscv64_set_one_reg_to_vcpu!(a3);
1732         riscv64_set_one_reg_to_vcpu!(a4);
1733         riscv64_set_one_reg_to_vcpu!(a5);
1734         riscv64_set_one_reg_to_vcpu!(a6);
1735         riscv64_set_one_reg_to_vcpu!(a7);
1736         riscv64_set_one_reg_to_vcpu!(s2);
1737         riscv64_set_one_reg_to_vcpu!(s3);
1738         riscv64_set_one_reg_to_vcpu!(s4);
1739         riscv64_set_one_reg_to_vcpu!(s5);
1740         riscv64_set_one_reg_to_vcpu!(s6);
1741         riscv64_set_one_reg_to_vcpu!(s7);
1742         riscv64_set_one_reg_to_vcpu!(s8);
1743         riscv64_set_one_reg_to_vcpu!(s9);
1744         riscv64_set_one_reg_to_vcpu!(s10);
1745         riscv64_set_one_reg_to_vcpu!(s11);
1746         riscv64_set_one_reg_to_vcpu!(t3);
1747         riscv64_set_one_reg_to_vcpu!(t4);
1748         riscv64_set_one_reg_to_vcpu!(t5);
1749         riscv64_set_one_reg_to_vcpu!(t6);
1750         riscv64_set_one_reg_to_vcpu!(mode);
1751 
1752         Ok(())
1753     }
1754 
1755     #[cfg(target_arch = "x86_64")]
1756     ///
1757     /// Returns the vCPU special registers.
1758     ///
get_sregs(&self) -> cpu::Result<SpecialRegisters>1759     fn get_sregs(&self) -> cpu::Result<SpecialRegisters> {
1760         Ok(self
1761             .fd
1762             .lock()
1763             .unwrap()
1764             .get_sregs()
1765             .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))?
1766             .into())
1767     }
1768 
1769     #[cfg(target_arch = "x86_64")]
1770     ///
1771     /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl.
1772     ///
set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()>1773     fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> {
1774         let sregs = (*sregs).into();
1775         self.fd
1776             .lock()
1777             .unwrap()
1778             .set_sregs(&sregs)
1779             .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into()))
1780     }
1781 
1782     #[cfg(target_arch = "x86_64")]
1783     ///
1784     /// Returns the floating point state (FPU) from the vCPU.
1785     ///
get_fpu(&self) -> cpu::Result<FpuState>1786     fn get_fpu(&self) -> cpu::Result<FpuState> {
1787         Ok(self
1788             .fd
1789             .lock()
1790             .unwrap()
1791             .get_fpu()
1792             .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))?
1793             .into())
1794     }
1795 
1796     #[cfg(target_arch = "x86_64")]
1797     ///
1798     /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioctl.
1799     ///
set_fpu(&self, fpu: &FpuState) -> cpu::Result<()>1800     fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> {
1801         let fpu: kvm_bindings::kvm_fpu = (*fpu).clone().into();
1802         self.fd
1803             .lock()
1804             .unwrap()
1805             .set_fpu(&fpu)
1806             .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into()))
1807     }
1808 
1809     #[cfg(target_arch = "x86_64")]
1810     ///
1811     /// X86 specific call to setup the CPUID registers.
1812     ///
set_cpuid2(&self, cpuid: &[CpuIdEntry]) -> cpu::Result<()>1813     fn set_cpuid2(&self, cpuid: &[CpuIdEntry]) -> cpu::Result<()> {
1814         let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> =
1815             cpuid.iter().map(|e| (*e).into()).collect();
1816         let kvm_cpuid = <CpuId>::from_entries(&cpuid)
1817             .map_err(|_| cpu::HypervisorCpuError::SetCpuid(anyhow!("failed to create CpuId")))?;
1818 
1819         self.fd
1820             .lock()
1821             .unwrap()
1822             .set_cpuid2(&kvm_cpuid)
1823             .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into()))
1824     }
1825 
1826     #[cfg(target_arch = "x86_64")]
1827     ///
1828     /// X86 specific call to enable HyperV SynIC
1829     ///
enable_hyperv_synic(&self) -> cpu::Result<()>1830     fn enable_hyperv_synic(&self) -> cpu::Result<()> {
1831         // Update the information about Hyper-V SynIC being enabled and
1832         // emulated as it will influence later which MSRs should be saved.
1833         self.hyperv_synic.store(true, Ordering::Release);
1834 
1835         let cap = kvm_enable_cap {
1836             cap: KVM_CAP_HYPERV_SYNIC,
1837             ..Default::default()
1838         };
1839         self.fd
1840             .lock()
1841             .unwrap()
1842             .enable_cap(&cap)
1843             .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into()))
1844     }
1845 
1846     ///
1847     /// X86 specific call to retrieve the CPUID registers.
1848     ///
1849     #[cfg(target_arch = "x86_64")]
get_cpuid2(&self, num_entries: usize) -> cpu::Result<Vec<CpuIdEntry>>1850     fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<Vec<CpuIdEntry>> {
1851         let kvm_cpuid = self
1852             .fd
1853             .lock()
1854             .unwrap()
1855             .get_cpuid2(num_entries)
1856             .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))?;
1857 
1858         let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect();
1859 
1860         Ok(v)
1861     }
1862 
1863     #[cfg(target_arch = "x86_64")]
1864     ///
1865     /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1866     ///
get_lapic(&self) -> cpu::Result<LapicState>1867     fn get_lapic(&self) -> cpu::Result<LapicState> {
1868         Ok(self
1869             .fd
1870             .lock()
1871             .unwrap()
1872             .get_lapic()
1873             .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))?
1874             .into())
1875     }
1876 
1877     #[cfg(target_arch = "x86_64")]
1878     ///
1879     /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1880     ///
set_lapic(&self, klapic: &LapicState) -> cpu::Result<()>1881     fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> {
1882         let klapic: kvm_bindings::kvm_lapic_state = (*klapic).clone().into();
1883         self.fd
1884             .lock()
1885             .unwrap()
1886             .set_lapic(&klapic)
1887             .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into()))
1888     }
1889 
1890     #[cfg(target_arch = "x86_64")]
1891     ///
1892     /// Returns the model-specific registers (MSR) for this vCPU.
1893     ///
get_msrs(&self, msrs: &mut Vec<MsrEntry>) -> cpu::Result<usize>1894     fn get_msrs(&self, msrs: &mut Vec<MsrEntry>) -> cpu::Result<usize> {
1895         let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect();
1896         let mut kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap();
1897         let succ = self
1898             .fd
1899             .lock()
1900             .unwrap()
1901             .get_msrs(&mut kvm_msrs)
1902             .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))?;
1903 
1904         msrs[..succ].copy_from_slice(
1905             &kvm_msrs.as_slice()[..succ]
1906                 .iter()
1907                 .map(|e| (*e).into())
1908                 .collect::<Vec<MsrEntry>>(),
1909         );
1910 
1911         Ok(succ)
1912     }
1913 
1914     #[cfg(target_arch = "x86_64")]
1915     ///
1916     /// Setup the model-specific registers (MSR) for this vCPU.
1917     /// Returns the number of MSR entries actually written.
1918     ///
set_msrs(&self, msrs: &[MsrEntry]) -> cpu::Result<usize>1919     fn set_msrs(&self, msrs: &[MsrEntry]) -> cpu::Result<usize> {
1920         let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect();
1921         let kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap();
1922         self.fd
1923             .lock()
1924             .unwrap()
1925             .set_msrs(&kvm_msrs)
1926             .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into()))
1927     }
1928 
1929     ///
1930     /// Returns the vcpu's current "multiprocessing state".
1931     ///
get_mp_state(&self) -> cpu::Result<MpState>1932     fn get_mp_state(&self) -> cpu::Result<MpState> {
1933         Ok(self
1934             .fd
1935             .lock()
1936             .unwrap()
1937             .get_mp_state()
1938             .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))?
1939             .into())
1940     }
1941 
1942     ///
1943     /// Sets the vcpu's current "multiprocessing state".
1944     ///
set_mp_state(&self, mp_state: MpState) -> cpu::Result<()>1945     fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> {
1946         self.fd
1947             .lock()
1948             .unwrap()
1949             .set_mp_state(mp_state.into())
1950             .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into()))
1951     }
1952 
1953     #[cfg(target_arch = "x86_64")]
1954     ///
1955     /// Translates guest virtual address to guest physical address using the `KVM_TRANSLATE` ioctl.
1956     ///
translate_gva(&self, gva: u64, _flags: u64) -> cpu::Result<(u64, u32)>1957     fn translate_gva(&self, gva: u64, _flags: u64) -> cpu::Result<(u64, u32)> {
1958         let tr = self
1959             .fd
1960             .lock()
1961             .unwrap()
1962             .translate_gva(gva)
1963             .map_err(|e| cpu::HypervisorCpuError::TranslateVirtualAddress(e.into()))?;
1964         // tr.valid is set if the GVA is mapped to valid GPA.
1965         match tr.valid {
1966             0 => Err(cpu::HypervisorCpuError::TranslateVirtualAddress(anyhow!(
1967                 "Invalid GVA: {:#x}",
1968                 gva
1969             ))),
1970             _ => Ok((tr.physical_address, 0)),
1971         }
1972     }
1973 
1974     ///
1975     /// Triggers the running of the current virtual CPU returning an exit reason.
1976     ///
run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError>1977     fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> {
1978         match self.fd.lock().unwrap().run() {
1979             Ok(run) => match run {
1980                 #[cfg(target_arch = "x86_64")]
1981                 VcpuExit::IoIn(addr, data) => {
1982                     if let Some(vm_ops) = &self.vm_ops {
1983                         return vm_ops
1984                             .pio_read(addr.into(), data)
1985                             .map(|_| cpu::VmExit::Ignore)
1986                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1987                     }
1988 
1989                     Ok(cpu::VmExit::Ignore)
1990                 }
1991                 #[cfg(target_arch = "x86_64")]
1992                 VcpuExit::IoOut(addr, data) => {
1993                     if let Some(vm_ops) = &self.vm_ops {
1994                         return vm_ops
1995                             .pio_write(addr.into(), data)
1996                             .map(|_| cpu::VmExit::Ignore)
1997                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1998                     }
1999 
2000                     Ok(cpu::VmExit::Ignore)
2001                 }
2002                 #[cfg(target_arch = "x86_64")]
2003                 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)),
2004                 #[cfg(target_arch = "x86_64")]
2005                 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset),
2006 
2007                 #[cfg(target_arch = "aarch64")]
2008                 VcpuExit::SystemEvent(event_type, flags) => {
2009                     use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN};
2010                     // On Aarch64, when the VM is shutdown, run() returns
2011                     // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN
2012                     if event_type == KVM_SYSTEM_EVENT_RESET {
2013                         Ok(cpu::VmExit::Reset)
2014                     } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN {
2015                         Ok(cpu::VmExit::Shutdown)
2016                     } else {
2017                         Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
2018                             "Unexpected system event with type 0x{:x}, flags 0x{:x?}",
2019                             event_type,
2020                             flags
2021                         )))
2022                     }
2023                 }
2024 
2025                 VcpuExit::MmioRead(addr, data) => {
2026                     if let Some(vm_ops) = &self.vm_ops {
2027                         return vm_ops
2028                             .mmio_read(addr, data)
2029                             .map(|_| cpu::VmExit::Ignore)
2030                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
2031                     }
2032 
2033                     Ok(cpu::VmExit::Ignore)
2034                 }
2035                 VcpuExit::MmioWrite(addr, data) => {
2036                     if let Some(vm_ops) = &self.vm_ops {
2037                         return vm_ops
2038                             .mmio_write(addr, data)
2039                             .map(|_| cpu::VmExit::Ignore)
2040                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
2041                     }
2042 
2043                     Ok(cpu::VmExit::Ignore)
2044                 }
2045                 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv),
2046                 #[cfg(feature = "tdx")]
2047                 VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx),
2048                 VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug),
2049 
2050                 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
2051                     "Unexpected exit reason on vcpu run: {:?}",
2052                     r
2053                 ))),
2054             },
2055 
2056             Err(ref e) => match e.errno() {
2057                 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore),
2058                 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
2059                     "VCPU error {:?}",
2060                     e
2061                 ))),
2062             },
2063         }
2064     }
2065 
2066     #[cfg(target_arch = "x86_64")]
2067     ///
2068     /// Let the guest know that it has been paused, which prevents from
2069     /// potential soft lockups when being resumed.
2070     ///
notify_guest_clock_paused(&self) -> cpu::Result<()>2071     fn notify_guest_clock_paused(&self) -> cpu::Result<()> {
2072         if let Err(e) = self.fd.lock().unwrap().kvmclock_ctrl() {
2073             // Linux kernel returns -EINVAL if the PV clock isn't yet initialised
2074             // which could be because we're still in firmware or the guest doesn't
2075             // use KVM clock.
2076             if e.errno() != libc::EINVAL {
2077                 return Err(cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into()));
2078             }
2079         }
2080 
2081         Ok(())
2082     }
2083 
2084     #[cfg(not(target_arch = "riscv64"))]
2085     ///
2086     /// Sets debug registers to set hardware breakpoints and/or enable single step.
2087     ///
set_guest_debug( &self, addrs: &[vm_memory::GuestAddress], singlestep: bool, ) -> cpu::Result<()>2088     fn set_guest_debug(
2089         &self,
2090         addrs: &[vm_memory::GuestAddress],
2091         singlestep: bool,
2092     ) -> cpu::Result<()> {
2093         let mut dbg = kvm_guest_debug {
2094             #[cfg(target_arch = "x86_64")]
2095             control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP,
2096             #[cfg(target_arch = "aarch64")]
2097             control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW,
2098             ..Default::default()
2099         };
2100         if singlestep {
2101             dbg.control |= KVM_GUESTDBG_SINGLESTEP;
2102         }
2103 
2104         // Set the debug registers.
2105         // Here we assume that the number of addresses do not exceed what
2106         // `Hypervisor::get_guest_debug_hw_bps()` specifies.
2107         #[cfg(target_arch = "x86_64")]
2108         {
2109             // Set bits 9 and 10.
2110             // bit 9: GE (global exact breakpoint enable) flag.
2111             // bit 10: always 1.
2112             dbg.arch.debugreg[7] = 0x0600;
2113 
2114             for (i, addr) in addrs.iter().enumerate() {
2115                 dbg.arch.debugreg[i] = addr.0;
2116                 // Set global breakpoint enable flag
2117                 dbg.arch.debugreg[7] |= 2 << (i * 2);
2118             }
2119         }
2120         #[cfg(target_arch = "aarch64")]
2121         {
2122             for (i, addr) in addrs.iter().enumerate() {
2123                 // DBGBCR_EL1 (Debug Breakpoint Control Registers, D13.3.2):
2124                 // bit 0: 1 (Enabled)
2125                 // bit 1~2: 0b11 (PMC = EL1/EL0)
2126                 // bit 5~8: 0b1111 (BAS = AArch64)
2127                 // others: 0
2128                 dbg.arch.dbg_bcr[i] = 0b1u64 | 0b110u64 | 0b1_1110_0000u64;
2129                 // DBGBVR_EL1 (Debug Breakpoint Value Registers, D13.3.3):
2130                 // bit 2~52: VA[2:52]
2131                 dbg.arch.dbg_bvr[i] = (!0u64 >> 11) & addr.0;
2132             }
2133         }
2134         self.fd
2135             .lock()
2136             .unwrap()
2137             .set_guest_debug(&dbg)
2138             .map_err(|e| cpu::HypervisorCpuError::SetDebugRegs(e.into()))
2139     }
2140 
2141     #[cfg(target_arch = "aarch64")]
vcpu_get_finalized_features(&self) -> i322142     fn vcpu_get_finalized_features(&self) -> i32 {
2143         kvm_bindings::KVM_ARM_VCPU_SVE as i32
2144     }
2145 
2146     #[cfg(target_arch = "aarch64")]
vcpu_set_processor_features( &self, vm: &Arc<dyn crate::Vm>, kvi: &mut crate::VcpuInit, id: u8, ) -> cpu::Result<()>2147     fn vcpu_set_processor_features(
2148         &self,
2149         vm: &Arc<dyn crate::Vm>,
2150         kvi: &mut crate::VcpuInit,
2151         id: u8,
2152     ) -> cpu::Result<()> {
2153         use std::arch::is_aarch64_feature_detected;
2154         #[allow(clippy::nonminimal_bool)]
2155         let sve_supported =
2156             is_aarch64_feature_detected!("sve") || is_aarch64_feature_detected!("sve2");
2157 
2158         let mut kvm_kvi: kvm_bindings::kvm_vcpu_init = (*kvi).into();
2159 
2160         // We already checked that the capability is supported.
2161         kvm_kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2;
2162         if vm
2163             .as_any()
2164             .downcast_ref::<crate::kvm::KvmVm>()
2165             .unwrap()
2166             .check_extension(Cap::ArmPmuV3)
2167         {
2168             kvm_kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3;
2169         }
2170 
2171         if sve_supported
2172             && vm
2173                 .as_any()
2174                 .downcast_ref::<crate::kvm::KvmVm>()
2175                 .unwrap()
2176                 .check_extension(Cap::ArmSve)
2177         {
2178             kvm_kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_SVE;
2179         }
2180 
2181         // Non-boot cpus are powered off initially.
2182         if id > 0 {
2183             kvm_kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF;
2184         }
2185 
2186         *kvi = kvm_kvi.into();
2187 
2188         Ok(())
2189     }
2190 
2191     ///
2192     /// Return VcpuInit with default value set
2193     ///
2194     #[cfg(target_arch = "aarch64")]
create_vcpu_init(&self) -> crate::VcpuInit2195     fn create_vcpu_init(&self) -> crate::VcpuInit {
2196         kvm_bindings::kvm_vcpu_init::default().into()
2197     }
2198 
2199     #[cfg(target_arch = "aarch64")]
vcpu_init(&self, kvi: &crate::VcpuInit) -> cpu::Result<()>2200     fn vcpu_init(&self, kvi: &crate::VcpuInit) -> cpu::Result<()> {
2201         let kvm_kvi: kvm_bindings::kvm_vcpu_init = (*kvi).into();
2202         self.fd
2203             .lock()
2204             .unwrap()
2205             .vcpu_init(&kvm_kvi)
2206             .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into()))
2207     }
2208 
2209     #[cfg(target_arch = "aarch64")]
vcpu_finalize(&self, feature: i32) -> cpu::Result<()>2210     fn vcpu_finalize(&self, feature: i32) -> cpu::Result<()> {
2211         self.fd
2212             .lock()
2213             .unwrap()
2214             .vcpu_finalize(&feature)
2215             .map_err(|e| cpu::HypervisorCpuError::VcpuFinalize(e.into()))
2216     }
2217 
2218     #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
2219     ///
2220     /// Gets a list of the guest registers that are supported for the
2221     /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls.
2222     ///
get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()>2223     fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> {
2224         let mut kvm_reg_list: kvm_bindings::RegList = reg_list.clone().into();
2225         self.fd
2226             .lock()
2227             .unwrap()
2228             .get_reg_list(&mut kvm_reg_list)
2229             .map_err(|e: kvm_ioctls::Error| cpu::HypervisorCpuError::GetRegList(e.into()))?;
2230         *reg_list = kvm_reg_list.into();
2231         Ok(())
2232     }
2233 
2234     ///
2235     /// Gets the value of a system register
2236     ///
2237     #[cfg(target_arch = "aarch64")]
get_sys_reg(&self, sys_reg: u32) -> cpu::Result<u64>2238     fn get_sys_reg(&self, sys_reg: u32) -> cpu::Result<u64> {
2239         //
2240         // Arm Architecture Reference Manual defines the encoding of
2241         // AArch64 system registers, see
2242         // https://developer.arm.com/documentation/ddi0487 (chapter D12).
2243         // While KVM defines another ID for each AArch64 system register,
2244         // which is used in calling `KVM_G/SET_ONE_REG` to access a system
2245         // register of a guest.
2246         // A mapping exists between the Arm standard encoding and the KVM ID.
2247         // This function takes the standard u32 ID as input parameter, converts
2248         // it to the corresponding KVM ID, and call `KVM_GET_ONE_REG` API to
2249         // get the value of the system parameter.
2250         //
2251         let id: u64 = KVM_REG_ARM64
2252             | KVM_REG_SIZE_U64
2253             | KVM_REG_ARM64_SYSREG as u64
2254             | ((((sys_reg) >> 5)
2255                 & (KVM_REG_ARM64_SYSREG_OP0_MASK
2256                     | KVM_REG_ARM64_SYSREG_OP1_MASK
2257                     | KVM_REG_ARM64_SYSREG_CRN_MASK
2258                     | KVM_REG_ARM64_SYSREG_CRM_MASK
2259                     | KVM_REG_ARM64_SYSREG_OP2_MASK)) as u64);
2260         let mut bytes = [0_u8; 8];
2261         self.fd
2262             .lock()
2263             .unwrap()
2264             .get_one_reg(id, &mut bytes)
2265             .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?;
2266         Ok(u64::from_le_bytes(bytes))
2267     }
2268 
2269     ///
2270     /// Gets the value of a non-core register
2271     ///
2272     #[cfg(target_arch = "riscv64")]
get_non_core_reg(&self, _non_core_reg: u32) -> cpu::Result<u64>2273     fn get_non_core_reg(&self, _non_core_reg: u32) -> cpu::Result<u64> {
2274         unimplemented!()
2275     }
2276 
2277     ///
2278     /// Configure core registers for a given CPU.
2279     ///
2280     #[cfg(target_arch = "aarch64")]
setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()>2281     fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> {
2282         let kreg_off = offset_of!(kvm_regs, regs);
2283 
2284         // Get the register index of the PSTATE (Processor State) register.
2285         let pstate = offset_of!(user_pt_regs, pstate) + kreg_off;
2286         self.fd
2287             .lock()
2288             .unwrap()
2289             .set_one_reg(
2290                 arm64_core_reg_id!(KVM_REG_SIZE_U64, pstate),
2291                 &regs::PSTATE_FAULT_BITS_64.to_le_bytes(),
2292             )
2293             .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
2294 
2295         // Other vCPUs are powered off initially awaiting PSCI wakeup.
2296         if cpu_id == 0 {
2297             // Setting the PC (Processor Counter) to the current program address (kernel address).
2298             let pc = offset_of!(user_pt_regs, pc) + kreg_off;
2299             self.fd
2300                 .lock()
2301                 .unwrap()
2302                 .set_one_reg(
2303                     arm64_core_reg_id!(KVM_REG_SIZE_U64, pc),
2304                     &boot_ip.to_le_bytes(),
2305                 )
2306                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
2307 
2308             // Last mandatory thing to set -> the address pointing to the FDT (also called DTB).
2309             // "The device tree blob (dtb) must be placed on an 8-byte boundary and must
2310             // not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt.
2311             // We are choosing to place it the end of DRAM. See `get_fdt_addr`.
2312             let regs0 = offset_of!(user_pt_regs, regs) + kreg_off;
2313             self.fd
2314                 .lock()
2315                 .unwrap()
2316                 .set_one_reg(
2317                     arm64_core_reg_id!(KVM_REG_SIZE_U64, regs0),
2318                     &fdt_start.to_le_bytes(),
2319                 )
2320                 .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?;
2321         }
2322         Ok(())
2323     }
2324 
2325     #[cfg(target_arch = "riscv64")]
2326     ///
2327     /// Configure registers for a given RISC-V CPU.
2328     ///
setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()>2329     fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> {
2330         // Setting the A0 () to the hartid of this CPU.
2331         let a0 = offset_of!(kvm_riscv_core, regs, user_regs_struct, a0);
2332         self.fd
2333             .lock()
2334             .unwrap()
2335             .set_one_reg(
2336                 riscv64_reg_id!(KVM_REG_RISCV_CORE, a0),
2337                 &u64::from(cpu_id).to_le_bytes(),
2338             )
2339             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
2340 
2341         // Setting the PC (Processor Counter) to the current program address (kernel address).
2342         let pc = offset_of!(kvm_riscv_core, regs, user_regs_struct, pc);
2343         self.fd
2344             .lock()
2345             .unwrap()
2346             .set_one_reg(
2347                 riscv64_reg_id!(KVM_REG_RISCV_CORE, pc),
2348                 &boot_ip.to_le_bytes(),
2349             )
2350             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
2351 
2352         // Last mandatory thing to set -> the address pointing to the FDT (also called DTB).
2353         // "The device tree blob (dtb) must be placed on an 8-byte boundary and must
2354         // not exceed 64 kilobytes in size." -> https://www.kernel.org/doc/Documentation/arch/riscv/boot.txt.
2355         let a1 = offset_of!(kvm_riscv_core, regs, user_regs_struct, a1);
2356         self.fd
2357             .lock()
2358             .unwrap()
2359             .set_one_reg(
2360                 riscv64_reg_id!(KVM_REG_RISCV_CORE, a1),
2361                 &fdt_start.to_le_bytes(),
2362             )
2363             .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?;
2364 
2365         Ok(())
2366     }
2367 
2368     #[cfg(target_arch = "x86_64")]
2369     ///
2370     /// Get the current CPU state
2371     ///
2372     /// Ordering requirements:
2373     ///
2374     /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify
2375     /// vCPU/LAPIC state. As such, it must be done before most everything
2376     /// else, otherwise we cannot restore everything and expect it to work.
2377     ///
2378     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
2379     /// still running.
2380     ///
2381     /// KVM_GET_LAPIC may change state of LAPIC before returning it.
2382     ///
2383     /// GET_VCPU_EVENTS should probably be last to save. The code looks as
2384     /// it might as well be affected by internal state modifications of the
2385     /// GET ioctls.
2386     ///
2387     /// SREGS saves/restores a pending interrupt, similar to what
2388     /// VCPU_EVENTS also does.
2389     ///
2390     /// GET_MSRS requires a prepopulated data structure to do something
2391     /// meaningful. For SET_MSRS it will then contain good data.
2392     ///
2393     /// # Example
2394     ///
2395     /// ```rust
2396     /// # use hypervisor::kvm::KvmHypervisor;
2397     /// # use std::sync::Arc;
2398     /// let kvm = KvmHypervisor::new().unwrap();
2399     /// let hv = Arc::new(kvm);
2400     /// let vm = hv.create_vm().expect("new VM fd creation failed");
2401     /// vm.enable_split_irq().unwrap();
2402     /// let vcpu = vm.create_vcpu(0, None).unwrap();
2403     /// let state = vcpu.state().unwrap();
2404     /// ```
state(&self) -> cpu::Result<CpuState>2405     fn state(&self) -> cpu::Result<CpuState> {
2406         let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?;
2407         let mp_state = self.get_mp_state()?.into();
2408         let regs = self.get_regs()?;
2409         let sregs = self.get_sregs()?;
2410         let xsave = self.get_xsave()?;
2411         let xcrs = self.get_xcrs()?;
2412         let lapic_state = self.get_lapic()?;
2413         let fpu = self.get_fpu()?;
2414 
2415         // Try to get all MSRs based on the list previously retrieved from KVM.
2416         // If the number of MSRs obtained from GET_MSRS is different from the
2417         // expected amount, we fallback onto a slower method by getting MSRs
2418         // by chunks. This is the only way to make sure we try to get as many
2419         // MSRs as possible, even if some MSRs are not supported.
2420         let mut msr_entries = self.msrs.clone();
2421 
2422         // Save extra MSRs if the Hyper-V synthetic interrupt controller is
2423         // emulated.
2424         if self.hyperv_synic.load(Ordering::Acquire) {
2425             let hyperv_synic_msrs = vec![
2426                 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084,
2427                 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096,
2428                 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d,
2429                 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4,
2430                 0x400000b5, 0x400000b6, 0x400000b7,
2431             ];
2432             for index in hyperv_synic_msrs {
2433                 let msr = kvm_msr_entry {
2434                     index,
2435                     ..Default::default()
2436                 };
2437                 msr_entries.push(msr.into());
2438             }
2439         }
2440 
2441         let expected_num_msrs = msr_entries.len();
2442         let num_msrs = self.get_msrs(&mut msr_entries)?;
2443         let msrs = if num_msrs != expected_num_msrs {
2444             let mut faulty_msr_index = num_msrs;
2445             let mut msr_entries_tmp = msr_entries[..faulty_msr_index].to_vec();
2446 
2447             loop {
2448                 warn!(
2449                     "Detected faulty MSR 0x{:x} while getting MSRs",
2450                     msr_entries[faulty_msr_index].index
2451                 );
2452 
2453                 // Skip the first bad MSR
2454                 let start_pos = faulty_msr_index + 1;
2455 
2456                 let mut sub_msr_entries = msr_entries[start_pos..].to_vec();
2457                 let num_msrs = self.get_msrs(&mut sub_msr_entries)?;
2458 
2459                 msr_entries_tmp.extend(&sub_msr_entries[..num_msrs]);
2460 
2461                 if num_msrs == sub_msr_entries.len() {
2462                     break;
2463                 }
2464 
2465                 faulty_msr_index = start_pos + num_msrs;
2466             }
2467 
2468             msr_entries_tmp
2469         } else {
2470             msr_entries
2471         };
2472 
2473         let vcpu_events = self.get_vcpu_events()?;
2474         let tsc_khz = self.tsc_khz()?;
2475 
2476         Ok(VcpuKvmState {
2477             cpuid,
2478             msrs,
2479             vcpu_events,
2480             regs: regs.into(),
2481             sregs: sregs.into(),
2482             fpu,
2483             lapic_state,
2484             xsave,
2485             xcrs,
2486             mp_state,
2487             tsc_khz,
2488         }
2489         .into())
2490     }
2491 
2492     ///
2493     /// Get the current AArch64 CPU state
2494     ///
2495     #[cfg(target_arch = "aarch64")]
state(&self) -> cpu::Result<CpuState>2496     fn state(&self) -> cpu::Result<CpuState> {
2497         let mut state = VcpuKvmState {
2498             mp_state: self.get_mp_state()?.into(),
2499             ..Default::default()
2500         };
2501         // Get core registers
2502         state.core_regs = self.get_regs()?.into();
2503 
2504         // Get systerm register
2505         // Call KVM_GET_REG_LIST to get all registers available to the guest.
2506         // For ArmV8 there are around 500 registers.
2507         let mut sys_regs: Vec<kvm_bindings::kvm_one_reg> = Vec::new();
2508         let mut reg_list = kvm_bindings::RegList::new(500).unwrap();
2509         self.fd
2510             .lock()
2511             .unwrap()
2512             .get_reg_list(&mut reg_list)
2513             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?;
2514 
2515         // At this point reg_list should contain: core registers and system
2516         // registers.
2517         // The register list contains the number of registers and their ids. We
2518         // will be needing to call KVM_GET_ONE_REG on each id in order to save
2519         // all of them. We carve out from the list  the core registers which are
2520         // represented in the kernel by kvm_regs structure and for which we can
2521         // calculate the id based on the offset in the structure.
2522         reg_list.retain(|regid| is_system_register(*regid));
2523 
2524         // Now, for the rest of the registers left in the previously fetched
2525         // register list, we are simply calling KVM_GET_ONE_REG.
2526         let indices = reg_list.as_slice();
2527         for index in indices.iter() {
2528             let mut bytes = [0_u8; 8];
2529             self.fd
2530                 .lock()
2531                 .unwrap()
2532                 .get_one_reg(*index, &mut bytes)
2533                 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?;
2534             sys_regs.push(kvm_bindings::kvm_one_reg {
2535                 id: *index,
2536                 addr: u64::from_le_bytes(bytes),
2537             });
2538         }
2539 
2540         state.sys_regs = sys_regs;
2541 
2542         Ok(state.into())
2543     }
2544 
2545     #[cfg(target_arch = "riscv64")]
2546     ///
2547     /// Get the current RISC-V 64-bit CPU state
2548     ///
state(&self) -> cpu::Result<CpuState>2549     fn state(&self) -> cpu::Result<CpuState> {
2550         let mut state = VcpuKvmState {
2551             mp_state: self.get_mp_state()?.into(),
2552             ..Default::default()
2553         };
2554         // Get core registers
2555         state.core_regs = self.get_regs()?.into();
2556 
2557         // Get non-core register
2558         // Call KVM_GET_REG_LIST to get all registers available to the guest.
2559         // For RISC-V 64-bit there are around 200 registers.
2560         let mut sys_regs: Vec<kvm_bindings::kvm_one_reg> = Vec::new();
2561         let mut reg_list = kvm_bindings::RegList::new(200).unwrap();
2562         self.fd
2563             .lock()
2564             .unwrap()
2565             .get_reg_list(&mut reg_list)
2566             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?;
2567 
2568         // At this point reg_list should contain:
2569         // - core registers
2570         // - config registers
2571         // - timer registers
2572         // - control and status registers
2573         // - AIA control and status registers
2574         // - smstateen control and status registers
2575         // - sbi_sta control and status registers.
2576         //
2577         // The register list contains the number of registers and their ids. We
2578         // will be needing to call KVM_GET_ONE_REG on each id in order to save
2579         // all of them. We carve out from the list the core registers which are
2580         // represented in the kernel by `kvm_riscv_core` structure and for which
2581         // we can calculate the id based on the offset in the structure.
2582         reg_list.retain(|regid| is_non_core_register(*regid));
2583 
2584         // Now, for the rest of the registers left in the previously fetched
2585         // register list, we are simply calling KVM_GET_ONE_REG.
2586         let indices = reg_list.as_slice();
2587         for index in indices.iter() {
2588             let mut bytes = [0_u8; 8];
2589             self.fd
2590                 .lock()
2591                 .unwrap()
2592                 .get_one_reg(*index, &mut bytes)
2593                 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?;
2594             sys_regs.push(kvm_bindings::kvm_one_reg {
2595                 id: *index,
2596                 addr: u64::from_le_bytes(bytes),
2597             });
2598         }
2599 
2600         state.non_core_regs = sys_regs;
2601 
2602         Ok(state.into())
2603     }
2604 
2605     #[cfg(target_arch = "x86_64")]
2606     ///
2607     /// Restore the previously saved CPU state
2608     ///
2609     /// Ordering requirements:
2610     ///
2611     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
2612     /// still running.
2613     ///
2614     /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so
2615     /// if we ever change the BSP, we have to do that before restoring anything.
2616     /// The same seems to be true for CPUID stuff.
2617     ///
2618     /// SREGS saves/restores a pending interrupt, similar to what
2619     /// VCPU_EVENTS also does.
2620     ///
2621     /// SET_REGS clears pending exceptions unconditionally, thus, it must be
2622     /// done before SET_VCPU_EVENTS, which restores it.
2623     ///
2624     /// SET_LAPIC must come after SET_SREGS, because the latter restores
2625     /// the apic base msr.
2626     ///
2627     /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR
2628     /// only restores successfully, when the LAPIC is correctly configured.
2629     ///
2630     /// Arguments: CpuState
2631     /// # Example
2632     ///
2633     /// ```rust
2634     /// # use hypervisor::kvm::KvmHypervisor;
2635     /// # use std::sync::Arc;
2636     /// let kvm = KvmHypervisor::new().unwrap();
2637     /// let hv = Arc::new(kvm);
2638     /// let vm = hv.create_vm().expect("new VM fd creation failed");
2639     /// vm.enable_split_irq().unwrap();
2640     /// let vcpu = vm.create_vcpu(0, None).unwrap();
2641     /// let state = vcpu.state().unwrap();
2642     /// vcpu.set_state(&state).unwrap();
2643     /// ```
set_state(&self, state: &CpuState) -> cpu::Result<()>2644     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
2645         let state: VcpuKvmState = state.clone().into();
2646         self.set_cpuid2(&state.cpuid)?;
2647         self.set_mp_state(state.mp_state.into())?;
2648         self.set_regs(&state.regs.into())?;
2649         self.set_sregs(&state.sregs.into())?;
2650         self.set_xsave(&state.xsave)?;
2651         self.set_xcrs(&state.xcrs)?;
2652         self.set_lapic(&state.lapic_state)?;
2653         self.set_fpu(&state.fpu)?;
2654 
2655         if let Some(freq) = state.tsc_khz {
2656             self.set_tsc_khz(freq)?;
2657         }
2658 
2659         // Try to set all MSRs previously stored.
2660         // If the number of MSRs set from SET_MSRS is different from the
2661         // expected amount, we fallback onto a slower method by setting MSRs
2662         // by chunks. This is the only way to make sure we try to set as many
2663         // MSRs as possible, even if some MSRs are not supported.
2664         let expected_num_msrs = state.msrs.len();
2665         let num_msrs = self.set_msrs(&state.msrs)?;
2666         if num_msrs != expected_num_msrs {
2667             let mut faulty_msr_index = num_msrs;
2668 
2669             loop {
2670                 warn!(
2671                     "Detected faulty MSR 0x{:x} while setting MSRs",
2672                     state.msrs[faulty_msr_index].index
2673                 );
2674 
2675                 // Skip the first bad MSR
2676                 let start_pos = faulty_msr_index + 1;
2677 
2678                 let sub_msr_entries = state.msrs[start_pos..].to_vec();
2679 
2680                 let num_msrs = self.set_msrs(&sub_msr_entries)?;
2681 
2682                 if num_msrs == sub_msr_entries.len() {
2683                     break;
2684                 }
2685 
2686                 faulty_msr_index = start_pos + num_msrs;
2687             }
2688         }
2689 
2690         self.set_vcpu_events(&state.vcpu_events)?;
2691 
2692         Ok(())
2693     }
2694 
2695     ///
2696     /// Restore the previously saved AArch64 CPU state
2697     ///
2698     #[cfg(target_arch = "aarch64")]
set_state(&self, state: &CpuState) -> cpu::Result<()>2699     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
2700         let state: VcpuKvmState = state.clone().into();
2701         // Set core registers
2702         self.set_regs(&state.core_regs.into())?;
2703         // Set system registers
2704         for reg in &state.sys_regs {
2705             self.fd
2706                 .lock()
2707                 .unwrap()
2708                 .set_one_reg(reg.id, &reg.addr.to_le_bytes())
2709                 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?;
2710         }
2711 
2712         self.set_mp_state(state.mp_state.into())?;
2713 
2714         Ok(())
2715     }
2716 
2717     #[cfg(target_arch = "riscv64")]
2718     ///
2719     /// Restore the previously saved RISC-V 64-bit CPU state
2720     ///
set_state(&self, state: &CpuState) -> cpu::Result<()>2721     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
2722         let state: VcpuKvmState = state.clone().into();
2723         // Set core registers
2724         self.set_regs(&state.core_regs.into())?;
2725         // Set system registers
2726         for reg in &state.non_core_regs {
2727             self.fd
2728                 .lock()
2729                 .unwrap()
2730                 .set_one_reg(reg.id, &reg.addr.to_le_bytes())
2731                 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?;
2732         }
2733 
2734         self.set_mp_state(state.mp_state.into())?;
2735 
2736         Ok(())
2737     }
2738 
2739     ///
2740     /// Initialize TDX for this CPU
2741     ///
2742     #[cfg(feature = "tdx")]
tdx_init(&self, hob_address: u64) -> cpu::Result<()>2743     fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> {
2744         tdx_command(
2745             &self.fd.lock().unwrap().as_raw_fd(),
2746             TdxCommand::InitVcpu,
2747             0,
2748             hob_address,
2749         )
2750         .map_err(cpu::HypervisorCpuError::InitializeTdx)
2751     }
2752 
2753     ///
2754     /// Set the "immediate_exit" state
2755     ///
set_immediate_exit(&self, exit: bool)2756     fn set_immediate_exit(&self, exit: bool) {
2757         self.fd.lock().unwrap().set_kvm_immediate_exit(exit.into());
2758     }
2759 
2760     ///
2761     /// Returns the details about TDX exit reason
2762     ///
2763     #[cfg(feature = "tdx")]
get_tdx_exit_details(&mut self) -> cpu::Result<TdxExitDetails>2764     fn get_tdx_exit_details(&mut self) -> cpu::Result<TdxExitDetails> {
2765         let mut fd = self.fd.as_ref().lock().unwrap();
2766         let kvm_run = fd.get_kvm_run();
2767         // SAFETY: accessing a union field in a valid structure
2768         let tdx_vmcall = unsafe {
2769             &mut (*((&mut kvm_run.__bindgen_anon_1) as *mut kvm_run__bindgen_ty_1
2770                 as *mut KvmTdxExit))
2771                 .u
2772                 .vmcall
2773         };
2774 
2775         tdx_vmcall.status_code = TDG_VP_VMCALL_INVALID_OPERAND;
2776 
2777         if tdx_vmcall.type_ != 0 {
2778             return Err(cpu::HypervisorCpuError::UnknownTdxVmCall);
2779         }
2780 
2781         match tdx_vmcall.subfunction {
2782             TDG_VP_VMCALL_GET_QUOTE => Ok(TdxExitDetails::GetQuote),
2783             TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT => {
2784                 Ok(TdxExitDetails::SetupEventNotifyInterrupt)
2785             }
2786             _ => Err(cpu::HypervisorCpuError::UnknownTdxVmCall),
2787         }
2788     }
2789 
2790     ///
2791     /// Set the status code for TDX exit
2792     ///
2793     #[cfg(feature = "tdx")]
set_tdx_status(&mut self, status: TdxExitStatus)2794     fn set_tdx_status(&mut self, status: TdxExitStatus) {
2795         let mut fd = self.fd.as_ref().lock().unwrap();
2796         let kvm_run = fd.get_kvm_run();
2797         // SAFETY: accessing a union field in a valid structure
2798         let tdx_vmcall = unsafe {
2799             &mut (*((&mut kvm_run.__bindgen_anon_1) as *mut kvm_run__bindgen_ty_1
2800                 as *mut KvmTdxExit))
2801                 .u
2802                 .vmcall
2803         };
2804 
2805         tdx_vmcall.status_code = match status {
2806             TdxExitStatus::Success => TDG_VP_VMCALL_SUCCESS,
2807             TdxExitStatus::InvalidOperand => TDG_VP_VMCALL_INVALID_OPERAND,
2808         };
2809     }
2810 
2811     #[cfg(target_arch = "x86_64")]
2812     ///
2813     /// Return the list of initial MSR entries for a VCPU
2814     ///
boot_msr_entries(&self) -> Vec<MsrEntry>2815     fn boot_msr_entries(&self) -> Vec<MsrEntry> {
2816         use crate::arch::x86::{msr_index, MTRR_ENABLE, MTRR_MEM_TYPE_WB};
2817 
2818         [
2819             msr!(msr_index::MSR_IA32_SYSENTER_CS),
2820             msr!(msr_index::MSR_IA32_SYSENTER_ESP),
2821             msr!(msr_index::MSR_IA32_SYSENTER_EIP),
2822             msr!(msr_index::MSR_STAR),
2823             msr!(msr_index::MSR_CSTAR),
2824             msr!(msr_index::MSR_LSTAR),
2825             msr!(msr_index::MSR_KERNEL_GS_BASE),
2826             msr!(msr_index::MSR_SYSCALL_MASK),
2827             msr!(msr_index::MSR_IA32_TSC),
2828             msr_data!(
2829                 msr_index::MSR_IA32_MISC_ENABLE,
2830                 msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64
2831             ),
2832             msr_data!(msr_index::MSR_MTRRdefType, MTRR_ENABLE | MTRR_MEM_TYPE_WB),
2833         ]
2834         .to_vec()
2835     }
2836 
2837     #[cfg(target_arch = "aarch64")]
has_pmu_support(&self) -> bool2838     fn has_pmu_support(&self) -> bool {
2839         let cpu_attr = kvm_bindings::kvm_device_attr {
2840             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2841             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT),
2842             addr: 0x0,
2843             flags: 0,
2844         };
2845         self.fd.lock().unwrap().has_device_attr(&cpu_attr).is_ok()
2846     }
2847 
2848     #[cfg(target_arch = "aarch64")]
init_pmu(&self, irq: u32) -> cpu::Result<()>2849     fn init_pmu(&self, irq: u32) -> cpu::Result<()> {
2850         let cpu_attr = kvm_bindings::kvm_device_attr {
2851             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2852             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT),
2853             addr: 0x0,
2854             flags: 0,
2855         };
2856         let cpu_attr_irq = kvm_bindings::kvm_device_attr {
2857             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2858             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_IRQ),
2859             addr: &irq as *const u32 as u64,
2860             flags: 0,
2861         };
2862         self.fd
2863             .lock()
2864             .unwrap()
2865             .set_device_attr(&cpu_attr_irq)
2866             .map_err(|_| cpu::HypervisorCpuError::InitializePmu)?;
2867         self.fd
2868             .lock()
2869             .unwrap()
2870             .set_device_attr(&cpu_attr)
2871             .map_err(|_| cpu::HypervisorCpuError::InitializePmu)
2872     }
2873 
2874     #[cfg(target_arch = "x86_64")]
2875     ///
2876     /// Get the frequency of the TSC if available
2877     ///
tsc_khz(&self) -> cpu::Result<Option<u32>>2878     fn tsc_khz(&self) -> cpu::Result<Option<u32>> {
2879         match self.fd.lock().unwrap().get_tsc_khz() {
2880             Err(e) => {
2881                 if e.errno() == libc::EIO {
2882                     Ok(None)
2883                 } else {
2884                     Err(cpu::HypervisorCpuError::GetTscKhz(e.into()))
2885                 }
2886             }
2887             Ok(v) => Ok(Some(v)),
2888         }
2889     }
2890 
2891     #[cfg(target_arch = "x86_64")]
2892     ///
2893     /// Set the frequency of the TSC if available
2894     ///
set_tsc_khz(&self, freq: u32) -> cpu::Result<()>2895     fn set_tsc_khz(&self, freq: u32) -> cpu::Result<()> {
2896         match self.fd.lock().unwrap().set_tsc_khz(freq) {
2897             Err(e) => {
2898                 if e.errno() == libc::EIO {
2899                     Ok(())
2900                 } else {
2901                     Err(cpu::HypervisorCpuError::SetTscKhz(e.into()))
2902                 }
2903             }
2904             Ok(_) => Ok(()),
2905         }
2906     }
2907 
2908     #[cfg(target_arch = "x86_64")]
2909     ///
2910     /// Trigger NMI interrupt
2911     ///
nmi(&self) -> cpu::Result<()>2912     fn nmi(&self) -> cpu::Result<()> {
2913         match self.fd.lock().unwrap().nmi() {
2914             Err(e) => {
2915                 if e.errno() == libc::EIO {
2916                     Ok(())
2917                 } else {
2918                     Err(cpu::HypervisorCpuError::Nmi(e.into()))
2919                 }
2920             }
2921             Ok(_) => Ok(()),
2922         }
2923     }
2924 }
2925 
2926 impl KvmVcpu {
2927     #[cfg(target_arch = "x86_64")]
2928     ///
2929     /// X86 specific call that returns the vcpu's current "xsave struct".
2930     ///
get_xsave(&self) -> cpu::Result<XsaveState>2931     fn get_xsave(&self) -> cpu::Result<XsaveState> {
2932         Ok(self
2933             .fd
2934             .lock()
2935             .unwrap()
2936             .get_xsave()
2937             .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into()))?
2938             .into())
2939     }
2940 
2941     #[cfg(target_arch = "x86_64")]
2942     ///
2943     /// X86 specific call that sets the vcpu's current "xsave struct".
2944     ///
set_xsave(&self, xsave: &XsaveState) -> cpu::Result<()>2945     fn set_xsave(&self, xsave: &XsaveState) -> cpu::Result<()> {
2946         let xsave: kvm_bindings::kvm_xsave = (*xsave).clone().into();
2947         // SAFETY: Here we trust the kernel not to read past the end of the kvm_xsave struct
2948         // when calling the kvm-ioctl library function.
2949         unsafe {
2950             self.fd
2951                 .lock()
2952                 .unwrap()
2953                 .set_xsave(&xsave)
2954                 .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into()))
2955         }
2956     }
2957 
2958     #[cfg(target_arch = "x86_64")]
2959     ///
2960     /// X86 specific call that returns the vcpu's current "xcrs".
2961     ///
get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters>2962     fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> {
2963         self.fd
2964             .lock()
2965             .unwrap()
2966             .get_xcrs()
2967             .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into()))
2968     }
2969 
2970     #[cfg(target_arch = "x86_64")]
2971     ///
2972     /// X86 specific call that sets the vcpu's current "xcrs".
2973     ///
set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()>2974     fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> {
2975         self.fd
2976             .lock()
2977             .unwrap()
2978             .set_xcrs(xcrs)
2979             .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into()))
2980     }
2981 
2982     #[cfg(target_arch = "x86_64")]
2983     ///
2984     /// Returns currently pending exceptions, interrupts, and NMIs as well as related
2985     /// states of the vcpu.
2986     ///
get_vcpu_events(&self) -> cpu::Result<VcpuEvents>2987     fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> {
2988         self.fd
2989             .lock()
2990             .unwrap()
2991             .get_vcpu_events()
2992             .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into()))
2993     }
2994 
2995     #[cfg(target_arch = "x86_64")]
2996     ///
2997     /// Sets pending exceptions, interrupts, and NMIs as well as related states
2998     /// of the vcpu.
2999     ///
set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()>3000     fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> {
3001         self.fd
3002             .lock()
3003             .unwrap()
3004             .set_vcpu_events(events)
3005             .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into()))
3006     }
3007 }
3008 
3009 #[cfg(test)]
3010 mod tests {
3011     #[test]
3012     #[cfg(target_arch = "riscv64")]
test_get_and_set_regs()3013     fn test_get_and_set_regs() {
3014         use super::*;
3015 
3016         let kvm = KvmHypervisor::new().unwrap();
3017         let hypervisor = Arc::new(kvm);
3018         let vm = hypervisor.create_vm().expect("new VM fd creation failed");
3019         let vcpu0 = vm.create_vcpu(0, None).unwrap();
3020 
3021         let core_regs = StandardRegisters::from(kvm_riscv_core {
3022             regs: user_regs_struct {
3023                 pc: 0x00,
3024                 ra: 0x01,
3025                 sp: 0x02,
3026                 gp: 0x03,
3027                 tp: 0x04,
3028                 t0: 0x05,
3029                 t1: 0x06,
3030                 t2: 0x07,
3031                 s0: 0x08,
3032                 s1: 0x09,
3033                 a0: 0x0a,
3034                 a1: 0x0b,
3035                 a2: 0x0c,
3036                 a3: 0x0d,
3037                 a4: 0x0e,
3038                 a5: 0x0f,
3039                 a6: 0x10,
3040                 a7: 0x11,
3041                 s2: 0x12,
3042                 s3: 0x13,
3043                 s4: 0x14,
3044                 s5: 0x15,
3045                 s6: 0x16,
3046                 s7: 0x17,
3047                 s8: 0x18,
3048                 s9: 0x19,
3049                 s10: 0x1a,
3050                 s11: 0x1b,
3051                 t3: 0x1c,
3052                 t4: 0x1d,
3053                 t5: 0x1e,
3054                 t6: 0x1f,
3055             },
3056             mode: 0x00,
3057         });
3058 
3059         vcpu0.set_regs(&core_regs).unwrap();
3060         assert_eq!(vcpu0.get_regs().unwrap(), core_regs);
3061     }
3062 }
3063