1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause 4 // 5 // Copyright © 2020, Microsoft Corporation 6 // 7 // Copyright 2018-2019 CrowdStrike, Inc. 8 // 9 // 10 11 #[cfg(target_arch = "aarch64")] 12 use crate::aarch64::gic::KvmGicV3Its; 13 #[cfg(target_arch = "aarch64")] 14 pub use crate::aarch64::{ 15 check_required_kvm_extensions, gic::Gicv3ItsState as GicState, is_system_register, VcpuInit, 16 VcpuKvmState, 17 }; 18 #[cfg(target_arch = "aarch64")] 19 use crate::arch::aarch64::gic::{Vgic, VgicConfig}; 20 use crate::cpu; 21 use crate::hypervisor; 22 use crate::vec_with_array_field; 23 use crate::vm::{self, InterruptSourceConfig, VmOps}; 24 use crate::HypervisorType; 25 #[cfg(target_arch = "aarch64")] 26 use crate::{arm64_core_reg_id, offset__of}; 27 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd}; 28 use std::any::Any; 29 use std::collections::HashMap; 30 #[cfg(target_arch = "aarch64")] 31 use std::convert::TryInto; 32 #[cfg(target_arch = "x86_64")] 33 use std::fs::File; 34 #[cfg(target_arch = "x86_64")] 35 use std::os::unix::io::AsRawFd; 36 #[cfg(feature = "tdx")] 37 use std::os::unix::io::RawFd; 38 use std::result; 39 #[cfg(target_arch = "x86_64")] 40 use std::sync::atomic::{AtomicBool, Ordering}; 41 #[cfg(target_arch = "aarch64")] 42 use std::sync::Mutex; 43 use std::sync::{Arc, RwLock}; 44 use vmm_sys_util::eventfd::EventFd; 45 // x86_64 dependencies 46 #[cfg(target_arch = "x86_64")] 47 pub mod x86_64; 48 #[cfg(target_arch = "x86_64")] 49 use crate::arch::x86::{ 50 CpuIdEntry, FpuState, LapicState, MsrEntry, SpecialRegisters, StandardRegisters, 51 NUM_IOAPIC_PINS, 52 }; 53 #[cfg(target_arch = "x86_64")] 54 use crate::ClockData; 55 use crate::{ 56 CpuState, IoEventAddress, IrqRoutingEntry, MpState, UserMemoryRegion, 57 USER_MEMORY_REGION_LOG_DIRTY, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE, 58 }; 59 #[cfg(target_arch = "aarch64")] 60 use aarch64::{RegList, Register, StandardRegisters}; 61 #[cfg(target_arch = "x86_64")] 62 use kvm_bindings::{ 63 kvm_enable_cap, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, KVM_CAP_SPLIT_IRQCHIP, 64 KVM_GUESTDBG_USE_HW_BP, 65 }; 66 #[cfg(target_arch = "x86_64")] 67 use x86_64::check_required_kvm_extensions; 68 #[cfg(target_arch = "x86_64")] 69 pub use x86_64::{CpuId, ExtendedControlRegisters, MsrEntries, VcpuKvmState, Xsave}; 70 // aarch64 dependencies 71 #[cfg(target_arch = "aarch64")] 72 pub mod aarch64; 73 pub use kvm_bindings; 74 #[cfg(feature = "tdx")] 75 use kvm_bindings::KVMIO; 76 pub use kvm_bindings::{ 77 kvm_clock_data, kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_guest_debug, 78 kvm_irq_routing, kvm_irq_routing_entry, kvm_mp_state, kvm_userspace_memory_region, 79 KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI, 80 KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID, 81 }; 82 #[cfg(target_arch = "aarch64")] 83 use kvm_bindings::{ 84 kvm_regs, user_fpsimd_state, user_pt_regs, KVM_GUESTDBG_USE_HW, KVM_NR_SPSR, KVM_REG_ARM64, 85 KVM_REG_ARM64_SYSREG, KVM_REG_ARM64_SYSREG_CRM_MASK, KVM_REG_ARM64_SYSREG_CRN_MASK, 86 KVM_REG_ARM64_SYSREG_OP0_MASK, KVM_REG_ARM64_SYSREG_OP1_MASK, KVM_REG_ARM64_SYSREG_OP2_MASK, 87 KVM_REG_ARM_CORE, KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64, 88 }; 89 pub use kvm_ioctls; 90 pub use kvm_ioctls::{Cap, Kvm}; 91 #[cfg(target_arch = "aarch64")] 92 use std::mem; 93 use thiserror::Error; 94 use vfio_ioctls::VfioDeviceFd; 95 #[cfg(feature = "tdx")] 96 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_ioc_nr, ioctl_iowr_nr}; 97 /// 98 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms 99 /// 100 pub use { 101 kvm_bindings::kvm_create_device as CreateDevice, kvm_bindings::kvm_device_attr as DeviceAttr, 102 kvm_bindings::kvm_run, kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::VcpuExit, 103 }; 104 105 #[cfg(target_arch = "x86_64")] 106 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196; 107 108 #[cfg(feature = "tdx")] 109 const KVM_EXIT_TDX: u32 = 35; 110 #[cfg(feature = "tdx")] 111 const TDG_VP_VMCALL_GET_QUOTE: u64 = 0x10002; 112 #[cfg(feature = "tdx")] 113 const TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: u64 = 0x10004; 114 #[cfg(feature = "tdx")] 115 const TDG_VP_VMCALL_SUCCESS: u64 = 0; 116 #[cfg(feature = "tdx")] 117 const TDG_VP_VMCALL_INVALID_OPERAND: u64 = 0x8000000000000000; 118 119 #[cfg(feature = "tdx")] 120 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong); 121 122 #[cfg(feature = "tdx")] 123 #[repr(u32)] 124 enum TdxCommand { 125 Capabilities = 0, 126 InitVm, 127 InitVcpu, 128 InitMemRegion, 129 Finalize, 130 } 131 132 #[cfg(feature = "tdx")] 133 pub enum TdxExitDetails { 134 GetQuote, 135 SetupEventNotifyInterrupt, 136 } 137 138 #[cfg(feature = "tdx")] 139 pub enum TdxExitStatus { 140 Success, 141 InvalidOperand, 142 } 143 144 #[cfg(feature = "tdx")] 145 const TDX_MAX_NR_CPUID_CONFIGS: usize = 6; 146 147 #[cfg(feature = "tdx")] 148 #[repr(C)] 149 #[derive(Debug, Default)] 150 pub struct TdxCpuidConfig { 151 pub leaf: u32, 152 pub sub_leaf: u32, 153 pub eax: u32, 154 pub ebx: u32, 155 pub ecx: u32, 156 pub edx: u32, 157 } 158 159 #[cfg(feature = "tdx")] 160 #[repr(C)] 161 #[derive(Debug, Default)] 162 pub struct TdxCapabilities { 163 pub attrs_fixed0: u64, 164 pub attrs_fixed1: u64, 165 pub xfam_fixed0: u64, 166 pub xfam_fixed1: u64, 167 pub nr_cpuid_configs: u32, 168 pub padding: u32, 169 pub cpuid_configs: [TdxCpuidConfig; TDX_MAX_NR_CPUID_CONFIGS], 170 } 171 172 impl From<kvm_userspace_memory_region> for UserMemoryRegion { 173 fn from(region: kvm_userspace_memory_region) -> Self { 174 let mut flags = USER_MEMORY_REGION_READ; 175 if region.flags & KVM_MEM_READONLY == 0 { 176 flags |= USER_MEMORY_REGION_WRITE; 177 } 178 if region.flags & KVM_MEM_LOG_DIRTY_PAGES != 0 { 179 flags |= USER_MEMORY_REGION_LOG_DIRTY; 180 } 181 182 UserMemoryRegion { 183 slot: region.slot, 184 guest_phys_addr: region.guest_phys_addr, 185 memory_size: region.memory_size, 186 userspace_addr: region.userspace_addr, 187 flags, 188 } 189 } 190 } 191 192 impl From<UserMemoryRegion> for kvm_userspace_memory_region { 193 fn from(region: UserMemoryRegion) -> Self { 194 assert!( 195 region.flags & USER_MEMORY_REGION_READ != 0, 196 "KVM mapped memory is always readable" 197 ); 198 199 let mut flags = 0; 200 if region.flags & USER_MEMORY_REGION_WRITE == 0 { 201 flags |= KVM_MEM_READONLY; 202 } 203 if region.flags & USER_MEMORY_REGION_LOG_DIRTY != 0 { 204 flags |= KVM_MEM_LOG_DIRTY_PAGES; 205 } 206 207 kvm_userspace_memory_region { 208 slot: region.slot, 209 guest_phys_addr: region.guest_phys_addr, 210 memory_size: region.memory_size, 211 userspace_addr: region.userspace_addr, 212 flags, 213 } 214 } 215 } 216 217 impl From<kvm_mp_state> for MpState { 218 fn from(s: kvm_mp_state) -> Self { 219 MpState::Kvm(s) 220 } 221 } 222 223 impl From<MpState> for kvm_mp_state { 224 fn from(ms: MpState) -> Self { 225 match ms { 226 MpState::Kvm(s) => s, 227 /* Needed in case other hypervisors are enabled */ 228 #[allow(unreachable_patterns)] 229 _ => panic!("CpuState is not valid"), 230 } 231 } 232 } 233 234 impl From<kvm_ioctls::IoEventAddress> for IoEventAddress { 235 fn from(a: kvm_ioctls::IoEventAddress) -> Self { 236 match a { 237 kvm_ioctls::IoEventAddress::Pio(x) => Self::Pio(x), 238 kvm_ioctls::IoEventAddress::Mmio(x) => Self::Mmio(x), 239 } 240 } 241 } 242 243 impl From<IoEventAddress> for kvm_ioctls::IoEventAddress { 244 fn from(a: IoEventAddress) -> Self { 245 match a { 246 IoEventAddress::Pio(x) => Self::Pio(x), 247 IoEventAddress::Mmio(x) => Self::Mmio(x), 248 } 249 } 250 } 251 252 impl From<VcpuKvmState> for CpuState { 253 fn from(s: VcpuKvmState) -> Self { 254 CpuState::Kvm(s) 255 } 256 } 257 258 impl From<CpuState> for VcpuKvmState { 259 fn from(s: CpuState) -> Self { 260 match s { 261 CpuState::Kvm(s) => s, 262 /* Needed in case other hypervisors are enabled */ 263 #[allow(unreachable_patterns)] 264 _ => panic!("CpuState is not valid"), 265 } 266 } 267 } 268 269 #[cfg(target_arch = "x86_64")] 270 impl From<kvm_clock_data> for ClockData { 271 fn from(d: kvm_clock_data) -> Self { 272 ClockData::Kvm(d) 273 } 274 } 275 276 #[cfg(target_arch = "x86_64")] 277 impl From<ClockData> for kvm_clock_data { 278 fn from(ms: ClockData) -> Self { 279 match ms { 280 ClockData::Kvm(s) => s, 281 /* Needed in case other hypervisors are enabled */ 282 #[allow(unreachable_patterns)] 283 _ => panic!("CpuState is not valid"), 284 } 285 } 286 } 287 288 impl From<kvm_irq_routing_entry> for IrqRoutingEntry { 289 fn from(s: kvm_irq_routing_entry) -> Self { 290 IrqRoutingEntry::Kvm(s) 291 } 292 } 293 294 impl From<IrqRoutingEntry> for kvm_irq_routing_entry { 295 fn from(e: IrqRoutingEntry) -> Self { 296 match e { 297 IrqRoutingEntry::Kvm(e) => e, 298 /* Needed in case other hypervisors are enabled */ 299 #[allow(unreachable_patterns)] 300 _ => panic!("IrqRoutingEntry is not valid"), 301 } 302 } 303 } 304 305 struct KvmDirtyLogSlot { 306 slot: u32, 307 guest_phys_addr: u64, 308 memory_size: u64, 309 userspace_addr: u64, 310 } 311 312 /// Wrapper over KVM VM ioctls. 313 pub struct KvmVm { 314 fd: Arc<VmFd>, 315 #[cfg(target_arch = "x86_64")] 316 msrs: Vec<MsrEntry>, 317 dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>, 318 } 319 320 impl KvmVm { 321 /// 322 /// Creates an emulated device in the kernel. 323 /// 324 /// See the documentation for `KVM_CREATE_DEVICE`. 325 fn create_device(&self, device: &mut CreateDevice) -> vm::Result<vfio_ioctls::VfioDeviceFd> { 326 let device_fd = self 327 .fd 328 .create_device(device) 329 .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?; 330 Ok(VfioDeviceFd::new_from_kvm(device_fd)) 331 } 332 /// Checks if a particular `Cap` is available. 333 fn check_extension(&self, c: Cap) -> bool { 334 self.fd.check_extension(c) 335 } 336 } 337 338 /// 339 /// Implementation of Vm trait for KVM 340 /// Example: 341 /// #[cfg(feature = "kvm")] 342 /// extern crate hypervisor 343 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 344 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 345 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 346 /// vm.set/get().unwrap() 347 /// 348 impl vm::Vm for KvmVm { 349 #[cfg(target_arch = "x86_64")] 350 /// 351 /// Sets the address of the one-page region in the VM's address space. 352 /// 353 fn set_identity_map_address(&self, address: u64) -> vm::Result<()> { 354 self.fd 355 .set_identity_map_address(address) 356 .map_err(|e| vm::HypervisorVmError::SetIdentityMapAddress(e.into())) 357 } 358 #[cfg(target_arch = "x86_64")] 359 /// 360 /// Sets the address of the three-page region in the VM's address space. 361 /// 362 fn set_tss_address(&self, offset: usize) -> vm::Result<()> { 363 self.fd 364 .set_tss_address(offset) 365 .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into())) 366 } 367 /// 368 /// Creates an in-kernel interrupt controller. 369 /// 370 fn create_irq_chip(&self) -> vm::Result<()> { 371 self.fd 372 .create_irq_chip() 373 .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into())) 374 } 375 /// 376 /// Registers an event that will, when signaled, trigger the `gsi` IRQ. 377 /// 378 fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> { 379 self.fd 380 .register_irqfd(fd, gsi) 381 .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into())) 382 } 383 /// 384 /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ. 385 /// 386 fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> { 387 self.fd 388 .unregister_irqfd(fd, gsi) 389 .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into())) 390 } 391 /// 392 /// Creates a VcpuFd object from a vcpu RawFd. 393 /// 394 fn create_vcpu( 395 &self, 396 id: u8, 397 vm_ops: Option<Arc<dyn VmOps>>, 398 ) -> vm::Result<Arc<dyn cpu::Vcpu>> { 399 let vc = self 400 .fd 401 .create_vcpu(id as u64) 402 .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?; 403 let vcpu = KvmVcpu { 404 fd: vc, 405 #[cfg(target_arch = "x86_64")] 406 msrs: self.msrs.clone(), 407 vm_ops, 408 #[cfg(target_arch = "x86_64")] 409 hyperv_synic: AtomicBool::new(false), 410 }; 411 Ok(Arc::new(vcpu)) 412 } 413 #[cfg(target_arch = "aarch64")] 414 /// 415 /// Creates a virtual GIC device. 416 /// 417 fn create_vgic(&self, config: VgicConfig) -> vm::Result<Arc<Mutex<dyn Vgic>>> { 418 let gic_device = KvmGicV3Its::new(self, config) 419 .map_err(|e| vm::HypervisorVmError::CreateVgic(anyhow!("Vgic error {:?}", e)))?; 420 Ok(Arc::new(Mutex::new(gic_device))) 421 } 422 /// 423 /// Registers an event to be signaled whenever a certain address is written to. 424 /// 425 fn register_ioevent( 426 &self, 427 fd: &EventFd, 428 addr: &IoEventAddress, 429 datamatch: Option<vm::DataMatch>, 430 ) -> vm::Result<()> { 431 let addr = &kvm_ioctls::IoEventAddress::from(*addr); 432 if let Some(dm) = datamatch { 433 match dm { 434 vm::DataMatch::DataMatch32(kvm_dm32) => self 435 .fd 436 .register_ioevent(fd, addr, kvm_dm32) 437 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())), 438 vm::DataMatch::DataMatch64(kvm_dm64) => self 439 .fd 440 .register_ioevent(fd, addr, kvm_dm64) 441 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())), 442 } 443 } else { 444 self.fd 445 .register_ioevent(fd, addr, NoDatamatch) 446 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())) 447 } 448 } 449 /// 450 /// Unregisters an event from a certain address it has been previously registered to. 451 /// 452 fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> { 453 let addr = &kvm_ioctls::IoEventAddress::from(*addr); 454 self.fd 455 .unregister_ioevent(fd, addr, NoDatamatch) 456 .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into())) 457 } 458 459 /// 460 /// Constructs a routing entry 461 /// 462 fn make_routing_entry(&self, gsi: u32, config: &InterruptSourceConfig) -> IrqRoutingEntry { 463 match &config { 464 InterruptSourceConfig::MsiIrq(cfg) => { 465 let mut kvm_route = kvm_irq_routing_entry { 466 gsi, 467 type_: KVM_IRQ_ROUTING_MSI, 468 ..Default::default() 469 }; 470 471 kvm_route.u.msi.address_lo = cfg.low_addr; 472 kvm_route.u.msi.address_hi = cfg.high_addr; 473 kvm_route.u.msi.data = cfg.data; 474 475 if self.check_extension(crate::kvm::Cap::MsiDevid) { 476 // On AArch64, there is limitation on the range of the 'devid', 477 // it can not be greater than 65536 (the max of u16). 478 // 479 // BDF can not be used directly, because 'segment' is in high 480 // 16 bits. The layout of the u32 BDF is: 481 // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --| 482 // | segment | bus | device | function | 483 // 484 // Now that we support 1 bus only in a segment, we can build a 485 // 'devid' by replacing the 'bus' bits with the low 8 bits of 486 // 'segment' data. 487 // This way we can resolve the range checking problem and give 488 // different `devid` to all the devices. Limitation is that at 489 // most 256 segments can be supported. 490 // 491 let modified_devid = (cfg.devid & 0x00ff_0000) >> 8 | cfg.devid & 0xff; 492 493 kvm_route.flags = KVM_MSI_VALID_DEVID; 494 kvm_route.u.msi.__bindgen_anon_1.devid = modified_devid; 495 } 496 kvm_route.into() 497 } 498 InterruptSourceConfig::LegacyIrq(cfg) => { 499 let mut kvm_route = kvm_irq_routing_entry { 500 gsi, 501 type_: KVM_IRQ_ROUTING_IRQCHIP, 502 ..Default::default() 503 }; 504 kvm_route.u.irqchip.irqchip = cfg.irqchip; 505 kvm_route.u.irqchip.pin = cfg.pin; 506 507 kvm_route.into() 508 } 509 } 510 } 511 512 /// 513 /// Sets the GSI routing table entries, overwriting any previously set 514 /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl. 515 /// 516 fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> { 517 let mut irq_routing = 518 vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len()); 519 irq_routing[0].nr = entries.len() as u32; 520 irq_routing[0].flags = 0; 521 let entries: Vec<kvm_irq_routing_entry> = entries 522 .iter() 523 .map(|entry| match entry { 524 IrqRoutingEntry::Kvm(e) => *e, 525 #[allow(unreachable_patterns)] 526 _ => panic!("IrqRoutingEntry type is wrong"), 527 }) 528 .collect(); 529 530 // SAFETY: irq_routing initialized with entries.len() and now it is being turned into 531 // entries_slice with entries.len() again. It is guaranteed to be large enough to hold 532 // everything from entries. 533 unsafe { 534 let entries_slice: &mut [kvm_irq_routing_entry] = 535 irq_routing[0].entries.as_mut_slice(entries.len()); 536 entries_slice.copy_from_slice(&entries); 537 } 538 539 self.fd 540 .set_gsi_routing(&irq_routing[0]) 541 .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into())) 542 } 543 /// 544 /// Creates a memory region structure that can be used with {create/remove}_user_memory_region 545 /// 546 fn make_user_memory_region( 547 &self, 548 slot: u32, 549 guest_phys_addr: u64, 550 memory_size: u64, 551 userspace_addr: u64, 552 readonly: bool, 553 log_dirty_pages: bool, 554 ) -> UserMemoryRegion { 555 kvm_userspace_memory_region { 556 slot, 557 guest_phys_addr, 558 memory_size, 559 userspace_addr, 560 flags: if readonly { KVM_MEM_READONLY } else { 0 } 561 | if log_dirty_pages { 562 KVM_MEM_LOG_DIRTY_PAGES 563 } else { 564 0 565 }, 566 } 567 .into() 568 } 569 /// 570 /// Creates a guest physical memory region. 571 /// 572 fn create_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> { 573 let mut region: kvm_userspace_memory_region = user_memory_region.into(); 574 575 if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 { 576 if (region.flags & KVM_MEM_READONLY) != 0 { 577 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!( 578 "Error creating regions with both 'dirty-pages-log' and 'read-only'." 579 ))); 580 } 581 582 // Keep track of the regions that need dirty pages log 583 self.dirty_log_slots.write().unwrap().insert( 584 region.slot, 585 KvmDirtyLogSlot { 586 slot: region.slot, 587 guest_phys_addr: region.guest_phys_addr, 588 memory_size: region.memory_size, 589 userspace_addr: region.userspace_addr, 590 }, 591 ); 592 593 // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`. 594 // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`. 595 region.flags = 0; 596 } 597 598 // SAFETY: Safe because guest regions are guaranteed not to overlap. 599 unsafe { 600 self.fd 601 .set_user_memory_region(region) 602 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into())) 603 } 604 } 605 /// 606 /// Removes a guest physical memory region. 607 /// 608 fn remove_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> { 609 let mut region: kvm_userspace_memory_region = user_memory_region.into(); 610 611 // Remove the corresponding entry from "self.dirty_log_slots" if needed 612 self.dirty_log_slots.write().unwrap().remove(®ion.slot); 613 614 // Setting the size to 0 means "remove" 615 region.memory_size = 0; 616 // SAFETY: Safe because guest regions are guaranteed not to overlap. 617 unsafe { 618 self.fd 619 .set_user_memory_region(region) 620 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into())) 621 } 622 } 623 /// 624 /// Returns the preferred CPU target type which can be emulated by KVM on underlying host. 625 /// 626 #[cfg(target_arch = "aarch64")] 627 fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> { 628 self.fd 629 .get_preferred_target(kvi) 630 .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into())) 631 } 632 #[cfg(target_arch = "x86_64")] 633 fn enable_split_irq(&self) -> vm::Result<()> { 634 // Create split irqchip 635 // Only the local APIC is emulated in kernel, both PICs and IOAPIC 636 // are not. 637 let mut cap = kvm_enable_cap { 638 cap: KVM_CAP_SPLIT_IRQCHIP, 639 ..Default::default() 640 }; 641 cap.args[0] = NUM_IOAPIC_PINS as u64; 642 self.fd 643 .enable_cap(&cap) 644 .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?; 645 Ok(()) 646 } 647 #[cfg(target_arch = "x86_64")] 648 fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> { 649 let mut cap = kvm_enable_cap { 650 cap: KVM_CAP_SGX_ATTRIBUTE, 651 ..Default::default() 652 }; 653 cap.args[0] = file.as_raw_fd() as u64; 654 self.fd 655 .enable_cap(&cap) 656 .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?; 657 Ok(()) 658 } 659 /// Retrieve guest clock. 660 #[cfg(target_arch = "x86_64")] 661 fn get_clock(&self) -> vm::Result<ClockData> { 662 Ok(self 663 .fd 664 .get_clock() 665 .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))? 666 .into()) 667 } 668 /// Set guest clock. 669 #[cfg(target_arch = "x86_64")] 670 fn set_clock(&self, data: &ClockData) -> vm::Result<()> { 671 let data = (*data).into(); 672 self.fd 673 .set_clock(&data) 674 .map_err(|e| vm::HypervisorVmError::SetClock(e.into())) 675 } 676 /// Create a device that is used for passthrough 677 fn create_passthrough_device(&self) -> vm::Result<VfioDeviceFd> { 678 let mut vfio_dev = kvm_create_device { 679 type_: kvm_device_type_KVM_DEV_TYPE_VFIO, 680 fd: 0, 681 flags: 0, 682 }; 683 684 self.create_device(&mut vfio_dev) 685 .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into())) 686 } 687 /// 688 /// Start logging dirty pages 689 /// 690 fn start_dirty_log(&self) -> vm::Result<()> { 691 let dirty_log_slots = self.dirty_log_slots.read().unwrap(); 692 for (_, s) in dirty_log_slots.iter() { 693 let region = kvm_userspace_memory_region { 694 slot: s.slot, 695 guest_phys_addr: s.guest_phys_addr, 696 memory_size: s.memory_size, 697 userspace_addr: s.userspace_addr, 698 flags: KVM_MEM_LOG_DIRTY_PAGES, 699 }; 700 // SAFETY: Safe because guest regions are guaranteed not to overlap. 701 unsafe { 702 self.fd 703 .set_user_memory_region(region) 704 .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; 705 } 706 } 707 708 Ok(()) 709 } 710 711 /// 712 /// Stop logging dirty pages 713 /// 714 fn stop_dirty_log(&self) -> vm::Result<()> { 715 let dirty_log_slots = self.dirty_log_slots.read().unwrap(); 716 for (_, s) in dirty_log_slots.iter() { 717 let region = kvm_userspace_memory_region { 718 slot: s.slot, 719 guest_phys_addr: s.guest_phys_addr, 720 memory_size: s.memory_size, 721 userspace_addr: s.userspace_addr, 722 flags: 0, 723 }; 724 // SAFETY: Safe because guest regions are guaranteed not to overlap. 725 unsafe { 726 self.fd 727 .set_user_memory_region(region) 728 .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; 729 } 730 } 731 732 Ok(()) 733 } 734 735 /// 736 /// Get dirty pages bitmap (one bit per page) 737 /// 738 fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> { 739 self.fd 740 .get_dirty_log(slot, memory_size as usize) 741 .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into())) 742 } 743 744 /// 745 /// Initialize TDX for this VM 746 /// 747 #[cfg(feature = "tdx")] 748 fn tdx_init(&self, cpuid: &[CpuIdEntry], max_vcpus: u32) -> vm::Result<()> { 749 use std::io::{Error, ErrorKind}; 750 let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> = 751 cpuid.iter().map(|e| (*e).into()).collect(); 752 let kvm_cpuid = kvm_bindings::CpuId::from_entries(&cpuid).map_err(|_| { 753 vm::HypervisorVmError::InitializeTdx(Error::new( 754 ErrorKind::Other, 755 "failed to allocate CpuId", 756 )) 757 })?; 758 759 #[repr(C)] 760 struct TdxInitVm { 761 max_vcpus: u32, 762 tsc_khz: u32, 763 attributes: u64, 764 cpuid: u64, 765 mrconfigid: [u64; 6], 766 mrowner: [u64; 6], 767 mrownerconfig: [u64; 6], 768 reserved: [u64; 43], 769 } 770 let data = TdxInitVm { 771 max_vcpus, 772 tsc_khz: 0, 773 attributes: 0, 774 cpuid: kvm_cpuid.as_fam_struct_ptr() as u64, 775 mrconfigid: [0; 6], 776 mrowner: [0; 6], 777 mrownerconfig: [0; 6], 778 reserved: [0; 43], 779 }; 780 781 tdx_command( 782 &self.fd.as_raw_fd(), 783 TdxCommand::InitVm, 784 0, 785 &data as *const _ as u64, 786 ) 787 .map_err(vm::HypervisorVmError::InitializeTdx) 788 } 789 790 /// 791 /// Finalize the TDX setup for this VM 792 /// 793 #[cfg(feature = "tdx")] 794 fn tdx_finalize(&self) -> vm::Result<()> { 795 tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0) 796 .map_err(vm::HypervisorVmError::FinalizeTdx) 797 } 798 799 /// 800 /// Initialize memory regions for the TDX VM 801 /// 802 #[cfg(feature = "tdx")] 803 fn tdx_init_memory_region( 804 &self, 805 host_address: u64, 806 guest_address: u64, 807 size: u64, 808 measure: bool, 809 ) -> vm::Result<()> { 810 #[repr(C)] 811 struct TdxInitMemRegion { 812 host_address: u64, 813 guest_address: u64, 814 pages: u64, 815 } 816 let data = TdxInitMemRegion { 817 host_address, 818 guest_address, 819 pages: size / 4096, 820 }; 821 822 tdx_command( 823 &self.fd.as_raw_fd(), 824 TdxCommand::InitMemRegion, 825 u32::from(measure), 826 &data as *const _ as u64, 827 ) 828 .map_err(vm::HypervisorVmError::InitMemRegionTdx) 829 } 830 /// Downcast to the underlying KvmVm type 831 fn as_any(&self) -> &dyn Any { 832 self 833 } 834 } 835 836 #[cfg(feature = "tdx")] 837 fn tdx_command( 838 fd: &RawFd, 839 command: TdxCommand, 840 metadata: u32, 841 data: u64, 842 ) -> std::result::Result<(), std::io::Error> { 843 #[repr(C)] 844 struct TdxIoctlCmd { 845 command: TdxCommand, 846 metadata: u32, 847 data: u64, 848 } 849 let cmd = TdxIoctlCmd { 850 command, 851 metadata, 852 data, 853 }; 854 // SAFETY: FFI call. All input parameters are valid. 855 let ret = unsafe { 856 ioctl_with_val( 857 fd, 858 KVM_MEMORY_ENCRYPT_OP(), 859 &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong, 860 ) 861 }; 862 863 if ret < 0 { 864 return Err(std::io::Error::last_os_error()); 865 } 866 Ok(()) 867 } 868 869 /// Wrapper over KVM system ioctls. 870 pub struct KvmHypervisor { 871 kvm: Kvm, 872 } 873 874 impl KvmHypervisor { 875 #[cfg(target_arch = "x86_64")] 876 /// 877 /// Retrieve the list of MSRs supported by the hypervisor. 878 /// 879 fn get_msr_list(&self) -> hypervisor::Result<MsrList> { 880 self.kvm 881 .get_msr_index_list() 882 .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into())) 883 } 884 } 885 886 /// Enum for KVM related error 887 #[derive(Debug, Error)] 888 pub enum KvmError { 889 #[error("Capability missing: {0:?}")] 890 CapabilityMissing(Cap), 891 } 892 pub type KvmResult<T> = result::Result<T, KvmError>; 893 impl KvmHypervisor { 894 /// Create a hypervisor based on Kvm 895 #[allow(clippy::new_ret_no_self)] 896 pub fn new() -> hypervisor::Result<Arc<dyn hypervisor::Hypervisor>> { 897 let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?; 898 let api_version = kvm_obj.get_api_version(); 899 900 if api_version != kvm_bindings::KVM_API_VERSION as i32 { 901 return Err(hypervisor::HypervisorError::IncompatibleApiVersion); 902 } 903 904 Ok(Arc::new(KvmHypervisor { kvm: kvm_obj })) 905 } 906 /// Check if the hypervisor is available 907 pub fn is_available() -> hypervisor::Result<bool> { 908 match std::fs::metadata("/dev/kvm") { 909 Ok(_) => Ok(true), 910 Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false), 911 Err(err) => Err(hypervisor::HypervisorError::HypervisorAvailableCheck( 912 err.into(), 913 )), 914 } 915 } 916 } 917 /// Implementation of Hypervisor trait for KVM 918 /// Example: 919 /// #[cfg(feature = "kvm")] 920 /// extern crate hypervisor 921 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 922 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 923 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 924 /// 925 impl hypervisor::Hypervisor for KvmHypervisor { 926 /// 927 /// Returns the type of the hypervisor 928 /// 929 fn hypervisor_type(&self) -> HypervisorType { 930 HypervisorType::Kvm 931 } 932 /// Create a KVM vm object of a specific VM type and return the object as Vm trait object 933 /// Example 934 /// # extern crate hypervisor; 935 /// # use hypervisor::KvmHypervisor; 936 /// use hypervisor::KvmVm; 937 /// let hypervisor = KvmHypervisor::new().unwrap(); 938 /// let vm = hypervisor.create_vm_with_type(KvmVmType::LegacyVm).unwrap() 939 /// 940 fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> { 941 let fd: VmFd; 942 loop { 943 match self.kvm.create_vm_with_type(vm_type) { 944 Ok(res) => fd = res, 945 Err(e) => { 946 if e.errno() == libc::EINTR { 947 // If the error returned is EINTR, which means the 948 // ioctl has been interrupted, we have to retry as 949 // this can't be considered as a regular error. 950 continue; 951 } else { 952 return Err(hypervisor::HypervisorError::VmCreate(e.into())); 953 } 954 } 955 } 956 break; 957 } 958 959 let vm_fd = Arc::new(fd); 960 961 #[cfg(target_arch = "x86_64")] 962 { 963 let msr_list = self.get_msr_list()?; 964 let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize; 965 let mut msrs: Vec<MsrEntry> = vec![ 966 MsrEntry { 967 ..Default::default() 968 }; 969 num_msrs 970 ]; 971 let indices = msr_list.as_slice(); 972 for (pos, index) in indices.iter().enumerate() { 973 msrs[pos].index = *index; 974 } 975 976 Ok(Arc::new(KvmVm { 977 fd: vm_fd, 978 msrs, 979 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), 980 })) 981 } 982 983 #[cfg(target_arch = "aarch64")] 984 { 985 Ok(Arc::new(KvmVm { 986 fd: vm_fd, 987 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), 988 })) 989 } 990 } 991 992 /// Create a KVM vm object and return the object as Vm trait object 993 /// Example 994 /// # extern crate hypervisor; 995 /// # use hypervisor::KvmHypervisor; 996 /// use hypervisor::KvmVm; 997 /// let hypervisor = KvmHypervisor::new().unwrap(); 998 /// let vm = hypervisor.create_vm().unwrap() 999 /// 1000 fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> { 1001 #[allow(unused_mut)] 1002 let mut vm_type: u64 = 0; // Create with default platform type 1003 1004 // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA 1005 // size from the host and use that when creating the VM, which may 1006 // avoid unnecessary VM creation failures. 1007 #[cfg(target_arch = "aarch64")] 1008 if self.kvm.check_extension(Cap::ArmVmIPASize) { 1009 vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap(); 1010 } 1011 1012 self.create_vm_with_type(vm_type) 1013 } 1014 1015 fn check_required_extensions(&self) -> hypervisor::Result<()> { 1016 check_required_kvm_extensions(&self.kvm) 1017 .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into())) 1018 } 1019 1020 #[cfg(target_arch = "x86_64")] 1021 /// 1022 /// X86 specific call to get the system supported CPUID values. 1023 /// 1024 fn get_cpuid(&self) -> hypervisor::Result<Vec<CpuIdEntry>> { 1025 let kvm_cpuid = self 1026 .kvm 1027 .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES) 1028 .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))?; 1029 1030 let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect(); 1031 1032 Ok(v) 1033 } 1034 1035 #[cfg(target_arch = "aarch64")] 1036 /// 1037 /// Retrieve AArch64 host maximum IPA size supported by KVM. 1038 /// 1039 fn get_host_ipa_limit(&self) -> i32 { 1040 self.kvm.get_host_ipa_limit() 1041 } 1042 1043 /// 1044 /// Retrieve TDX capabilities 1045 /// 1046 #[cfg(feature = "tdx")] 1047 fn tdx_capabilities(&self) -> hypervisor::Result<TdxCapabilities> { 1048 let data = TdxCapabilities { 1049 nr_cpuid_configs: TDX_MAX_NR_CPUID_CONFIGS as u32, 1050 ..Default::default() 1051 }; 1052 1053 tdx_command( 1054 &self.kvm.as_raw_fd(), 1055 TdxCommand::Capabilities, 1056 0, 1057 &data as *const _ as u64, 1058 ) 1059 .map_err(|e| hypervisor::HypervisorError::TdxCapabilities(e.into()))?; 1060 1061 Ok(data) 1062 } 1063 1064 /// 1065 /// Get the number of supported hardware breakpoints 1066 /// 1067 fn get_guest_debug_hw_bps(&self) -> usize { 1068 #[cfg(target_arch = "x86_64")] 1069 { 1070 4 1071 } 1072 #[cfg(target_arch = "aarch64")] 1073 { 1074 self.kvm.get_guest_debug_hw_bps() as usize 1075 } 1076 } 1077 } 1078 /// Vcpu struct for KVM 1079 pub struct KvmVcpu { 1080 fd: VcpuFd, 1081 #[cfg(target_arch = "x86_64")] 1082 msrs: Vec<MsrEntry>, 1083 vm_ops: Option<Arc<dyn vm::VmOps>>, 1084 #[cfg(target_arch = "x86_64")] 1085 hyperv_synic: AtomicBool, 1086 } 1087 /// Implementation of Vcpu trait for KVM 1088 /// Example: 1089 /// #[cfg(feature = "kvm")] 1090 /// extern crate hypervisor 1091 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1092 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1093 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 1094 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1095 /// vcpu.get/set().unwrap() 1096 /// 1097 impl cpu::Vcpu for KvmVcpu { 1098 #[cfg(target_arch = "x86_64")] 1099 /// 1100 /// Returns the vCPU general purpose registers. 1101 /// 1102 fn get_regs(&self) -> cpu::Result<StandardRegisters> { 1103 Ok(self 1104 .fd 1105 .get_regs() 1106 .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))? 1107 .into()) 1108 } 1109 /// 1110 /// Returns the vCPU general purpose registers. 1111 /// The `KVM_GET_REGS` ioctl is not available on AArch64, `KVM_GET_ONE_REG` 1112 /// is used to get registers one by one. 1113 /// 1114 #[cfg(target_arch = "aarch64")] 1115 fn get_regs(&self) -> cpu::Result<StandardRegisters> { 1116 let mut state: StandardRegisters = kvm_regs::default(); 1117 let mut off = offset__of!(user_pt_regs, regs); 1118 // There are 31 user_pt_regs: 1119 // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72 1120 // These actually are the general-purpose registers of the Armv8-a 1121 // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register). 1122 for i in 0..31 { 1123 state.regs.regs[i] = self 1124 .fd 1125 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1126 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1127 .try_into() 1128 .unwrap(); 1129 off += std::mem::size_of::<u64>(); 1130 } 1131 1132 // We are now entering the "Other register" section of the ARMv8-a architecture. 1133 // First one, stack pointer. 1134 let off = offset__of!(user_pt_regs, sp); 1135 state.regs.sp = self 1136 .fd 1137 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1138 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1139 .try_into() 1140 .unwrap(); 1141 1142 // Second one, the program counter. 1143 let off = offset__of!(user_pt_regs, pc); 1144 state.regs.pc = self 1145 .fd 1146 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1147 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1148 .try_into() 1149 .unwrap(); 1150 1151 // Next is the processor state. 1152 let off = offset__of!(user_pt_regs, pstate); 1153 state.regs.pstate = self 1154 .fd 1155 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1156 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1157 .try_into() 1158 .unwrap(); 1159 1160 // The stack pointer associated with EL1 1161 let off = offset__of!(kvm_regs, sp_el1); 1162 state.sp_el1 = self 1163 .fd 1164 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1165 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1166 .try_into() 1167 .unwrap(); 1168 1169 // Exception Link Register for EL1, when taking an exception to EL1, this register 1170 // holds the address to which to return afterwards. 1171 let off = offset__of!(kvm_regs, elr_el1); 1172 state.elr_el1 = self 1173 .fd 1174 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1175 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1176 .try_into() 1177 .unwrap(); 1178 1179 // Saved Program Status Registers, there are 5 of them used in the kernel. 1180 let mut off = offset__of!(kvm_regs, spsr); 1181 for i in 0..KVM_NR_SPSR as usize { 1182 state.spsr[i] = self 1183 .fd 1184 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1185 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1186 .try_into() 1187 .unwrap(); 1188 off += std::mem::size_of::<u64>(); 1189 } 1190 1191 // Now moving on to floting point registers which are stored in the user_fpsimd_state in the kernel: 1192 // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53 1193 let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs); 1194 for i in 0..32 { 1195 state.fp_regs.vregs[i] = self 1196 .fd 1197 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off)) 1198 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1199 off += mem::size_of::<u128>(); 1200 } 1201 1202 // Floating-point Status Register 1203 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr); 1204 state.fp_regs.fpsr = self 1205 .fd 1206 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off)) 1207 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1208 .try_into() 1209 .unwrap(); 1210 1211 // Floating-point Control Register 1212 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr); 1213 state.fp_regs.fpcr = self 1214 .fd 1215 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off)) 1216 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1217 .try_into() 1218 .unwrap(); 1219 Ok(state) 1220 } 1221 #[cfg(target_arch = "x86_64")] 1222 /// 1223 /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl. 1224 /// 1225 fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> { 1226 let regs = (*regs).into(); 1227 self.fd 1228 .set_regs(®s) 1229 .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into())) 1230 } 1231 1232 /// 1233 /// Sets the vCPU general purpose registers. 1234 /// The `KVM_SET_REGS` ioctl is not available on AArch64, `KVM_SET_ONE_REG` 1235 /// is used to set registers one by one. 1236 /// 1237 #[cfg(target_arch = "aarch64")] 1238 fn set_regs(&self, state: &StandardRegisters) -> cpu::Result<()> { 1239 // The function follows the exact identical order from `state`. Look there 1240 // for some additional info on registers. 1241 let mut off = offset__of!(user_pt_regs, regs); 1242 for i in 0..31 { 1243 self.fd 1244 .set_one_reg( 1245 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1246 state.regs.regs[i].into(), 1247 ) 1248 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1249 off += std::mem::size_of::<u64>(); 1250 } 1251 1252 let off = offset__of!(user_pt_regs, sp); 1253 self.fd 1254 .set_one_reg( 1255 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1256 state.regs.sp.into(), 1257 ) 1258 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1259 1260 let off = offset__of!(user_pt_regs, pc); 1261 self.fd 1262 .set_one_reg( 1263 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1264 state.regs.pc.into(), 1265 ) 1266 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1267 1268 let off = offset__of!(user_pt_regs, pstate); 1269 self.fd 1270 .set_one_reg( 1271 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1272 state.regs.pstate.into(), 1273 ) 1274 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1275 1276 let off = offset__of!(kvm_regs, sp_el1); 1277 self.fd 1278 .set_one_reg( 1279 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1280 state.sp_el1.into(), 1281 ) 1282 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1283 1284 let off = offset__of!(kvm_regs, elr_el1); 1285 self.fd 1286 .set_one_reg( 1287 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1288 state.elr_el1.into(), 1289 ) 1290 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1291 1292 let mut off = offset__of!(kvm_regs, spsr); 1293 for i in 0..KVM_NR_SPSR as usize { 1294 self.fd 1295 .set_one_reg( 1296 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1297 state.spsr[i].into(), 1298 ) 1299 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1300 off += std::mem::size_of::<u64>(); 1301 } 1302 1303 let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs); 1304 for i in 0..32 { 1305 self.fd 1306 .set_one_reg( 1307 arm64_core_reg_id!(KVM_REG_SIZE_U128, off), 1308 state.fp_regs.vregs[i], 1309 ) 1310 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1311 off += mem::size_of::<u128>(); 1312 } 1313 1314 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr); 1315 self.fd 1316 .set_one_reg( 1317 arm64_core_reg_id!(KVM_REG_SIZE_U32, off), 1318 state.fp_regs.fpsr.into(), 1319 ) 1320 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1321 1322 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr); 1323 self.fd 1324 .set_one_reg( 1325 arm64_core_reg_id!(KVM_REG_SIZE_U32, off), 1326 state.fp_regs.fpcr.into(), 1327 ) 1328 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1329 Ok(()) 1330 } 1331 1332 #[cfg(target_arch = "x86_64")] 1333 /// 1334 /// Returns the vCPU special registers. 1335 /// 1336 fn get_sregs(&self) -> cpu::Result<SpecialRegisters> { 1337 Ok(self 1338 .fd 1339 .get_sregs() 1340 .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))? 1341 .into()) 1342 } 1343 #[cfg(target_arch = "x86_64")] 1344 /// 1345 /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl. 1346 /// 1347 fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> { 1348 let sregs = (*sregs).into(); 1349 self.fd 1350 .set_sregs(&sregs) 1351 .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into())) 1352 } 1353 #[cfg(target_arch = "x86_64")] 1354 /// 1355 /// Returns the floating point state (FPU) from the vCPU. 1356 /// 1357 fn get_fpu(&self) -> cpu::Result<FpuState> { 1358 Ok(self 1359 .fd 1360 .get_fpu() 1361 .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))? 1362 .into()) 1363 } 1364 #[cfg(target_arch = "x86_64")] 1365 /// 1366 /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct. 1367 /// 1368 fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> { 1369 let fpu: kvm_bindings::kvm_fpu = (*fpu).clone().into(); 1370 self.fd 1371 .set_fpu(&fpu) 1372 .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into())) 1373 } 1374 #[cfg(target_arch = "x86_64")] 1375 /// 1376 /// X86 specific call to setup the CPUID registers. 1377 /// 1378 fn set_cpuid2(&self, cpuid: &[CpuIdEntry]) -> cpu::Result<()> { 1379 let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> = 1380 cpuid.iter().map(|e| (*e).into()).collect(); 1381 let kvm_cpuid = <CpuId>::from_entries(&cpuid) 1382 .map_err(|_| cpu::HypervisorCpuError::SetCpuid(anyhow!("failed to create CpuId")))?; 1383 1384 self.fd 1385 .set_cpuid2(&kvm_cpuid) 1386 .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into())) 1387 } 1388 #[cfg(target_arch = "x86_64")] 1389 /// 1390 /// X86 specific call to enable HyperV SynIC 1391 /// 1392 fn enable_hyperv_synic(&self) -> cpu::Result<()> { 1393 // Update the information about Hyper-V SynIC being enabled and 1394 // emulated as it will influence later which MSRs should be saved. 1395 self.hyperv_synic.store(true, Ordering::Release); 1396 1397 let cap = kvm_enable_cap { 1398 cap: KVM_CAP_HYPERV_SYNIC, 1399 ..Default::default() 1400 }; 1401 self.fd 1402 .enable_cap(&cap) 1403 .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into())) 1404 } 1405 /// 1406 /// X86 specific call to retrieve the CPUID registers. 1407 /// 1408 #[cfg(target_arch = "x86_64")] 1409 fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<Vec<CpuIdEntry>> { 1410 let kvm_cpuid = self 1411 .fd 1412 .get_cpuid2(num_entries) 1413 .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))?; 1414 1415 let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect(); 1416 1417 Ok(v) 1418 } 1419 #[cfg(target_arch = "x86_64")] 1420 /// 1421 /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller). 1422 /// 1423 fn get_lapic(&self) -> cpu::Result<LapicState> { 1424 Ok(self 1425 .fd 1426 .get_lapic() 1427 .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))? 1428 .into()) 1429 } 1430 #[cfg(target_arch = "x86_64")] 1431 /// 1432 /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller). 1433 /// 1434 fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> { 1435 let klapic: kvm_bindings::kvm_lapic_state = (*klapic).clone().into(); 1436 self.fd 1437 .set_lapic(&klapic) 1438 .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into())) 1439 } 1440 #[cfg(target_arch = "x86_64")] 1441 /// 1442 /// Returns the model-specific registers (MSR) for this vCPU. 1443 /// 1444 fn get_msrs(&self, msrs: &mut Vec<MsrEntry>) -> cpu::Result<usize> { 1445 let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect(); 1446 let mut kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap(); 1447 let succ = self 1448 .fd 1449 .get_msrs(&mut kvm_msrs) 1450 .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))?; 1451 1452 msrs[..succ].copy_from_slice( 1453 &kvm_msrs.as_slice()[..succ] 1454 .iter() 1455 .map(|e| (*e).into()) 1456 .collect::<Vec<MsrEntry>>(), 1457 ); 1458 1459 Ok(succ) 1460 } 1461 #[cfg(target_arch = "x86_64")] 1462 /// 1463 /// Setup the model-specific registers (MSR) for this vCPU. 1464 /// Returns the number of MSR entries actually written. 1465 /// 1466 fn set_msrs(&self, msrs: &[MsrEntry]) -> cpu::Result<usize> { 1467 let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect(); 1468 let kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap(); 1469 self.fd 1470 .set_msrs(&kvm_msrs) 1471 .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into())) 1472 } 1473 /// 1474 /// Returns the vcpu's current "multiprocessing state". 1475 /// 1476 fn get_mp_state(&self) -> cpu::Result<MpState> { 1477 Ok(self 1478 .fd 1479 .get_mp_state() 1480 .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))? 1481 .into()) 1482 } 1483 /// 1484 /// Sets the vcpu's current "multiprocessing state". 1485 /// 1486 fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> { 1487 self.fd 1488 .set_mp_state(mp_state.into()) 1489 .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into())) 1490 } 1491 #[cfg(target_arch = "x86_64")] 1492 /// 1493 /// Translates guest virtual address to guest physical address using the `KVM_TRANSLATE` ioctl. 1494 /// 1495 fn translate_gva(&self, gva: u64, _flags: u64) -> cpu::Result<(u64, u32)> { 1496 let tr = self 1497 .fd 1498 .translate_gva(gva) 1499 .map_err(|e| cpu::HypervisorCpuError::TranslateVirtualAddress(e.into()))?; 1500 // tr.valid is set if the GVA is mapped to valid GPA. 1501 match tr.valid { 1502 0 => Err(cpu::HypervisorCpuError::TranslateVirtualAddress(anyhow!( 1503 "Invalid GVA: {:#x}", 1504 gva 1505 ))), 1506 _ => Ok((tr.physical_address, 0)), 1507 } 1508 } 1509 /// 1510 /// Triggers the running of the current virtual CPU returning an exit reason. 1511 /// 1512 fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> { 1513 match self.fd.run() { 1514 Ok(run) => match run { 1515 #[cfg(target_arch = "x86_64")] 1516 VcpuExit::IoIn(addr, data) => { 1517 if let Some(vm_ops) = &self.vm_ops { 1518 return vm_ops 1519 .pio_read(addr.into(), data) 1520 .map(|_| cpu::VmExit::Ignore) 1521 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1522 } 1523 1524 Ok(cpu::VmExit::IoIn(addr, data)) 1525 } 1526 #[cfg(target_arch = "x86_64")] 1527 VcpuExit::IoOut(addr, data) => { 1528 if let Some(vm_ops) = &self.vm_ops { 1529 return vm_ops 1530 .pio_write(addr.into(), data) 1531 .map(|_| cpu::VmExit::Ignore) 1532 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1533 } 1534 1535 Ok(cpu::VmExit::IoOut(addr, data)) 1536 } 1537 #[cfg(target_arch = "x86_64")] 1538 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)), 1539 #[cfg(target_arch = "x86_64")] 1540 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset), 1541 1542 #[cfg(target_arch = "aarch64")] 1543 VcpuExit::SystemEvent(event_type, flags) => { 1544 use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN}; 1545 // On Aarch64, when the VM is shutdown, run() returns 1546 // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN 1547 if event_type == KVM_SYSTEM_EVENT_RESET { 1548 Ok(cpu::VmExit::Reset) 1549 } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN { 1550 Ok(cpu::VmExit::Shutdown) 1551 } else { 1552 Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1553 "Unexpected system event with type 0x{:x}, flags 0x{:x}", 1554 event_type, 1555 flags 1556 ))) 1557 } 1558 } 1559 1560 VcpuExit::MmioRead(addr, data) => { 1561 if let Some(vm_ops) = &self.vm_ops { 1562 return vm_ops 1563 .mmio_read(addr, data) 1564 .map(|_| cpu::VmExit::Ignore) 1565 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1566 } 1567 1568 Ok(cpu::VmExit::MmioRead(addr, data)) 1569 } 1570 VcpuExit::MmioWrite(addr, data) => { 1571 if let Some(vm_ops) = &self.vm_ops { 1572 return vm_ops 1573 .mmio_write(addr, data) 1574 .map(|_| cpu::VmExit::Ignore) 1575 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1576 } 1577 1578 Ok(cpu::VmExit::MmioWrite(addr, data)) 1579 } 1580 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv), 1581 #[cfg(feature = "tdx")] 1582 VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx), 1583 VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug), 1584 1585 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1586 "Unexpected exit reason on vcpu run: {:?}", 1587 r 1588 ))), 1589 }, 1590 1591 Err(ref e) => match e.errno() { 1592 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore), 1593 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1594 "VCPU error {:?}", 1595 e 1596 ))), 1597 }, 1598 } 1599 } 1600 #[cfg(target_arch = "x86_64")] 1601 /// 1602 /// Let the guest know that it has been paused, which prevents from 1603 /// potential soft lockups when being resumed. 1604 /// 1605 fn notify_guest_clock_paused(&self) -> cpu::Result<()> { 1606 if let Err(e) = self.fd.kvmclock_ctrl() { 1607 // Linux kernel returns -EINVAL if the PV clock isn't yet initialised 1608 // which could be because we're still in firmware or the guest doesn't 1609 // use KVM clock. 1610 if e.errno() != libc::EINVAL { 1611 return Err(cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into())); 1612 } 1613 } 1614 1615 Ok(()) 1616 } 1617 /// 1618 /// Sets debug registers to set hardware breakpoints and/or enable single step. 1619 /// 1620 fn set_guest_debug( 1621 &self, 1622 addrs: &[vm_memory::GuestAddress], 1623 singlestep: bool, 1624 ) -> cpu::Result<()> { 1625 let mut dbg = kvm_guest_debug { 1626 #[cfg(target_arch = "x86_64")] 1627 control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP, 1628 #[cfg(target_arch = "aarch64")] 1629 control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW, 1630 ..Default::default() 1631 }; 1632 if singlestep { 1633 dbg.control |= KVM_GUESTDBG_SINGLESTEP; 1634 } 1635 1636 // Set the debug registers. 1637 // Here we assume that the number of addresses do not exceed what 1638 // `Hypervisor::get_guest_debug_hw_bps()` specifies. 1639 #[cfg(target_arch = "x86_64")] 1640 { 1641 // Set bits 9 and 10. 1642 // bit 9: GE (global exact breakpoint enable) flag. 1643 // bit 10: always 1. 1644 dbg.arch.debugreg[7] = 0x0600; 1645 1646 for (i, addr) in addrs.iter().enumerate() { 1647 dbg.arch.debugreg[i] = addr.0; 1648 // Set global breakpoint enable flag 1649 dbg.arch.debugreg[7] |= 2 << (i * 2); 1650 } 1651 } 1652 #[cfg(target_arch = "aarch64")] 1653 { 1654 for (i, addr) in addrs.iter().enumerate() { 1655 // DBGBCR_EL1 (Debug Breakpoint Control Registers, D13.3.2): 1656 // bit 0: 1 (Enabled) 1657 // bit 1~2: 0b11 (PMC = EL1/EL0) 1658 // bit 5~8: 0b1111 (BAS = AArch64) 1659 // others: 0 1660 dbg.arch.dbg_bcr[i] = 0b1u64 | 0b110u64 | 0b1_1110_0000u64; 1661 // DBGBVR_EL1 (Debug Breakpoint Value Registers, D13.3.3): 1662 // bit 2~52: VA[2:52] 1663 dbg.arch.dbg_bvr[i] = (!0u64 >> 11) & addr.0; 1664 } 1665 } 1666 self.fd 1667 .set_guest_debug(&dbg) 1668 .map_err(|e| cpu::HypervisorCpuError::SetDebugRegs(e.into())) 1669 } 1670 #[cfg(target_arch = "aarch64")] 1671 fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> { 1672 self.fd 1673 .vcpu_init(kvi) 1674 .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into())) 1675 } 1676 /// 1677 /// Gets a list of the guest registers that are supported for the 1678 /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls. 1679 /// 1680 #[cfg(target_arch = "aarch64")] 1681 fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> { 1682 self.fd 1683 .get_reg_list(reg_list) 1684 .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into())) 1685 } 1686 /// 1687 /// Gets the value of a system register 1688 /// 1689 #[cfg(target_arch = "aarch64")] 1690 fn get_sys_reg(&self, sys_reg: u32) -> cpu::Result<u64> { 1691 // 1692 // Arm Architecture Reference Manual defines the encoding of 1693 // AArch64 system registers, see 1694 // https://developer.arm.com/documentation/ddi0487 (chapter D12). 1695 // While KVM defines another ID for each AArch64 system register, 1696 // which is used in calling `KVM_G/SET_ONE_REG` to access a system 1697 // register of a guest. 1698 // A mapping exists between the Arm standard encoding and the KVM ID. 1699 // This function takes the standard u32 ID as input parameter, converts 1700 // it to the corresponding KVM ID, and call `KVM_GET_ONE_REG` API to 1701 // get the value of the system parameter. 1702 // 1703 let id: u64 = KVM_REG_ARM64 1704 | KVM_REG_SIZE_U64 1705 | KVM_REG_ARM64_SYSREG as u64 1706 | ((((sys_reg) >> 5) 1707 & (KVM_REG_ARM64_SYSREG_OP0_MASK 1708 | KVM_REG_ARM64_SYSREG_OP1_MASK 1709 | KVM_REG_ARM64_SYSREG_CRN_MASK 1710 | KVM_REG_ARM64_SYSREG_CRM_MASK 1711 | KVM_REG_ARM64_SYSREG_OP2_MASK)) as u64); 1712 Ok(self 1713 .fd 1714 .get_one_reg(id) 1715 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))? 1716 .try_into() 1717 .unwrap()) 1718 } 1719 /// 1720 /// Configure core registers for a given CPU. 1721 /// 1722 #[cfg(target_arch = "aarch64")] 1723 fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> { 1724 #[allow(non_upper_case_globals)] 1725 // PSR (Processor State Register) bits. 1726 // Taken from arch/arm64/include/uapi/asm/ptrace.h. 1727 const PSR_MODE_EL1h: u64 = 0x0000_0005; 1728 const PSR_F_BIT: u64 = 0x0000_0040; 1729 const PSR_I_BIT: u64 = 0x0000_0080; 1730 const PSR_A_BIT: u64 = 0x0000_0100; 1731 const PSR_D_BIT: u64 = 0x0000_0200; 1732 // Taken from arch/arm64/kvm/inject_fault.c. 1733 const PSTATE_FAULT_BITS_64: u64 = 1734 PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | PSR_I_BIT | PSR_D_BIT; 1735 1736 let kreg_off = offset__of!(kvm_regs, regs); 1737 1738 // Get the register index of the PSTATE (Processor State) register. 1739 let pstate = offset__of!(user_pt_regs, pstate) + kreg_off; 1740 self.fd 1741 .set_one_reg( 1742 arm64_core_reg_id!(KVM_REG_SIZE_U64, pstate), 1743 PSTATE_FAULT_BITS_64.into(), 1744 ) 1745 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1746 1747 // Other vCPUs are powered off initially awaiting PSCI wakeup. 1748 if cpu_id == 0 { 1749 // Setting the PC (Processor Counter) to the current program address (kernel address). 1750 let pc = offset__of!(user_pt_regs, pc) + kreg_off; 1751 self.fd 1752 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, pc), boot_ip.into()) 1753 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1754 1755 // Last mandatory thing to set -> the address pointing to the FDT (also called DTB). 1756 // "The device tree blob (dtb) must be placed on an 8-byte boundary and must 1757 // not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt. 1758 // We are choosing to place it the end of DRAM. See `get_fdt_addr`. 1759 let regs0 = offset__of!(user_pt_regs, regs) + kreg_off; 1760 self.fd 1761 .set_one_reg( 1762 arm64_core_reg_id!(KVM_REG_SIZE_U64, regs0), 1763 fdt_start.into(), 1764 ) 1765 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1766 } 1767 Ok(()) 1768 } 1769 1770 #[cfg(target_arch = "x86_64")] 1771 /// 1772 /// Get the current CPU state 1773 /// 1774 /// Ordering requirements: 1775 /// 1776 /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify 1777 /// vCPU/LAPIC state. As such, it must be done before most everything 1778 /// else, otherwise we cannot restore everything and expect it to work. 1779 /// 1780 /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are 1781 /// still running. 1782 /// 1783 /// KVM_GET_LAPIC may change state of LAPIC before returning it. 1784 /// 1785 /// GET_VCPU_EVENTS should probably be last to save. The code looks as 1786 /// it might as well be affected by internal state modifications of the 1787 /// GET ioctls. 1788 /// 1789 /// SREGS saves/restores a pending interrupt, similar to what 1790 /// VCPU_EVENTS also does. 1791 /// 1792 /// GET_MSRS requires a pre-populated data structure to do something 1793 /// meaningful. For SET_MSRS it will then contain good data. 1794 /// 1795 /// # Example 1796 /// 1797 /// ```rust 1798 /// # extern crate hypervisor; 1799 /// # use hypervisor::KvmHypervisor; 1800 /// # use std::sync::Arc; 1801 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1802 /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1803 /// let vm = hv.create_vm().expect("new VM fd creation failed"); 1804 /// vm.enable_split_irq().unwrap(); 1805 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1806 /// let state = vcpu.state().unwrap(); 1807 /// ``` 1808 fn state(&self) -> cpu::Result<CpuState> { 1809 let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?; 1810 let mp_state = self.get_mp_state()?.into(); 1811 let regs = self.get_regs()?; 1812 let sregs = self.get_sregs()?; 1813 let xsave = self.get_xsave()?; 1814 let xcrs = self.get_xcrs()?; 1815 let lapic_state = self.get_lapic()?; 1816 let fpu = self.get_fpu()?; 1817 1818 // Try to get all MSRs based on the list previously retrieved from KVM. 1819 // If the number of MSRs obtained from GET_MSRS is different from the 1820 // expected amount, we fallback onto a slower method by getting MSRs 1821 // by chunks. This is the only way to make sure we try to get as many 1822 // MSRs as possible, even if some MSRs are not supported. 1823 let mut msr_entries = self.msrs.clone(); 1824 1825 // Save extra MSRs if the Hyper-V synthetic interrupt controller is 1826 // emulated. 1827 if self.hyperv_synic.load(Ordering::Acquire) { 1828 let hyperv_synic_msrs = vec![ 1829 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084, 1830 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096, 1831 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d, 1832 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4, 1833 0x400000b5, 0x400000b6, 0x400000b7, 1834 ]; 1835 for index in hyperv_synic_msrs { 1836 let msr = kvm_msr_entry { 1837 index, 1838 ..Default::default() 1839 }; 1840 msr_entries.push(msr.into()); 1841 } 1842 } 1843 1844 let expected_num_msrs = msr_entries.len(); 1845 let num_msrs = self.get_msrs(&mut msr_entries)?; 1846 let msrs = if num_msrs != expected_num_msrs { 1847 let mut faulty_msr_index = num_msrs; 1848 let mut msr_entries_tmp = msr_entries[..faulty_msr_index].to_vec(); 1849 1850 loop { 1851 warn!( 1852 "Detected faulty MSR 0x{:x} while getting MSRs", 1853 msr_entries[faulty_msr_index].index 1854 ); 1855 1856 // Skip the first bad MSR 1857 let start_pos = faulty_msr_index + 1; 1858 1859 let mut sub_msr_entries = msr_entries[start_pos..].to_vec(); 1860 let num_msrs = self.get_msrs(&mut sub_msr_entries)?; 1861 1862 msr_entries_tmp.extend(&sub_msr_entries[..num_msrs]); 1863 1864 if num_msrs == sub_msr_entries.len() { 1865 break; 1866 } 1867 1868 faulty_msr_index = start_pos + num_msrs; 1869 } 1870 1871 msr_entries_tmp 1872 } else { 1873 msr_entries 1874 }; 1875 1876 let vcpu_events = self.get_vcpu_events()?; 1877 1878 Ok(VcpuKvmState { 1879 cpuid, 1880 msrs, 1881 vcpu_events, 1882 regs: regs.into(), 1883 sregs: sregs.into(), 1884 fpu, 1885 lapic_state, 1886 xsave, 1887 xcrs, 1888 mp_state, 1889 } 1890 .into()) 1891 } 1892 /// 1893 /// Get the current AArch64 CPU state 1894 /// 1895 #[cfg(target_arch = "aarch64")] 1896 fn state(&self) -> cpu::Result<CpuState> { 1897 let mut state = VcpuKvmState { 1898 mp_state: self.get_mp_state()?.into(), 1899 ..Default::default() 1900 }; 1901 // Get core registers 1902 state.core_regs = self.get_regs()?; 1903 1904 // Get systerm register 1905 // Call KVM_GET_REG_LIST to get all registers available to the guest. 1906 // For ArmV8 there are around 500 registers. 1907 let mut sys_regs: Vec<Register> = Vec::new(); 1908 let mut reg_list = RegList::new(500).unwrap(); 1909 self.fd 1910 .get_reg_list(&mut reg_list) 1911 .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?; 1912 1913 // At this point reg_list should contain: core registers and system 1914 // registers. 1915 // The register list contains the number of registers and their ids. We 1916 // will be needing to call KVM_GET_ONE_REG on each id in order to save 1917 // all of them. We carve out from the list the core registers which are 1918 // represented in the kernel by kvm_regs structure and for which we can 1919 // calculate the id based on the offset in the structure. 1920 reg_list.retain(|regid| is_system_register(*regid)); 1921 1922 // Now, for the rest of the registers left in the previously fetched 1923 // register list, we are simply calling KVM_GET_ONE_REG. 1924 let indices = reg_list.as_slice(); 1925 for index in indices.iter() { 1926 sys_regs.push(kvm_bindings::kvm_one_reg { 1927 id: *index, 1928 addr: self 1929 .fd 1930 .get_one_reg(*index) 1931 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))? 1932 .try_into() 1933 .unwrap(), 1934 }); 1935 } 1936 1937 state.sys_regs = sys_regs; 1938 1939 Ok(state.into()) 1940 } 1941 #[cfg(target_arch = "x86_64")] 1942 /// 1943 /// Restore the previously saved CPU state 1944 /// 1945 /// Ordering requirements: 1946 /// 1947 /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are 1948 /// still running. 1949 /// 1950 /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so 1951 /// if we ever change the BSP, we have to do that before restoring anything. 1952 /// The same seems to be true for CPUID stuff. 1953 /// 1954 /// SREGS saves/restores a pending interrupt, similar to what 1955 /// VCPU_EVENTS also does. 1956 /// 1957 /// SET_REGS clears pending exceptions unconditionally, thus, it must be 1958 /// done before SET_VCPU_EVENTS, which restores it. 1959 /// 1960 /// SET_LAPIC must come after SET_SREGS, because the latter restores 1961 /// the apic base msr. 1962 /// 1963 /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR 1964 /// only restores successfully, when the LAPIC is correctly configured. 1965 /// 1966 /// Arguments: CpuState 1967 /// # Example 1968 /// 1969 /// ```rust 1970 /// # extern crate hypervisor; 1971 /// # use hypervisor::KvmHypervisor; 1972 /// # use std::sync::Arc; 1973 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1974 /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1975 /// let vm = hv.create_vm().expect("new VM fd creation failed"); 1976 /// vm.enable_split_irq().unwrap(); 1977 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1978 /// let state = vcpu.state().unwrap(); 1979 /// vcpu.set_state(&state).unwrap(); 1980 /// ``` 1981 fn set_state(&self, state: &CpuState) -> cpu::Result<()> { 1982 let state: VcpuKvmState = state.clone().into(); 1983 self.set_cpuid2(&state.cpuid)?; 1984 self.set_mp_state(state.mp_state.into())?; 1985 self.set_regs(&state.regs.into())?; 1986 self.set_sregs(&state.sregs.into())?; 1987 self.set_xsave(&state.xsave)?; 1988 self.set_xcrs(&state.xcrs)?; 1989 self.set_lapic(&state.lapic_state)?; 1990 self.set_fpu(&state.fpu)?; 1991 1992 // Try to set all MSRs previously stored. 1993 // If the number of MSRs set from SET_MSRS is different from the 1994 // expected amount, we fallback onto a slower method by setting MSRs 1995 // by chunks. This is the only way to make sure we try to set as many 1996 // MSRs as possible, even if some MSRs are not supported. 1997 let expected_num_msrs = state.msrs.len(); 1998 let num_msrs = self.set_msrs(&state.msrs)?; 1999 if num_msrs != expected_num_msrs { 2000 let mut faulty_msr_index = num_msrs; 2001 2002 loop { 2003 warn!( 2004 "Detected faulty MSR 0x{:x} while setting MSRs", 2005 state.msrs[faulty_msr_index].index 2006 ); 2007 2008 // Skip the first bad MSR 2009 let start_pos = faulty_msr_index + 1; 2010 2011 let sub_msr_entries = state.msrs[start_pos..].to_vec(); 2012 2013 let num_msrs = self.set_msrs(&sub_msr_entries)?; 2014 2015 if num_msrs == sub_msr_entries.len() { 2016 break; 2017 } 2018 2019 faulty_msr_index = start_pos + num_msrs; 2020 } 2021 } 2022 2023 self.set_vcpu_events(&state.vcpu_events)?; 2024 2025 Ok(()) 2026 } 2027 /// 2028 /// Restore the previously saved AArch64 CPU state 2029 /// 2030 #[cfg(target_arch = "aarch64")] 2031 fn set_state(&self, state: &CpuState) -> cpu::Result<()> { 2032 let state: VcpuKvmState = state.clone().into(); 2033 // Set core registers 2034 self.set_regs(&state.core_regs)?; 2035 // Set system registers 2036 for reg in &state.sys_regs { 2037 self.fd 2038 .set_one_reg(reg.id, reg.addr.into()) 2039 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?; 2040 } 2041 2042 self.set_mp_state(state.mp_state.into())?; 2043 2044 Ok(()) 2045 } 2046 2047 /// 2048 /// Initialize TDX for this CPU 2049 /// 2050 #[cfg(feature = "tdx")] 2051 fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> { 2052 tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address) 2053 .map_err(cpu::HypervisorCpuError::InitializeTdx) 2054 } 2055 2056 /// 2057 /// Set the "immediate_exit" state 2058 /// 2059 fn set_immediate_exit(&self, exit: bool) { 2060 self.fd.set_kvm_immediate_exit(exit.into()); 2061 } 2062 2063 /// 2064 /// Returns the details about TDX exit reason 2065 /// 2066 #[cfg(feature = "tdx")] 2067 fn get_tdx_exit_details(&mut self) -> cpu::Result<TdxExitDetails> { 2068 let kvm_run = self.fd.get_kvm_run(); 2069 // SAFETY: accessing a union field in a valid structure 2070 let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall }; 2071 2072 tdx_vmcall.status_code = TDG_VP_VMCALL_INVALID_OPERAND; 2073 2074 if tdx_vmcall.type_ != 0 { 2075 return Err(cpu::HypervisorCpuError::UnknownTdxVmCall); 2076 } 2077 2078 match tdx_vmcall.subfunction { 2079 TDG_VP_VMCALL_GET_QUOTE => Ok(TdxExitDetails::GetQuote), 2080 TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT => { 2081 Ok(TdxExitDetails::SetupEventNotifyInterrupt) 2082 } 2083 _ => Err(cpu::HypervisorCpuError::UnknownTdxVmCall), 2084 } 2085 } 2086 2087 /// 2088 /// Set the status code for TDX exit 2089 /// 2090 #[cfg(feature = "tdx")] 2091 fn set_tdx_status(&mut self, status: TdxExitStatus) { 2092 let kvm_run = self.fd.get_kvm_run(); 2093 // SAFETY: accessing a union field in a valid structure 2094 let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall }; 2095 2096 tdx_vmcall.status_code = match status { 2097 TdxExitStatus::Success => TDG_VP_VMCALL_SUCCESS, 2098 TdxExitStatus::InvalidOperand => TDG_VP_VMCALL_INVALID_OPERAND, 2099 }; 2100 } 2101 #[cfg(target_arch = "x86_64")] 2102 /// 2103 /// Return the list of initial MSR entries for a VCPU 2104 /// 2105 fn boot_msr_entries(&self) -> Vec<MsrEntry> { 2106 use crate::arch::x86::{msr_index, MTRR_ENABLE, MTRR_MEM_TYPE_WB}; 2107 2108 [ 2109 msr!(msr_index::MSR_IA32_SYSENTER_CS), 2110 msr!(msr_index::MSR_IA32_SYSENTER_ESP), 2111 msr!(msr_index::MSR_IA32_SYSENTER_EIP), 2112 msr!(msr_index::MSR_STAR), 2113 msr!(msr_index::MSR_CSTAR), 2114 msr!(msr_index::MSR_LSTAR), 2115 msr!(msr_index::MSR_KERNEL_GS_BASE), 2116 msr!(msr_index::MSR_SYSCALL_MASK), 2117 msr!(msr_index::MSR_IA32_TSC), 2118 msr_data!( 2119 msr_index::MSR_IA32_MISC_ENABLE, 2120 msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64 2121 ), 2122 msr_data!(msr_index::MSR_MTRRdefType, MTRR_ENABLE | MTRR_MEM_TYPE_WB), 2123 ] 2124 .to_vec() 2125 } 2126 #[cfg(target_arch = "aarch64")] 2127 fn has_pmu_support(&self) -> bool { 2128 let cpu_attr = kvm_bindings::kvm_device_attr { 2129 group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL, 2130 attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT), 2131 addr: 0x0, 2132 flags: 0, 2133 }; 2134 self.fd.has_device_attr(&cpu_attr).is_ok() 2135 } 2136 #[cfg(target_arch = "aarch64")] 2137 fn init_pmu(&self, irq: u32) -> cpu::Result<()> { 2138 let cpu_attr = kvm_bindings::kvm_device_attr { 2139 group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL, 2140 attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT), 2141 addr: 0x0, 2142 flags: 0, 2143 }; 2144 let cpu_attr_irq = kvm_bindings::kvm_device_attr { 2145 group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL, 2146 attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_IRQ), 2147 addr: &irq as *const u32 as u64, 2148 flags: 0, 2149 }; 2150 self.fd 2151 .set_device_attr(&cpu_attr_irq) 2152 .map_err(|_| cpu::HypervisorCpuError::InitializePmu)?; 2153 self.fd 2154 .set_device_attr(&cpu_attr) 2155 .map_err(|_| cpu::HypervisorCpuError::InitializePmu) 2156 } 2157 } 2158 2159 impl KvmVcpu { 2160 #[cfg(target_arch = "x86_64")] 2161 /// 2162 /// X86 specific call that returns the vcpu's current "xsave struct". 2163 /// 2164 fn get_xsave(&self) -> cpu::Result<Xsave> { 2165 self.fd 2166 .get_xsave() 2167 .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into())) 2168 } 2169 #[cfg(target_arch = "x86_64")] 2170 /// 2171 /// X86 specific call that sets the vcpu's current "xsave struct". 2172 /// 2173 fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> { 2174 self.fd 2175 .set_xsave(xsave) 2176 .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into())) 2177 } 2178 #[cfg(target_arch = "x86_64")] 2179 /// 2180 /// X86 specific call that returns the vcpu's current "xcrs". 2181 /// 2182 fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> { 2183 self.fd 2184 .get_xcrs() 2185 .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into())) 2186 } 2187 #[cfg(target_arch = "x86_64")] 2188 /// 2189 /// X86 specific call that sets the vcpu's current "xcrs". 2190 /// 2191 fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> { 2192 self.fd 2193 .set_xcrs(xcrs) 2194 .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into())) 2195 } 2196 #[cfg(target_arch = "x86_64")] 2197 /// 2198 /// Returns currently pending exceptions, interrupts, and NMIs as well as related 2199 /// states of the vcpu. 2200 /// 2201 fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> { 2202 self.fd 2203 .get_vcpu_events() 2204 .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into())) 2205 } 2206 #[cfg(target_arch = "x86_64")] 2207 /// 2208 /// Sets pending exceptions, interrupts, and NMIs as well as related states 2209 /// of the vcpu. 2210 /// 2211 fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> { 2212 self.fd 2213 .set_vcpu_events(events) 2214 .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into())) 2215 } 2216 } 2217