1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause 4 // 5 // Copyright © 2020, Microsoft Corporation 6 // 7 // Copyright 2018-2019 CrowdStrike, Inc. 8 // 9 // 10 11 #[cfg(target_arch = "aarch64")] 12 use crate::aarch64::gic::KvmGicV3Its; 13 #[cfg(target_arch = "aarch64")] 14 pub use crate::aarch64::{ 15 check_required_kvm_extensions, gic::Gicv3ItsState as GicState, is_system_register, VcpuInit, 16 VcpuKvmState, 17 }; 18 #[cfg(target_arch = "aarch64")] 19 use crate::arch::aarch64::gic::{Vgic, VgicConfig}; 20 use crate::cpu; 21 use crate::hypervisor; 22 use crate::vec_with_array_field; 23 use crate::vm::{self, InterruptSourceConfig, VmOps}; 24 use crate::HypervisorType; 25 #[cfg(target_arch = "aarch64")] 26 use crate::{arm64_core_reg_id, offset_of}; 27 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd}; 28 use std::any::Any; 29 use std::collections::HashMap; 30 #[cfg(target_arch = "aarch64")] 31 use std::convert::TryInto; 32 #[cfg(target_arch = "x86_64")] 33 use std::fs::File; 34 #[cfg(target_arch = "x86_64")] 35 use std::os::unix::io::AsRawFd; 36 #[cfg(feature = "tdx")] 37 use std::os::unix::io::RawFd; 38 use std::result; 39 #[cfg(target_arch = "x86_64")] 40 use std::sync::atomic::{AtomicBool, Ordering}; 41 #[cfg(target_arch = "aarch64")] 42 use std::sync::Mutex; 43 use std::sync::{Arc, RwLock}; 44 use vmm_sys_util::eventfd::EventFd; 45 // x86_64 dependencies 46 #[cfg(target_arch = "x86_64")] 47 pub mod x86_64; 48 #[cfg(target_arch = "x86_64")] 49 use crate::arch::x86::{ 50 CpuIdEntry, FpuState, LapicState, MsrEntry, SpecialRegisters, StandardRegisters, 51 NUM_IOAPIC_PINS, 52 }; 53 #[cfg(target_arch = "x86_64")] 54 use crate::ClockData; 55 use crate::{ 56 CpuState, IoEventAddress, IrqRoutingEntry, MpState, UserMemoryRegion, 57 USER_MEMORY_REGION_LOG_DIRTY, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE, 58 }; 59 #[cfg(target_arch = "aarch64")] 60 use aarch64::{RegList, Register, StandardRegisters}; 61 #[cfg(target_arch = "x86_64")] 62 use kvm_bindings::{ 63 kvm_enable_cap, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, KVM_CAP_SPLIT_IRQCHIP, 64 KVM_GUESTDBG_USE_HW_BP, 65 }; 66 #[cfg(target_arch = "x86_64")] 67 use x86_64::check_required_kvm_extensions; 68 #[cfg(target_arch = "x86_64")] 69 pub use x86_64::{CpuId, ExtendedControlRegisters, MsrEntries, VcpuKvmState, Xsave}; 70 // aarch64 dependencies 71 #[cfg(target_arch = "aarch64")] 72 pub mod aarch64; 73 pub use kvm_bindings; 74 #[cfg(feature = "tdx")] 75 use kvm_bindings::KVMIO; 76 pub use kvm_bindings::{ 77 kvm_clock_data, kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_guest_debug, 78 kvm_irq_routing, kvm_irq_routing_entry, kvm_mp_state, kvm_userspace_memory_region, 79 KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI, 80 KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID, 81 }; 82 #[cfg(target_arch = "aarch64")] 83 use kvm_bindings::{ 84 kvm_regs, user_fpsimd_state, user_pt_regs, KVM_GUESTDBG_USE_HW, KVM_NR_SPSR, KVM_REG_ARM64, 85 KVM_REG_ARM64_SYSREG, KVM_REG_ARM64_SYSREG_CRM_MASK, KVM_REG_ARM64_SYSREG_CRN_MASK, 86 KVM_REG_ARM64_SYSREG_OP0_MASK, KVM_REG_ARM64_SYSREG_OP1_MASK, KVM_REG_ARM64_SYSREG_OP2_MASK, 87 KVM_REG_ARM_CORE, KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64, 88 }; 89 pub use kvm_ioctls; 90 pub use kvm_ioctls::{Cap, Kvm}; 91 #[cfg(target_arch = "aarch64")] 92 use std::mem; 93 use thiserror::Error; 94 use vfio_ioctls::VfioDeviceFd; 95 #[cfg(feature = "tdx")] 96 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_ioc_nr, ioctl_iowr_nr}; 97 /// 98 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms 99 /// 100 pub use { 101 kvm_bindings::kvm_create_device as CreateDevice, kvm_bindings::kvm_device_attr as DeviceAttr, 102 kvm_bindings::kvm_run, kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::VcpuExit, 103 }; 104 105 #[cfg(target_arch = "x86_64")] 106 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196; 107 108 #[cfg(feature = "tdx")] 109 const KVM_EXIT_TDX: u32 = 50; 110 #[cfg(feature = "tdx")] 111 const TDG_VP_VMCALL_GET_QUOTE: u64 = 0x10002; 112 #[cfg(feature = "tdx")] 113 const TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: u64 = 0x10004; 114 #[cfg(feature = "tdx")] 115 const TDG_VP_VMCALL_SUCCESS: u64 = 0; 116 #[cfg(feature = "tdx")] 117 const TDG_VP_VMCALL_INVALID_OPERAND: u64 = 0x8000000000000000; 118 119 #[cfg(feature = "tdx")] 120 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong); 121 122 #[cfg(feature = "tdx")] 123 #[repr(u32)] 124 enum TdxCommand { 125 Capabilities = 0, 126 InitVm, 127 InitVcpu, 128 InitMemRegion, 129 Finalize, 130 } 131 132 #[cfg(feature = "tdx")] 133 pub enum TdxExitDetails { 134 GetQuote, 135 SetupEventNotifyInterrupt, 136 } 137 138 #[cfg(feature = "tdx")] 139 pub enum TdxExitStatus { 140 Success, 141 InvalidOperand, 142 } 143 144 #[cfg(feature = "tdx")] 145 const TDX_MAX_NR_CPUID_CONFIGS: usize = 6; 146 147 #[cfg(feature = "tdx")] 148 #[repr(C)] 149 #[derive(Debug, Default)] 150 pub struct TdxCpuidConfig { 151 pub leaf: u32, 152 pub sub_leaf: u32, 153 pub eax: u32, 154 pub ebx: u32, 155 pub ecx: u32, 156 pub edx: u32, 157 } 158 159 #[cfg(feature = "tdx")] 160 #[repr(C)] 161 #[derive(Debug, Default)] 162 pub struct TdxCapabilities { 163 pub attrs_fixed0: u64, 164 pub attrs_fixed1: u64, 165 pub xfam_fixed0: u64, 166 pub xfam_fixed1: u64, 167 pub nr_cpuid_configs: u32, 168 pub padding: u32, 169 pub cpuid_configs: [TdxCpuidConfig; TDX_MAX_NR_CPUID_CONFIGS], 170 } 171 172 impl From<kvm_userspace_memory_region> for UserMemoryRegion { 173 fn from(region: kvm_userspace_memory_region) -> Self { 174 let mut flags = USER_MEMORY_REGION_READ; 175 if region.flags & KVM_MEM_READONLY == 0 { 176 flags |= USER_MEMORY_REGION_WRITE; 177 } 178 if region.flags & KVM_MEM_LOG_DIRTY_PAGES != 0 { 179 flags |= USER_MEMORY_REGION_LOG_DIRTY; 180 } 181 182 UserMemoryRegion { 183 slot: region.slot, 184 guest_phys_addr: region.guest_phys_addr, 185 memory_size: region.memory_size, 186 userspace_addr: region.userspace_addr, 187 flags, 188 } 189 } 190 } 191 192 impl From<UserMemoryRegion> for kvm_userspace_memory_region { 193 fn from(region: UserMemoryRegion) -> Self { 194 assert!( 195 region.flags & USER_MEMORY_REGION_READ != 0, 196 "KVM mapped memory is always readable" 197 ); 198 199 let mut flags = 0; 200 if region.flags & USER_MEMORY_REGION_WRITE == 0 { 201 flags |= KVM_MEM_READONLY; 202 } 203 if region.flags & USER_MEMORY_REGION_LOG_DIRTY != 0 { 204 flags |= KVM_MEM_LOG_DIRTY_PAGES; 205 } 206 207 kvm_userspace_memory_region { 208 slot: region.slot, 209 guest_phys_addr: region.guest_phys_addr, 210 memory_size: region.memory_size, 211 userspace_addr: region.userspace_addr, 212 flags, 213 } 214 } 215 } 216 217 impl From<kvm_mp_state> for MpState { 218 fn from(s: kvm_mp_state) -> Self { 219 MpState::Kvm(s) 220 } 221 } 222 223 impl From<MpState> for kvm_mp_state { 224 fn from(ms: MpState) -> Self { 225 match ms { 226 MpState::Kvm(s) => s, 227 /* Needed in case other hypervisors are enabled */ 228 #[allow(unreachable_patterns)] 229 _ => panic!("CpuState is not valid"), 230 } 231 } 232 } 233 234 impl From<kvm_ioctls::IoEventAddress> for IoEventAddress { 235 fn from(a: kvm_ioctls::IoEventAddress) -> Self { 236 match a { 237 kvm_ioctls::IoEventAddress::Pio(x) => Self::Pio(x), 238 kvm_ioctls::IoEventAddress::Mmio(x) => Self::Mmio(x), 239 } 240 } 241 } 242 243 impl From<IoEventAddress> for kvm_ioctls::IoEventAddress { 244 fn from(a: IoEventAddress) -> Self { 245 match a { 246 IoEventAddress::Pio(x) => Self::Pio(x), 247 IoEventAddress::Mmio(x) => Self::Mmio(x), 248 } 249 } 250 } 251 252 impl From<VcpuKvmState> for CpuState { 253 fn from(s: VcpuKvmState) -> Self { 254 CpuState::Kvm(s) 255 } 256 } 257 258 impl From<CpuState> for VcpuKvmState { 259 fn from(s: CpuState) -> Self { 260 match s { 261 CpuState::Kvm(s) => s, 262 /* Needed in case other hypervisors are enabled */ 263 #[allow(unreachable_patterns)] 264 _ => panic!("CpuState is not valid"), 265 } 266 } 267 } 268 269 #[cfg(target_arch = "x86_64")] 270 impl From<kvm_clock_data> for ClockData { 271 fn from(d: kvm_clock_data) -> Self { 272 ClockData::Kvm(d) 273 } 274 } 275 276 #[cfg(target_arch = "x86_64")] 277 impl From<ClockData> for kvm_clock_data { 278 fn from(ms: ClockData) -> Self { 279 match ms { 280 ClockData::Kvm(s) => s, 281 /* Needed in case other hypervisors are enabled */ 282 #[allow(unreachable_patterns)] 283 _ => panic!("CpuState is not valid"), 284 } 285 } 286 } 287 288 impl From<kvm_irq_routing_entry> for IrqRoutingEntry { 289 fn from(s: kvm_irq_routing_entry) -> Self { 290 IrqRoutingEntry::Kvm(s) 291 } 292 } 293 294 impl From<IrqRoutingEntry> for kvm_irq_routing_entry { 295 fn from(e: IrqRoutingEntry) -> Self { 296 match e { 297 IrqRoutingEntry::Kvm(e) => e, 298 /* Needed in case other hypervisors are enabled */ 299 #[allow(unreachable_patterns)] 300 _ => panic!("IrqRoutingEntry is not valid"), 301 } 302 } 303 } 304 305 struct KvmDirtyLogSlot { 306 slot: u32, 307 guest_phys_addr: u64, 308 memory_size: u64, 309 userspace_addr: u64, 310 } 311 312 /// Wrapper over KVM VM ioctls. 313 pub struct KvmVm { 314 fd: Arc<VmFd>, 315 #[cfg(target_arch = "x86_64")] 316 msrs: Vec<MsrEntry>, 317 dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>, 318 } 319 320 impl KvmVm { 321 /// 322 /// Creates an emulated device in the kernel. 323 /// 324 /// See the documentation for `KVM_CREATE_DEVICE`. 325 fn create_device(&self, device: &mut CreateDevice) -> vm::Result<vfio_ioctls::VfioDeviceFd> { 326 let device_fd = self 327 .fd 328 .create_device(device) 329 .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?; 330 Ok(VfioDeviceFd::new_from_kvm(device_fd)) 331 } 332 /// Checks if a particular `Cap` is available. 333 fn check_extension(&self, c: Cap) -> bool { 334 self.fd.check_extension(c) 335 } 336 } 337 338 /// 339 /// Implementation of Vm trait for KVM 340 /// Example: 341 /// #[cfg(feature = "kvm")] 342 /// extern crate hypervisor 343 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 344 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 345 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 346 /// vm.set/get().unwrap() 347 /// 348 impl vm::Vm for KvmVm { 349 #[cfg(target_arch = "x86_64")] 350 /// 351 /// Sets the address of the one-page region in the VM's address space. 352 /// 353 fn set_identity_map_address(&self, address: u64) -> vm::Result<()> { 354 self.fd 355 .set_identity_map_address(address) 356 .map_err(|e| vm::HypervisorVmError::SetIdentityMapAddress(e.into())) 357 } 358 #[cfg(target_arch = "x86_64")] 359 /// 360 /// Sets the address of the three-page region in the VM's address space. 361 /// 362 fn set_tss_address(&self, offset: usize) -> vm::Result<()> { 363 self.fd 364 .set_tss_address(offset) 365 .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into())) 366 } 367 /// 368 /// Creates an in-kernel interrupt controller. 369 /// 370 fn create_irq_chip(&self) -> vm::Result<()> { 371 self.fd 372 .create_irq_chip() 373 .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into())) 374 } 375 /// 376 /// Registers an event that will, when signaled, trigger the `gsi` IRQ. 377 /// 378 fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> { 379 self.fd 380 .register_irqfd(fd, gsi) 381 .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into())) 382 } 383 /// 384 /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ. 385 /// 386 fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> { 387 self.fd 388 .unregister_irqfd(fd, gsi) 389 .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into())) 390 } 391 /// 392 /// Creates a VcpuFd object from a vcpu RawFd. 393 /// 394 fn create_vcpu( 395 &self, 396 id: u8, 397 vm_ops: Option<Arc<dyn VmOps>>, 398 ) -> vm::Result<Arc<dyn cpu::Vcpu>> { 399 let vc = self 400 .fd 401 .create_vcpu(id as u64) 402 .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?; 403 let vcpu = KvmVcpu { 404 fd: vc, 405 #[cfg(target_arch = "x86_64")] 406 msrs: self.msrs.clone(), 407 vm_ops, 408 #[cfg(target_arch = "x86_64")] 409 hyperv_synic: AtomicBool::new(false), 410 }; 411 Ok(Arc::new(vcpu)) 412 } 413 #[cfg(target_arch = "aarch64")] 414 /// 415 /// Creates a virtual GIC device. 416 /// 417 fn create_vgic(&self, config: VgicConfig) -> vm::Result<Arc<Mutex<dyn Vgic>>> { 418 let gic_device = KvmGicV3Its::new(self, config) 419 .map_err(|e| vm::HypervisorVmError::CreateVgic(anyhow!("Vgic error {:?}", e)))?; 420 Ok(Arc::new(Mutex::new(gic_device))) 421 } 422 /// 423 /// Registers an event to be signaled whenever a certain address is written to. 424 /// 425 fn register_ioevent( 426 &self, 427 fd: &EventFd, 428 addr: &IoEventAddress, 429 datamatch: Option<vm::DataMatch>, 430 ) -> vm::Result<()> { 431 let addr = &kvm_ioctls::IoEventAddress::from(*addr); 432 if let Some(dm) = datamatch { 433 match dm { 434 vm::DataMatch::DataMatch32(kvm_dm32) => self 435 .fd 436 .register_ioevent(fd, addr, kvm_dm32) 437 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())), 438 vm::DataMatch::DataMatch64(kvm_dm64) => self 439 .fd 440 .register_ioevent(fd, addr, kvm_dm64) 441 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())), 442 } 443 } else { 444 self.fd 445 .register_ioevent(fd, addr, NoDatamatch) 446 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())) 447 } 448 } 449 /// 450 /// Unregisters an event from a certain address it has been previously registered to. 451 /// 452 fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> { 453 let addr = &kvm_ioctls::IoEventAddress::from(*addr); 454 self.fd 455 .unregister_ioevent(fd, addr, NoDatamatch) 456 .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into())) 457 } 458 459 /// 460 /// Constructs a routing entry 461 /// 462 fn make_routing_entry(&self, gsi: u32, config: &InterruptSourceConfig) -> IrqRoutingEntry { 463 match &config { 464 InterruptSourceConfig::MsiIrq(cfg) => { 465 let mut kvm_route = kvm_irq_routing_entry { 466 gsi, 467 type_: KVM_IRQ_ROUTING_MSI, 468 ..Default::default() 469 }; 470 471 kvm_route.u.msi.address_lo = cfg.low_addr; 472 kvm_route.u.msi.address_hi = cfg.high_addr; 473 kvm_route.u.msi.data = cfg.data; 474 475 if self.check_extension(crate::kvm::Cap::MsiDevid) { 476 // On AArch64, there is limitation on the range of the 'devid', 477 // it can not be greater than 65536 (the max of u16). 478 // 479 // BDF can not be used directly, because 'segment' is in high 480 // 16 bits. The layout of the u32 BDF is: 481 // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --| 482 // | segment | bus | device | function | 483 // 484 // Now that we support 1 bus only in a segment, we can build a 485 // 'devid' by replacing the 'bus' bits with the low 8 bits of 486 // 'segment' data. 487 // This way we can resolve the range checking problem and give 488 // different `devid` to all the devices. Limitation is that at 489 // most 256 segments can be supported. 490 // 491 let modified_devid = (cfg.devid & 0x00ff_0000) >> 8 | cfg.devid & 0xff; 492 493 kvm_route.flags = KVM_MSI_VALID_DEVID; 494 kvm_route.u.msi.__bindgen_anon_1.devid = modified_devid; 495 } 496 kvm_route.into() 497 } 498 InterruptSourceConfig::LegacyIrq(cfg) => { 499 let mut kvm_route = kvm_irq_routing_entry { 500 gsi, 501 type_: KVM_IRQ_ROUTING_IRQCHIP, 502 ..Default::default() 503 }; 504 kvm_route.u.irqchip.irqchip = cfg.irqchip; 505 kvm_route.u.irqchip.pin = cfg.pin; 506 507 kvm_route.into() 508 } 509 } 510 } 511 512 /// 513 /// Sets the GSI routing table entries, overwriting any previously set 514 /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl. 515 /// 516 fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> { 517 let mut irq_routing = 518 vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len()); 519 irq_routing[0].nr = entries.len() as u32; 520 irq_routing[0].flags = 0; 521 let entries: Vec<kvm_irq_routing_entry> = entries 522 .iter() 523 .map(|entry| match entry { 524 IrqRoutingEntry::Kvm(e) => *e, 525 #[allow(unreachable_patterns)] 526 _ => panic!("IrqRoutingEntry type is wrong"), 527 }) 528 .collect(); 529 530 // SAFETY: irq_routing initialized with entries.len() and now it is being turned into 531 // entries_slice with entries.len() again. It is guaranteed to be large enough to hold 532 // everything from entries. 533 unsafe { 534 let entries_slice: &mut [kvm_irq_routing_entry] = 535 irq_routing[0].entries.as_mut_slice(entries.len()); 536 entries_slice.copy_from_slice(&entries); 537 } 538 539 self.fd 540 .set_gsi_routing(&irq_routing[0]) 541 .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into())) 542 } 543 /// 544 /// Creates a memory region structure that can be used with {create/remove}_user_memory_region 545 /// 546 fn make_user_memory_region( 547 &self, 548 slot: u32, 549 guest_phys_addr: u64, 550 memory_size: u64, 551 userspace_addr: u64, 552 readonly: bool, 553 log_dirty_pages: bool, 554 ) -> UserMemoryRegion { 555 kvm_userspace_memory_region { 556 slot, 557 guest_phys_addr, 558 memory_size, 559 userspace_addr, 560 flags: if readonly { KVM_MEM_READONLY } else { 0 } 561 | if log_dirty_pages { 562 KVM_MEM_LOG_DIRTY_PAGES 563 } else { 564 0 565 }, 566 } 567 .into() 568 } 569 /// 570 /// Creates a guest physical memory region. 571 /// 572 fn create_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> { 573 let mut region: kvm_userspace_memory_region = user_memory_region.into(); 574 575 if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 { 576 if (region.flags & KVM_MEM_READONLY) != 0 { 577 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!( 578 "Error creating regions with both 'dirty-pages-log' and 'read-only'." 579 ))); 580 } 581 582 // Keep track of the regions that need dirty pages log 583 self.dirty_log_slots.write().unwrap().insert( 584 region.slot, 585 KvmDirtyLogSlot { 586 slot: region.slot, 587 guest_phys_addr: region.guest_phys_addr, 588 memory_size: region.memory_size, 589 userspace_addr: region.userspace_addr, 590 }, 591 ); 592 593 // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`. 594 // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`. 595 region.flags = 0; 596 } 597 598 // SAFETY: Safe because guest regions are guaranteed not to overlap. 599 unsafe { 600 self.fd 601 .set_user_memory_region(region) 602 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into())) 603 } 604 } 605 /// 606 /// Removes a guest physical memory region. 607 /// 608 fn remove_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> { 609 let mut region: kvm_userspace_memory_region = user_memory_region.into(); 610 611 // Remove the corresponding entry from "self.dirty_log_slots" if needed 612 self.dirty_log_slots.write().unwrap().remove(®ion.slot); 613 614 // Setting the size to 0 means "remove" 615 region.memory_size = 0; 616 // SAFETY: Safe because guest regions are guaranteed not to overlap. 617 unsafe { 618 self.fd 619 .set_user_memory_region(region) 620 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into())) 621 } 622 } 623 /// 624 /// Returns the preferred CPU target type which can be emulated by KVM on underlying host. 625 /// 626 #[cfg(target_arch = "aarch64")] 627 fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> { 628 self.fd 629 .get_preferred_target(kvi) 630 .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into())) 631 } 632 #[cfg(target_arch = "x86_64")] 633 fn enable_split_irq(&self) -> vm::Result<()> { 634 // Create split irqchip 635 // Only the local APIC is emulated in kernel, both PICs and IOAPIC 636 // are not. 637 let mut cap = kvm_enable_cap { 638 cap: KVM_CAP_SPLIT_IRQCHIP, 639 ..Default::default() 640 }; 641 cap.args[0] = NUM_IOAPIC_PINS as u64; 642 self.fd 643 .enable_cap(&cap) 644 .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?; 645 Ok(()) 646 } 647 #[cfg(target_arch = "x86_64")] 648 fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> { 649 let mut cap = kvm_enable_cap { 650 cap: KVM_CAP_SGX_ATTRIBUTE, 651 ..Default::default() 652 }; 653 cap.args[0] = file.as_raw_fd() as u64; 654 self.fd 655 .enable_cap(&cap) 656 .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?; 657 Ok(()) 658 } 659 /// Retrieve guest clock. 660 #[cfg(target_arch = "x86_64")] 661 fn get_clock(&self) -> vm::Result<ClockData> { 662 Ok(self 663 .fd 664 .get_clock() 665 .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))? 666 .into()) 667 } 668 /// Set guest clock. 669 #[cfg(target_arch = "x86_64")] 670 fn set_clock(&self, data: &ClockData) -> vm::Result<()> { 671 let data = (*data).into(); 672 self.fd 673 .set_clock(&data) 674 .map_err(|e| vm::HypervisorVmError::SetClock(e.into())) 675 } 676 /// Create a device that is used for passthrough 677 fn create_passthrough_device(&self) -> vm::Result<VfioDeviceFd> { 678 let mut vfio_dev = kvm_create_device { 679 type_: kvm_device_type_KVM_DEV_TYPE_VFIO, 680 fd: 0, 681 flags: 0, 682 }; 683 684 self.create_device(&mut vfio_dev) 685 .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into())) 686 } 687 /// 688 /// Start logging dirty pages 689 /// 690 fn start_dirty_log(&self) -> vm::Result<()> { 691 let dirty_log_slots = self.dirty_log_slots.read().unwrap(); 692 for (_, s) in dirty_log_slots.iter() { 693 let region = kvm_userspace_memory_region { 694 slot: s.slot, 695 guest_phys_addr: s.guest_phys_addr, 696 memory_size: s.memory_size, 697 userspace_addr: s.userspace_addr, 698 flags: KVM_MEM_LOG_DIRTY_PAGES, 699 }; 700 // SAFETY: Safe because guest regions are guaranteed not to overlap. 701 unsafe { 702 self.fd 703 .set_user_memory_region(region) 704 .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; 705 } 706 } 707 708 Ok(()) 709 } 710 711 /// 712 /// Stop logging dirty pages 713 /// 714 fn stop_dirty_log(&self) -> vm::Result<()> { 715 let dirty_log_slots = self.dirty_log_slots.read().unwrap(); 716 for (_, s) in dirty_log_slots.iter() { 717 let region = kvm_userspace_memory_region { 718 slot: s.slot, 719 guest_phys_addr: s.guest_phys_addr, 720 memory_size: s.memory_size, 721 userspace_addr: s.userspace_addr, 722 flags: 0, 723 }; 724 // SAFETY: Safe because guest regions are guaranteed not to overlap. 725 unsafe { 726 self.fd 727 .set_user_memory_region(region) 728 .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; 729 } 730 } 731 732 Ok(()) 733 } 734 735 /// 736 /// Get dirty pages bitmap (one bit per page) 737 /// 738 fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> { 739 self.fd 740 .get_dirty_log(slot, memory_size as usize) 741 .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into())) 742 } 743 744 /// 745 /// Initialize TDX for this VM 746 /// 747 #[cfg(feature = "tdx")] 748 fn tdx_init(&self, cpuid: &[CpuIdEntry], max_vcpus: u32) -> vm::Result<()> { 749 const TDX_ATTR_SEPT_VE_DISABLE: usize = 28; 750 751 let mut cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> = 752 cpuid.iter().map(|e| (*e).into()).collect(); 753 cpuid.resize(256, kvm_bindings::kvm_cpuid_entry2::default()); 754 755 #[repr(C)] 756 struct TdxInitVm { 757 attributes: u64, 758 max_vcpus: u32, 759 padding: u32, 760 mrconfigid: [u64; 6], 761 mrowner: [u64; 6], 762 mrownerconfig: [u64; 6], 763 cpuid_nent: u32, 764 cpuid_padding: u32, 765 cpuid_entries: [kvm_bindings::kvm_cpuid_entry2; 256], 766 } 767 let data = TdxInitVm { 768 attributes: 1 << TDX_ATTR_SEPT_VE_DISABLE, 769 max_vcpus, 770 padding: 0, 771 mrconfigid: [0; 6], 772 mrowner: [0; 6], 773 mrownerconfig: [0; 6], 774 cpuid_nent: cpuid.len() as u32, 775 cpuid_padding: 0, 776 cpuid_entries: cpuid.as_slice().try_into().unwrap(), 777 }; 778 779 tdx_command( 780 &self.fd.as_raw_fd(), 781 TdxCommand::InitVm, 782 0, 783 &data as *const _ as u64, 784 ) 785 .map_err(vm::HypervisorVmError::InitializeTdx) 786 } 787 788 /// 789 /// Finalize the TDX setup for this VM 790 /// 791 #[cfg(feature = "tdx")] 792 fn tdx_finalize(&self) -> vm::Result<()> { 793 tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0) 794 .map_err(vm::HypervisorVmError::FinalizeTdx) 795 } 796 797 /// 798 /// Initialize memory regions for the TDX VM 799 /// 800 #[cfg(feature = "tdx")] 801 fn tdx_init_memory_region( 802 &self, 803 host_address: u64, 804 guest_address: u64, 805 size: u64, 806 measure: bool, 807 ) -> vm::Result<()> { 808 #[repr(C)] 809 struct TdxInitMemRegion { 810 host_address: u64, 811 guest_address: u64, 812 pages: u64, 813 } 814 let data = TdxInitMemRegion { 815 host_address, 816 guest_address, 817 pages: size / 4096, 818 }; 819 820 tdx_command( 821 &self.fd.as_raw_fd(), 822 TdxCommand::InitMemRegion, 823 u32::from(measure), 824 &data as *const _ as u64, 825 ) 826 .map_err(vm::HypervisorVmError::InitMemRegionTdx) 827 } 828 /// Downcast to the underlying KvmVm type 829 fn as_any(&self) -> &dyn Any { 830 self 831 } 832 } 833 834 #[cfg(feature = "tdx")] 835 fn tdx_command( 836 fd: &RawFd, 837 command: TdxCommand, 838 flags: u32, 839 data: u64, 840 ) -> std::result::Result<(), std::io::Error> { 841 #[repr(C)] 842 struct TdxIoctlCmd { 843 command: TdxCommand, 844 flags: u32, 845 data: u64, 846 error: u64, 847 unused: u64, 848 } 849 let cmd = TdxIoctlCmd { 850 command, 851 flags, 852 data, 853 error: 0, 854 unused: 0, 855 }; 856 // SAFETY: FFI call. All input parameters are valid. 857 let ret = unsafe { 858 ioctl_with_val( 859 fd, 860 KVM_MEMORY_ENCRYPT_OP(), 861 &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong, 862 ) 863 }; 864 865 if ret < 0 { 866 return Err(std::io::Error::last_os_error()); 867 } 868 Ok(()) 869 } 870 871 /// Wrapper over KVM system ioctls. 872 pub struct KvmHypervisor { 873 kvm: Kvm, 874 } 875 876 impl KvmHypervisor { 877 #[cfg(target_arch = "x86_64")] 878 /// 879 /// Retrieve the list of MSRs supported by the hypervisor. 880 /// 881 fn get_msr_list(&self) -> hypervisor::Result<MsrList> { 882 self.kvm 883 .get_msr_index_list() 884 .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into())) 885 } 886 } 887 888 /// Enum for KVM related error 889 #[derive(Debug, Error)] 890 pub enum KvmError { 891 #[error("Capability missing: {0:?}")] 892 CapabilityMissing(Cap), 893 } 894 pub type KvmResult<T> = result::Result<T, KvmError>; 895 impl KvmHypervisor { 896 /// Create a hypervisor based on Kvm 897 #[allow(clippy::new_ret_no_self)] 898 pub fn new() -> hypervisor::Result<Arc<dyn hypervisor::Hypervisor>> { 899 let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?; 900 let api_version = kvm_obj.get_api_version(); 901 902 if api_version != kvm_bindings::KVM_API_VERSION as i32 { 903 return Err(hypervisor::HypervisorError::IncompatibleApiVersion); 904 } 905 906 Ok(Arc::new(KvmHypervisor { kvm: kvm_obj })) 907 } 908 /// Check if the hypervisor is available 909 pub fn is_available() -> hypervisor::Result<bool> { 910 match std::fs::metadata("/dev/kvm") { 911 Ok(_) => Ok(true), 912 Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false), 913 Err(err) => Err(hypervisor::HypervisorError::HypervisorAvailableCheck( 914 err.into(), 915 )), 916 } 917 } 918 } 919 /// Implementation of Hypervisor trait for KVM 920 /// Example: 921 /// #[cfg(feature = "kvm")] 922 /// extern crate hypervisor 923 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 924 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 925 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 926 /// 927 impl hypervisor::Hypervisor for KvmHypervisor { 928 /// 929 /// Returns the type of the hypervisor 930 /// 931 fn hypervisor_type(&self) -> HypervisorType { 932 HypervisorType::Kvm 933 } 934 /// Create a KVM vm object of a specific VM type and return the object as Vm trait object 935 /// Example 936 /// # extern crate hypervisor; 937 /// # use hypervisor::KvmHypervisor; 938 /// use hypervisor::KvmVm; 939 /// let hypervisor = KvmHypervisor::new().unwrap(); 940 /// let vm = hypervisor.create_vm_with_type(KvmVmType::LegacyVm).unwrap() 941 /// 942 fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> { 943 let fd: VmFd; 944 loop { 945 match self.kvm.create_vm_with_type(vm_type) { 946 Ok(res) => fd = res, 947 Err(e) => { 948 if e.errno() == libc::EINTR { 949 // If the error returned is EINTR, which means the 950 // ioctl has been interrupted, we have to retry as 951 // this can't be considered as a regular error. 952 continue; 953 } else { 954 return Err(hypervisor::HypervisorError::VmCreate(e.into())); 955 } 956 } 957 } 958 break; 959 } 960 961 let vm_fd = Arc::new(fd); 962 963 #[cfg(target_arch = "x86_64")] 964 { 965 let msr_list = self.get_msr_list()?; 966 let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize; 967 let mut msrs: Vec<MsrEntry> = vec![ 968 MsrEntry { 969 ..Default::default() 970 }; 971 num_msrs 972 ]; 973 let indices = msr_list.as_slice(); 974 for (pos, index) in indices.iter().enumerate() { 975 msrs[pos].index = *index; 976 } 977 978 Ok(Arc::new(KvmVm { 979 fd: vm_fd, 980 msrs, 981 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), 982 })) 983 } 984 985 #[cfg(target_arch = "aarch64")] 986 { 987 Ok(Arc::new(KvmVm { 988 fd: vm_fd, 989 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), 990 })) 991 } 992 } 993 994 /// Create a KVM vm object and return the object as Vm trait object 995 /// Example 996 /// # extern crate hypervisor; 997 /// # use hypervisor::KvmHypervisor; 998 /// use hypervisor::KvmVm; 999 /// let hypervisor = KvmHypervisor::new().unwrap(); 1000 /// let vm = hypervisor.create_vm().unwrap() 1001 /// 1002 fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> { 1003 #[allow(unused_mut)] 1004 let mut vm_type: u64 = 0; // Create with default platform type 1005 1006 // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA 1007 // size from the host and use that when creating the VM, which may 1008 // avoid unnecessary VM creation failures. 1009 #[cfg(target_arch = "aarch64")] 1010 if self.kvm.check_extension(Cap::ArmVmIPASize) { 1011 vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap(); 1012 } 1013 1014 self.create_vm_with_type(vm_type) 1015 } 1016 1017 fn check_required_extensions(&self) -> hypervisor::Result<()> { 1018 check_required_kvm_extensions(&self.kvm) 1019 .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into())) 1020 } 1021 1022 #[cfg(target_arch = "x86_64")] 1023 /// 1024 /// X86 specific call to get the system supported CPUID values. 1025 /// 1026 fn get_supported_cpuid(&self) -> hypervisor::Result<Vec<CpuIdEntry>> { 1027 let kvm_cpuid = self 1028 .kvm 1029 .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES) 1030 .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))?; 1031 1032 let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect(); 1033 1034 Ok(v) 1035 } 1036 1037 #[cfg(target_arch = "aarch64")] 1038 /// 1039 /// Retrieve AArch64 host maximum IPA size supported by KVM. 1040 /// 1041 fn get_host_ipa_limit(&self) -> i32 { 1042 self.kvm.get_host_ipa_limit() 1043 } 1044 1045 /// 1046 /// Retrieve TDX capabilities 1047 /// 1048 #[cfg(feature = "tdx")] 1049 fn tdx_capabilities(&self) -> hypervisor::Result<TdxCapabilities> { 1050 let data = TdxCapabilities { 1051 nr_cpuid_configs: TDX_MAX_NR_CPUID_CONFIGS as u32, 1052 ..Default::default() 1053 }; 1054 1055 tdx_command( 1056 &self.kvm.as_raw_fd(), 1057 TdxCommand::Capabilities, 1058 0, 1059 &data as *const _ as u64, 1060 ) 1061 .map_err(|e| hypervisor::HypervisorError::TdxCapabilities(e.into()))?; 1062 1063 Ok(data) 1064 } 1065 1066 /// 1067 /// Get the number of supported hardware breakpoints 1068 /// 1069 fn get_guest_debug_hw_bps(&self) -> usize { 1070 #[cfg(target_arch = "x86_64")] 1071 { 1072 4 1073 } 1074 #[cfg(target_arch = "aarch64")] 1075 { 1076 self.kvm.get_guest_debug_hw_bps() as usize 1077 } 1078 } 1079 } 1080 /// Vcpu struct for KVM 1081 pub struct KvmVcpu { 1082 fd: VcpuFd, 1083 #[cfg(target_arch = "x86_64")] 1084 msrs: Vec<MsrEntry>, 1085 vm_ops: Option<Arc<dyn vm::VmOps>>, 1086 #[cfg(target_arch = "x86_64")] 1087 hyperv_synic: AtomicBool, 1088 } 1089 /// Implementation of Vcpu trait for KVM 1090 /// Example: 1091 /// #[cfg(feature = "kvm")] 1092 /// extern crate hypervisor 1093 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1094 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1095 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 1096 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1097 /// vcpu.get/set().unwrap() 1098 /// 1099 impl cpu::Vcpu for KvmVcpu { 1100 #[cfg(target_arch = "x86_64")] 1101 /// 1102 /// Returns the vCPU general purpose registers. 1103 /// 1104 fn get_regs(&self) -> cpu::Result<StandardRegisters> { 1105 Ok(self 1106 .fd 1107 .get_regs() 1108 .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))? 1109 .into()) 1110 } 1111 /// 1112 /// Returns the vCPU general purpose registers. 1113 /// The `KVM_GET_REGS` ioctl is not available on AArch64, `KVM_GET_ONE_REG` 1114 /// is used to get registers one by one. 1115 /// 1116 #[cfg(target_arch = "aarch64")] 1117 fn get_regs(&self) -> cpu::Result<StandardRegisters> { 1118 let mut state: StandardRegisters = kvm_regs::default(); 1119 let mut off = offset_of!(user_pt_regs, regs); 1120 // There are 31 user_pt_regs: 1121 // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72 1122 // These actually are the general-purpose registers of the Armv8-a 1123 // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register). 1124 for i in 0..31 { 1125 state.regs.regs[i] = self 1126 .fd 1127 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1128 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1129 .try_into() 1130 .unwrap(); 1131 off += std::mem::size_of::<u64>(); 1132 } 1133 1134 // We are now entering the "Other register" section of the ARMv8-a architecture. 1135 // First one, stack pointer. 1136 let off = offset_of!(user_pt_regs, sp); 1137 state.regs.sp = self 1138 .fd 1139 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1140 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1141 .try_into() 1142 .unwrap(); 1143 1144 // Second one, the program counter. 1145 let off = offset_of!(user_pt_regs, pc); 1146 state.regs.pc = self 1147 .fd 1148 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1149 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1150 .try_into() 1151 .unwrap(); 1152 1153 // Next is the processor state. 1154 let off = offset_of!(user_pt_regs, pstate); 1155 state.regs.pstate = self 1156 .fd 1157 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1158 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1159 .try_into() 1160 .unwrap(); 1161 1162 // The stack pointer associated with EL1 1163 let off = offset_of!(kvm_regs, sp_el1); 1164 state.sp_el1 = self 1165 .fd 1166 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1167 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1168 .try_into() 1169 .unwrap(); 1170 1171 // Exception Link Register for EL1, when taking an exception to EL1, this register 1172 // holds the address to which to return afterwards. 1173 let off = offset_of!(kvm_regs, elr_el1); 1174 state.elr_el1 = self 1175 .fd 1176 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1177 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1178 .try_into() 1179 .unwrap(); 1180 1181 // Saved Program Status Registers, there are 5 of them used in the kernel. 1182 let mut off = offset_of!(kvm_regs, spsr); 1183 for i in 0..KVM_NR_SPSR as usize { 1184 state.spsr[i] = self 1185 .fd 1186 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1187 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1188 .try_into() 1189 .unwrap(); 1190 off += std::mem::size_of::<u64>(); 1191 } 1192 1193 // Now moving on to floting point registers which are stored in the user_fpsimd_state in the kernel: 1194 // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53 1195 let mut off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, vregs); 1196 for i in 0..32 { 1197 state.fp_regs.vregs[i] = self 1198 .fd 1199 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off)) 1200 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1201 off += mem::size_of::<u128>(); 1202 } 1203 1204 // Floating-point Status Register 1205 let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpsr); 1206 state.fp_regs.fpsr = self 1207 .fd 1208 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off)) 1209 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1210 .try_into() 1211 .unwrap(); 1212 1213 // Floating-point Control Register 1214 let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpcr); 1215 state.fp_regs.fpcr = self 1216 .fd 1217 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off)) 1218 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1219 .try_into() 1220 .unwrap(); 1221 Ok(state) 1222 } 1223 #[cfg(target_arch = "x86_64")] 1224 /// 1225 /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl. 1226 /// 1227 fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> { 1228 let regs = (*regs).into(); 1229 self.fd 1230 .set_regs(®s) 1231 .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into())) 1232 } 1233 1234 /// 1235 /// Sets the vCPU general purpose registers. 1236 /// The `KVM_SET_REGS` ioctl is not available on AArch64, `KVM_SET_ONE_REG` 1237 /// is used to set registers one by one. 1238 /// 1239 #[cfg(target_arch = "aarch64")] 1240 fn set_regs(&self, state: &StandardRegisters) -> cpu::Result<()> { 1241 // The function follows the exact identical order from `state`. Look there 1242 // for some additional info on registers. 1243 let mut off = offset_of!(user_pt_regs, regs); 1244 for i in 0..31 { 1245 self.fd 1246 .set_one_reg( 1247 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1248 state.regs.regs[i].into(), 1249 ) 1250 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1251 off += std::mem::size_of::<u64>(); 1252 } 1253 1254 let off = offset_of!(user_pt_regs, sp); 1255 self.fd 1256 .set_one_reg( 1257 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1258 state.regs.sp.into(), 1259 ) 1260 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1261 1262 let off = offset_of!(user_pt_regs, pc); 1263 self.fd 1264 .set_one_reg( 1265 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1266 state.regs.pc.into(), 1267 ) 1268 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1269 1270 let off = offset_of!(user_pt_regs, pstate); 1271 self.fd 1272 .set_one_reg( 1273 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1274 state.regs.pstate.into(), 1275 ) 1276 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1277 1278 let off = offset_of!(kvm_regs, sp_el1); 1279 self.fd 1280 .set_one_reg( 1281 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1282 state.sp_el1.into(), 1283 ) 1284 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1285 1286 let off = offset_of!(kvm_regs, elr_el1); 1287 self.fd 1288 .set_one_reg( 1289 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1290 state.elr_el1.into(), 1291 ) 1292 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1293 1294 let mut off = offset_of!(kvm_regs, spsr); 1295 for i in 0..KVM_NR_SPSR as usize { 1296 self.fd 1297 .set_one_reg( 1298 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1299 state.spsr[i].into(), 1300 ) 1301 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1302 off += std::mem::size_of::<u64>(); 1303 } 1304 1305 let mut off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, vregs); 1306 for i in 0..32 { 1307 self.fd 1308 .set_one_reg( 1309 arm64_core_reg_id!(KVM_REG_SIZE_U128, off), 1310 state.fp_regs.vregs[i], 1311 ) 1312 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1313 off += mem::size_of::<u128>(); 1314 } 1315 1316 let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpsr); 1317 self.fd 1318 .set_one_reg( 1319 arm64_core_reg_id!(KVM_REG_SIZE_U32, off), 1320 state.fp_regs.fpsr.into(), 1321 ) 1322 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1323 1324 let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpcr); 1325 self.fd 1326 .set_one_reg( 1327 arm64_core_reg_id!(KVM_REG_SIZE_U32, off), 1328 state.fp_regs.fpcr.into(), 1329 ) 1330 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1331 Ok(()) 1332 } 1333 1334 #[cfg(target_arch = "x86_64")] 1335 /// 1336 /// Returns the vCPU special registers. 1337 /// 1338 fn get_sregs(&self) -> cpu::Result<SpecialRegisters> { 1339 Ok(self 1340 .fd 1341 .get_sregs() 1342 .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))? 1343 .into()) 1344 } 1345 #[cfg(target_arch = "x86_64")] 1346 /// 1347 /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl. 1348 /// 1349 fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> { 1350 let sregs = (*sregs).into(); 1351 self.fd 1352 .set_sregs(&sregs) 1353 .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into())) 1354 } 1355 #[cfg(target_arch = "x86_64")] 1356 /// 1357 /// Returns the floating point state (FPU) from the vCPU. 1358 /// 1359 fn get_fpu(&self) -> cpu::Result<FpuState> { 1360 Ok(self 1361 .fd 1362 .get_fpu() 1363 .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))? 1364 .into()) 1365 } 1366 #[cfg(target_arch = "x86_64")] 1367 /// 1368 /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct. 1369 /// 1370 fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> { 1371 let fpu: kvm_bindings::kvm_fpu = (*fpu).clone().into(); 1372 self.fd 1373 .set_fpu(&fpu) 1374 .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into())) 1375 } 1376 #[cfg(target_arch = "x86_64")] 1377 /// 1378 /// X86 specific call to setup the CPUID registers. 1379 /// 1380 fn set_cpuid2(&self, cpuid: &[CpuIdEntry]) -> cpu::Result<()> { 1381 let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> = 1382 cpuid.iter().map(|e| (*e).into()).collect(); 1383 let kvm_cpuid = <CpuId>::from_entries(&cpuid) 1384 .map_err(|_| cpu::HypervisorCpuError::SetCpuid(anyhow!("failed to create CpuId")))?; 1385 1386 self.fd 1387 .set_cpuid2(&kvm_cpuid) 1388 .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into())) 1389 } 1390 #[cfg(target_arch = "x86_64")] 1391 /// 1392 /// X86 specific call to enable HyperV SynIC 1393 /// 1394 fn enable_hyperv_synic(&self) -> cpu::Result<()> { 1395 // Update the information about Hyper-V SynIC being enabled and 1396 // emulated as it will influence later which MSRs should be saved. 1397 self.hyperv_synic.store(true, Ordering::Release); 1398 1399 let cap = kvm_enable_cap { 1400 cap: KVM_CAP_HYPERV_SYNIC, 1401 ..Default::default() 1402 }; 1403 self.fd 1404 .enable_cap(&cap) 1405 .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into())) 1406 } 1407 /// 1408 /// X86 specific call to retrieve the CPUID registers. 1409 /// 1410 #[cfg(target_arch = "x86_64")] 1411 fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<Vec<CpuIdEntry>> { 1412 let kvm_cpuid = self 1413 .fd 1414 .get_cpuid2(num_entries) 1415 .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))?; 1416 1417 let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect(); 1418 1419 Ok(v) 1420 } 1421 #[cfg(target_arch = "x86_64")] 1422 /// 1423 /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller). 1424 /// 1425 fn get_lapic(&self) -> cpu::Result<LapicState> { 1426 Ok(self 1427 .fd 1428 .get_lapic() 1429 .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))? 1430 .into()) 1431 } 1432 #[cfg(target_arch = "x86_64")] 1433 /// 1434 /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller). 1435 /// 1436 fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> { 1437 let klapic: kvm_bindings::kvm_lapic_state = (*klapic).clone().into(); 1438 self.fd 1439 .set_lapic(&klapic) 1440 .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into())) 1441 } 1442 #[cfg(target_arch = "x86_64")] 1443 /// 1444 /// Returns the model-specific registers (MSR) for this vCPU. 1445 /// 1446 fn get_msrs(&self, msrs: &mut Vec<MsrEntry>) -> cpu::Result<usize> { 1447 let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect(); 1448 let mut kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap(); 1449 let succ = self 1450 .fd 1451 .get_msrs(&mut kvm_msrs) 1452 .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))?; 1453 1454 msrs[..succ].copy_from_slice( 1455 &kvm_msrs.as_slice()[..succ] 1456 .iter() 1457 .map(|e| (*e).into()) 1458 .collect::<Vec<MsrEntry>>(), 1459 ); 1460 1461 Ok(succ) 1462 } 1463 #[cfg(target_arch = "x86_64")] 1464 /// 1465 /// Setup the model-specific registers (MSR) for this vCPU. 1466 /// Returns the number of MSR entries actually written. 1467 /// 1468 fn set_msrs(&self, msrs: &[MsrEntry]) -> cpu::Result<usize> { 1469 let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect(); 1470 let kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap(); 1471 self.fd 1472 .set_msrs(&kvm_msrs) 1473 .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into())) 1474 } 1475 /// 1476 /// Returns the vcpu's current "multiprocessing state". 1477 /// 1478 fn get_mp_state(&self) -> cpu::Result<MpState> { 1479 Ok(self 1480 .fd 1481 .get_mp_state() 1482 .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))? 1483 .into()) 1484 } 1485 /// 1486 /// Sets the vcpu's current "multiprocessing state". 1487 /// 1488 fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> { 1489 self.fd 1490 .set_mp_state(mp_state.into()) 1491 .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into())) 1492 } 1493 #[cfg(target_arch = "x86_64")] 1494 /// 1495 /// Translates guest virtual address to guest physical address using the `KVM_TRANSLATE` ioctl. 1496 /// 1497 fn translate_gva(&self, gva: u64, _flags: u64) -> cpu::Result<(u64, u32)> { 1498 let tr = self 1499 .fd 1500 .translate_gva(gva) 1501 .map_err(|e| cpu::HypervisorCpuError::TranslateVirtualAddress(e.into()))?; 1502 // tr.valid is set if the GVA is mapped to valid GPA. 1503 match tr.valid { 1504 0 => Err(cpu::HypervisorCpuError::TranslateVirtualAddress(anyhow!( 1505 "Invalid GVA: {:#x}", 1506 gva 1507 ))), 1508 _ => Ok((tr.physical_address, 0)), 1509 } 1510 } 1511 /// 1512 /// Triggers the running of the current virtual CPU returning an exit reason. 1513 /// 1514 fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> { 1515 match self.fd.run() { 1516 Ok(run) => match run { 1517 #[cfg(target_arch = "x86_64")] 1518 VcpuExit::IoIn(addr, data) => { 1519 if let Some(vm_ops) = &self.vm_ops { 1520 return vm_ops 1521 .pio_read(addr.into(), data) 1522 .map(|_| cpu::VmExit::Ignore) 1523 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1524 } 1525 1526 Ok(cpu::VmExit::IoIn(addr, data)) 1527 } 1528 #[cfg(target_arch = "x86_64")] 1529 VcpuExit::IoOut(addr, data) => { 1530 if let Some(vm_ops) = &self.vm_ops { 1531 return vm_ops 1532 .pio_write(addr.into(), data) 1533 .map(|_| cpu::VmExit::Ignore) 1534 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1535 } 1536 1537 Ok(cpu::VmExit::IoOut(addr, data)) 1538 } 1539 #[cfg(target_arch = "x86_64")] 1540 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)), 1541 #[cfg(target_arch = "x86_64")] 1542 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset), 1543 1544 #[cfg(target_arch = "aarch64")] 1545 VcpuExit::SystemEvent(event_type, flags) => { 1546 use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN}; 1547 // On Aarch64, when the VM is shutdown, run() returns 1548 // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN 1549 if event_type == KVM_SYSTEM_EVENT_RESET { 1550 Ok(cpu::VmExit::Reset) 1551 } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN { 1552 Ok(cpu::VmExit::Shutdown) 1553 } else { 1554 Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1555 "Unexpected system event with type 0x{:x}, flags 0x{:x}", 1556 event_type, 1557 flags 1558 ))) 1559 } 1560 } 1561 1562 VcpuExit::MmioRead(addr, data) => { 1563 if let Some(vm_ops) = &self.vm_ops { 1564 return vm_ops 1565 .mmio_read(addr, data) 1566 .map(|_| cpu::VmExit::Ignore) 1567 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1568 } 1569 1570 Ok(cpu::VmExit::MmioRead(addr, data)) 1571 } 1572 VcpuExit::MmioWrite(addr, data) => { 1573 if let Some(vm_ops) = &self.vm_ops { 1574 return vm_ops 1575 .mmio_write(addr, data) 1576 .map(|_| cpu::VmExit::Ignore) 1577 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1578 } 1579 1580 Ok(cpu::VmExit::MmioWrite(addr, data)) 1581 } 1582 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv), 1583 #[cfg(feature = "tdx")] 1584 VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx), 1585 VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug), 1586 1587 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1588 "Unexpected exit reason on vcpu run: {:?}", 1589 r 1590 ))), 1591 }, 1592 1593 Err(ref e) => match e.errno() { 1594 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore), 1595 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1596 "VCPU error {:?}", 1597 e 1598 ))), 1599 }, 1600 } 1601 } 1602 #[cfg(target_arch = "x86_64")] 1603 /// 1604 /// Let the guest know that it has been paused, which prevents from 1605 /// potential soft lockups when being resumed. 1606 /// 1607 fn notify_guest_clock_paused(&self) -> cpu::Result<()> { 1608 if let Err(e) = self.fd.kvmclock_ctrl() { 1609 // Linux kernel returns -EINVAL if the PV clock isn't yet initialised 1610 // which could be because we're still in firmware or the guest doesn't 1611 // use KVM clock. 1612 if e.errno() != libc::EINVAL { 1613 return Err(cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into())); 1614 } 1615 } 1616 1617 Ok(()) 1618 } 1619 /// 1620 /// Sets debug registers to set hardware breakpoints and/or enable single step. 1621 /// 1622 fn set_guest_debug( 1623 &self, 1624 addrs: &[vm_memory::GuestAddress], 1625 singlestep: bool, 1626 ) -> cpu::Result<()> { 1627 let mut dbg = kvm_guest_debug { 1628 #[cfg(target_arch = "x86_64")] 1629 control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP, 1630 #[cfg(target_arch = "aarch64")] 1631 control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW, 1632 ..Default::default() 1633 }; 1634 if singlestep { 1635 dbg.control |= KVM_GUESTDBG_SINGLESTEP; 1636 } 1637 1638 // Set the debug registers. 1639 // Here we assume that the number of addresses do not exceed what 1640 // `Hypervisor::get_guest_debug_hw_bps()` specifies. 1641 #[cfg(target_arch = "x86_64")] 1642 { 1643 // Set bits 9 and 10. 1644 // bit 9: GE (global exact breakpoint enable) flag. 1645 // bit 10: always 1. 1646 dbg.arch.debugreg[7] = 0x0600; 1647 1648 for (i, addr) in addrs.iter().enumerate() { 1649 dbg.arch.debugreg[i] = addr.0; 1650 // Set global breakpoint enable flag 1651 dbg.arch.debugreg[7] |= 2 << (i * 2); 1652 } 1653 } 1654 #[cfg(target_arch = "aarch64")] 1655 { 1656 for (i, addr) in addrs.iter().enumerate() { 1657 // DBGBCR_EL1 (Debug Breakpoint Control Registers, D13.3.2): 1658 // bit 0: 1 (Enabled) 1659 // bit 1~2: 0b11 (PMC = EL1/EL0) 1660 // bit 5~8: 0b1111 (BAS = AArch64) 1661 // others: 0 1662 dbg.arch.dbg_bcr[i] = 0b1u64 | 0b110u64 | 0b1_1110_0000u64; 1663 // DBGBVR_EL1 (Debug Breakpoint Value Registers, D13.3.3): 1664 // bit 2~52: VA[2:52] 1665 dbg.arch.dbg_bvr[i] = (!0u64 >> 11) & addr.0; 1666 } 1667 } 1668 self.fd 1669 .set_guest_debug(&dbg) 1670 .map_err(|e| cpu::HypervisorCpuError::SetDebugRegs(e.into())) 1671 } 1672 #[cfg(target_arch = "aarch64")] 1673 fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> { 1674 self.fd 1675 .vcpu_init(kvi) 1676 .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into())) 1677 } 1678 /// 1679 /// Gets a list of the guest registers that are supported for the 1680 /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls. 1681 /// 1682 #[cfg(target_arch = "aarch64")] 1683 fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> { 1684 self.fd 1685 .get_reg_list(reg_list) 1686 .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into())) 1687 } 1688 /// 1689 /// Gets the value of a system register 1690 /// 1691 #[cfg(target_arch = "aarch64")] 1692 fn get_sys_reg(&self, sys_reg: u32) -> cpu::Result<u64> { 1693 // 1694 // Arm Architecture Reference Manual defines the encoding of 1695 // AArch64 system registers, see 1696 // https://developer.arm.com/documentation/ddi0487 (chapter D12). 1697 // While KVM defines another ID for each AArch64 system register, 1698 // which is used in calling `KVM_G/SET_ONE_REG` to access a system 1699 // register of a guest. 1700 // A mapping exists between the Arm standard encoding and the KVM ID. 1701 // This function takes the standard u32 ID as input parameter, converts 1702 // it to the corresponding KVM ID, and call `KVM_GET_ONE_REG` API to 1703 // get the value of the system parameter. 1704 // 1705 let id: u64 = KVM_REG_ARM64 1706 | KVM_REG_SIZE_U64 1707 | KVM_REG_ARM64_SYSREG as u64 1708 | ((((sys_reg) >> 5) 1709 & (KVM_REG_ARM64_SYSREG_OP0_MASK 1710 | KVM_REG_ARM64_SYSREG_OP1_MASK 1711 | KVM_REG_ARM64_SYSREG_CRN_MASK 1712 | KVM_REG_ARM64_SYSREG_CRM_MASK 1713 | KVM_REG_ARM64_SYSREG_OP2_MASK)) as u64); 1714 Ok(self 1715 .fd 1716 .get_one_reg(id) 1717 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))? 1718 .try_into() 1719 .unwrap()) 1720 } 1721 /// 1722 /// Configure core registers for a given CPU. 1723 /// 1724 #[cfg(target_arch = "aarch64")] 1725 fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> { 1726 #[allow(non_upper_case_globals)] 1727 // PSR (Processor State Register) bits. 1728 // Taken from arch/arm64/include/uapi/asm/ptrace.h. 1729 const PSR_MODE_EL1h: u64 = 0x0000_0005; 1730 const PSR_F_BIT: u64 = 0x0000_0040; 1731 const PSR_I_BIT: u64 = 0x0000_0080; 1732 const PSR_A_BIT: u64 = 0x0000_0100; 1733 const PSR_D_BIT: u64 = 0x0000_0200; 1734 // Taken from arch/arm64/kvm/inject_fault.c. 1735 const PSTATE_FAULT_BITS_64: u64 = 1736 PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | PSR_I_BIT | PSR_D_BIT; 1737 1738 let kreg_off = offset_of!(kvm_regs, regs); 1739 1740 // Get the register index of the PSTATE (Processor State) register. 1741 let pstate = offset_of!(user_pt_regs, pstate) + kreg_off; 1742 self.fd 1743 .set_one_reg( 1744 arm64_core_reg_id!(KVM_REG_SIZE_U64, pstate), 1745 PSTATE_FAULT_BITS_64.into(), 1746 ) 1747 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1748 1749 // Other vCPUs are powered off initially awaiting PSCI wakeup. 1750 if cpu_id == 0 { 1751 // Setting the PC (Processor Counter) to the current program address (kernel address). 1752 let pc = offset_of!(user_pt_regs, pc) + kreg_off; 1753 self.fd 1754 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, pc), boot_ip.into()) 1755 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1756 1757 // Last mandatory thing to set -> the address pointing to the FDT (also called DTB). 1758 // "The device tree blob (dtb) must be placed on an 8-byte boundary and must 1759 // not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt. 1760 // We are choosing to place it the end of DRAM. See `get_fdt_addr`. 1761 let regs0 = offset_of!(user_pt_regs, regs) + kreg_off; 1762 self.fd 1763 .set_one_reg( 1764 arm64_core_reg_id!(KVM_REG_SIZE_U64, regs0), 1765 fdt_start.into(), 1766 ) 1767 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1768 } 1769 Ok(()) 1770 } 1771 1772 #[cfg(target_arch = "x86_64")] 1773 /// 1774 /// Get the current CPU state 1775 /// 1776 /// Ordering requirements: 1777 /// 1778 /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify 1779 /// vCPU/LAPIC state. As such, it must be done before most everything 1780 /// else, otherwise we cannot restore everything and expect it to work. 1781 /// 1782 /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are 1783 /// still running. 1784 /// 1785 /// KVM_GET_LAPIC may change state of LAPIC before returning it. 1786 /// 1787 /// GET_VCPU_EVENTS should probably be last to save. The code looks as 1788 /// it might as well be affected by internal state modifications of the 1789 /// GET ioctls. 1790 /// 1791 /// SREGS saves/restores a pending interrupt, similar to what 1792 /// VCPU_EVENTS also does. 1793 /// 1794 /// GET_MSRS requires a pre-populated data structure to do something 1795 /// meaningful. For SET_MSRS it will then contain good data. 1796 /// 1797 /// # Example 1798 /// 1799 /// ```rust 1800 /// # extern crate hypervisor; 1801 /// # use hypervisor::KvmHypervisor; 1802 /// # use std::sync::Arc; 1803 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1804 /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1805 /// let vm = hv.create_vm().expect("new VM fd creation failed"); 1806 /// vm.enable_split_irq().unwrap(); 1807 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1808 /// let state = vcpu.state().unwrap(); 1809 /// ``` 1810 fn state(&self) -> cpu::Result<CpuState> { 1811 let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?; 1812 let mp_state = self.get_mp_state()?.into(); 1813 let regs = self.get_regs()?; 1814 let sregs = self.get_sregs()?; 1815 let xsave = self.get_xsave()?; 1816 let xcrs = self.get_xcrs()?; 1817 let lapic_state = self.get_lapic()?; 1818 let fpu = self.get_fpu()?; 1819 1820 // Try to get all MSRs based on the list previously retrieved from KVM. 1821 // If the number of MSRs obtained from GET_MSRS is different from the 1822 // expected amount, we fallback onto a slower method by getting MSRs 1823 // by chunks. This is the only way to make sure we try to get as many 1824 // MSRs as possible, even if some MSRs are not supported. 1825 let mut msr_entries = self.msrs.clone(); 1826 1827 // Save extra MSRs if the Hyper-V synthetic interrupt controller is 1828 // emulated. 1829 if self.hyperv_synic.load(Ordering::Acquire) { 1830 let hyperv_synic_msrs = vec![ 1831 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084, 1832 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096, 1833 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d, 1834 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4, 1835 0x400000b5, 0x400000b6, 0x400000b7, 1836 ]; 1837 for index in hyperv_synic_msrs { 1838 let msr = kvm_msr_entry { 1839 index, 1840 ..Default::default() 1841 }; 1842 msr_entries.push(msr.into()); 1843 } 1844 } 1845 1846 let expected_num_msrs = msr_entries.len(); 1847 let num_msrs = self.get_msrs(&mut msr_entries)?; 1848 let msrs = if num_msrs != expected_num_msrs { 1849 let mut faulty_msr_index = num_msrs; 1850 let mut msr_entries_tmp = msr_entries[..faulty_msr_index].to_vec(); 1851 1852 loop { 1853 warn!( 1854 "Detected faulty MSR 0x{:x} while getting MSRs", 1855 msr_entries[faulty_msr_index].index 1856 ); 1857 1858 // Skip the first bad MSR 1859 let start_pos = faulty_msr_index + 1; 1860 1861 let mut sub_msr_entries = msr_entries[start_pos..].to_vec(); 1862 let num_msrs = self.get_msrs(&mut sub_msr_entries)?; 1863 1864 msr_entries_tmp.extend(&sub_msr_entries[..num_msrs]); 1865 1866 if num_msrs == sub_msr_entries.len() { 1867 break; 1868 } 1869 1870 faulty_msr_index = start_pos + num_msrs; 1871 } 1872 1873 msr_entries_tmp 1874 } else { 1875 msr_entries 1876 }; 1877 1878 let vcpu_events = self.get_vcpu_events()?; 1879 1880 Ok(VcpuKvmState { 1881 cpuid, 1882 msrs, 1883 vcpu_events, 1884 regs: regs.into(), 1885 sregs: sregs.into(), 1886 fpu, 1887 lapic_state, 1888 xsave, 1889 xcrs, 1890 mp_state, 1891 } 1892 .into()) 1893 } 1894 /// 1895 /// Get the current AArch64 CPU state 1896 /// 1897 #[cfg(target_arch = "aarch64")] 1898 fn state(&self) -> cpu::Result<CpuState> { 1899 let mut state = VcpuKvmState { 1900 mp_state: self.get_mp_state()?.into(), 1901 ..Default::default() 1902 }; 1903 // Get core registers 1904 state.core_regs = self.get_regs()?; 1905 1906 // Get systerm register 1907 // Call KVM_GET_REG_LIST to get all registers available to the guest. 1908 // For ArmV8 there are around 500 registers. 1909 let mut sys_regs: Vec<Register> = Vec::new(); 1910 let mut reg_list = RegList::new(500).unwrap(); 1911 self.fd 1912 .get_reg_list(&mut reg_list) 1913 .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?; 1914 1915 // At this point reg_list should contain: core registers and system 1916 // registers. 1917 // The register list contains the number of registers and their ids. We 1918 // will be needing to call KVM_GET_ONE_REG on each id in order to save 1919 // all of them. We carve out from the list the core registers which are 1920 // represented in the kernel by kvm_regs structure and for which we can 1921 // calculate the id based on the offset in the structure. 1922 reg_list.retain(|regid| is_system_register(*regid)); 1923 1924 // Now, for the rest of the registers left in the previously fetched 1925 // register list, we are simply calling KVM_GET_ONE_REG. 1926 let indices = reg_list.as_slice(); 1927 for index in indices.iter() { 1928 sys_regs.push(kvm_bindings::kvm_one_reg { 1929 id: *index, 1930 addr: self 1931 .fd 1932 .get_one_reg(*index) 1933 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))? 1934 .try_into() 1935 .unwrap(), 1936 }); 1937 } 1938 1939 state.sys_regs = sys_regs; 1940 1941 Ok(state.into()) 1942 } 1943 #[cfg(target_arch = "x86_64")] 1944 /// 1945 /// Restore the previously saved CPU state 1946 /// 1947 /// Ordering requirements: 1948 /// 1949 /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are 1950 /// still running. 1951 /// 1952 /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so 1953 /// if we ever change the BSP, we have to do that before restoring anything. 1954 /// The same seems to be true for CPUID stuff. 1955 /// 1956 /// SREGS saves/restores a pending interrupt, similar to what 1957 /// VCPU_EVENTS also does. 1958 /// 1959 /// SET_REGS clears pending exceptions unconditionally, thus, it must be 1960 /// done before SET_VCPU_EVENTS, which restores it. 1961 /// 1962 /// SET_LAPIC must come after SET_SREGS, because the latter restores 1963 /// the apic base msr. 1964 /// 1965 /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR 1966 /// only restores successfully, when the LAPIC is correctly configured. 1967 /// 1968 /// Arguments: CpuState 1969 /// # Example 1970 /// 1971 /// ```rust 1972 /// # extern crate hypervisor; 1973 /// # use hypervisor::KvmHypervisor; 1974 /// # use std::sync::Arc; 1975 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1976 /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1977 /// let vm = hv.create_vm().expect("new VM fd creation failed"); 1978 /// vm.enable_split_irq().unwrap(); 1979 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1980 /// let state = vcpu.state().unwrap(); 1981 /// vcpu.set_state(&state).unwrap(); 1982 /// ``` 1983 fn set_state(&self, state: &CpuState) -> cpu::Result<()> { 1984 let state: VcpuKvmState = state.clone().into(); 1985 self.set_cpuid2(&state.cpuid)?; 1986 self.set_mp_state(state.mp_state.into())?; 1987 self.set_regs(&state.regs.into())?; 1988 self.set_sregs(&state.sregs.into())?; 1989 self.set_xsave(&state.xsave)?; 1990 self.set_xcrs(&state.xcrs)?; 1991 self.set_lapic(&state.lapic_state)?; 1992 self.set_fpu(&state.fpu)?; 1993 1994 // Try to set all MSRs previously stored. 1995 // If the number of MSRs set from SET_MSRS is different from the 1996 // expected amount, we fallback onto a slower method by setting MSRs 1997 // by chunks. This is the only way to make sure we try to set as many 1998 // MSRs as possible, even if some MSRs are not supported. 1999 let expected_num_msrs = state.msrs.len(); 2000 let num_msrs = self.set_msrs(&state.msrs)?; 2001 if num_msrs != expected_num_msrs { 2002 let mut faulty_msr_index = num_msrs; 2003 2004 loop { 2005 warn!( 2006 "Detected faulty MSR 0x{:x} while setting MSRs", 2007 state.msrs[faulty_msr_index].index 2008 ); 2009 2010 // Skip the first bad MSR 2011 let start_pos = faulty_msr_index + 1; 2012 2013 let sub_msr_entries = state.msrs[start_pos..].to_vec(); 2014 2015 let num_msrs = self.set_msrs(&sub_msr_entries)?; 2016 2017 if num_msrs == sub_msr_entries.len() { 2018 break; 2019 } 2020 2021 faulty_msr_index = start_pos + num_msrs; 2022 } 2023 } 2024 2025 self.set_vcpu_events(&state.vcpu_events)?; 2026 2027 Ok(()) 2028 } 2029 /// 2030 /// Restore the previously saved AArch64 CPU state 2031 /// 2032 #[cfg(target_arch = "aarch64")] 2033 fn set_state(&self, state: &CpuState) -> cpu::Result<()> { 2034 let state: VcpuKvmState = state.clone().into(); 2035 // Set core registers 2036 self.set_regs(&state.core_regs)?; 2037 // Set system registers 2038 for reg in &state.sys_regs { 2039 self.fd 2040 .set_one_reg(reg.id, reg.addr.into()) 2041 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?; 2042 } 2043 2044 self.set_mp_state(state.mp_state.into())?; 2045 2046 Ok(()) 2047 } 2048 2049 /// 2050 /// Initialize TDX for this CPU 2051 /// 2052 #[cfg(feature = "tdx")] 2053 fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> { 2054 tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address) 2055 .map_err(cpu::HypervisorCpuError::InitializeTdx) 2056 } 2057 2058 /// 2059 /// Set the "immediate_exit" state 2060 /// 2061 fn set_immediate_exit(&self, exit: bool) { 2062 self.fd.set_kvm_immediate_exit(exit.into()); 2063 } 2064 2065 /// 2066 /// Returns the details about TDX exit reason 2067 /// 2068 #[cfg(feature = "tdx")] 2069 fn get_tdx_exit_details(&mut self) -> cpu::Result<TdxExitDetails> { 2070 let kvm_run = self.fd.get_kvm_run(); 2071 // SAFETY: accessing a union field in a valid structure 2072 let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall }; 2073 2074 tdx_vmcall.status_code = TDG_VP_VMCALL_INVALID_OPERAND; 2075 2076 if tdx_vmcall.type_ != 0 { 2077 return Err(cpu::HypervisorCpuError::UnknownTdxVmCall); 2078 } 2079 2080 match tdx_vmcall.subfunction { 2081 TDG_VP_VMCALL_GET_QUOTE => Ok(TdxExitDetails::GetQuote), 2082 TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT => { 2083 Ok(TdxExitDetails::SetupEventNotifyInterrupt) 2084 } 2085 _ => Err(cpu::HypervisorCpuError::UnknownTdxVmCall), 2086 } 2087 } 2088 2089 /// 2090 /// Set the status code for TDX exit 2091 /// 2092 #[cfg(feature = "tdx")] 2093 fn set_tdx_status(&mut self, status: TdxExitStatus) { 2094 let kvm_run = self.fd.get_kvm_run(); 2095 // SAFETY: accessing a union field in a valid structure 2096 let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall }; 2097 2098 tdx_vmcall.status_code = match status { 2099 TdxExitStatus::Success => TDG_VP_VMCALL_SUCCESS, 2100 TdxExitStatus::InvalidOperand => TDG_VP_VMCALL_INVALID_OPERAND, 2101 }; 2102 } 2103 #[cfg(target_arch = "x86_64")] 2104 /// 2105 /// Return the list of initial MSR entries for a VCPU 2106 /// 2107 fn boot_msr_entries(&self) -> Vec<MsrEntry> { 2108 use crate::arch::x86::{msr_index, MTRR_ENABLE, MTRR_MEM_TYPE_WB}; 2109 2110 [ 2111 msr!(msr_index::MSR_IA32_SYSENTER_CS), 2112 msr!(msr_index::MSR_IA32_SYSENTER_ESP), 2113 msr!(msr_index::MSR_IA32_SYSENTER_EIP), 2114 msr!(msr_index::MSR_STAR), 2115 msr!(msr_index::MSR_CSTAR), 2116 msr!(msr_index::MSR_LSTAR), 2117 msr!(msr_index::MSR_KERNEL_GS_BASE), 2118 msr!(msr_index::MSR_SYSCALL_MASK), 2119 msr!(msr_index::MSR_IA32_TSC), 2120 msr_data!( 2121 msr_index::MSR_IA32_MISC_ENABLE, 2122 msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64 2123 ), 2124 msr_data!(msr_index::MSR_MTRRdefType, MTRR_ENABLE | MTRR_MEM_TYPE_WB), 2125 ] 2126 .to_vec() 2127 } 2128 #[cfg(target_arch = "aarch64")] 2129 fn has_pmu_support(&self) -> bool { 2130 let cpu_attr = kvm_bindings::kvm_device_attr { 2131 group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL, 2132 attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT), 2133 addr: 0x0, 2134 flags: 0, 2135 }; 2136 self.fd.has_device_attr(&cpu_attr).is_ok() 2137 } 2138 #[cfg(target_arch = "aarch64")] 2139 fn init_pmu(&self, irq: u32) -> cpu::Result<()> { 2140 let cpu_attr = kvm_bindings::kvm_device_attr { 2141 group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL, 2142 attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT), 2143 addr: 0x0, 2144 flags: 0, 2145 }; 2146 let cpu_attr_irq = kvm_bindings::kvm_device_attr { 2147 group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL, 2148 attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_IRQ), 2149 addr: &irq as *const u32 as u64, 2150 flags: 0, 2151 }; 2152 self.fd 2153 .set_device_attr(&cpu_attr_irq) 2154 .map_err(|_| cpu::HypervisorCpuError::InitializePmu)?; 2155 self.fd 2156 .set_device_attr(&cpu_attr) 2157 .map_err(|_| cpu::HypervisorCpuError::InitializePmu) 2158 } 2159 2160 #[cfg(target_arch = "x86_64")] 2161 /// 2162 /// Get the frequency of the TSC if available 2163 /// 2164 fn tsc_khz(&self) -> cpu::Result<Option<u32>> { 2165 match self.fd.get_tsc_khz() { 2166 Err(e) => { 2167 if e.errno() == libc::EIO { 2168 Ok(None) 2169 } else { 2170 Err(cpu::HypervisorCpuError::GetTscKhz(e.into())) 2171 } 2172 } 2173 Ok(v) => Ok(Some(v)), 2174 } 2175 } 2176 } 2177 2178 impl KvmVcpu { 2179 #[cfg(target_arch = "x86_64")] 2180 /// 2181 /// X86 specific call that returns the vcpu's current "xsave struct". 2182 /// 2183 fn get_xsave(&self) -> cpu::Result<Xsave> { 2184 self.fd 2185 .get_xsave() 2186 .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into())) 2187 } 2188 #[cfg(target_arch = "x86_64")] 2189 /// 2190 /// X86 specific call that sets the vcpu's current "xsave struct". 2191 /// 2192 fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> { 2193 self.fd 2194 .set_xsave(xsave) 2195 .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into())) 2196 } 2197 #[cfg(target_arch = "x86_64")] 2198 /// 2199 /// X86 specific call that returns the vcpu's current "xcrs". 2200 /// 2201 fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> { 2202 self.fd 2203 .get_xcrs() 2204 .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into())) 2205 } 2206 #[cfg(target_arch = "x86_64")] 2207 /// 2208 /// X86 specific call that sets the vcpu's current "xcrs". 2209 /// 2210 fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> { 2211 self.fd 2212 .set_xcrs(xcrs) 2213 .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into())) 2214 } 2215 #[cfg(target_arch = "x86_64")] 2216 /// 2217 /// Returns currently pending exceptions, interrupts, and NMIs as well as related 2218 /// states of the vcpu. 2219 /// 2220 fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> { 2221 self.fd 2222 .get_vcpu_events() 2223 .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into())) 2224 } 2225 #[cfg(target_arch = "x86_64")] 2226 /// 2227 /// Sets pending exceptions, interrupts, and NMIs as well as related states 2228 /// of the vcpu. 2229 /// 2230 fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> { 2231 self.fd 2232 .set_vcpu_events(events) 2233 .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into())) 2234 } 2235 } 2236