1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause 4 // 5 // Copyright © 2020, Microsoft Corporation 6 // 7 // Copyright 2018-2019 CrowdStrike, Inc. 8 // 9 // 10 11 #[cfg(target_arch = "aarch64")] 12 use crate::aarch64::gic::KvmGicV3Its; 13 #[cfg(target_arch = "aarch64")] 14 pub use crate::aarch64::{ 15 check_required_kvm_extensions, gic::Gicv3ItsState as GicState, is_system_register, VcpuInit, 16 VcpuKvmState, 17 }; 18 #[cfg(target_arch = "aarch64")] 19 use crate::arch::aarch64::gic::Vgic; 20 use crate::cpu; 21 use crate::hypervisor; 22 use crate::vec_with_array_field; 23 use crate::vm::{self, InterruptSourceConfig, VmOps}; 24 use crate::HypervisorType; 25 #[cfg(target_arch = "aarch64")] 26 use crate::{arm64_core_reg_id, offset__of}; 27 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd}; 28 use std::any::Any; 29 use std::collections::HashMap; 30 #[cfg(target_arch = "aarch64")] 31 use std::convert::TryInto; 32 #[cfg(target_arch = "x86_64")] 33 use std::fs::File; 34 #[cfg(target_arch = "x86_64")] 35 use std::os::unix::io::AsRawFd; 36 #[cfg(feature = "tdx")] 37 use std::os::unix::io::RawFd; 38 use std::result; 39 #[cfg(target_arch = "x86_64")] 40 use std::sync::atomic::{AtomicBool, Ordering}; 41 #[cfg(target_arch = "aarch64")] 42 use std::sync::Mutex; 43 use std::sync::{Arc, RwLock}; 44 use vmm_sys_util::eventfd::EventFd; 45 // x86_64 dependencies 46 #[cfg(target_arch = "x86_64")] 47 pub mod x86_64; 48 #[cfg(target_arch = "x86_64")] 49 use crate::arch::x86::{ 50 CpuIdEntry, FpuState, LapicState, MsrEntry, SpecialRegisters, StandardRegisters, 51 NUM_IOAPIC_PINS, 52 }; 53 #[cfg(target_arch = "x86_64")] 54 use crate::ClockData; 55 use crate::{ 56 CpuState, IoEventAddress, IrqRoutingEntry, MpState, UserMemoryRegion, 57 USER_MEMORY_REGION_LOG_DIRTY, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE, 58 }; 59 #[cfg(target_arch = "aarch64")] 60 use aarch64::{RegList, Register, StandardRegisters}; 61 #[cfg(target_arch = "x86_64")] 62 use kvm_bindings::{ 63 kvm_enable_cap, kvm_guest_debug, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, 64 KVM_CAP_SPLIT_IRQCHIP, KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_GUESTDBG_USE_HW_BP, 65 }; 66 #[cfg(target_arch = "x86_64")] 67 use x86_64::check_required_kvm_extensions; 68 #[cfg(target_arch = "x86_64")] 69 pub use x86_64::{CpuId, ExtendedControlRegisters, MsrEntries, VcpuKvmState, Xsave}; 70 // aarch64 dependencies 71 #[cfg(target_arch = "aarch64")] 72 pub mod aarch64; 73 pub use kvm_bindings; 74 #[cfg(feature = "tdx")] 75 use kvm_bindings::KVMIO; 76 pub use kvm_bindings::{ 77 kvm_clock_data, kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_irq_routing, 78 kvm_irq_routing_entry, kvm_mp_state, kvm_userspace_memory_region, KVM_IRQ_ROUTING_IRQCHIP, 79 KVM_IRQ_ROUTING_MSI, KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID, 80 }; 81 #[cfg(target_arch = "aarch64")] 82 use kvm_bindings::{ 83 kvm_regs, user_fpsimd_state, user_pt_regs, KVM_NR_SPSR, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, 84 KVM_REG_ARM64_SYSREG_CRM_MASK, KVM_REG_ARM64_SYSREG_CRN_MASK, KVM_REG_ARM64_SYSREG_OP0_MASK, 85 KVM_REG_ARM64_SYSREG_OP1_MASK, KVM_REG_ARM64_SYSREG_OP2_MASK, KVM_REG_ARM_CORE, 86 KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64, 87 }; 88 pub use kvm_ioctls; 89 pub use kvm_ioctls::{Cap, Kvm}; 90 #[cfg(target_arch = "aarch64")] 91 use std::mem; 92 use thiserror::Error; 93 use vfio_ioctls::VfioDeviceFd; 94 #[cfg(feature = "tdx")] 95 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_ioc_nr, ioctl_iowr_nr}; 96 /// 97 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms 98 /// 99 pub use { 100 kvm_bindings::kvm_create_device as CreateDevice, kvm_bindings::kvm_device_attr as DeviceAttr, 101 kvm_bindings::kvm_run, kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::VcpuExit, 102 }; 103 104 #[cfg(target_arch = "x86_64")] 105 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196; 106 107 #[cfg(feature = "tdx")] 108 const KVM_EXIT_TDX: u32 = 35; 109 #[cfg(feature = "tdx")] 110 const TDG_VP_VMCALL_GET_QUOTE: u64 = 0x10002; 111 #[cfg(feature = "tdx")] 112 const TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: u64 = 0x10004; 113 #[cfg(feature = "tdx")] 114 const TDG_VP_VMCALL_SUCCESS: u64 = 0; 115 #[cfg(feature = "tdx")] 116 const TDG_VP_VMCALL_INVALID_OPERAND: u64 = 0x8000000000000000; 117 118 #[cfg(feature = "tdx")] 119 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong); 120 121 #[cfg(feature = "tdx")] 122 #[repr(u32)] 123 enum TdxCommand { 124 Capabilities = 0, 125 InitVm, 126 InitVcpu, 127 InitMemRegion, 128 Finalize, 129 } 130 131 #[cfg(feature = "tdx")] 132 pub enum TdxExitDetails { 133 GetQuote, 134 SetupEventNotifyInterrupt, 135 } 136 137 #[cfg(feature = "tdx")] 138 pub enum TdxExitStatus { 139 Success, 140 InvalidOperand, 141 } 142 143 #[cfg(feature = "tdx")] 144 const TDX_MAX_NR_CPUID_CONFIGS: usize = 6; 145 146 #[cfg(feature = "tdx")] 147 #[repr(C)] 148 #[derive(Debug, Default)] 149 pub struct TdxCpuidConfig { 150 pub leaf: u32, 151 pub sub_leaf: u32, 152 pub eax: u32, 153 pub ebx: u32, 154 pub ecx: u32, 155 pub edx: u32, 156 } 157 158 #[cfg(feature = "tdx")] 159 #[repr(C)] 160 #[derive(Debug, Default)] 161 pub struct TdxCapabilities { 162 pub attrs_fixed0: u64, 163 pub attrs_fixed1: u64, 164 pub xfam_fixed0: u64, 165 pub xfam_fixed1: u64, 166 pub nr_cpuid_configs: u32, 167 pub padding: u32, 168 pub cpuid_configs: [TdxCpuidConfig; TDX_MAX_NR_CPUID_CONFIGS], 169 } 170 171 impl From<kvm_userspace_memory_region> for UserMemoryRegion { 172 fn from(region: kvm_userspace_memory_region) -> Self { 173 let mut flags = USER_MEMORY_REGION_READ; 174 if region.flags & KVM_MEM_READONLY == 0 { 175 flags |= USER_MEMORY_REGION_WRITE; 176 } 177 if region.flags & KVM_MEM_LOG_DIRTY_PAGES != 0 { 178 flags |= USER_MEMORY_REGION_LOG_DIRTY; 179 } 180 181 UserMemoryRegion { 182 slot: region.slot, 183 guest_phys_addr: region.guest_phys_addr, 184 memory_size: region.memory_size, 185 userspace_addr: region.userspace_addr, 186 flags, 187 } 188 } 189 } 190 191 impl From<UserMemoryRegion> for kvm_userspace_memory_region { 192 fn from(region: UserMemoryRegion) -> Self { 193 assert!( 194 region.flags & USER_MEMORY_REGION_READ != 0, 195 "KVM mapped memory is always readable" 196 ); 197 198 let mut flags = 0; 199 if region.flags & USER_MEMORY_REGION_WRITE == 0 { 200 flags |= KVM_MEM_READONLY; 201 } 202 if region.flags & USER_MEMORY_REGION_LOG_DIRTY != 0 { 203 flags |= KVM_MEM_LOG_DIRTY_PAGES; 204 } 205 206 kvm_userspace_memory_region { 207 slot: region.slot, 208 guest_phys_addr: region.guest_phys_addr, 209 memory_size: region.memory_size, 210 userspace_addr: region.userspace_addr, 211 flags, 212 } 213 } 214 } 215 216 impl From<kvm_mp_state> for MpState { 217 fn from(s: kvm_mp_state) -> Self { 218 MpState::Kvm(s) 219 } 220 } 221 222 impl From<MpState> for kvm_mp_state { 223 fn from(ms: MpState) -> Self { 224 match ms { 225 MpState::Kvm(s) => s, 226 /* Needed in case other hypervisors are enabled */ 227 #[allow(unreachable_patterns)] 228 _ => panic!("CpuState is not valid"), 229 } 230 } 231 } 232 233 impl From<kvm_ioctls::IoEventAddress> for IoEventAddress { 234 fn from(a: kvm_ioctls::IoEventAddress) -> Self { 235 match a { 236 kvm_ioctls::IoEventAddress::Pio(x) => Self::Pio(x), 237 kvm_ioctls::IoEventAddress::Mmio(x) => Self::Mmio(x), 238 } 239 } 240 } 241 242 impl From<IoEventAddress> for kvm_ioctls::IoEventAddress { 243 fn from(a: IoEventAddress) -> Self { 244 match a { 245 IoEventAddress::Pio(x) => Self::Pio(x), 246 IoEventAddress::Mmio(x) => Self::Mmio(x), 247 } 248 } 249 } 250 251 impl From<VcpuKvmState> for CpuState { 252 fn from(s: VcpuKvmState) -> Self { 253 CpuState::Kvm(s) 254 } 255 } 256 257 impl From<CpuState> for VcpuKvmState { 258 fn from(s: CpuState) -> Self { 259 match s { 260 CpuState::Kvm(s) => s, 261 /* Needed in case other hypervisors are enabled */ 262 #[allow(unreachable_patterns)] 263 _ => panic!("CpuState is not valid"), 264 } 265 } 266 } 267 268 #[cfg(target_arch = "x86_64")] 269 impl From<kvm_clock_data> for ClockData { 270 fn from(d: kvm_clock_data) -> Self { 271 ClockData::Kvm(d) 272 } 273 } 274 275 #[cfg(target_arch = "x86_64")] 276 impl From<ClockData> for kvm_clock_data { 277 fn from(ms: ClockData) -> Self { 278 match ms { 279 ClockData::Kvm(s) => s, 280 /* Needed in case other hypervisors are enabled */ 281 #[allow(unreachable_patterns)] 282 _ => panic!("CpuState is not valid"), 283 } 284 } 285 } 286 287 impl From<kvm_irq_routing_entry> for IrqRoutingEntry { 288 fn from(s: kvm_irq_routing_entry) -> Self { 289 IrqRoutingEntry::Kvm(s) 290 } 291 } 292 293 impl From<IrqRoutingEntry> for kvm_irq_routing_entry { 294 fn from(e: IrqRoutingEntry) -> Self { 295 match e { 296 IrqRoutingEntry::Kvm(e) => e, 297 /* Needed in case other hypervisors are enabled */ 298 #[allow(unreachable_patterns)] 299 _ => panic!("IrqRoutingEntry is not valid"), 300 } 301 } 302 } 303 304 struct KvmDirtyLogSlot { 305 slot: u32, 306 guest_phys_addr: u64, 307 memory_size: u64, 308 userspace_addr: u64, 309 } 310 311 /// Wrapper over KVM VM ioctls. 312 pub struct KvmVm { 313 fd: Arc<VmFd>, 314 #[cfg(target_arch = "x86_64")] 315 msrs: Vec<MsrEntry>, 316 dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>, 317 } 318 319 impl KvmVm { 320 /// 321 /// Creates an emulated device in the kernel. 322 /// 323 /// See the documentation for `KVM_CREATE_DEVICE`. 324 fn create_device(&self, device: &mut CreateDevice) -> vm::Result<vfio_ioctls::VfioDeviceFd> { 325 let device_fd = self 326 .fd 327 .create_device(device) 328 .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?; 329 Ok(VfioDeviceFd::new_from_kvm(device_fd)) 330 } 331 /// Checks if a particular `Cap` is available. 332 fn check_extension(&self, c: Cap) -> bool { 333 self.fd.check_extension(c) 334 } 335 } 336 337 /// 338 /// Implementation of Vm trait for KVM 339 /// Example: 340 /// #[cfg(feature = "kvm")] 341 /// extern crate hypervisor 342 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 343 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 344 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 345 /// vm.set/get().unwrap() 346 /// 347 impl vm::Vm for KvmVm { 348 #[cfg(target_arch = "x86_64")] 349 /// 350 /// Sets the address of the one-page region in the VM's address space. 351 /// 352 fn set_identity_map_address(&self, address: u64) -> vm::Result<()> { 353 self.fd 354 .set_identity_map_address(address) 355 .map_err(|e| vm::HypervisorVmError::SetIdentityMapAddress(e.into())) 356 } 357 #[cfg(target_arch = "x86_64")] 358 /// 359 /// Sets the address of the three-page region in the VM's address space. 360 /// 361 fn set_tss_address(&self, offset: usize) -> vm::Result<()> { 362 self.fd 363 .set_tss_address(offset) 364 .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into())) 365 } 366 /// 367 /// Creates an in-kernel interrupt controller. 368 /// 369 fn create_irq_chip(&self) -> vm::Result<()> { 370 self.fd 371 .create_irq_chip() 372 .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into())) 373 } 374 /// 375 /// Registers an event that will, when signaled, trigger the `gsi` IRQ. 376 /// 377 fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> { 378 self.fd 379 .register_irqfd(fd, gsi) 380 .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into())) 381 } 382 /// 383 /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ. 384 /// 385 fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> { 386 self.fd 387 .unregister_irqfd(fd, gsi) 388 .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into())) 389 } 390 /// 391 /// Creates a VcpuFd object from a vcpu RawFd. 392 /// 393 fn create_vcpu( 394 &self, 395 id: u8, 396 vm_ops: Option<Arc<dyn VmOps>>, 397 ) -> vm::Result<Arc<dyn cpu::Vcpu>> { 398 let vc = self 399 .fd 400 .create_vcpu(id as u64) 401 .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?; 402 let vcpu = KvmVcpu { 403 fd: vc, 404 #[cfg(target_arch = "x86_64")] 405 msrs: self.msrs.clone(), 406 vm_ops, 407 #[cfg(target_arch = "x86_64")] 408 hyperv_synic: AtomicBool::new(false), 409 }; 410 Ok(Arc::new(vcpu)) 411 } 412 #[cfg(target_arch = "aarch64")] 413 /// 414 /// Creates a virtual GIC device. 415 /// 416 fn create_vgic( 417 &self, 418 vcpu_count: u64, 419 dist_addr: u64, 420 dist_size: u64, 421 redist_size: u64, 422 msi_size: u64, 423 nr_irqs: u32, 424 ) -> vm::Result<Arc<Mutex<dyn Vgic>>> { 425 let gic_device = KvmGicV3Its::new( 426 self, 427 vcpu_count, 428 dist_addr, 429 dist_size, 430 redist_size, 431 msi_size, 432 nr_irqs, 433 ) 434 .map_err(|e| vm::HypervisorVmError::CreateVgic(anyhow!("Vgic error {:?}", e)))?; 435 Ok(Arc::new(Mutex::new(gic_device))) 436 } 437 /// 438 /// Registers an event to be signaled whenever a certain address is written to. 439 /// 440 fn register_ioevent( 441 &self, 442 fd: &EventFd, 443 addr: &IoEventAddress, 444 datamatch: Option<vm::DataMatch>, 445 ) -> vm::Result<()> { 446 let addr = &kvm_ioctls::IoEventAddress::from(*addr); 447 if let Some(dm) = datamatch { 448 match dm { 449 vm::DataMatch::DataMatch32(kvm_dm32) => self 450 .fd 451 .register_ioevent(fd, addr, kvm_dm32) 452 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())), 453 vm::DataMatch::DataMatch64(kvm_dm64) => self 454 .fd 455 .register_ioevent(fd, addr, kvm_dm64) 456 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())), 457 } 458 } else { 459 self.fd 460 .register_ioevent(fd, addr, NoDatamatch) 461 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())) 462 } 463 } 464 /// 465 /// Unregisters an event from a certain address it has been previously registered to. 466 /// 467 fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> { 468 let addr = &kvm_ioctls::IoEventAddress::from(*addr); 469 self.fd 470 .unregister_ioevent(fd, addr, NoDatamatch) 471 .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into())) 472 } 473 474 /// 475 /// Constructs a routing entry 476 /// 477 fn make_routing_entry(&self, gsi: u32, config: &InterruptSourceConfig) -> IrqRoutingEntry { 478 match &config { 479 InterruptSourceConfig::MsiIrq(cfg) => { 480 let mut kvm_route = kvm_irq_routing_entry { 481 gsi, 482 type_: KVM_IRQ_ROUTING_MSI, 483 ..Default::default() 484 }; 485 486 kvm_route.u.msi.address_lo = cfg.low_addr; 487 kvm_route.u.msi.address_hi = cfg.high_addr; 488 kvm_route.u.msi.data = cfg.data; 489 490 if self.check_extension(crate::kvm::Cap::MsiDevid) { 491 // On AArch64, there is limitation on the range of the 'devid', 492 // it can not be greater than 65536 (the max of u16). 493 // 494 // BDF can not be used directly, because 'segment' is in high 495 // 16 bits. The layout of the u32 BDF is: 496 // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --| 497 // | segment | bus | device | function | 498 // 499 // Now that we support 1 bus only in a segment, we can build a 500 // 'devid' by replacing the 'bus' bits with the low 8 bits of 501 // 'segment' data. 502 // This way we can resolve the range checking problem and give 503 // different `devid` to all the devices. Limitation is that at 504 // most 256 segments can be supported. 505 // 506 let modified_devid = (cfg.devid & 0x00ff_0000) >> 8 | cfg.devid & 0xff; 507 508 kvm_route.flags = KVM_MSI_VALID_DEVID; 509 kvm_route.u.msi.__bindgen_anon_1.devid = modified_devid; 510 } 511 kvm_route.into() 512 } 513 InterruptSourceConfig::LegacyIrq(cfg) => { 514 let mut kvm_route = kvm_irq_routing_entry { 515 gsi, 516 type_: KVM_IRQ_ROUTING_IRQCHIP, 517 ..Default::default() 518 }; 519 kvm_route.u.irqchip.irqchip = cfg.irqchip; 520 kvm_route.u.irqchip.pin = cfg.pin; 521 522 kvm_route.into() 523 } 524 } 525 } 526 527 /// 528 /// Sets the GSI routing table entries, overwriting any previously set 529 /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl. 530 /// 531 fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> { 532 let mut irq_routing = 533 vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len()); 534 irq_routing[0].nr = entries.len() as u32; 535 irq_routing[0].flags = 0; 536 let entries: Vec<kvm_irq_routing_entry> = entries 537 .iter() 538 .map(|entry| match entry { 539 IrqRoutingEntry::Kvm(e) => *e, 540 #[allow(unreachable_patterns)] 541 _ => panic!("IrqRoutingEntry type is wrong"), 542 }) 543 .collect(); 544 545 // SAFETY: irq_routing initialized with entries.len() and now it is being turned into 546 // entries_slice with entries.len() again. It is guaranteed to be large enough to hold 547 // everything from entries. 548 unsafe { 549 let entries_slice: &mut [kvm_irq_routing_entry] = 550 irq_routing[0].entries.as_mut_slice(entries.len()); 551 entries_slice.copy_from_slice(&entries); 552 } 553 554 self.fd 555 .set_gsi_routing(&irq_routing[0]) 556 .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into())) 557 } 558 /// 559 /// Creates a memory region structure that can be used with {create/remove}_user_memory_region 560 /// 561 fn make_user_memory_region( 562 &self, 563 slot: u32, 564 guest_phys_addr: u64, 565 memory_size: u64, 566 userspace_addr: u64, 567 readonly: bool, 568 log_dirty_pages: bool, 569 ) -> UserMemoryRegion { 570 kvm_userspace_memory_region { 571 slot, 572 guest_phys_addr, 573 memory_size, 574 userspace_addr, 575 flags: if readonly { KVM_MEM_READONLY } else { 0 } 576 | if log_dirty_pages { 577 KVM_MEM_LOG_DIRTY_PAGES 578 } else { 579 0 580 }, 581 } 582 .into() 583 } 584 /// 585 /// Creates a guest physical memory region. 586 /// 587 fn create_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> { 588 let mut region: kvm_userspace_memory_region = user_memory_region.into(); 589 590 if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 { 591 if (region.flags & KVM_MEM_READONLY) != 0 { 592 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!( 593 "Error creating regions with both 'dirty-pages-log' and 'read-only'." 594 ))); 595 } 596 597 // Keep track of the regions that need dirty pages log 598 self.dirty_log_slots.write().unwrap().insert( 599 region.slot, 600 KvmDirtyLogSlot { 601 slot: region.slot, 602 guest_phys_addr: region.guest_phys_addr, 603 memory_size: region.memory_size, 604 userspace_addr: region.userspace_addr, 605 }, 606 ); 607 608 // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`. 609 // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`. 610 region.flags = 0; 611 } 612 613 // SAFETY: Safe because guest regions are guaranteed not to overlap. 614 unsafe { 615 self.fd 616 .set_user_memory_region(region) 617 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into())) 618 } 619 } 620 /// 621 /// Removes a guest physical memory region. 622 /// 623 fn remove_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> { 624 let mut region: kvm_userspace_memory_region = user_memory_region.into(); 625 626 // Remove the corresponding entry from "self.dirty_log_slots" if needed 627 self.dirty_log_slots.write().unwrap().remove(®ion.slot); 628 629 // Setting the size to 0 means "remove" 630 region.memory_size = 0; 631 // SAFETY: Safe because guest regions are guaranteed not to overlap. 632 unsafe { 633 self.fd 634 .set_user_memory_region(region) 635 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into())) 636 } 637 } 638 /// 639 /// Returns the preferred CPU target type which can be emulated by KVM on underlying host. 640 /// 641 #[cfg(target_arch = "aarch64")] 642 fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> { 643 self.fd 644 .get_preferred_target(kvi) 645 .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into())) 646 } 647 #[cfg(target_arch = "x86_64")] 648 fn enable_split_irq(&self) -> vm::Result<()> { 649 // Create split irqchip 650 // Only the local APIC is emulated in kernel, both PICs and IOAPIC 651 // are not. 652 let mut cap = kvm_enable_cap { 653 cap: KVM_CAP_SPLIT_IRQCHIP, 654 ..Default::default() 655 }; 656 cap.args[0] = NUM_IOAPIC_PINS as u64; 657 self.fd 658 .enable_cap(&cap) 659 .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?; 660 Ok(()) 661 } 662 #[cfg(target_arch = "x86_64")] 663 fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> { 664 let mut cap = kvm_enable_cap { 665 cap: KVM_CAP_SGX_ATTRIBUTE, 666 ..Default::default() 667 }; 668 cap.args[0] = file.as_raw_fd() as u64; 669 self.fd 670 .enable_cap(&cap) 671 .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?; 672 Ok(()) 673 } 674 /// Retrieve guest clock. 675 #[cfg(target_arch = "x86_64")] 676 fn get_clock(&self) -> vm::Result<ClockData> { 677 Ok(self 678 .fd 679 .get_clock() 680 .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))? 681 .into()) 682 } 683 /// Set guest clock. 684 #[cfg(target_arch = "x86_64")] 685 fn set_clock(&self, data: &ClockData) -> vm::Result<()> { 686 let data = (*data).into(); 687 self.fd 688 .set_clock(&data) 689 .map_err(|e| vm::HypervisorVmError::SetClock(e.into())) 690 } 691 /// Create a device that is used for passthrough 692 fn create_passthrough_device(&self) -> vm::Result<VfioDeviceFd> { 693 let mut vfio_dev = kvm_create_device { 694 type_: kvm_device_type_KVM_DEV_TYPE_VFIO, 695 fd: 0, 696 flags: 0, 697 }; 698 699 self.create_device(&mut vfio_dev) 700 .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into())) 701 } 702 /// 703 /// Start logging dirty pages 704 /// 705 fn start_dirty_log(&self) -> vm::Result<()> { 706 let dirty_log_slots = self.dirty_log_slots.read().unwrap(); 707 for (_, s) in dirty_log_slots.iter() { 708 let region = kvm_userspace_memory_region { 709 slot: s.slot, 710 guest_phys_addr: s.guest_phys_addr, 711 memory_size: s.memory_size, 712 userspace_addr: s.userspace_addr, 713 flags: KVM_MEM_LOG_DIRTY_PAGES, 714 }; 715 // SAFETY: Safe because guest regions are guaranteed not to overlap. 716 unsafe { 717 self.fd 718 .set_user_memory_region(region) 719 .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; 720 } 721 } 722 723 Ok(()) 724 } 725 726 /// 727 /// Stop logging dirty pages 728 /// 729 fn stop_dirty_log(&self) -> vm::Result<()> { 730 let dirty_log_slots = self.dirty_log_slots.read().unwrap(); 731 for (_, s) in dirty_log_slots.iter() { 732 let region = kvm_userspace_memory_region { 733 slot: s.slot, 734 guest_phys_addr: s.guest_phys_addr, 735 memory_size: s.memory_size, 736 userspace_addr: s.userspace_addr, 737 flags: 0, 738 }; 739 // SAFETY: Safe because guest regions are guaranteed not to overlap. 740 unsafe { 741 self.fd 742 .set_user_memory_region(region) 743 .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; 744 } 745 } 746 747 Ok(()) 748 } 749 750 /// 751 /// Get dirty pages bitmap (one bit per page) 752 /// 753 fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> { 754 self.fd 755 .get_dirty_log(slot, memory_size as usize) 756 .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into())) 757 } 758 759 /// 760 /// Initialize TDX for this VM 761 /// 762 #[cfg(feature = "tdx")] 763 fn tdx_init(&self, cpuid: &[CpuIdEntry], max_vcpus: u32) -> vm::Result<()> { 764 use std::io::{Error, ErrorKind}; 765 let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> = 766 cpuid.iter().map(|e| (*e).into()).collect(); 767 let kvm_cpuid = kvm_bindings::CpuId::from_entries(&cpuid).map_err(|_| { 768 vm::HypervisorVmError::InitializeTdx(Error::new( 769 ErrorKind::Other, 770 "failed to allocate CpuId", 771 )) 772 })?; 773 774 #[repr(C)] 775 struct TdxInitVm { 776 max_vcpus: u32, 777 tsc_khz: u32, 778 attributes: u64, 779 cpuid: u64, 780 mrconfigid: [u64; 6], 781 mrowner: [u64; 6], 782 mrownerconfig: [u64; 6], 783 reserved: [u64; 43], 784 } 785 let data = TdxInitVm { 786 max_vcpus, 787 tsc_khz: 0, 788 attributes: 0, 789 cpuid: kvm_cpuid.as_fam_struct_ptr() as u64, 790 mrconfigid: [0; 6], 791 mrowner: [0; 6], 792 mrownerconfig: [0; 6], 793 reserved: [0; 43], 794 }; 795 796 tdx_command( 797 &self.fd.as_raw_fd(), 798 TdxCommand::InitVm, 799 0, 800 &data as *const _ as u64, 801 ) 802 .map_err(vm::HypervisorVmError::InitializeTdx) 803 } 804 805 /// 806 /// Finalize the TDX setup for this VM 807 /// 808 #[cfg(feature = "tdx")] 809 fn tdx_finalize(&self) -> vm::Result<()> { 810 tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0) 811 .map_err(vm::HypervisorVmError::FinalizeTdx) 812 } 813 814 /// 815 /// Initialize memory regions for the TDX VM 816 /// 817 #[cfg(feature = "tdx")] 818 fn tdx_init_memory_region( 819 &self, 820 host_address: u64, 821 guest_address: u64, 822 size: u64, 823 measure: bool, 824 ) -> vm::Result<()> { 825 #[repr(C)] 826 struct TdxInitMemRegion { 827 host_address: u64, 828 guest_address: u64, 829 pages: u64, 830 } 831 let data = TdxInitMemRegion { 832 host_address, 833 guest_address, 834 pages: size / 4096, 835 }; 836 837 tdx_command( 838 &self.fd.as_raw_fd(), 839 TdxCommand::InitMemRegion, 840 if measure { 1 } else { 0 }, 841 &data as *const _ as u64, 842 ) 843 .map_err(vm::HypervisorVmError::InitMemRegionTdx) 844 } 845 /// Downcast to the underlying KvmVm type 846 fn as_any(&self) -> &dyn Any { 847 self 848 } 849 } 850 851 #[cfg(feature = "tdx")] 852 fn tdx_command( 853 fd: &RawFd, 854 command: TdxCommand, 855 metadata: u32, 856 data: u64, 857 ) -> std::result::Result<(), std::io::Error> { 858 #[repr(C)] 859 struct TdxIoctlCmd { 860 command: TdxCommand, 861 metadata: u32, 862 data: u64, 863 } 864 let cmd = TdxIoctlCmd { 865 command, 866 metadata, 867 data, 868 }; 869 // SAFETY: FFI call. All input parameters are valid. 870 let ret = unsafe { 871 ioctl_with_val( 872 fd, 873 KVM_MEMORY_ENCRYPT_OP(), 874 &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong, 875 ) 876 }; 877 878 if ret < 0 { 879 return Err(std::io::Error::last_os_error()); 880 } 881 Ok(()) 882 } 883 884 /// Wrapper over KVM system ioctls. 885 pub struct KvmHypervisor { 886 kvm: Kvm, 887 } 888 889 impl KvmHypervisor { 890 #[cfg(target_arch = "x86_64")] 891 /// 892 /// Retrieve the list of MSRs supported by the hypervisor. 893 /// 894 fn get_msr_list(&self) -> hypervisor::Result<MsrList> { 895 self.kvm 896 .get_msr_index_list() 897 .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into())) 898 } 899 } 900 901 /// Enum for KVM related error 902 #[derive(Debug, Error)] 903 pub enum KvmError { 904 #[error("Capability missing: {0:?}")] 905 CapabilityMissing(Cap), 906 } 907 pub type KvmResult<T> = result::Result<T, KvmError>; 908 impl KvmHypervisor { 909 /// Create a hypervisor based on Kvm 910 #[allow(clippy::new_ret_no_self)] 911 pub fn new() -> hypervisor::Result<Arc<dyn hypervisor::Hypervisor>> { 912 let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?; 913 let api_version = kvm_obj.get_api_version(); 914 915 if api_version != kvm_bindings::KVM_API_VERSION as i32 { 916 return Err(hypervisor::HypervisorError::IncompatibleApiVersion); 917 } 918 919 Ok(Arc::new(KvmHypervisor { kvm: kvm_obj })) 920 } 921 /// Check if the hypervisor is available 922 pub fn is_available() -> hypervisor::Result<bool> { 923 match std::fs::metadata("/dev/kvm") { 924 Ok(_) => Ok(true), 925 Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false), 926 Err(err) => Err(hypervisor::HypervisorError::HypervisorAvailableCheck( 927 err.into(), 928 )), 929 } 930 } 931 } 932 /// Implementation of Hypervisor trait for KVM 933 /// Example: 934 /// #[cfg(feature = "kvm")] 935 /// extern crate hypervisor 936 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 937 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 938 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 939 /// 940 impl hypervisor::Hypervisor for KvmHypervisor { 941 /// 942 /// Returns the type of the hypervisor 943 /// 944 fn hypervisor_type(&self) -> HypervisorType { 945 HypervisorType::Kvm 946 } 947 /// Create a KVM vm object of a specific VM type and return the object as Vm trait object 948 /// Example 949 /// # extern crate hypervisor; 950 /// # use hypervisor::KvmHypervisor; 951 /// use hypervisor::KvmVm; 952 /// let hypervisor = KvmHypervisor::new().unwrap(); 953 /// let vm = hypervisor.create_vm_with_type(KvmVmType::LegacyVm).unwrap() 954 /// 955 fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> { 956 let fd: VmFd; 957 loop { 958 match self.kvm.create_vm_with_type(vm_type) { 959 Ok(res) => fd = res, 960 Err(e) => { 961 if e.errno() == libc::EINTR { 962 // If the error returned is EINTR, which means the 963 // ioctl has been interrupted, we have to retry as 964 // this can't be considered as a regular error. 965 continue; 966 } else { 967 return Err(hypervisor::HypervisorError::VmCreate(e.into())); 968 } 969 } 970 } 971 break; 972 } 973 974 let vm_fd = Arc::new(fd); 975 976 #[cfg(target_arch = "x86_64")] 977 { 978 let msr_list = self.get_msr_list()?; 979 let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize; 980 let mut msrs: Vec<MsrEntry> = vec![ 981 MsrEntry { 982 ..Default::default() 983 }; 984 num_msrs 985 ]; 986 let indices = msr_list.as_slice(); 987 for (pos, index) in indices.iter().enumerate() { 988 msrs[pos].index = *index; 989 } 990 991 Ok(Arc::new(KvmVm { 992 fd: vm_fd, 993 msrs, 994 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), 995 })) 996 } 997 998 #[cfg(target_arch = "aarch64")] 999 { 1000 Ok(Arc::new(KvmVm { 1001 fd: vm_fd, 1002 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), 1003 })) 1004 } 1005 } 1006 1007 /// Create a KVM vm object and return the object as Vm trait object 1008 /// Example 1009 /// # extern crate hypervisor; 1010 /// # use hypervisor::KvmHypervisor; 1011 /// use hypervisor::KvmVm; 1012 /// let hypervisor = KvmHypervisor::new().unwrap(); 1013 /// let vm = hypervisor.create_vm().unwrap() 1014 /// 1015 fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> { 1016 #[allow(unused_mut)] 1017 let mut vm_type: u64 = 0; // Create with default platform type 1018 1019 // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA 1020 // size from the host and use that when creating the VM, which may 1021 // avoid unnecessary VM creation failures. 1022 #[cfg(target_arch = "aarch64")] 1023 if self.kvm.check_extension(Cap::ArmVmIPASize) { 1024 vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap(); 1025 } 1026 1027 self.create_vm_with_type(vm_type) 1028 } 1029 1030 fn check_required_extensions(&self) -> hypervisor::Result<()> { 1031 check_required_kvm_extensions(&self.kvm) 1032 .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into())) 1033 } 1034 1035 #[cfg(target_arch = "x86_64")] 1036 /// 1037 /// X86 specific call to get the system supported CPUID values. 1038 /// 1039 fn get_cpuid(&self) -> hypervisor::Result<Vec<CpuIdEntry>> { 1040 let kvm_cpuid = self 1041 .kvm 1042 .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES) 1043 .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))?; 1044 1045 let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect(); 1046 1047 Ok(v) 1048 } 1049 1050 #[cfg(target_arch = "aarch64")] 1051 /// 1052 /// Retrieve AArch64 host maximum IPA size supported by KVM. 1053 /// 1054 fn get_host_ipa_limit(&self) -> i32 { 1055 self.kvm.get_host_ipa_limit() 1056 } 1057 1058 /// 1059 /// Retrieve TDX capabilities 1060 /// 1061 #[cfg(feature = "tdx")] 1062 fn tdx_capabilities(&self) -> hypervisor::Result<TdxCapabilities> { 1063 let data = TdxCapabilities { 1064 nr_cpuid_configs: TDX_MAX_NR_CPUID_CONFIGS as u32, 1065 ..Default::default() 1066 }; 1067 1068 tdx_command( 1069 &self.kvm.as_raw_fd(), 1070 TdxCommand::Capabilities, 1071 0, 1072 &data as *const _ as u64, 1073 ) 1074 .map_err(|e| hypervisor::HypervisorError::TdxCapabilities(e.into()))?; 1075 1076 Ok(data) 1077 } 1078 } 1079 /// Vcpu struct for KVM 1080 pub struct KvmVcpu { 1081 fd: VcpuFd, 1082 #[cfg(target_arch = "x86_64")] 1083 msrs: Vec<MsrEntry>, 1084 vm_ops: Option<Arc<dyn vm::VmOps>>, 1085 #[cfg(target_arch = "x86_64")] 1086 hyperv_synic: AtomicBool, 1087 } 1088 /// Implementation of Vcpu trait for KVM 1089 /// Example: 1090 /// #[cfg(feature = "kvm")] 1091 /// extern crate hypervisor 1092 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1093 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1094 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 1095 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1096 /// vcpu.get/set().unwrap() 1097 /// 1098 impl cpu::Vcpu for KvmVcpu { 1099 #[cfg(target_arch = "x86_64")] 1100 /// 1101 /// Returns the vCPU general purpose registers. 1102 /// 1103 fn get_regs(&self) -> cpu::Result<StandardRegisters> { 1104 Ok(self 1105 .fd 1106 .get_regs() 1107 .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))? 1108 .into()) 1109 } 1110 /// 1111 /// Returns the vCPU general purpose registers. 1112 /// The `KVM_GET_REGS` ioctl is not available on AArch64, `KVM_GET_ONE_REG` 1113 /// is used to get registers one by one. 1114 /// 1115 #[cfg(target_arch = "aarch64")] 1116 fn get_regs(&self) -> cpu::Result<StandardRegisters> { 1117 let mut state: StandardRegisters = kvm_regs::default(); 1118 let mut off = offset__of!(user_pt_regs, regs); 1119 // There are 31 user_pt_regs: 1120 // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72 1121 // These actually are the general-purpose registers of the Armv8-a 1122 // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register). 1123 for i in 0..31 { 1124 state.regs.regs[i] = self 1125 .fd 1126 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1127 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1128 off += std::mem::size_of::<u64>(); 1129 } 1130 1131 // We are now entering the "Other register" section of the ARMv8-a architecture. 1132 // First one, stack pointer. 1133 let off = offset__of!(user_pt_regs, sp); 1134 state.regs.sp = self 1135 .fd 1136 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1137 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1138 1139 // Second one, the program counter. 1140 let off = offset__of!(user_pt_regs, pc); 1141 state.regs.pc = self 1142 .fd 1143 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1144 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1145 1146 // Next is the processor state. 1147 let off = offset__of!(user_pt_regs, pstate); 1148 state.regs.pstate = self 1149 .fd 1150 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1151 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1152 1153 // The stack pointer associated with EL1 1154 let off = offset__of!(kvm_regs, sp_el1); 1155 state.sp_el1 = self 1156 .fd 1157 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1158 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1159 1160 // Exception Link Register for EL1, when taking an exception to EL1, this register 1161 // holds the address to which to return afterwards. 1162 let off = offset__of!(kvm_regs, elr_el1); 1163 state.elr_el1 = self 1164 .fd 1165 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1166 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1167 1168 // Saved Program Status Registers, there are 5 of them used in the kernel. 1169 let mut off = offset__of!(kvm_regs, spsr); 1170 for i in 0..KVM_NR_SPSR as usize { 1171 state.spsr[i] = self 1172 .fd 1173 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1174 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1175 off += std::mem::size_of::<u64>(); 1176 } 1177 1178 // Now moving on to floting point registers which are stored in the user_fpsimd_state in the kernel: 1179 // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53 1180 let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs); 1181 for i in 0..32 { 1182 state.fp_regs.vregs[i] = self 1183 .fd 1184 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off)) 1185 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1186 .into(); 1187 off += mem::size_of::<u128>(); 1188 } 1189 1190 // Floating-point Status Register 1191 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr); 1192 state.fp_regs.fpsr = self 1193 .fd 1194 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off)) 1195 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1196 as u32; 1197 1198 // Floating-point Control Register 1199 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr); 1200 state.fp_regs.fpcr = self 1201 .fd 1202 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off)) 1203 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1204 as u32; 1205 Ok(state) 1206 } 1207 #[cfg(target_arch = "x86_64")] 1208 /// 1209 /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl. 1210 /// 1211 fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> { 1212 let regs = (*regs).into(); 1213 self.fd 1214 .set_regs(®s) 1215 .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into())) 1216 } 1217 1218 /// 1219 /// Sets the vCPU general purpose registers. 1220 /// The `KVM_SET_REGS` ioctl is not available on AArch64, `KVM_SET_ONE_REG` 1221 /// is used to set registers one by one. 1222 /// 1223 #[cfg(target_arch = "aarch64")] 1224 fn set_regs(&self, state: &StandardRegisters) -> cpu::Result<()> { 1225 // The function follows the exact identical order from `state`. Look there 1226 // for some additional info on registers. 1227 let mut off = offset__of!(user_pt_regs, regs); 1228 for i in 0..31 { 1229 self.fd 1230 .set_one_reg( 1231 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1232 state.regs.regs[i], 1233 ) 1234 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1235 off += std::mem::size_of::<u64>(); 1236 } 1237 1238 let off = offset__of!(user_pt_regs, sp); 1239 self.fd 1240 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.sp) 1241 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1242 1243 let off = offset__of!(user_pt_regs, pc); 1244 self.fd 1245 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pc) 1246 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1247 1248 let off = offset__of!(user_pt_regs, pstate); 1249 self.fd 1250 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pstate) 1251 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1252 1253 let off = offset__of!(kvm_regs, sp_el1); 1254 self.fd 1255 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.sp_el1) 1256 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1257 1258 let off = offset__of!(kvm_regs, elr_el1); 1259 self.fd 1260 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.elr_el1) 1261 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1262 1263 let mut off = offset__of!(kvm_regs, spsr); 1264 for i in 0..KVM_NR_SPSR as usize { 1265 self.fd 1266 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.spsr[i]) 1267 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1268 off += std::mem::size_of::<u64>(); 1269 } 1270 1271 let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs); 1272 for i in 0..32 { 1273 self.fd 1274 .set_one_reg( 1275 arm64_core_reg_id!(KVM_REG_SIZE_U128, off), 1276 state.fp_regs.vregs[i] as u64, 1277 ) 1278 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1279 off += mem::size_of::<u128>(); 1280 } 1281 1282 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr); 1283 self.fd 1284 .set_one_reg( 1285 arm64_core_reg_id!(KVM_REG_SIZE_U32, off), 1286 state.fp_regs.fpsr as u64, 1287 ) 1288 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1289 1290 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr); 1291 self.fd 1292 .set_one_reg( 1293 arm64_core_reg_id!(KVM_REG_SIZE_U32, off), 1294 state.fp_regs.fpcr as u64, 1295 ) 1296 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1297 Ok(()) 1298 } 1299 1300 #[cfg(target_arch = "x86_64")] 1301 /// 1302 /// Returns the vCPU special registers. 1303 /// 1304 fn get_sregs(&self) -> cpu::Result<SpecialRegisters> { 1305 Ok(self 1306 .fd 1307 .get_sregs() 1308 .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))? 1309 .into()) 1310 } 1311 #[cfg(target_arch = "x86_64")] 1312 /// 1313 /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl. 1314 /// 1315 fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> { 1316 let sregs = (*sregs).into(); 1317 self.fd 1318 .set_sregs(&sregs) 1319 .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into())) 1320 } 1321 #[cfg(target_arch = "x86_64")] 1322 /// 1323 /// Returns the floating point state (FPU) from the vCPU. 1324 /// 1325 fn get_fpu(&self) -> cpu::Result<FpuState> { 1326 Ok(self 1327 .fd 1328 .get_fpu() 1329 .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))? 1330 .into()) 1331 } 1332 #[cfg(target_arch = "x86_64")] 1333 /// 1334 /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct. 1335 /// 1336 fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> { 1337 let fpu: kvm_bindings::kvm_fpu = (*fpu).clone().into(); 1338 self.fd 1339 .set_fpu(&fpu) 1340 .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into())) 1341 } 1342 #[cfg(target_arch = "x86_64")] 1343 /// 1344 /// X86 specific call to setup the CPUID registers. 1345 /// 1346 fn set_cpuid2(&self, cpuid: &[CpuIdEntry]) -> cpu::Result<()> { 1347 let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> = 1348 cpuid.iter().map(|e| (*e).into()).collect(); 1349 let kvm_cpuid = <CpuId>::from_entries(&cpuid) 1350 .map_err(|_| cpu::HypervisorCpuError::SetCpuid(anyhow!("failed to create CpuId")))?; 1351 1352 self.fd 1353 .set_cpuid2(&kvm_cpuid) 1354 .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into())) 1355 } 1356 #[cfg(target_arch = "x86_64")] 1357 /// 1358 /// X86 specific call to enable HyperV SynIC 1359 /// 1360 fn enable_hyperv_synic(&self) -> cpu::Result<()> { 1361 // Update the information about Hyper-V SynIC being enabled and 1362 // emulated as it will influence later which MSRs should be saved. 1363 self.hyperv_synic.store(true, Ordering::Release); 1364 1365 let cap = kvm_enable_cap { 1366 cap: KVM_CAP_HYPERV_SYNIC, 1367 ..Default::default() 1368 }; 1369 self.fd 1370 .enable_cap(&cap) 1371 .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into())) 1372 } 1373 /// 1374 /// X86 specific call to retrieve the CPUID registers. 1375 /// 1376 #[cfg(target_arch = "x86_64")] 1377 fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<Vec<CpuIdEntry>> { 1378 let kvm_cpuid = self 1379 .fd 1380 .get_cpuid2(num_entries) 1381 .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))?; 1382 1383 let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect(); 1384 1385 Ok(v) 1386 } 1387 #[cfg(target_arch = "x86_64")] 1388 /// 1389 /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller). 1390 /// 1391 fn get_lapic(&self) -> cpu::Result<LapicState> { 1392 Ok(self 1393 .fd 1394 .get_lapic() 1395 .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))? 1396 .into()) 1397 } 1398 #[cfg(target_arch = "x86_64")] 1399 /// 1400 /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller). 1401 /// 1402 fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> { 1403 let klapic: kvm_bindings::kvm_lapic_state = (*klapic).clone().into(); 1404 self.fd 1405 .set_lapic(&klapic) 1406 .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into())) 1407 } 1408 #[cfg(target_arch = "x86_64")] 1409 /// 1410 /// Returns the model-specific registers (MSR) for this vCPU. 1411 /// 1412 fn get_msrs(&self, msrs: &mut Vec<MsrEntry>) -> cpu::Result<usize> { 1413 let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect(); 1414 let mut kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap(); 1415 let succ = self 1416 .fd 1417 .get_msrs(&mut kvm_msrs) 1418 .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))?; 1419 1420 msrs[..succ].copy_from_slice( 1421 &kvm_msrs.as_slice()[..succ] 1422 .iter() 1423 .map(|e| (*e).into()) 1424 .collect::<Vec<MsrEntry>>(), 1425 ); 1426 1427 Ok(succ) 1428 } 1429 #[cfg(target_arch = "x86_64")] 1430 /// 1431 /// Setup the model-specific registers (MSR) for this vCPU. 1432 /// Returns the number of MSR entries actually written. 1433 /// 1434 fn set_msrs(&self, msrs: &[MsrEntry]) -> cpu::Result<usize> { 1435 let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect(); 1436 let kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap(); 1437 self.fd 1438 .set_msrs(&kvm_msrs) 1439 .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into())) 1440 } 1441 /// 1442 /// Returns the vcpu's current "multiprocessing state". 1443 /// 1444 fn get_mp_state(&self) -> cpu::Result<MpState> { 1445 Ok(self 1446 .fd 1447 .get_mp_state() 1448 .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))? 1449 .into()) 1450 } 1451 /// 1452 /// Sets the vcpu's current "multiprocessing state". 1453 /// 1454 fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> { 1455 self.fd 1456 .set_mp_state(mp_state.into()) 1457 .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into())) 1458 } 1459 #[cfg(target_arch = "x86_64")] 1460 /// 1461 /// Translates guest virtual address to guest physical address using the `KVM_TRANSLATE` ioctl. 1462 /// 1463 fn translate_gva(&self, gva: u64, _flags: u64) -> cpu::Result<(u64, u32)> { 1464 let tr = self 1465 .fd 1466 .translate_gva(gva) 1467 .map_err(|e| cpu::HypervisorCpuError::TranslateVirtualAddress(e.into()))?; 1468 // tr.valid is set if the GVA is mapped to valid GPA. 1469 match tr.valid { 1470 0 => Err(cpu::HypervisorCpuError::TranslateVirtualAddress(anyhow!( 1471 "Invalid GVA: {:#x}", 1472 gva 1473 ))), 1474 _ => Ok((tr.physical_address, 0)), 1475 } 1476 } 1477 /// 1478 /// Triggers the running of the current virtual CPU returning an exit reason. 1479 /// 1480 fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> { 1481 match self.fd.run() { 1482 Ok(run) => match run { 1483 #[cfg(target_arch = "x86_64")] 1484 VcpuExit::IoIn(addr, data) => { 1485 if let Some(vm_ops) = &self.vm_ops { 1486 return vm_ops 1487 .pio_read(addr.into(), data) 1488 .map(|_| cpu::VmExit::Ignore) 1489 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1490 } 1491 1492 Ok(cpu::VmExit::IoIn(addr, data)) 1493 } 1494 #[cfg(target_arch = "x86_64")] 1495 VcpuExit::IoOut(addr, data) => { 1496 if let Some(vm_ops) = &self.vm_ops { 1497 return vm_ops 1498 .pio_write(addr.into(), data) 1499 .map(|_| cpu::VmExit::Ignore) 1500 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1501 } 1502 1503 Ok(cpu::VmExit::IoOut(addr, data)) 1504 } 1505 #[cfg(target_arch = "x86_64")] 1506 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)), 1507 #[cfg(target_arch = "x86_64")] 1508 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset), 1509 1510 #[cfg(target_arch = "aarch64")] 1511 VcpuExit::SystemEvent(event_type, flags) => { 1512 use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN}; 1513 // On Aarch64, when the VM is shutdown, run() returns 1514 // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN 1515 if event_type == KVM_SYSTEM_EVENT_RESET { 1516 Ok(cpu::VmExit::Reset) 1517 } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN { 1518 Ok(cpu::VmExit::Shutdown) 1519 } else { 1520 Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1521 "Unexpected system event with type 0x{:x}, flags 0x{:x}", 1522 event_type, 1523 flags 1524 ))) 1525 } 1526 } 1527 1528 VcpuExit::MmioRead(addr, data) => { 1529 if let Some(vm_ops) = &self.vm_ops { 1530 return vm_ops 1531 .mmio_read(addr, data) 1532 .map(|_| cpu::VmExit::Ignore) 1533 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1534 } 1535 1536 Ok(cpu::VmExit::MmioRead(addr, data)) 1537 } 1538 VcpuExit::MmioWrite(addr, data) => { 1539 if let Some(vm_ops) = &self.vm_ops { 1540 return vm_ops 1541 .mmio_write(addr, data) 1542 .map(|_| cpu::VmExit::Ignore) 1543 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1544 } 1545 1546 Ok(cpu::VmExit::MmioWrite(addr, data)) 1547 } 1548 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv), 1549 #[cfg(feature = "tdx")] 1550 VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx), 1551 VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug), 1552 1553 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1554 "Unexpected exit reason on vcpu run: {:?}", 1555 r 1556 ))), 1557 }, 1558 1559 Err(ref e) => match e.errno() { 1560 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore), 1561 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1562 "VCPU error {:?}", 1563 e 1564 ))), 1565 }, 1566 } 1567 } 1568 #[cfg(target_arch = "x86_64")] 1569 /// 1570 /// Let the guest know that it has been paused, which prevents from 1571 /// potential soft lockups when being resumed. 1572 /// 1573 fn notify_guest_clock_paused(&self) -> cpu::Result<()> { 1574 if let Err(e) = self.fd.kvmclock_ctrl() { 1575 // Linux kernel returns -EINVAL if the PV clock isn't yet initialised 1576 // which could be because we're still in firmware or the guest doesn't 1577 // use KVM clock. 1578 if e.errno() != libc::EINVAL { 1579 return Err(cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into())); 1580 } 1581 } 1582 1583 Ok(()) 1584 } 1585 #[cfg(target_arch = "x86_64")] 1586 /// 1587 /// Sets debug registers to set hardware breakpoints and/or enable single step. 1588 /// 1589 fn set_guest_debug( 1590 &self, 1591 addrs: &[vm_memory::GuestAddress], 1592 singlestep: bool, 1593 ) -> cpu::Result<()> { 1594 if addrs.len() > 4 { 1595 return Err(cpu::HypervisorCpuError::SetDebugRegs(anyhow!( 1596 "Support 4 breakpoints at most but {} addresses are passed", 1597 addrs.len() 1598 ))); 1599 } 1600 1601 let mut dbg = kvm_guest_debug { 1602 control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP, 1603 ..Default::default() 1604 }; 1605 if singlestep { 1606 dbg.control |= KVM_GUESTDBG_SINGLESTEP; 1607 } 1608 1609 // Set bits 9 and 10. 1610 // bit 9: GE (global exact breakpoint enable) flag. 1611 // bit 10: always 1. 1612 dbg.arch.debugreg[7] = 0x0600; 1613 1614 for (i, addr) in addrs.iter().enumerate() { 1615 dbg.arch.debugreg[i] = addr.0; 1616 // Set global breakpoint enable flag 1617 dbg.arch.debugreg[7] |= 2 << (i * 2); 1618 } 1619 1620 self.fd 1621 .set_guest_debug(&dbg) 1622 .map_err(|e| cpu::HypervisorCpuError::SetDebugRegs(e.into())) 1623 } 1624 #[cfg(target_arch = "aarch64")] 1625 fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> { 1626 self.fd 1627 .vcpu_init(kvi) 1628 .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into())) 1629 } 1630 /// 1631 /// Gets a list of the guest registers that are supported for the 1632 /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls. 1633 /// 1634 #[cfg(target_arch = "aarch64")] 1635 fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> { 1636 self.fd 1637 .get_reg_list(reg_list) 1638 .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into())) 1639 } 1640 /// 1641 /// Gets the value of a system register 1642 /// 1643 #[cfg(target_arch = "aarch64")] 1644 fn get_sys_reg(&self, sys_reg: u32) -> cpu::Result<u64> { 1645 // 1646 // Arm Architecture Reference Manual defines the encoding of 1647 // AArch64 system registers, see 1648 // https://developer.arm.com/documentation/ddi0487 (chapter D12). 1649 // While KVM defines another ID for each AArch64 system register, 1650 // which is used in calling `KVM_G/SET_ONE_REG` to access a system 1651 // register of a guest. 1652 // A mapping exists between the Arm standard encoding and the KVM ID. 1653 // This function takes the standard u32 ID as input parameter, converts 1654 // it to the corresponding KVM ID, and call `KVM_GET_ONE_REG` API to 1655 // get the value of the system parameter. 1656 // 1657 let id: u64 = KVM_REG_ARM64 as u64 1658 | KVM_REG_SIZE_U64 as u64 1659 | KVM_REG_ARM64_SYSREG as u64 1660 | ((((sys_reg) >> 5) 1661 & (KVM_REG_ARM64_SYSREG_OP0_MASK 1662 | KVM_REG_ARM64_SYSREG_OP1_MASK 1663 | KVM_REG_ARM64_SYSREG_CRN_MASK 1664 | KVM_REG_ARM64_SYSREG_CRM_MASK 1665 | KVM_REG_ARM64_SYSREG_OP2_MASK)) as u64); 1666 self.fd 1667 .get_one_reg(id) 1668 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into())) 1669 } 1670 /// 1671 /// Configure core registers for a given CPU. 1672 /// 1673 #[cfg(target_arch = "aarch64")] 1674 fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> { 1675 #[allow(non_upper_case_globals)] 1676 // PSR (Processor State Register) bits. 1677 // Taken from arch/arm64/include/uapi/asm/ptrace.h. 1678 const PSR_MODE_EL1h: u64 = 0x0000_0005; 1679 const PSR_F_BIT: u64 = 0x0000_0040; 1680 const PSR_I_BIT: u64 = 0x0000_0080; 1681 const PSR_A_BIT: u64 = 0x0000_0100; 1682 const PSR_D_BIT: u64 = 0x0000_0200; 1683 // Taken from arch/arm64/kvm/inject_fault.c. 1684 const PSTATE_FAULT_BITS_64: u64 = 1685 PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | PSR_I_BIT | PSR_D_BIT; 1686 1687 let kreg_off = offset__of!(kvm_regs, regs); 1688 1689 // Get the register index of the PSTATE (Processor State) register. 1690 let pstate = offset__of!(user_pt_regs, pstate) + kreg_off; 1691 self.fd 1692 .set_one_reg( 1693 arm64_core_reg_id!(KVM_REG_SIZE_U64, pstate), 1694 PSTATE_FAULT_BITS_64, 1695 ) 1696 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1697 1698 // Other vCPUs are powered off initially awaiting PSCI wakeup. 1699 if cpu_id == 0 { 1700 // Setting the PC (Processor Counter) to the current program address (kernel address). 1701 let pc = offset__of!(user_pt_regs, pc) + kreg_off; 1702 self.fd 1703 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, pc), boot_ip as u64) 1704 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1705 1706 // Last mandatory thing to set -> the address pointing to the FDT (also called DTB). 1707 // "The device tree blob (dtb) must be placed on an 8-byte boundary and must 1708 // not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt. 1709 // We are choosing to place it the end of DRAM. See `get_fdt_addr`. 1710 let regs0 = offset__of!(user_pt_regs, regs) + kreg_off; 1711 self.fd 1712 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, regs0), fdt_start) 1713 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1714 } 1715 Ok(()) 1716 } 1717 1718 #[cfg(target_arch = "x86_64")] 1719 /// 1720 /// Get the current CPU state 1721 /// 1722 /// Ordering requirements: 1723 /// 1724 /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify 1725 /// vCPU/LAPIC state. As such, it must be done before most everything 1726 /// else, otherwise we cannot restore everything and expect it to work. 1727 /// 1728 /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are 1729 /// still running. 1730 /// 1731 /// KVM_GET_LAPIC may change state of LAPIC before returning it. 1732 /// 1733 /// GET_VCPU_EVENTS should probably be last to save. The code looks as 1734 /// it might as well be affected by internal state modifications of the 1735 /// GET ioctls. 1736 /// 1737 /// SREGS saves/restores a pending interrupt, similar to what 1738 /// VCPU_EVENTS also does. 1739 /// 1740 /// GET_MSRS requires a pre-populated data structure to do something 1741 /// meaningful. For SET_MSRS it will then contain good data. 1742 /// 1743 /// # Example 1744 /// 1745 /// ```rust 1746 /// # extern crate hypervisor; 1747 /// # use hypervisor::KvmHypervisor; 1748 /// # use std::sync::Arc; 1749 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1750 /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1751 /// let vm = hv.create_vm().expect("new VM fd creation failed"); 1752 /// vm.enable_split_irq().unwrap(); 1753 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1754 /// let state = vcpu.state().unwrap(); 1755 /// ``` 1756 fn state(&self) -> cpu::Result<CpuState> { 1757 let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?; 1758 let mp_state = self.get_mp_state()?.into(); 1759 let regs = self.get_regs()?; 1760 let sregs = self.get_sregs()?; 1761 let xsave = self.get_xsave()?; 1762 let xcrs = self.get_xcrs()?; 1763 let lapic_state = self.get_lapic()?; 1764 let fpu = self.get_fpu()?; 1765 1766 // Try to get all MSRs based on the list previously retrieved from KVM. 1767 // If the number of MSRs obtained from GET_MSRS is different from the 1768 // expected amount, we fallback onto a slower method by getting MSRs 1769 // by chunks. This is the only way to make sure we try to get as many 1770 // MSRs as possible, even if some MSRs are not supported. 1771 let mut msr_entries = self.msrs.clone(); 1772 1773 // Save extra MSRs if the Hyper-V synthetic interrupt controller is 1774 // emulated. 1775 if self.hyperv_synic.load(Ordering::Acquire) { 1776 let hyperv_synic_msrs = vec![ 1777 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084, 1778 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096, 1779 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d, 1780 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4, 1781 0x400000b5, 0x400000b6, 0x400000b7, 1782 ]; 1783 for index in hyperv_synic_msrs { 1784 let msr = kvm_msr_entry { 1785 index, 1786 ..Default::default() 1787 }; 1788 msr_entries.push(msr.into()); 1789 } 1790 } 1791 1792 let expected_num_msrs = msr_entries.len(); 1793 let num_msrs = self.get_msrs(&mut msr_entries)?; 1794 let msrs = if num_msrs != expected_num_msrs { 1795 let mut faulty_msr_index = num_msrs; 1796 let mut msr_entries_tmp = msr_entries[..faulty_msr_index].to_vec(); 1797 1798 loop { 1799 warn!( 1800 "Detected faulty MSR 0x{:x} while getting MSRs", 1801 msr_entries[faulty_msr_index].index 1802 ); 1803 1804 // Skip the first bad MSR 1805 let start_pos = faulty_msr_index + 1; 1806 1807 let mut sub_msr_entries = msr_entries[start_pos..].to_vec(); 1808 let num_msrs = self.get_msrs(&mut sub_msr_entries)?; 1809 1810 msr_entries_tmp.extend(&sub_msr_entries[..num_msrs]); 1811 1812 if num_msrs == sub_msr_entries.len() { 1813 break; 1814 } 1815 1816 faulty_msr_index = start_pos + num_msrs; 1817 } 1818 1819 msr_entries_tmp 1820 } else { 1821 msr_entries 1822 }; 1823 1824 let vcpu_events = self.get_vcpu_events()?; 1825 1826 Ok(VcpuKvmState { 1827 cpuid, 1828 msrs, 1829 vcpu_events, 1830 regs: regs.into(), 1831 sregs: sregs.into(), 1832 fpu, 1833 lapic_state, 1834 xsave, 1835 xcrs, 1836 mp_state, 1837 } 1838 .into()) 1839 } 1840 /// 1841 /// Get the current AArch64 CPU state 1842 /// 1843 #[cfg(target_arch = "aarch64")] 1844 fn state(&self) -> cpu::Result<CpuState> { 1845 let mut state = VcpuKvmState { 1846 mp_state: self.get_mp_state()?.into(), 1847 ..Default::default() 1848 }; 1849 // Get core registers 1850 state.core_regs = self.get_regs()?; 1851 1852 // Get systerm register 1853 // Call KVM_GET_REG_LIST to get all registers available to the guest. 1854 // For ArmV8 there are around 500 registers. 1855 let mut sys_regs: Vec<Register> = Vec::new(); 1856 let mut reg_list = RegList::new(500).unwrap(); 1857 self.fd 1858 .get_reg_list(&mut reg_list) 1859 .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?; 1860 1861 // At this point reg_list should contain: core registers and system 1862 // registers. 1863 // The register list contains the number of registers and their ids. We 1864 // will be needing to call KVM_GET_ONE_REG on each id in order to save 1865 // all of them. We carve out from the list the core registers which are 1866 // represented in the kernel by kvm_regs structure and for which we can 1867 // calculate the id based on the offset in the structure. 1868 reg_list.retain(|regid| is_system_register(*regid)); 1869 1870 // Now, for the rest of the registers left in the previously fetched 1871 // register list, we are simply calling KVM_GET_ONE_REG. 1872 let indices = reg_list.as_slice(); 1873 for index in indices.iter() { 1874 sys_regs.push(kvm_bindings::kvm_one_reg { 1875 id: *index, 1876 addr: self 1877 .fd 1878 .get_one_reg(*index) 1879 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?, 1880 }); 1881 } 1882 1883 state.sys_regs = sys_regs; 1884 1885 Ok(state.into()) 1886 } 1887 #[cfg(target_arch = "x86_64")] 1888 /// 1889 /// Restore the previously saved CPU state 1890 /// 1891 /// Ordering requirements: 1892 /// 1893 /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are 1894 /// still running. 1895 /// 1896 /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so 1897 /// if we ever change the BSP, we have to do that before restoring anything. 1898 /// The same seems to be true for CPUID stuff. 1899 /// 1900 /// SREGS saves/restores a pending interrupt, similar to what 1901 /// VCPU_EVENTS also does. 1902 /// 1903 /// SET_REGS clears pending exceptions unconditionally, thus, it must be 1904 /// done before SET_VCPU_EVENTS, which restores it. 1905 /// 1906 /// SET_LAPIC must come after SET_SREGS, because the latter restores 1907 /// the apic base msr. 1908 /// 1909 /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR 1910 /// only restores successfully, when the LAPIC is correctly configured. 1911 /// 1912 /// Arguments: CpuState 1913 /// # Example 1914 /// 1915 /// ```rust 1916 /// # extern crate hypervisor; 1917 /// # use hypervisor::KvmHypervisor; 1918 /// # use std::sync::Arc; 1919 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1920 /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1921 /// let vm = hv.create_vm().expect("new VM fd creation failed"); 1922 /// vm.enable_split_irq().unwrap(); 1923 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1924 /// let state = vcpu.state().unwrap(); 1925 /// vcpu.set_state(&state).unwrap(); 1926 /// ``` 1927 fn set_state(&self, state: &CpuState) -> cpu::Result<()> { 1928 let state: VcpuKvmState = state.clone().into(); 1929 self.set_cpuid2(&state.cpuid)?; 1930 self.set_mp_state(state.mp_state.into())?; 1931 self.set_regs(&state.regs.into())?; 1932 self.set_sregs(&state.sregs.into())?; 1933 self.set_xsave(&state.xsave)?; 1934 self.set_xcrs(&state.xcrs)?; 1935 self.set_lapic(&state.lapic_state)?; 1936 self.set_fpu(&state.fpu)?; 1937 1938 // Try to set all MSRs previously stored. 1939 // If the number of MSRs set from SET_MSRS is different from the 1940 // expected amount, we fallback onto a slower method by setting MSRs 1941 // by chunks. This is the only way to make sure we try to set as many 1942 // MSRs as possible, even if some MSRs are not supported. 1943 let expected_num_msrs = state.msrs.len(); 1944 let num_msrs = self.set_msrs(&state.msrs)?; 1945 if num_msrs != expected_num_msrs { 1946 let mut faulty_msr_index = num_msrs; 1947 1948 loop { 1949 warn!( 1950 "Detected faulty MSR 0x{:x} while setting MSRs", 1951 state.msrs[faulty_msr_index].index 1952 ); 1953 1954 // Skip the first bad MSR 1955 let start_pos = faulty_msr_index + 1; 1956 1957 let sub_msr_entries = state.msrs[start_pos..].to_vec(); 1958 1959 let num_msrs = self.set_msrs(&sub_msr_entries)?; 1960 1961 if num_msrs == sub_msr_entries.len() { 1962 break; 1963 } 1964 1965 faulty_msr_index = start_pos + num_msrs; 1966 } 1967 } 1968 1969 self.set_vcpu_events(&state.vcpu_events)?; 1970 1971 Ok(()) 1972 } 1973 /// 1974 /// Restore the previously saved AArch64 CPU state 1975 /// 1976 #[cfg(target_arch = "aarch64")] 1977 fn set_state(&self, state: &CpuState) -> cpu::Result<()> { 1978 let state: VcpuKvmState = state.clone().into(); 1979 // Set core registers 1980 self.set_regs(&state.core_regs)?; 1981 // Set system registers 1982 for reg in &state.sys_regs { 1983 self.fd 1984 .set_one_reg(reg.id, reg.addr) 1985 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?; 1986 } 1987 1988 self.set_mp_state(state.mp_state.into())?; 1989 1990 Ok(()) 1991 } 1992 1993 /// 1994 /// Initialize TDX for this CPU 1995 /// 1996 #[cfg(feature = "tdx")] 1997 fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> { 1998 tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address) 1999 .map_err(cpu::HypervisorCpuError::InitializeTdx) 2000 } 2001 2002 /// 2003 /// Set the "immediate_exit" state 2004 /// 2005 fn set_immediate_exit(&self, exit: bool) { 2006 self.fd.set_kvm_immediate_exit(exit.into()); 2007 } 2008 2009 /// 2010 /// Returns the details about TDX exit reason 2011 /// 2012 #[cfg(feature = "tdx")] 2013 fn get_tdx_exit_details(&mut self) -> cpu::Result<TdxExitDetails> { 2014 let kvm_run = self.fd.get_kvm_run(); 2015 let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall }; 2016 2017 tdx_vmcall.status_code = TDG_VP_VMCALL_INVALID_OPERAND; 2018 2019 if tdx_vmcall.type_ != 0 { 2020 return Err(cpu::HypervisorCpuError::UnknownTdxVmCall); 2021 } 2022 2023 match tdx_vmcall.subfunction { 2024 TDG_VP_VMCALL_GET_QUOTE => Ok(TdxExitDetails::GetQuote), 2025 TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT => { 2026 Ok(TdxExitDetails::SetupEventNotifyInterrupt) 2027 } 2028 _ => Err(cpu::HypervisorCpuError::UnknownTdxVmCall), 2029 } 2030 } 2031 2032 /// 2033 /// Set the status code for TDX exit 2034 /// 2035 #[cfg(feature = "tdx")] 2036 fn set_tdx_status(&mut self, status: TdxExitStatus) { 2037 let kvm_run = self.fd.get_kvm_run(); 2038 let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall }; 2039 2040 tdx_vmcall.status_code = match status { 2041 TdxExitStatus::Success => TDG_VP_VMCALL_SUCCESS, 2042 TdxExitStatus::InvalidOperand => TDG_VP_VMCALL_INVALID_OPERAND, 2043 }; 2044 } 2045 #[cfg(target_arch = "x86_64")] 2046 /// 2047 /// Return the list of initial MSR entries for a VCPU 2048 /// 2049 fn boot_msr_entries(&self) -> Vec<MsrEntry> { 2050 use crate::arch::x86::{msr_index, MTRR_ENABLE, MTRR_MEM_TYPE_WB}; 2051 2052 [ 2053 msr!(msr_index::MSR_IA32_SYSENTER_CS), 2054 msr!(msr_index::MSR_IA32_SYSENTER_ESP), 2055 msr!(msr_index::MSR_IA32_SYSENTER_EIP), 2056 msr!(msr_index::MSR_STAR), 2057 msr!(msr_index::MSR_CSTAR), 2058 msr!(msr_index::MSR_LSTAR), 2059 msr!(msr_index::MSR_KERNEL_GS_BASE), 2060 msr!(msr_index::MSR_SYSCALL_MASK), 2061 msr!(msr_index::MSR_IA32_TSC), 2062 msr_data!( 2063 msr_index::MSR_IA32_MISC_ENABLE, 2064 msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64 2065 ), 2066 msr_data!(msr_index::MSR_MTRRdefType, MTRR_ENABLE | MTRR_MEM_TYPE_WB), 2067 ] 2068 .to_vec() 2069 } 2070 #[cfg(target_arch = "aarch64")] 2071 fn has_pmu_support(&self) -> bool { 2072 let cpu_attr = kvm_bindings::kvm_device_attr { 2073 group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL, 2074 attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT), 2075 addr: 0x0, 2076 flags: 0, 2077 }; 2078 self.fd.has_device_attr(&cpu_attr).is_ok() 2079 } 2080 #[cfg(target_arch = "aarch64")] 2081 fn init_pmu(&self, irq: u32) -> cpu::Result<()> { 2082 let cpu_attr = kvm_bindings::kvm_device_attr { 2083 group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL, 2084 attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT), 2085 addr: 0x0, 2086 flags: 0, 2087 }; 2088 let cpu_attr_irq = kvm_bindings::kvm_device_attr { 2089 group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL, 2090 attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_IRQ), 2091 addr: &irq as *const u32 as u64, 2092 flags: 0, 2093 }; 2094 self.fd 2095 .set_device_attr(&cpu_attr_irq) 2096 .map_err(|_| cpu::HypervisorCpuError::InitializePmu)?; 2097 self.fd 2098 .set_device_attr(&cpu_attr) 2099 .map_err(|_| cpu::HypervisorCpuError::InitializePmu) 2100 } 2101 } 2102 2103 impl KvmVcpu { 2104 #[cfg(target_arch = "x86_64")] 2105 /// 2106 /// X86 specific call that returns the vcpu's current "xsave struct". 2107 /// 2108 fn get_xsave(&self) -> cpu::Result<Xsave> { 2109 self.fd 2110 .get_xsave() 2111 .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into())) 2112 } 2113 #[cfg(target_arch = "x86_64")] 2114 /// 2115 /// X86 specific call that sets the vcpu's current "xsave struct". 2116 /// 2117 fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> { 2118 self.fd 2119 .set_xsave(xsave) 2120 .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into())) 2121 } 2122 #[cfg(target_arch = "x86_64")] 2123 /// 2124 /// X86 specific call that returns the vcpu's current "xcrs". 2125 /// 2126 fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> { 2127 self.fd 2128 .get_xcrs() 2129 .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into())) 2130 } 2131 #[cfg(target_arch = "x86_64")] 2132 /// 2133 /// X86 specific call that sets the vcpu's current "xcrs". 2134 /// 2135 fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> { 2136 self.fd 2137 .set_xcrs(xcrs) 2138 .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into())) 2139 } 2140 #[cfg(target_arch = "x86_64")] 2141 /// 2142 /// Returns currently pending exceptions, interrupts, and NMIs as well as related 2143 /// states of the vcpu. 2144 /// 2145 fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> { 2146 self.fd 2147 .get_vcpu_events() 2148 .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into())) 2149 } 2150 #[cfg(target_arch = "x86_64")] 2151 /// 2152 /// Sets pending exceptions, interrupts, and NMIs as well as related states 2153 /// of the vcpu. 2154 /// 2155 fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> { 2156 self.fd 2157 .set_vcpu_events(events) 2158 .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into())) 2159 } 2160 } 2161