1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause 4 // 5 // Copyright © 2020, Microsoft Corporation 6 // 7 // Copyright 2018-2019 CrowdStrike, Inc. 8 // 9 // 10 11 #[cfg(target_arch = "aarch64")] 12 use crate::aarch64::gic::KvmGicV3Its; 13 #[cfg(target_arch = "aarch64")] 14 pub use crate::aarch64::{ 15 check_required_kvm_extensions, gic::Gicv3ItsState as GicState, is_system_register, VcpuInit, 16 VcpuKvmState, MPIDR_EL1, 17 }; 18 #[cfg(target_arch = "aarch64")] 19 use crate::arch::aarch64::gic::Vgic; 20 use crate::cpu; 21 use crate::hypervisor; 22 use crate::vec_with_array_field; 23 use crate::vm::{self, InterruptSourceConfig, VmOps}; 24 #[cfg(target_arch = "aarch64")] 25 use crate::{arm64_core_reg_id, offset__of}; 26 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd}; 27 use std::any::Any; 28 use std::collections::HashMap; 29 #[cfg(target_arch = "aarch64")] 30 use std::convert::TryInto; 31 #[cfg(target_arch = "x86_64")] 32 use std::fs::File; 33 #[cfg(target_arch = "x86_64")] 34 use std::os::unix::io::AsRawFd; 35 #[cfg(feature = "tdx")] 36 use std::os::unix::io::RawFd; 37 use std::result; 38 #[cfg(target_arch = "x86_64")] 39 use std::sync::atomic::{AtomicBool, Ordering}; 40 #[cfg(target_arch = "aarch64")] 41 use std::sync::Mutex; 42 use std::sync::{Arc, RwLock}; 43 use vmm_sys_util::eventfd::EventFd; 44 // x86_64 dependencies 45 #[cfg(target_arch = "x86_64")] 46 pub mod x86_64; 47 #[cfg(target_arch = "x86_64")] 48 use crate::arch::x86::{ 49 CpuIdEntry, FpuState, LapicState, MsrEntry, SpecialRegisters, StandardRegisters, 50 NUM_IOAPIC_PINS, 51 }; 52 #[cfg(target_arch = "x86_64")] 53 use crate::ClockData; 54 use crate::{ 55 CpuState, IoEventAddress, IrqRoutingEntry, MpState, UserMemoryRegion, 56 USER_MEMORY_REGION_LOG_DIRTY, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE, 57 }; 58 #[cfg(target_arch = "aarch64")] 59 use aarch64::{RegList, Register, StandardRegisters}; 60 #[cfg(target_arch = "x86_64")] 61 use kvm_bindings::{ 62 kvm_enable_cap, kvm_guest_debug, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, 63 KVM_CAP_SPLIT_IRQCHIP, KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_GUESTDBG_USE_HW_BP, 64 }; 65 #[cfg(target_arch = "x86_64")] 66 use x86_64::check_required_kvm_extensions; 67 #[cfg(target_arch = "x86_64")] 68 pub use x86_64::{CpuId, ExtendedControlRegisters, MsrEntries, VcpuKvmState, Xsave}; 69 // aarch64 dependencies 70 #[cfg(target_arch = "aarch64")] 71 pub mod aarch64; 72 pub use kvm_bindings; 73 #[cfg(feature = "tdx")] 74 use kvm_bindings::KVMIO; 75 pub use kvm_bindings::{ 76 kvm_clock_data, kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_irq_routing, 77 kvm_irq_routing_entry, kvm_mp_state, kvm_userspace_memory_region, KVM_IRQ_ROUTING_IRQCHIP, 78 KVM_IRQ_ROUTING_MSI, KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID, 79 }; 80 #[cfg(target_arch = "aarch64")] 81 use kvm_bindings::{ 82 kvm_regs, user_fpsimd_state, user_pt_regs, KVM_NR_SPSR, KVM_REG_ARM64, KVM_REG_ARM_CORE, 83 KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64, 84 }; 85 pub use kvm_ioctls; 86 pub use kvm_ioctls::{Cap, Kvm}; 87 #[cfg(target_arch = "aarch64")] 88 use std::mem; 89 use thiserror::Error; 90 use vfio_ioctls::VfioDeviceFd; 91 #[cfg(feature = "tdx")] 92 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_ioc_nr, ioctl_iowr_nr}; 93 /// 94 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms 95 /// 96 pub use { 97 kvm_bindings::kvm_create_device as CreateDevice, kvm_bindings::kvm_device_attr as DeviceAttr, 98 kvm_bindings::kvm_run, kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::VcpuExit, 99 }; 100 101 #[cfg(target_arch = "x86_64")] 102 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196; 103 104 #[cfg(feature = "tdx")] 105 const KVM_EXIT_TDX: u32 = 35; 106 #[cfg(feature = "tdx")] 107 const TDG_VP_VMCALL_GET_QUOTE: u64 = 0x10002; 108 #[cfg(feature = "tdx")] 109 const TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: u64 = 0x10004; 110 #[cfg(feature = "tdx")] 111 const TDG_VP_VMCALL_SUCCESS: u64 = 0; 112 #[cfg(feature = "tdx")] 113 const TDG_VP_VMCALL_INVALID_OPERAND: u64 = 0x8000000000000000; 114 115 #[cfg(feature = "tdx")] 116 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong); 117 118 #[cfg(feature = "tdx")] 119 #[repr(u32)] 120 enum TdxCommand { 121 Capabilities = 0, 122 InitVm, 123 InitVcpu, 124 InitMemRegion, 125 Finalize, 126 } 127 128 #[cfg(feature = "tdx")] 129 pub enum TdxExitDetails { 130 GetQuote, 131 SetupEventNotifyInterrupt, 132 } 133 134 #[cfg(feature = "tdx")] 135 pub enum TdxExitStatus { 136 Success, 137 InvalidOperand, 138 } 139 140 #[cfg(feature = "tdx")] 141 const TDX_MAX_NR_CPUID_CONFIGS: usize = 6; 142 143 #[cfg(feature = "tdx")] 144 #[repr(C)] 145 #[derive(Debug, Default)] 146 pub struct TdxCpuidConfig { 147 pub leaf: u32, 148 pub sub_leaf: u32, 149 pub eax: u32, 150 pub ebx: u32, 151 pub ecx: u32, 152 pub edx: u32, 153 } 154 155 #[cfg(feature = "tdx")] 156 #[repr(C)] 157 #[derive(Debug, Default)] 158 pub struct TdxCapabilities { 159 pub attrs_fixed0: u64, 160 pub attrs_fixed1: u64, 161 pub xfam_fixed0: u64, 162 pub xfam_fixed1: u64, 163 pub nr_cpuid_configs: u32, 164 pub padding: u32, 165 pub cpuid_configs: [TdxCpuidConfig; TDX_MAX_NR_CPUID_CONFIGS], 166 } 167 168 impl From<kvm_userspace_memory_region> for UserMemoryRegion { 169 fn from(region: kvm_userspace_memory_region) -> Self { 170 let mut flags = USER_MEMORY_REGION_READ; 171 if region.flags & KVM_MEM_READONLY == 0 { 172 flags |= USER_MEMORY_REGION_WRITE; 173 } 174 if region.flags & KVM_MEM_LOG_DIRTY_PAGES != 0 { 175 flags |= USER_MEMORY_REGION_LOG_DIRTY; 176 } 177 178 UserMemoryRegion { 179 slot: region.slot, 180 guest_phys_addr: region.guest_phys_addr, 181 memory_size: region.memory_size, 182 userspace_addr: region.userspace_addr, 183 flags, 184 } 185 } 186 } 187 188 impl From<UserMemoryRegion> for kvm_userspace_memory_region { 189 fn from(region: UserMemoryRegion) -> Self { 190 assert!( 191 region.flags & USER_MEMORY_REGION_READ != 0, 192 "KVM mapped memory is always readable" 193 ); 194 195 let mut flags = 0; 196 if region.flags & USER_MEMORY_REGION_WRITE == 0 { 197 flags |= KVM_MEM_READONLY; 198 } 199 if region.flags & USER_MEMORY_REGION_LOG_DIRTY != 0 { 200 flags |= KVM_MEM_LOG_DIRTY_PAGES; 201 } 202 203 kvm_userspace_memory_region { 204 slot: region.slot, 205 guest_phys_addr: region.guest_phys_addr, 206 memory_size: region.memory_size, 207 userspace_addr: region.userspace_addr, 208 flags, 209 } 210 } 211 } 212 213 impl From<kvm_mp_state> for MpState { 214 fn from(s: kvm_mp_state) -> Self { 215 MpState::Kvm(s) 216 } 217 } 218 219 impl From<MpState> for kvm_mp_state { 220 fn from(ms: MpState) -> Self { 221 match ms { 222 MpState::Kvm(s) => s, 223 /* Needed in case other hypervisors are enabled */ 224 #[allow(unreachable_patterns)] 225 _ => panic!("CpuState is not valid"), 226 } 227 } 228 } 229 230 impl From<kvm_ioctls::IoEventAddress> for IoEventAddress { 231 fn from(a: kvm_ioctls::IoEventAddress) -> Self { 232 match a { 233 kvm_ioctls::IoEventAddress::Pio(x) => Self::Pio(x), 234 kvm_ioctls::IoEventAddress::Mmio(x) => Self::Mmio(x), 235 } 236 } 237 } 238 239 impl From<IoEventAddress> for kvm_ioctls::IoEventAddress { 240 fn from(a: IoEventAddress) -> Self { 241 match a { 242 IoEventAddress::Pio(x) => Self::Pio(x), 243 IoEventAddress::Mmio(x) => Self::Mmio(x), 244 } 245 } 246 } 247 248 impl From<VcpuKvmState> for CpuState { 249 fn from(s: VcpuKvmState) -> Self { 250 CpuState::Kvm(s) 251 } 252 } 253 254 impl From<CpuState> for VcpuKvmState { 255 fn from(s: CpuState) -> Self { 256 match s { 257 CpuState::Kvm(s) => s, 258 /* Needed in case other hypervisors are enabled */ 259 #[allow(unreachable_patterns)] 260 _ => panic!("CpuState is not valid"), 261 } 262 } 263 } 264 265 #[cfg(target_arch = "x86_64")] 266 impl From<kvm_clock_data> for ClockData { 267 fn from(d: kvm_clock_data) -> Self { 268 ClockData::Kvm(d) 269 } 270 } 271 272 #[cfg(target_arch = "x86_64")] 273 impl From<ClockData> for kvm_clock_data { 274 fn from(ms: ClockData) -> Self { 275 match ms { 276 ClockData::Kvm(s) => s, 277 /* Needed in case other hypervisors are enabled */ 278 #[allow(unreachable_patterns)] 279 _ => panic!("CpuState is not valid"), 280 } 281 } 282 } 283 284 impl From<kvm_irq_routing_entry> for IrqRoutingEntry { 285 fn from(s: kvm_irq_routing_entry) -> Self { 286 IrqRoutingEntry::Kvm(s) 287 } 288 } 289 290 impl From<IrqRoutingEntry> for kvm_irq_routing_entry { 291 fn from(e: IrqRoutingEntry) -> Self { 292 match e { 293 IrqRoutingEntry::Kvm(e) => e, 294 /* Needed in case other hypervisors are enabled */ 295 #[allow(unreachable_patterns)] 296 _ => panic!("IrqRoutingEntry is not valid"), 297 } 298 } 299 } 300 301 struct KvmDirtyLogSlot { 302 slot: u32, 303 guest_phys_addr: u64, 304 memory_size: u64, 305 userspace_addr: u64, 306 } 307 308 /// Wrapper over KVM VM ioctls. 309 pub struct KvmVm { 310 fd: Arc<VmFd>, 311 #[cfg(target_arch = "x86_64")] 312 msrs: Vec<MsrEntry>, 313 dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>, 314 } 315 316 impl KvmVm { 317 /// 318 /// Creates an emulated device in the kernel. 319 /// 320 /// See the documentation for `KVM_CREATE_DEVICE`. 321 fn create_device(&self, device: &mut CreateDevice) -> vm::Result<vfio_ioctls::VfioDeviceFd> { 322 let device_fd = self 323 .fd 324 .create_device(device) 325 .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?; 326 Ok(VfioDeviceFd::new_from_kvm(device_fd)) 327 } 328 /// Checks if a particular `Cap` is available. 329 fn check_extension(&self, c: Cap) -> bool { 330 self.fd.check_extension(c) 331 } 332 } 333 334 /// 335 /// Implementation of Vm trait for KVM 336 /// Example: 337 /// #[cfg(feature = "kvm")] 338 /// extern crate hypervisor 339 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 340 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 341 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 342 /// vm.set/get().unwrap() 343 /// 344 impl vm::Vm for KvmVm { 345 #[cfg(target_arch = "x86_64")] 346 /// 347 /// Sets the address of the one-page region in the VM's address space. 348 /// 349 fn set_identity_map_address(&self, address: u64) -> vm::Result<()> { 350 self.fd 351 .set_identity_map_address(address) 352 .map_err(|e| vm::HypervisorVmError::SetIdentityMapAddress(e.into())) 353 } 354 #[cfg(target_arch = "x86_64")] 355 /// 356 /// Sets the address of the three-page region in the VM's address space. 357 /// 358 fn set_tss_address(&self, offset: usize) -> vm::Result<()> { 359 self.fd 360 .set_tss_address(offset) 361 .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into())) 362 } 363 /// 364 /// Creates an in-kernel interrupt controller. 365 /// 366 fn create_irq_chip(&self) -> vm::Result<()> { 367 self.fd 368 .create_irq_chip() 369 .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into())) 370 } 371 /// 372 /// Registers an event that will, when signaled, trigger the `gsi` IRQ. 373 /// 374 fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> { 375 self.fd 376 .register_irqfd(fd, gsi) 377 .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into())) 378 } 379 /// 380 /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ. 381 /// 382 fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> { 383 self.fd 384 .unregister_irqfd(fd, gsi) 385 .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into())) 386 } 387 /// 388 /// Creates a VcpuFd object from a vcpu RawFd. 389 /// 390 fn create_vcpu( 391 &self, 392 id: u8, 393 vm_ops: Option<Arc<dyn VmOps>>, 394 ) -> vm::Result<Arc<dyn cpu::Vcpu>> { 395 let vc = self 396 .fd 397 .create_vcpu(id as u64) 398 .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?; 399 let vcpu = KvmVcpu { 400 fd: vc, 401 #[cfg(target_arch = "x86_64")] 402 msrs: self.msrs.clone(), 403 vm_ops, 404 #[cfg(target_arch = "x86_64")] 405 hyperv_synic: AtomicBool::new(false), 406 }; 407 Ok(Arc::new(vcpu)) 408 } 409 #[cfg(target_arch = "aarch64")] 410 /// 411 /// Creates a virtual GIC device. 412 /// 413 fn create_vgic( 414 &self, 415 vcpu_count: u64, 416 dist_addr: u64, 417 dist_size: u64, 418 redist_size: u64, 419 msi_size: u64, 420 nr_irqs: u32, 421 ) -> vm::Result<Arc<Mutex<dyn Vgic>>> { 422 let gic_device = KvmGicV3Its::new( 423 self, 424 vcpu_count, 425 dist_addr, 426 dist_size, 427 redist_size, 428 msi_size, 429 nr_irqs, 430 ) 431 .map_err(|e| vm::HypervisorVmError::CreateVgic(anyhow!("Vgic error {:?}", e)))?; 432 Ok(Arc::new(Mutex::new(gic_device))) 433 } 434 /// 435 /// Registers an event to be signaled whenever a certain address is written to. 436 /// 437 fn register_ioevent( 438 &self, 439 fd: &EventFd, 440 addr: &IoEventAddress, 441 datamatch: Option<vm::DataMatch>, 442 ) -> vm::Result<()> { 443 let addr = &kvm_ioctls::IoEventAddress::from(*addr); 444 if let Some(dm) = datamatch { 445 match dm { 446 vm::DataMatch::DataMatch32(kvm_dm32) => self 447 .fd 448 .register_ioevent(fd, addr, kvm_dm32) 449 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())), 450 vm::DataMatch::DataMatch64(kvm_dm64) => self 451 .fd 452 .register_ioevent(fd, addr, kvm_dm64) 453 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())), 454 } 455 } else { 456 self.fd 457 .register_ioevent(fd, addr, NoDatamatch) 458 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())) 459 } 460 } 461 /// 462 /// Unregisters an event from a certain address it has been previously registered to. 463 /// 464 fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> { 465 let addr = &kvm_ioctls::IoEventAddress::from(*addr); 466 self.fd 467 .unregister_ioevent(fd, addr, NoDatamatch) 468 .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into())) 469 } 470 471 /// 472 /// Constructs a routing entry 473 /// 474 fn make_routing_entry(&self, gsi: u32, config: &InterruptSourceConfig) -> IrqRoutingEntry { 475 match &config { 476 InterruptSourceConfig::MsiIrq(cfg) => { 477 let mut kvm_route = kvm_irq_routing_entry { 478 gsi, 479 type_: KVM_IRQ_ROUTING_MSI, 480 ..Default::default() 481 }; 482 483 kvm_route.u.msi.address_lo = cfg.low_addr; 484 kvm_route.u.msi.address_hi = cfg.high_addr; 485 kvm_route.u.msi.data = cfg.data; 486 487 if self.check_extension(crate::kvm::Cap::MsiDevid) { 488 // On AArch64, there is limitation on the range of the 'devid', 489 // it can not be greater than 65536 (the max of u16). 490 // 491 // BDF can not be used directly, because 'segment' is in high 492 // 16 bits. The layout of the u32 BDF is: 493 // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --| 494 // | segment | bus | device | function | 495 // 496 // Now that we support 1 bus only in a segment, we can build a 497 // 'devid' by replacing the 'bus' bits with the low 8 bits of 498 // 'segment' data. 499 // This way we can resolve the range checking problem and give 500 // different `devid` to all the devices. Limitation is that at 501 // most 256 segments can be supported. 502 // 503 let modified_devid = (cfg.devid & 0x00ff_0000) >> 8 | cfg.devid & 0xff; 504 505 kvm_route.flags = KVM_MSI_VALID_DEVID; 506 kvm_route.u.msi.__bindgen_anon_1.devid = modified_devid; 507 } 508 kvm_route.into() 509 } 510 InterruptSourceConfig::LegacyIrq(cfg) => { 511 let mut kvm_route = kvm_irq_routing_entry { 512 gsi, 513 type_: KVM_IRQ_ROUTING_IRQCHIP, 514 ..Default::default() 515 }; 516 kvm_route.u.irqchip.irqchip = cfg.irqchip; 517 kvm_route.u.irqchip.pin = cfg.pin; 518 519 kvm_route.into() 520 } 521 } 522 } 523 524 /// 525 /// Sets the GSI routing table entries, overwriting any previously set 526 /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl. 527 /// 528 fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> { 529 let mut irq_routing = 530 vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len()); 531 irq_routing[0].nr = entries.len() as u32; 532 irq_routing[0].flags = 0; 533 let entries: Vec<kvm_irq_routing_entry> = entries 534 .iter() 535 .map(|entry| match entry { 536 IrqRoutingEntry::Kvm(e) => *e, 537 #[allow(unreachable_patterns)] 538 _ => panic!("IrqRoutingEntry type is wrong"), 539 }) 540 .collect(); 541 542 // SAFETY: irq_routing initialized with entries.len() and now it is being turned into 543 // entries_slice with entries.len() again. It is guaranteed to be large enough to hold 544 // everything from entries. 545 unsafe { 546 let entries_slice: &mut [kvm_irq_routing_entry] = 547 irq_routing[0].entries.as_mut_slice(entries.len()); 548 entries_slice.copy_from_slice(&entries); 549 } 550 551 self.fd 552 .set_gsi_routing(&irq_routing[0]) 553 .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into())) 554 } 555 /// 556 /// Creates a memory region structure that can be used with {create/remove}_user_memory_region 557 /// 558 fn make_user_memory_region( 559 &self, 560 slot: u32, 561 guest_phys_addr: u64, 562 memory_size: u64, 563 userspace_addr: u64, 564 readonly: bool, 565 log_dirty_pages: bool, 566 ) -> UserMemoryRegion { 567 kvm_userspace_memory_region { 568 slot, 569 guest_phys_addr, 570 memory_size, 571 userspace_addr, 572 flags: if readonly { KVM_MEM_READONLY } else { 0 } 573 | if log_dirty_pages { 574 KVM_MEM_LOG_DIRTY_PAGES 575 } else { 576 0 577 }, 578 } 579 .into() 580 } 581 /// 582 /// Creates a guest physical memory region. 583 /// 584 fn create_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> { 585 let mut region: kvm_userspace_memory_region = user_memory_region.into(); 586 587 if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 { 588 if (region.flags & KVM_MEM_READONLY) != 0 { 589 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!( 590 "Error creating regions with both 'dirty-pages-log' and 'read-only'." 591 ))); 592 } 593 594 // Keep track of the regions that need dirty pages log 595 self.dirty_log_slots.write().unwrap().insert( 596 region.slot, 597 KvmDirtyLogSlot { 598 slot: region.slot, 599 guest_phys_addr: region.guest_phys_addr, 600 memory_size: region.memory_size, 601 userspace_addr: region.userspace_addr, 602 }, 603 ); 604 605 // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`. 606 // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`. 607 region.flags = 0; 608 } 609 610 // SAFETY: Safe because guest regions are guaranteed not to overlap. 611 unsafe { 612 self.fd 613 .set_user_memory_region(region) 614 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into())) 615 } 616 } 617 /// 618 /// Removes a guest physical memory region. 619 /// 620 fn remove_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> { 621 let mut region: kvm_userspace_memory_region = user_memory_region.into(); 622 623 // Remove the corresponding entry from "self.dirty_log_slots" if needed 624 self.dirty_log_slots.write().unwrap().remove(®ion.slot); 625 626 // Setting the size to 0 means "remove" 627 region.memory_size = 0; 628 // SAFETY: Safe because guest regions are guaranteed not to overlap. 629 unsafe { 630 self.fd 631 .set_user_memory_region(region) 632 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into())) 633 } 634 } 635 /// 636 /// Returns the preferred CPU target type which can be emulated by KVM on underlying host. 637 /// 638 #[cfg(target_arch = "aarch64")] 639 fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> { 640 self.fd 641 .get_preferred_target(kvi) 642 .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into())) 643 } 644 #[cfg(target_arch = "x86_64")] 645 fn enable_split_irq(&self) -> vm::Result<()> { 646 // Create split irqchip 647 // Only the local APIC is emulated in kernel, both PICs and IOAPIC 648 // are not. 649 let mut cap = kvm_enable_cap { 650 cap: KVM_CAP_SPLIT_IRQCHIP, 651 ..Default::default() 652 }; 653 cap.args[0] = NUM_IOAPIC_PINS as u64; 654 self.fd 655 .enable_cap(&cap) 656 .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?; 657 Ok(()) 658 } 659 #[cfg(target_arch = "x86_64")] 660 fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> { 661 let mut cap = kvm_enable_cap { 662 cap: KVM_CAP_SGX_ATTRIBUTE, 663 ..Default::default() 664 }; 665 cap.args[0] = file.as_raw_fd() as u64; 666 self.fd 667 .enable_cap(&cap) 668 .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?; 669 Ok(()) 670 } 671 /// Retrieve guest clock. 672 #[cfg(target_arch = "x86_64")] 673 fn get_clock(&self) -> vm::Result<ClockData> { 674 Ok(self 675 .fd 676 .get_clock() 677 .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))? 678 .into()) 679 } 680 /// Set guest clock. 681 #[cfg(target_arch = "x86_64")] 682 fn set_clock(&self, data: &ClockData) -> vm::Result<()> { 683 let data = (*data).into(); 684 self.fd 685 .set_clock(&data) 686 .map_err(|e| vm::HypervisorVmError::SetClock(e.into())) 687 } 688 /// Create a device that is used for passthrough 689 fn create_passthrough_device(&self) -> vm::Result<VfioDeviceFd> { 690 let mut vfio_dev = kvm_create_device { 691 type_: kvm_device_type_KVM_DEV_TYPE_VFIO, 692 fd: 0, 693 flags: 0, 694 }; 695 696 self.create_device(&mut vfio_dev) 697 .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into())) 698 } 699 /// 700 /// Start logging dirty pages 701 /// 702 fn start_dirty_log(&self) -> vm::Result<()> { 703 let dirty_log_slots = self.dirty_log_slots.read().unwrap(); 704 for (_, s) in dirty_log_slots.iter() { 705 let region = kvm_userspace_memory_region { 706 slot: s.slot, 707 guest_phys_addr: s.guest_phys_addr, 708 memory_size: s.memory_size, 709 userspace_addr: s.userspace_addr, 710 flags: KVM_MEM_LOG_DIRTY_PAGES, 711 }; 712 // SAFETY: Safe because guest regions are guaranteed not to overlap. 713 unsafe { 714 self.fd 715 .set_user_memory_region(region) 716 .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; 717 } 718 } 719 720 Ok(()) 721 } 722 723 /// 724 /// Stop logging dirty pages 725 /// 726 fn stop_dirty_log(&self) -> vm::Result<()> { 727 let dirty_log_slots = self.dirty_log_slots.read().unwrap(); 728 for (_, s) in dirty_log_slots.iter() { 729 let region = kvm_userspace_memory_region { 730 slot: s.slot, 731 guest_phys_addr: s.guest_phys_addr, 732 memory_size: s.memory_size, 733 userspace_addr: s.userspace_addr, 734 flags: 0, 735 }; 736 // SAFETY: Safe because guest regions are guaranteed not to overlap. 737 unsafe { 738 self.fd 739 .set_user_memory_region(region) 740 .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; 741 } 742 } 743 744 Ok(()) 745 } 746 747 /// 748 /// Get dirty pages bitmap (one bit per page) 749 /// 750 fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> { 751 self.fd 752 .get_dirty_log(slot, memory_size as usize) 753 .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into())) 754 } 755 756 /// 757 /// Initialize TDX for this VM 758 /// 759 #[cfg(feature = "tdx")] 760 fn tdx_init(&self, cpuid: &[CpuIdEntry], max_vcpus: u32) -> vm::Result<()> { 761 use std::io::{Error, ErrorKind}; 762 let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> = 763 cpuid.iter().map(|e| (*e).into()).collect(); 764 let kvm_cpuid = kvm_bindings::CpuId::from_entries(&cpuid).map_err(|_| { 765 vm::HypervisorVmError::InitializeTdx(Error::new( 766 ErrorKind::Other, 767 "failed to allocate CpuId", 768 )) 769 })?; 770 771 #[repr(C)] 772 struct TdxInitVm { 773 max_vcpus: u32, 774 tsc_khz: u32, 775 attributes: u64, 776 cpuid: u64, 777 mrconfigid: [u64; 6], 778 mrowner: [u64; 6], 779 mrownerconfig: [u64; 6], 780 reserved: [u64; 43], 781 } 782 let data = TdxInitVm { 783 max_vcpus, 784 tsc_khz: 0, 785 attributes: 0, 786 cpuid: kvm_cpuid.as_fam_struct_ptr() as u64, 787 mrconfigid: [0; 6], 788 mrowner: [0; 6], 789 mrownerconfig: [0; 6], 790 reserved: [0; 43], 791 }; 792 793 tdx_command( 794 &self.fd.as_raw_fd(), 795 TdxCommand::InitVm, 796 0, 797 &data as *const _ as u64, 798 ) 799 .map_err(vm::HypervisorVmError::InitializeTdx) 800 } 801 802 /// 803 /// Finalize the TDX setup for this VM 804 /// 805 #[cfg(feature = "tdx")] 806 fn tdx_finalize(&self) -> vm::Result<()> { 807 tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0) 808 .map_err(vm::HypervisorVmError::FinalizeTdx) 809 } 810 811 /// 812 /// Initialize memory regions for the TDX VM 813 /// 814 #[cfg(feature = "tdx")] 815 fn tdx_init_memory_region( 816 &self, 817 host_address: u64, 818 guest_address: u64, 819 size: u64, 820 measure: bool, 821 ) -> vm::Result<()> { 822 #[repr(C)] 823 struct TdxInitMemRegion { 824 host_address: u64, 825 guest_address: u64, 826 pages: u64, 827 } 828 let data = TdxInitMemRegion { 829 host_address, 830 guest_address, 831 pages: size / 4096, 832 }; 833 834 tdx_command( 835 &self.fd.as_raw_fd(), 836 TdxCommand::InitMemRegion, 837 if measure { 1 } else { 0 }, 838 &data as *const _ as u64, 839 ) 840 .map_err(vm::HypervisorVmError::InitMemRegionTdx) 841 } 842 /// Downcast to the underlying KvmVm type 843 fn as_any(&self) -> &dyn Any { 844 self 845 } 846 } 847 848 #[cfg(feature = "tdx")] 849 fn tdx_command( 850 fd: &RawFd, 851 command: TdxCommand, 852 metadata: u32, 853 data: u64, 854 ) -> std::result::Result<(), std::io::Error> { 855 #[repr(C)] 856 struct TdxIoctlCmd { 857 command: TdxCommand, 858 metadata: u32, 859 data: u64, 860 } 861 let cmd = TdxIoctlCmd { 862 command, 863 metadata, 864 data, 865 }; 866 // SAFETY: FFI call. All input parameters are valid. 867 let ret = unsafe { 868 ioctl_with_val( 869 fd, 870 KVM_MEMORY_ENCRYPT_OP(), 871 &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong, 872 ) 873 }; 874 875 if ret < 0 { 876 return Err(std::io::Error::last_os_error()); 877 } 878 Ok(()) 879 } 880 881 /// Wrapper over KVM system ioctls. 882 pub struct KvmHypervisor { 883 kvm: Kvm, 884 } 885 886 impl KvmHypervisor { 887 #[cfg(target_arch = "x86_64")] 888 /// 889 /// Retrieve the list of MSRs supported by the hypervisor. 890 /// 891 fn get_msr_list(&self) -> hypervisor::Result<MsrList> { 892 self.kvm 893 .get_msr_index_list() 894 .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into())) 895 } 896 } 897 898 /// Enum for KVM related error 899 #[derive(Debug, Error)] 900 pub enum KvmError { 901 #[error("Capability missing: {0:?}")] 902 CapabilityMissing(Cap), 903 } 904 pub type KvmResult<T> = result::Result<T, KvmError>; 905 impl KvmHypervisor { 906 /// Create a hypervisor based on Kvm 907 pub fn new() -> hypervisor::Result<KvmHypervisor> { 908 let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?; 909 let api_version = kvm_obj.get_api_version(); 910 911 if api_version != kvm_bindings::KVM_API_VERSION as i32 { 912 return Err(hypervisor::HypervisorError::IncompatibleApiVersion); 913 } 914 915 Ok(KvmHypervisor { kvm: kvm_obj }) 916 } 917 } 918 /// Implementation of Hypervisor trait for KVM 919 /// Example: 920 /// #[cfg(feature = "kvm")] 921 /// extern crate hypervisor 922 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 923 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 924 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 925 /// 926 impl hypervisor::Hypervisor for KvmHypervisor { 927 /// Create a KVM vm object of a specific VM type and return the object as Vm trait object 928 /// Example 929 /// # extern crate hypervisor; 930 /// # use hypervisor::KvmHypervisor; 931 /// use hypervisor::KvmVm; 932 /// let hypervisor = KvmHypervisor::new().unwrap(); 933 /// let vm = hypervisor.create_vm_with_type(KvmVmType::LegacyVm).unwrap() 934 /// 935 fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> { 936 let fd: VmFd; 937 loop { 938 match self.kvm.create_vm_with_type(vm_type) { 939 Ok(res) => fd = res, 940 Err(e) => { 941 if e.errno() == libc::EINTR { 942 // If the error returned is EINTR, which means the 943 // ioctl has been interrupted, we have to retry as 944 // this can't be considered as a regular error. 945 continue; 946 } else { 947 return Err(hypervisor::HypervisorError::VmCreate(e.into())); 948 } 949 } 950 } 951 break; 952 } 953 954 let vm_fd = Arc::new(fd); 955 956 #[cfg(target_arch = "x86_64")] 957 { 958 let msr_list = self.get_msr_list()?; 959 let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize; 960 let mut msrs: Vec<MsrEntry> = vec![ 961 MsrEntry { 962 ..Default::default() 963 }; 964 num_msrs 965 ]; 966 let indices = msr_list.as_slice(); 967 for (pos, index) in indices.iter().enumerate() { 968 msrs[pos].index = *index; 969 } 970 971 Ok(Arc::new(KvmVm { 972 fd: vm_fd, 973 msrs, 974 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), 975 })) 976 } 977 978 #[cfg(target_arch = "aarch64")] 979 { 980 Ok(Arc::new(KvmVm { 981 fd: vm_fd, 982 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), 983 })) 984 } 985 } 986 987 /// Create a KVM vm object and return the object as Vm trait object 988 /// Example 989 /// # extern crate hypervisor; 990 /// # use hypervisor::KvmHypervisor; 991 /// use hypervisor::KvmVm; 992 /// let hypervisor = KvmHypervisor::new().unwrap(); 993 /// let vm = hypervisor.create_vm().unwrap() 994 /// 995 fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> { 996 #[allow(unused_mut)] 997 let mut vm_type: u64 = 0; // Create with default platform type 998 999 // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA 1000 // size from the host and use that when creating the VM, which may 1001 // avoid unnecessary VM creation failures. 1002 #[cfg(target_arch = "aarch64")] 1003 if self.kvm.check_extension(Cap::ArmVmIPASize) { 1004 vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap(); 1005 } 1006 1007 self.create_vm_with_type(vm_type) 1008 } 1009 1010 fn check_required_extensions(&self) -> hypervisor::Result<()> { 1011 check_required_kvm_extensions(&self.kvm) 1012 .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into())) 1013 } 1014 1015 #[cfg(target_arch = "x86_64")] 1016 /// 1017 /// X86 specific call to get the system supported CPUID values. 1018 /// 1019 fn get_cpuid(&self) -> hypervisor::Result<Vec<CpuIdEntry>> { 1020 let kvm_cpuid = self 1021 .kvm 1022 .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES) 1023 .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))?; 1024 1025 let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect(); 1026 1027 Ok(v) 1028 } 1029 1030 #[cfg(target_arch = "aarch64")] 1031 /// 1032 /// Retrieve AArch64 host maximum IPA size supported by KVM. 1033 /// 1034 fn get_host_ipa_limit(&self) -> i32 { 1035 self.kvm.get_host_ipa_limit() 1036 } 1037 1038 /// 1039 /// Retrieve TDX capabilities 1040 /// 1041 #[cfg(feature = "tdx")] 1042 fn tdx_capabilities(&self) -> hypervisor::Result<TdxCapabilities> { 1043 let data = TdxCapabilities { 1044 nr_cpuid_configs: TDX_MAX_NR_CPUID_CONFIGS as u32, 1045 ..Default::default() 1046 }; 1047 1048 tdx_command( 1049 &self.kvm.as_raw_fd(), 1050 TdxCommand::Capabilities, 1051 0, 1052 &data as *const _ as u64, 1053 ) 1054 .map_err(|e| hypervisor::HypervisorError::TdxCapabilities(e.into()))?; 1055 1056 Ok(data) 1057 } 1058 } 1059 /// Vcpu struct for KVM 1060 pub struct KvmVcpu { 1061 fd: VcpuFd, 1062 #[cfg(target_arch = "x86_64")] 1063 msrs: Vec<MsrEntry>, 1064 vm_ops: Option<Arc<dyn vm::VmOps>>, 1065 #[cfg(target_arch = "x86_64")] 1066 hyperv_synic: AtomicBool, 1067 } 1068 /// Implementation of Vcpu trait for KVM 1069 /// Example: 1070 /// #[cfg(feature = "kvm")] 1071 /// extern crate hypervisor 1072 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1073 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1074 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 1075 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1076 /// vcpu.get/set().unwrap() 1077 /// 1078 impl cpu::Vcpu for KvmVcpu { 1079 #[cfg(target_arch = "x86_64")] 1080 /// 1081 /// Returns the vCPU general purpose registers. 1082 /// 1083 fn get_regs(&self) -> cpu::Result<StandardRegisters> { 1084 Ok(self 1085 .fd 1086 .get_regs() 1087 .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))? 1088 .into()) 1089 } 1090 /// 1091 /// Returns the vCPU general purpose registers. 1092 /// The `KVM_GET_REGS` ioctl is not available on AArch64, `KVM_GET_ONE_REG` 1093 /// is used to get registers one by one. 1094 /// 1095 #[cfg(target_arch = "aarch64")] 1096 fn get_regs(&self) -> cpu::Result<StandardRegisters> { 1097 let mut state: StandardRegisters = kvm_regs::default(); 1098 let mut off = offset__of!(user_pt_regs, regs); 1099 // There are 31 user_pt_regs: 1100 // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72 1101 // These actually are the general-purpose registers of the Armv8-a 1102 // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register). 1103 for i in 0..31 { 1104 state.regs.regs[i] = self 1105 .fd 1106 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1107 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1108 off += std::mem::size_of::<u64>(); 1109 } 1110 1111 // We are now entering the "Other register" section of the ARMv8-a architecture. 1112 // First one, stack pointer. 1113 let off = offset__of!(user_pt_regs, sp); 1114 state.regs.sp = self 1115 .fd 1116 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1117 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1118 1119 // Second one, the program counter. 1120 let off = offset__of!(user_pt_regs, pc); 1121 state.regs.pc = self 1122 .fd 1123 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1124 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1125 1126 // Next is the processor state. 1127 let off = offset__of!(user_pt_regs, pstate); 1128 state.regs.pstate = self 1129 .fd 1130 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1131 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1132 1133 // The stack pointer associated with EL1 1134 let off = offset__of!(kvm_regs, sp_el1); 1135 state.sp_el1 = self 1136 .fd 1137 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1138 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1139 1140 // Exception Link Register for EL1, when taking an exception to EL1, this register 1141 // holds the address to which to return afterwards. 1142 let off = offset__of!(kvm_regs, elr_el1); 1143 state.elr_el1 = self 1144 .fd 1145 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1146 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1147 1148 // Saved Program Status Registers, there are 5 of them used in the kernel. 1149 let mut off = offset__of!(kvm_regs, spsr); 1150 for i in 0..KVM_NR_SPSR as usize { 1151 state.spsr[i] = self 1152 .fd 1153 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1154 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1155 off += std::mem::size_of::<u64>(); 1156 } 1157 1158 // Now moving on to floting point registers which are stored in the user_fpsimd_state in the kernel: 1159 // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53 1160 let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs); 1161 for i in 0..32 { 1162 state.fp_regs.vregs[i] = self 1163 .fd 1164 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off)) 1165 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1166 .into(); 1167 off += mem::size_of::<u128>(); 1168 } 1169 1170 // Floating-point Status Register 1171 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr); 1172 state.fp_regs.fpsr = self 1173 .fd 1174 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off)) 1175 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1176 as u32; 1177 1178 // Floating-point Control Register 1179 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr); 1180 state.fp_regs.fpcr = self 1181 .fd 1182 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off)) 1183 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1184 as u32; 1185 Ok(state) 1186 } 1187 #[cfg(target_arch = "x86_64")] 1188 /// 1189 /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl. 1190 /// 1191 fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> { 1192 let regs = (*regs).into(); 1193 self.fd 1194 .set_regs(®s) 1195 .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into())) 1196 } 1197 1198 /// 1199 /// Sets the vCPU general purpose registers. 1200 /// The `KVM_SET_REGS` ioctl is not available on AArch64, `KVM_SET_ONE_REG` 1201 /// is used to set registers one by one. 1202 /// 1203 #[cfg(target_arch = "aarch64")] 1204 fn set_regs(&self, state: &StandardRegisters) -> cpu::Result<()> { 1205 // The function follows the exact identical order from `state`. Look there 1206 // for some additional info on registers. 1207 let mut off = offset__of!(user_pt_regs, regs); 1208 for i in 0..31 { 1209 self.fd 1210 .set_one_reg( 1211 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1212 state.regs.regs[i], 1213 ) 1214 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1215 off += std::mem::size_of::<u64>(); 1216 } 1217 1218 let off = offset__of!(user_pt_regs, sp); 1219 self.fd 1220 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.sp) 1221 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1222 1223 let off = offset__of!(user_pt_regs, pc); 1224 self.fd 1225 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pc) 1226 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1227 1228 let off = offset__of!(user_pt_regs, pstate); 1229 self.fd 1230 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pstate) 1231 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1232 1233 let off = offset__of!(kvm_regs, sp_el1); 1234 self.fd 1235 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.sp_el1) 1236 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1237 1238 let off = offset__of!(kvm_regs, elr_el1); 1239 self.fd 1240 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.elr_el1) 1241 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1242 1243 let mut off = offset__of!(kvm_regs, spsr); 1244 for i in 0..KVM_NR_SPSR as usize { 1245 self.fd 1246 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.spsr[i]) 1247 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1248 off += std::mem::size_of::<u64>(); 1249 } 1250 1251 let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs); 1252 for i in 0..32 { 1253 self.fd 1254 .set_one_reg( 1255 arm64_core_reg_id!(KVM_REG_SIZE_U128, off), 1256 state.fp_regs.vregs[i] as u64, 1257 ) 1258 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1259 off += mem::size_of::<u128>(); 1260 } 1261 1262 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr); 1263 self.fd 1264 .set_one_reg( 1265 arm64_core_reg_id!(KVM_REG_SIZE_U32, off), 1266 state.fp_regs.fpsr as u64, 1267 ) 1268 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1269 1270 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr); 1271 self.fd 1272 .set_one_reg( 1273 arm64_core_reg_id!(KVM_REG_SIZE_U32, off), 1274 state.fp_regs.fpcr as u64, 1275 ) 1276 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1277 Ok(()) 1278 } 1279 1280 #[cfg(target_arch = "x86_64")] 1281 /// 1282 /// Returns the vCPU special registers. 1283 /// 1284 fn get_sregs(&self) -> cpu::Result<SpecialRegisters> { 1285 Ok(self 1286 .fd 1287 .get_sregs() 1288 .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))? 1289 .into()) 1290 } 1291 #[cfg(target_arch = "x86_64")] 1292 /// 1293 /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl. 1294 /// 1295 fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> { 1296 let sregs = (*sregs).into(); 1297 self.fd 1298 .set_sregs(&sregs) 1299 .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into())) 1300 } 1301 #[cfg(target_arch = "x86_64")] 1302 /// 1303 /// Returns the floating point state (FPU) from the vCPU. 1304 /// 1305 fn get_fpu(&self) -> cpu::Result<FpuState> { 1306 Ok(self 1307 .fd 1308 .get_fpu() 1309 .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))? 1310 .into()) 1311 } 1312 #[cfg(target_arch = "x86_64")] 1313 /// 1314 /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct. 1315 /// 1316 fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> { 1317 let fpu: kvm_bindings::kvm_fpu = (*fpu).clone().into(); 1318 self.fd 1319 .set_fpu(&fpu) 1320 .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into())) 1321 } 1322 #[cfg(target_arch = "x86_64")] 1323 /// 1324 /// X86 specific call to setup the CPUID registers. 1325 /// 1326 fn set_cpuid2(&self, cpuid: &[CpuIdEntry]) -> cpu::Result<()> { 1327 let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> = 1328 cpuid.iter().map(|e| (*e).into()).collect(); 1329 let kvm_cpuid = <CpuId>::from_entries(&cpuid) 1330 .map_err(|_| cpu::HypervisorCpuError::SetCpuid(anyhow!("failed to create CpuId")))?; 1331 1332 self.fd 1333 .set_cpuid2(&kvm_cpuid) 1334 .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into())) 1335 } 1336 #[cfg(target_arch = "x86_64")] 1337 /// 1338 /// X86 specific call to enable HyperV SynIC 1339 /// 1340 fn enable_hyperv_synic(&self) -> cpu::Result<()> { 1341 // Update the information about Hyper-V SynIC being enabled and 1342 // emulated as it will influence later which MSRs should be saved. 1343 self.hyperv_synic.store(true, Ordering::Release); 1344 1345 let cap = kvm_enable_cap { 1346 cap: KVM_CAP_HYPERV_SYNIC, 1347 ..Default::default() 1348 }; 1349 self.fd 1350 .enable_cap(&cap) 1351 .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into())) 1352 } 1353 /// 1354 /// X86 specific call to retrieve the CPUID registers. 1355 /// 1356 #[cfg(target_arch = "x86_64")] 1357 fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<Vec<CpuIdEntry>> { 1358 let kvm_cpuid = self 1359 .fd 1360 .get_cpuid2(num_entries) 1361 .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))?; 1362 1363 let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect(); 1364 1365 Ok(v) 1366 } 1367 #[cfg(target_arch = "x86_64")] 1368 /// 1369 /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller). 1370 /// 1371 fn get_lapic(&self) -> cpu::Result<LapicState> { 1372 Ok(self 1373 .fd 1374 .get_lapic() 1375 .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))? 1376 .into()) 1377 } 1378 #[cfg(target_arch = "x86_64")] 1379 /// 1380 /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller). 1381 /// 1382 fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> { 1383 let klapic: kvm_bindings::kvm_lapic_state = (*klapic).clone().into(); 1384 self.fd 1385 .set_lapic(&klapic) 1386 .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into())) 1387 } 1388 #[cfg(target_arch = "x86_64")] 1389 /// 1390 /// Returns the model-specific registers (MSR) for this vCPU. 1391 /// 1392 fn get_msrs(&self, msrs: &mut Vec<MsrEntry>) -> cpu::Result<usize> { 1393 let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect(); 1394 let mut kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap(); 1395 let succ = self 1396 .fd 1397 .get_msrs(&mut kvm_msrs) 1398 .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))?; 1399 1400 msrs[..succ].copy_from_slice( 1401 &kvm_msrs.as_slice()[..succ] 1402 .iter() 1403 .map(|e| (*e).into()) 1404 .collect::<Vec<MsrEntry>>(), 1405 ); 1406 1407 Ok(succ) 1408 } 1409 #[cfg(target_arch = "x86_64")] 1410 /// 1411 /// Setup the model-specific registers (MSR) for this vCPU. 1412 /// Returns the number of MSR entries actually written. 1413 /// 1414 fn set_msrs(&self, msrs: &[MsrEntry]) -> cpu::Result<usize> { 1415 let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect(); 1416 let kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap(); 1417 self.fd 1418 .set_msrs(&kvm_msrs) 1419 .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into())) 1420 } 1421 /// 1422 /// Returns the vcpu's current "multiprocessing state". 1423 /// 1424 fn get_mp_state(&self) -> cpu::Result<MpState> { 1425 Ok(self 1426 .fd 1427 .get_mp_state() 1428 .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))? 1429 .into()) 1430 } 1431 /// 1432 /// Sets the vcpu's current "multiprocessing state". 1433 /// 1434 fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> { 1435 self.fd 1436 .set_mp_state(mp_state.into()) 1437 .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into())) 1438 } 1439 #[cfg(target_arch = "x86_64")] 1440 /// 1441 /// Translates guest virtual address to guest physical address using the `KVM_TRANSLATE` ioctl. 1442 /// 1443 fn translate_gva(&self, gva: u64, _flags: u64) -> cpu::Result<(u64, u32)> { 1444 let tr = self 1445 .fd 1446 .translate_gva(gva) 1447 .map_err(|e| cpu::HypervisorCpuError::TranslateVirtualAddress(e.into()))?; 1448 // tr.valid is set if the GVA is mapped to valid GPA. 1449 match tr.valid { 1450 0 => Err(cpu::HypervisorCpuError::TranslateVirtualAddress(anyhow!( 1451 "Invalid GVA: {:#x}", 1452 gva 1453 ))), 1454 _ => Ok((tr.physical_address, 0)), 1455 } 1456 } 1457 /// 1458 /// Triggers the running of the current virtual CPU returning an exit reason. 1459 /// 1460 fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> { 1461 match self.fd.run() { 1462 Ok(run) => match run { 1463 #[cfg(target_arch = "x86_64")] 1464 VcpuExit::IoIn(addr, data) => { 1465 if let Some(vm_ops) = &self.vm_ops { 1466 return vm_ops 1467 .pio_read(addr.into(), data) 1468 .map(|_| cpu::VmExit::Ignore) 1469 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1470 } 1471 1472 Ok(cpu::VmExit::IoIn(addr, data)) 1473 } 1474 #[cfg(target_arch = "x86_64")] 1475 VcpuExit::IoOut(addr, data) => { 1476 if let Some(vm_ops) = &self.vm_ops { 1477 return vm_ops 1478 .pio_write(addr.into(), data) 1479 .map(|_| cpu::VmExit::Ignore) 1480 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1481 } 1482 1483 Ok(cpu::VmExit::IoOut(addr, data)) 1484 } 1485 #[cfg(target_arch = "x86_64")] 1486 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)), 1487 #[cfg(target_arch = "x86_64")] 1488 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset), 1489 1490 #[cfg(target_arch = "aarch64")] 1491 VcpuExit::SystemEvent(event_type, flags) => { 1492 use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN}; 1493 // On Aarch64, when the VM is shutdown, run() returns 1494 // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN 1495 if event_type == KVM_SYSTEM_EVENT_RESET { 1496 Ok(cpu::VmExit::Reset) 1497 } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN { 1498 Ok(cpu::VmExit::Shutdown) 1499 } else { 1500 Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1501 "Unexpected system event with type 0x{:x}, flags 0x{:x}", 1502 event_type, 1503 flags 1504 ))) 1505 } 1506 } 1507 1508 VcpuExit::MmioRead(addr, data) => { 1509 if let Some(vm_ops) = &self.vm_ops { 1510 return vm_ops 1511 .mmio_read(addr, data) 1512 .map(|_| cpu::VmExit::Ignore) 1513 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1514 } 1515 1516 Ok(cpu::VmExit::MmioRead(addr, data)) 1517 } 1518 VcpuExit::MmioWrite(addr, data) => { 1519 if let Some(vm_ops) = &self.vm_ops { 1520 return vm_ops 1521 .mmio_write(addr, data) 1522 .map(|_| cpu::VmExit::Ignore) 1523 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1524 } 1525 1526 Ok(cpu::VmExit::MmioWrite(addr, data)) 1527 } 1528 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv), 1529 #[cfg(feature = "tdx")] 1530 VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx), 1531 VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug), 1532 1533 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1534 "Unexpected exit reason on vcpu run: {:?}", 1535 r 1536 ))), 1537 }, 1538 1539 Err(ref e) => match e.errno() { 1540 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore), 1541 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1542 "VCPU error {:?}", 1543 e 1544 ))), 1545 }, 1546 } 1547 } 1548 #[cfg(target_arch = "x86_64")] 1549 /// 1550 /// Let the guest know that it has been paused, which prevents from 1551 /// potential soft lockups when being resumed. 1552 /// 1553 fn notify_guest_clock_paused(&self) -> cpu::Result<()> { 1554 if let Err(e) = self.fd.kvmclock_ctrl() { 1555 // Linux kernel returns -EINVAL if the PV clock isn't yet initialised 1556 // which could be because we're still in firmware or the guest doesn't 1557 // use KVM clock. 1558 if e.errno() != libc::EINVAL { 1559 return Err(cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into())); 1560 } 1561 } 1562 1563 Ok(()) 1564 } 1565 #[cfg(target_arch = "x86_64")] 1566 /// 1567 /// Sets debug registers to set hardware breakpoints and/or enable single step. 1568 /// 1569 fn set_guest_debug( 1570 &self, 1571 addrs: &[vm_memory::GuestAddress], 1572 singlestep: bool, 1573 ) -> cpu::Result<()> { 1574 if addrs.len() > 4 { 1575 return Err(cpu::HypervisorCpuError::SetDebugRegs(anyhow!( 1576 "Support 4 breakpoints at most but {} addresses are passed", 1577 addrs.len() 1578 ))); 1579 } 1580 1581 let mut dbg = kvm_guest_debug { 1582 control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP, 1583 ..Default::default() 1584 }; 1585 if singlestep { 1586 dbg.control |= KVM_GUESTDBG_SINGLESTEP; 1587 } 1588 1589 // Set bits 9 and 10. 1590 // bit 9: GE (global exact breakpoint enable) flag. 1591 // bit 10: always 1. 1592 dbg.arch.debugreg[7] = 0x0600; 1593 1594 for (i, addr) in addrs.iter().enumerate() { 1595 dbg.arch.debugreg[i] = addr.0; 1596 // Set global breakpoint enable flag 1597 dbg.arch.debugreg[7] |= 2 << (i * 2); 1598 } 1599 1600 self.fd 1601 .set_guest_debug(&dbg) 1602 .map_err(|e| cpu::HypervisorCpuError::SetDebugRegs(e.into())) 1603 } 1604 #[cfg(target_arch = "aarch64")] 1605 fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> { 1606 self.fd 1607 .vcpu_init(kvi) 1608 .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into())) 1609 } 1610 /// 1611 /// Sets the value of one register for this vCPU. 1612 /// 1613 #[cfg(target_arch = "aarch64")] 1614 fn set_reg(&self, reg_id: u64, data: u64) -> cpu::Result<()> { 1615 self.fd 1616 .set_one_reg(reg_id, data) 1617 .map_err(|e| cpu::HypervisorCpuError::SetRegister(e.into())) 1618 } 1619 /// 1620 /// Gets the value of one register for this vCPU. 1621 /// 1622 #[cfg(target_arch = "aarch64")] 1623 fn get_reg(&self, reg_id: u64) -> cpu::Result<u64> { 1624 self.fd 1625 .get_one_reg(reg_id) 1626 .map_err(|e| cpu::HypervisorCpuError::GetRegister(e.into())) 1627 } 1628 /// 1629 /// Gets a list of the guest registers that are supported for the 1630 /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls. 1631 /// 1632 #[cfg(target_arch = "aarch64")] 1633 fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> { 1634 self.fd 1635 .get_reg_list(reg_list) 1636 .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into())) 1637 } 1638 /// 1639 /// Save the state of the system registers. 1640 /// 1641 #[cfg(target_arch = "aarch64")] 1642 fn get_sys_regs(&self) -> cpu::Result<Vec<Register>> { 1643 // Call KVM_GET_REG_LIST to get all registers available to the guest. For ArmV8 there are 1644 // around 500 registers. 1645 let mut state: Vec<Register> = Vec::new(); 1646 let mut reg_list = RegList::new(500).unwrap(); 1647 self.fd 1648 .get_reg_list(&mut reg_list) 1649 .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?; 1650 1651 // At this point reg_list should contain: core registers and system registers. 1652 // The register list contains the number of registers and their ids. We will be needing to 1653 // call KVM_GET_ONE_REG on each id in order to save all of them. We carve out from the list 1654 // the core registers which are represented in the kernel by kvm_regs structure and for which 1655 // we can calculate the id based on the offset in the structure. 1656 reg_list.retain(|regid| is_system_register(*regid)); 1657 1658 // Now, for the rest of the registers left in the previously fetched register list, we are 1659 // simply calling KVM_GET_ONE_REG. 1660 let indices = reg_list.as_slice(); 1661 for index in indices.iter() { 1662 state.push(kvm_bindings::kvm_one_reg { 1663 id: *index, 1664 addr: self 1665 .fd 1666 .get_one_reg(*index) 1667 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?, 1668 }); 1669 } 1670 1671 Ok(state) 1672 } 1673 /// 1674 /// Restore the state of the system registers. 1675 /// 1676 #[cfg(target_arch = "aarch64")] 1677 fn set_sys_regs(&self, state: &[Register]) -> cpu::Result<()> { 1678 for reg in state { 1679 self.fd 1680 .set_one_reg(reg.id, reg.addr) 1681 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?; 1682 } 1683 Ok(()) 1684 } 1685 /// 1686 /// Read the MPIDR - Multiprocessor Affinity Register. 1687 /// 1688 #[cfg(target_arch = "aarch64")] 1689 fn read_mpidr(&self) -> cpu::Result<u64> { 1690 self.fd 1691 .get_one_reg(MPIDR_EL1) 1692 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into())) 1693 } 1694 /// 1695 /// Configure core registers for a given CPU. 1696 /// 1697 #[cfg(target_arch = "aarch64")] 1698 fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> { 1699 #[allow(non_upper_case_globals)] 1700 // PSR (Processor State Register) bits. 1701 // Taken from arch/arm64/include/uapi/asm/ptrace.h. 1702 const PSR_MODE_EL1h: u64 = 0x0000_0005; 1703 const PSR_F_BIT: u64 = 0x0000_0040; 1704 const PSR_I_BIT: u64 = 0x0000_0080; 1705 const PSR_A_BIT: u64 = 0x0000_0100; 1706 const PSR_D_BIT: u64 = 0x0000_0200; 1707 // Taken from arch/arm64/kvm/inject_fault.c. 1708 const PSTATE_FAULT_BITS_64: u64 = 1709 PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | PSR_I_BIT | PSR_D_BIT; 1710 1711 let kreg_off = offset__of!(kvm_regs, regs); 1712 1713 // Get the register index of the PSTATE (Processor State) register. 1714 let pstate = offset__of!(user_pt_regs, pstate) + kreg_off; 1715 self.set_reg( 1716 arm64_core_reg_id!(KVM_REG_SIZE_U64, pstate), 1717 PSTATE_FAULT_BITS_64, 1718 ) 1719 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1720 1721 // Other vCPUs are powered off initially awaiting PSCI wakeup. 1722 if cpu_id == 0 { 1723 // Setting the PC (Processor Counter) to the current program address (kernel address). 1724 let pc = offset__of!(user_pt_regs, pc) + kreg_off; 1725 self.set_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, pc), boot_ip as u64) 1726 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1727 1728 // Last mandatory thing to set -> the address pointing to the FDT (also called DTB). 1729 // "The device tree blob (dtb) must be placed on an 8-byte boundary and must 1730 // not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt. 1731 // We are choosing to place it the end of DRAM. See `get_fdt_addr`. 1732 let regs0 = offset__of!(user_pt_regs, regs) + kreg_off; 1733 self.set_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, regs0), fdt_start) 1734 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1735 } 1736 Ok(()) 1737 } 1738 1739 #[cfg(target_arch = "x86_64")] 1740 /// 1741 /// Get the current CPU state 1742 /// 1743 /// Ordering requirements: 1744 /// 1745 /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify 1746 /// vCPU/LAPIC state. As such, it must be done before most everything 1747 /// else, otherwise we cannot restore everything and expect it to work. 1748 /// 1749 /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are 1750 /// still running. 1751 /// 1752 /// KVM_GET_LAPIC may change state of LAPIC before returning it. 1753 /// 1754 /// GET_VCPU_EVENTS should probably be last to save. The code looks as 1755 /// it might as well be affected by internal state modifications of the 1756 /// GET ioctls. 1757 /// 1758 /// SREGS saves/restores a pending interrupt, similar to what 1759 /// VCPU_EVENTS also does. 1760 /// 1761 /// GET_MSRS requires a pre-populated data structure to do something 1762 /// meaningful. For SET_MSRS it will then contain good data. 1763 /// 1764 /// # Example 1765 /// 1766 /// ```rust 1767 /// # extern crate hypervisor; 1768 /// # use hypervisor::KvmHypervisor; 1769 /// # use std::sync::Arc; 1770 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1771 /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1772 /// let vm = hv.create_vm().expect("new VM fd creation failed"); 1773 /// vm.enable_split_irq().unwrap(); 1774 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1775 /// let state = vcpu.state().unwrap(); 1776 /// ``` 1777 fn state(&self) -> cpu::Result<CpuState> { 1778 let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?; 1779 let mp_state = self.get_mp_state()?.into(); 1780 let regs = self.get_regs()?; 1781 let sregs = self.get_sregs()?; 1782 let xsave = self.get_xsave()?; 1783 let xcrs = self.get_xcrs()?; 1784 let lapic_state = self.get_lapic()?; 1785 let fpu = self.get_fpu()?; 1786 1787 // Try to get all MSRs based on the list previously retrieved from KVM. 1788 // If the number of MSRs obtained from GET_MSRS is different from the 1789 // expected amount, we fallback onto a slower method by getting MSRs 1790 // by chunks. This is the only way to make sure we try to get as many 1791 // MSRs as possible, even if some MSRs are not supported. 1792 let mut msr_entries = self.msrs.clone(); 1793 1794 // Save extra MSRs if the Hyper-V synthetic interrupt controller is 1795 // emulated. 1796 if self.hyperv_synic.load(Ordering::Acquire) { 1797 let hyperv_synic_msrs = vec![ 1798 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084, 1799 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096, 1800 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d, 1801 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4, 1802 0x400000b5, 0x400000b6, 0x400000b7, 1803 ]; 1804 for index in hyperv_synic_msrs { 1805 let msr = kvm_msr_entry { 1806 index, 1807 ..Default::default() 1808 }; 1809 msr_entries.push(msr.into()); 1810 } 1811 } 1812 1813 let expected_num_msrs = msr_entries.len(); 1814 let num_msrs = self.get_msrs(&mut msr_entries)?; 1815 let msrs = if num_msrs != expected_num_msrs { 1816 let mut faulty_msr_index = num_msrs; 1817 let mut msr_entries_tmp = msr_entries[..faulty_msr_index].to_vec(); 1818 1819 loop { 1820 warn!( 1821 "Detected faulty MSR 0x{:x} while getting MSRs", 1822 msr_entries[faulty_msr_index].index 1823 ); 1824 1825 // Skip the first bad MSR 1826 let start_pos = faulty_msr_index + 1; 1827 1828 let mut sub_msr_entries = msr_entries[start_pos..].to_vec(); 1829 let num_msrs = self.get_msrs(&mut sub_msr_entries)?; 1830 1831 msr_entries_tmp.extend(&sub_msr_entries[..num_msrs]); 1832 1833 if num_msrs == sub_msr_entries.len() { 1834 break; 1835 } 1836 1837 faulty_msr_index = start_pos + num_msrs; 1838 } 1839 1840 msr_entries_tmp 1841 } else { 1842 msr_entries 1843 }; 1844 1845 let vcpu_events = self.get_vcpu_events()?; 1846 1847 Ok(VcpuKvmState { 1848 cpuid, 1849 msrs, 1850 vcpu_events, 1851 regs: regs.into(), 1852 sregs: sregs.into(), 1853 fpu, 1854 lapic_state, 1855 xsave, 1856 xcrs, 1857 mp_state, 1858 } 1859 .into()) 1860 } 1861 /// 1862 /// Get the current AArch64 CPU state 1863 /// 1864 #[cfg(target_arch = "aarch64")] 1865 fn state(&self) -> cpu::Result<CpuState> { 1866 let mut state = VcpuKvmState { 1867 mp_state: self.get_mp_state()?.into(), 1868 mpidr: self.read_mpidr()?, 1869 ..Default::default() 1870 }; 1871 state.core_regs = self.get_regs()?; 1872 state.sys_regs = self.get_sys_regs()?; 1873 1874 Ok(state.into()) 1875 } 1876 #[cfg(target_arch = "x86_64")] 1877 /// 1878 /// Restore the previously saved CPU state 1879 /// 1880 /// Ordering requirements: 1881 /// 1882 /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are 1883 /// still running. 1884 /// 1885 /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so 1886 /// if we ever change the BSP, we have to do that before restoring anything. 1887 /// The same seems to be true for CPUID stuff. 1888 /// 1889 /// SREGS saves/restores a pending interrupt, similar to what 1890 /// VCPU_EVENTS also does. 1891 /// 1892 /// SET_REGS clears pending exceptions unconditionally, thus, it must be 1893 /// done before SET_VCPU_EVENTS, which restores it. 1894 /// 1895 /// SET_LAPIC must come after SET_SREGS, because the latter restores 1896 /// the apic base msr. 1897 /// 1898 /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR 1899 /// only restores successfully, when the LAPIC is correctly configured. 1900 /// 1901 /// Arguments: CpuState 1902 /// # Example 1903 /// 1904 /// ```rust 1905 /// # extern crate hypervisor; 1906 /// # use hypervisor::KvmHypervisor; 1907 /// # use std::sync::Arc; 1908 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1909 /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1910 /// let vm = hv.create_vm().expect("new VM fd creation failed"); 1911 /// vm.enable_split_irq().unwrap(); 1912 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1913 /// let state = vcpu.state().unwrap(); 1914 /// vcpu.set_state(&state).unwrap(); 1915 /// ``` 1916 fn set_state(&self, state: &CpuState) -> cpu::Result<()> { 1917 let state: VcpuKvmState = state.clone().into(); 1918 self.set_cpuid2(&state.cpuid)?; 1919 self.set_mp_state(state.mp_state.into())?; 1920 self.set_regs(&state.regs.into())?; 1921 self.set_sregs(&state.sregs.into())?; 1922 self.set_xsave(&state.xsave)?; 1923 self.set_xcrs(&state.xcrs)?; 1924 self.set_lapic(&state.lapic_state)?; 1925 self.set_fpu(&state.fpu)?; 1926 1927 // Try to set all MSRs previously stored. 1928 // If the number of MSRs set from SET_MSRS is different from the 1929 // expected amount, we fallback onto a slower method by setting MSRs 1930 // by chunks. This is the only way to make sure we try to set as many 1931 // MSRs as possible, even if some MSRs are not supported. 1932 let expected_num_msrs = state.msrs.len(); 1933 let num_msrs = self.set_msrs(&state.msrs)?; 1934 if num_msrs != expected_num_msrs { 1935 let mut faulty_msr_index = num_msrs; 1936 1937 loop { 1938 warn!( 1939 "Detected faulty MSR 0x{:x} while setting MSRs", 1940 state.msrs[faulty_msr_index].index 1941 ); 1942 1943 // Skip the first bad MSR 1944 let start_pos = faulty_msr_index + 1; 1945 1946 let sub_msr_entries = state.msrs[start_pos..].to_vec(); 1947 1948 let num_msrs = self.set_msrs(&sub_msr_entries)?; 1949 1950 if num_msrs == sub_msr_entries.len() { 1951 break; 1952 } 1953 1954 faulty_msr_index = start_pos + num_msrs; 1955 } 1956 } 1957 1958 self.set_vcpu_events(&state.vcpu_events)?; 1959 1960 Ok(()) 1961 } 1962 /// 1963 /// Restore the previously saved AArch64 CPU state 1964 /// 1965 #[cfg(target_arch = "aarch64")] 1966 fn set_state(&self, state: &CpuState) -> cpu::Result<()> { 1967 let state: VcpuKvmState = state.clone().into(); 1968 self.set_regs(&state.core_regs)?; 1969 self.set_sys_regs(&state.sys_regs)?; 1970 self.set_mp_state(state.mp_state.into())?; 1971 1972 Ok(()) 1973 } 1974 1975 /// 1976 /// Initialize TDX for this CPU 1977 /// 1978 #[cfg(feature = "tdx")] 1979 fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> { 1980 tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address) 1981 .map_err(cpu::HypervisorCpuError::InitializeTdx) 1982 } 1983 1984 /// 1985 /// Set the "immediate_exit" state 1986 /// 1987 fn set_immediate_exit(&self, exit: bool) { 1988 self.fd.set_kvm_immediate_exit(exit.into()); 1989 } 1990 1991 /// 1992 /// Returns the details about TDX exit reason 1993 /// 1994 #[cfg(feature = "tdx")] 1995 fn get_tdx_exit_details(&mut self) -> cpu::Result<TdxExitDetails> { 1996 let kvm_run = self.fd.get_kvm_run(); 1997 let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall }; 1998 1999 tdx_vmcall.status_code = TDG_VP_VMCALL_INVALID_OPERAND; 2000 2001 if tdx_vmcall.type_ != 0 { 2002 return Err(cpu::HypervisorCpuError::UnknownTdxVmCall); 2003 } 2004 2005 match tdx_vmcall.subfunction { 2006 TDG_VP_VMCALL_GET_QUOTE => Ok(TdxExitDetails::GetQuote), 2007 TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT => { 2008 Ok(TdxExitDetails::SetupEventNotifyInterrupt) 2009 } 2010 _ => Err(cpu::HypervisorCpuError::UnknownTdxVmCall), 2011 } 2012 } 2013 2014 /// 2015 /// Set the status code for TDX exit 2016 /// 2017 #[cfg(feature = "tdx")] 2018 fn set_tdx_status(&mut self, status: TdxExitStatus) { 2019 let kvm_run = self.fd.get_kvm_run(); 2020 let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall }; 2021 2022 tdx_vmcall.status_code = match status { 2023 TdxExitStatus::Success => TDG_VP_VMCALL_SUCCESS, 2024 TdxExitStatus::InvalidOperand => TDG_VP_VMCALL_INVALID_OPERAND, 2025 }; 2026 } 2027 #[cfg(target_arch = "x86_64")] 2028 /// 2029 /// Return the list of initial MSR entries for a VCPU 2030 /// 2031 fn boot_msr_entries(&self) -> Vec<MsrEntry> { 2032 use crate::arch::x86::{msr_index, MTRR_ENABLE, MTRR_MEM_TYPE_WB}; 2033 2034 [ 2035 msr!(msr_index::MSR_IA32_SYSENTER_CS), 2036 msr!(msr_index::MSR_IA32_SYSENTER_ESP), 2037 msr!(msr_index::MSR_IA32_SYSENTER_EIP), 2038 msr!(msr_index::MSR_STAR), 2039 msr!(msr_index::MSR_CSTAR), 2040 msr!(msr_index::MSR_LSTAR), 2041 msr!(msr_index::MSR_KERNEL_GS_BASE), 2042 msr!(msr_index::MSR_SYSCALL_MASK), 2043 msr!(msr_index::MSR_IA32_TSC), 2044 msr_data!( 2045 msr_index::MSR_IA32_MISC_ENABLE, 2046 msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64 2047 ), 2048 msr_data!(msr_index::MSR_MTRRdefType, MTRR_ENABLE | MTRR_MEM_TYPE_WB), 2049 ] 2050 .to_vec() 2051 } 2052 #[cfg(target_arch = "aarch64")] 2053 fn has_pmu_support(&self) -> bool { 2054 let cpu_attr = kvm_bindings::kvm_device_attr { 2055 group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL, 2056 attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT), 2057 addr: 0x0, 2058 flags: 0, 2059 }; 2060 self.fd.has_device_attr(&cpu_attr).is_ok() 2061 } 2062 #[cfg(target_arch = "aarch64")] 2063 fn init_pmu(&self, irq: u32) -> cpu::Result<()> { 2064 let cpu_attr = kvm_bindings::kvm_device_attr { 2065 group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL, 2066 attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT), 2067 addr: 0x0, 2068 flags: 0, 2069 }; 2070 let cpu_attr_irq = kvm_bindings::kvm_device_attr { 2071 group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL, 2072 attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_IRQ), 2073 addr: &irq as *const u32 as u64, 2074 flags: 0, 2075 }; 2076 self.fd 2077 .set_device_attr(&cpu_attr_irq) 2078 .map_err(|_| cpu::HypervisorCpuError::InitializePmu)?; 2079 self.fd 2080 .set_device_attr(&cpu_attr) 2081 .map_err(|_| cpu::HypervisorCpuError::InitializePmu) 2082 } 2083 } 2084 2085 impl KvmVcpu { 2086 #[cfg(target_arch = "x86_64")] 2087 /// 2088 /// X86 specific call that returns the vcpu's current "xsave struct". 2089 /// 2090 fn get_xsave(&self) -> cpu::Result<Xsave> { 2091 self.fd 2092 .get_xsave() 2093 .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into())) 2094 } 2095 #[cfg(target_arch = "x86_64")] 2096 /// 2097 /// X86 specific call that sets the vcpu's current "xsave struct". 2098 /// 2099 fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> { 2100 self.fd 2101 .set_xsave(xsave) 2102 .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into())) 2103 } 2104 #[cfg(target_arch = "x86_64")] 2105 /// 2106 /// X86 specific call that returns the vcpu's current "xcrs". 2107 /// 2108 fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> { 2109 self.fd 2110 .get_xcrs() 2111 .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into())) 2112 } 2113 #[cfg(target_arch = "x86_64")] 2114 /// 2115 /// X86 specific call that sets the vcpu's current "xcrs". 2116 /// 2117 fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> { 2118 self.fd 2119 .set_xcrs(xcrs) 2120 .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into())) 2121 } 2122 #[cfg(target_arch = "x86_64")] 2123 /// 2124 /// Returns currently pending exceptions, interrupts, and NMIs as well as related 2125 /// states of the vcpu. 2126 /// 2127 fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> { 2128 self.fd 2129 .get_vcpu_events() 2130 .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into())) 2131 } 2132 #[cfg(target_arch = "x86_64")] 2133 /// 2134 /// Sets pending exceptions, interrupts, and NMIs as well as related states 2135 /// of the vcpu. 2136 /// 2137 fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> { 2138 self.fd 2139 .set_vcpu_events(events) 2140 .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into())) 2141 } 2142 } 2143