1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause 4 // 5 // Copyright © 2020, Microsoft Corporation 6 // 7 // Copyright 2018-2019 CrowdStrike, Inc. 8 // 9 // 10 11 #[cfg(target_arch = "aarch64")] 12 use crate::aarch64::gic::KvmGicV3Its; 13 #[cfg(target_arch = "aarch64")] 14 pub use crate::aarch64::{ 15 check_required_kvm_extensions, gic::Gicv3ItsState as GicState, is_system_register, VcpuInit, 16 VcpuKvmState, MPIDR_EL1, 17 }; 18 #[cfg(target_arch = "aarch64")] 19 use crate::arch::aarch64::gic::Vgic; 20 use crate::cpu; 21 use crate::hypervisor; 22 use crate::vec_with_array_field; 23 use crate::vm::{self, InterruptSourceConfig, VmOps}; 24 use crate::HypervisorType; 25 #[cfg(target_arch = "aarch64")] 26 use crate::{arm64_core_reg_id, offset__of}; 27 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd}; 28 use std::any::Any; 29 use std::collections::HashMap; 30 #[cfg(target_arch = "aarch64")] 31 use std::convert::TryInto; 32 #[cfg(target_arch = "x86_64")] 33 use std::fs::File; 34 #[cfg(target_arch = "x86_64")] 35 use std::os::unix::io::AsRawFd; 36 #[cfg(feature = "tdx")] 37 use std::os::unix::io::RawFd; 38 use std::result; 39 #[cfg(target_arch = "x86_64")] 40 use std::sync::atomic::{AtomicBool, Ordering}; 41 #[cfg(target_arch = "aarch64")] 42 use std::sync::Mutex; 43 use std::sync::{Arc, RwLock}; 44 use vmm_sys_util::eventfd::EventFd; 45 // x86_64 dependencies 46 #[cfg(target_arch = "x86_64")] 47 pub mod x86_64; 48 #[cfg(target_arch = "x86_64")] 49 use crate::arch::x86::{ 50 CpuIdEntry, FpuState, LapicState, MsrEntry, SpecialRegisters, StandardRegisters, 51 NUM_IOAPIC_PINS, 52 }; 53 #[cfg(target_arch = "x86_64")] 54 use crate::ClockData; 55 use crate::{ 56 CpuState, IoEventAddress, IrqRoutingEntry, MpState, UserMemoryRegion, 57 USER_MEMORY_REGION_LOG_DIRTY, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE, 58 }; 59 #[cfg(target_arch = "aarch64")] 60 use aarch64::{RegList, Register, StandardRegisters}; 61 #[cfg(target_arch = "x86_64")] 62 use kvm_bindings::{ 63 kvm_enable_cap, kvm_guest_debug, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, 64 KVM_CAP_SPLIT_IRQCHIP, KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_GUESTDBG_USE_HW_BP, 65 }; 66 #[cfg(target_arch = "x86_64")] 67 use x86_64::check_required_kvm_extensions; 68 #[cfg(target_arch = "x86_64")] 69 pub use x86_64::{CpuId, ExtendedControlRegisters, MsrEntries, VcpuKvmState, Xsave}; 70 // aarch64 dependencies 71 #[cfg(target_arch = "aarch64")] 72 pub mod aarch64; 73 pub use kvm_bindings; 74 #[cfg(feature = "tdx")] 75 use kvm_bindings::KVMIO; 76 pub use kvm_bindings::{ 77 kvm_clock_data, kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_irq_routing, 78 kvm_irq_routing_entry, kvm_mp_state, kvm_userspace_memory_region, KVM_IRQ_ROUTING_IRQCHIP, 79 KVM_IRQ_ROUTING_MSI, KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID, 80 }; 81 #[cfg(target_arch = "aarch64")] 82 use kvm_bindings::{ 83 kvm_regs, user_fpsimd_state, user_pt_regs, KVM_NR_SPSR, KVM_REG_ARM64, KVM_REG_ARM_CORE, 84 KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64, 85 }; 86 pub use kvm_ioctls; 87 pub use kvm_ioctls::{Cap, Kvm}; 88 #[cfg(target_arch = "aarch64")] 89 use std::mem; 90 use thiserror::Error; 91 use vfio_ioctls::VfioDeviceFd; 92 #[cfg(feature = "tdx")] 93 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_ioc_nr, ioctl_iowr_nr}; 94 /// 95 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms 96 /// 97 pub use { 98 kvm_bindings::kvm_create_device as CreateDevice, kvm_bindings::kvm_device_attr as DeviceAttr, 99 kvm_bindings::kvm_run, kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::VcpuExit, 100 }; 101 102 #[cfg(target_arch = "x86_64")] 103 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196; 104 105 #[cfg(feature = "tdx")] 106 const KVM_EXIT_TDX: u32 = 35; 107 #[cfg(feature = "tdx")] 108 const TDG_VP_VMCALL_GET_QUOTE: u64 = 0x10002; 109 #[cfg(feature = "tdx")] 110 const TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: u64 = 0x10004; 111 #[cfg(feature = "tdx")] 112 const TDG_VP_VMCALL_SUCCESS: u64 = 0; 113 #[cfg(feature = "tdx")] 114 const TDG_VP_VMCALL_INVALID_OPERAND: u64 = 0x8000000000000000; 115 116 #[cfg(feature = "tdx")] 117 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong); 118 119 #[cfg(feature = "tdx")] 120 #[repr(u32)] 121 enum TdxCommand { 122 Capabilities = 0, 123 InitVm, 124 InitVcpu, 125 InitMemRegion, 126 Finalize, 127 } 128 129 #[cfg(feature = "tdx")] 130 pub enum TdxExitDetails { 131 GetQuote, 132 SetupEventNotifyInterrupt, 133 } 134 135 #[cfg(feature = "tdx")] 136 pub enum TdxExitStatus { 137 Success, 138 InvalidOperand, 139 } 140 141 #[cfg(feature = "tdx")] 142 const TDX_MAX_NR_CPUID_CONFIGS: usize = 6; 143 144 #[cfg(feature = "tdx")] 145 #[repr(C)] 146 #[derive(Debug, Default)] 147 pub struct TdxCpuidConfig { 148 pub leaf: u32, 149 pub sub_leaf: u32, 150 pub eax: u32, 151 pub ebx: u32, 152 pub ecx: u32, 153 pub edx: u32, 154 } 155 156 #[cfg(feature = "tdx")] 157 #[repr(C)] 158 #[derive(Debug, Default)] 159 pub struct TdxCapabilities { 160 pub attrs_fixed0: u64, 161 pub attrs_fixed1: u64, 162 pub xfam_fixed0: u64, 163 pub xfam_fixed1: u64, 164 pub nr_cpuid_configs: u32, 165 pub padding: u32, 166 pub cpuid_configs: [TdxCpuidConfig; TDX_MAX_NR_CPUID_CONFIGS], 167 } 168 169 impl From<kvm_userspace_memory_region> for UserMemoryRegion { 170 fn from(region: kvm_userspace_memory_region) -> Self { 171 let mut flags = USER_MEMORY_REGION_READ; 172 if region.flags & KVM_MEM_READONLY == 0 { 173 flags |= USER_MEMORY_REGION_WRITE; 174 } 175 if region.flags & KVM_MEM_LOG_DIRTY_PAGES != 0 { 176 flags |= USER_MEMORY_REGION_LOG_DIRTY; 177 } 178 179 UserMemoryRegion { 180 slot: region.slot, 181 guest_phys_addr: region.guest_phys_addr, 182 memory_size: region.memory_size, 183 userspace_addr: region.userspace_addr, 184 flags, 185 } 186 } 187 } 188 189 impl From<UserMemoryRegion> for kvm_userspace_memory_region { 190 fn from(region: UserMemoryRegion) -> Self { 191 assert!( 192 region.flags & USER_MEMORY_REGION_READ != 0, 193 "KVM mapped memory is always readable" 194 ); 195 196 let mut flags = 0; 197 if region.flags & USER_MEMORY_REGION_WRITE == 0 { 198 flags |= KVM_MEM_READONLY; 199 } 200 if region.flags & USER_MEMORY_REGION_LOG_DIRTY != 0 { 201 flags |= KVM_MEM_LOG_DIRTY_PAGES; 202 } 203 204 kvm_userspace_memory_region { 205 slot: region.slot, 206 guest_phys_addr: region.guest_phys_addr, 207 memory_size: region.memory_size, 208 userspace_addr: region.userspace_addr, 209 flags, 210 } 211 } 212 } 213 214 impl From<kvm_mp_state> for MpState { 215 fn from(s: kvm_mp_state) -> Self { 216 MpState::Kvm(s) 217 } 218 } 219 220 impl From<MpState> for kvm_mp_state { 221 fn from(ms: MpState) -> Self { 222 match ms { 223 MpState::Kvm(s) => s, 224 /* Needed in case other hypervisors are enabled */ 225 #[allow(unreachable_patterns)] 226 _ => panic!("CpuState is not valid"), 227 } 228 } 229 } 230 231 impl From<kvm_ioctls::IoEventAddress> for IoEventAddress { 232 fn from(a: kvm_ioctls::IoEventAddress) -> Self { 233 match a { 234 kvm_ioctls::IoEventAddress::Pio(x) => Self::Pio(x), 235 kvm_ioctls::IoEventAddress::Mmio(x) => Self::Mmio(x), 236 } 237 } 238 } 239 240 impl From<IoEventAddress> for kvm_ioctls::IoEventAddress { 241 fn from(a: IoEventAddress) -> Self { 242 match a { 243 IoEventAddress::Pio(x) => Self::Pio(x), 244 IoEventAddress::Mmio(x) => Self::Mmio(x), 245 } 246 } 247 } 248 249 impl From<VcpuKvmState> for CpuState { 250 fn from(s: VcpuKvmState) -> Self { 251 CpuState::Kvm(s) 252 } 253 } 254 255 impl From<CpuState> for VcpuKvmState { 256 fn from(s: CpuState) -> Self { 257 match s { 258 CpuState::Kvm(s) => s, 259 /* Needed in case other hypervisors are enabled */ 260 #[allow(unreachable_patterns)] 261 _ => panic!("CpuState is not valid"), 262 } 263 } 264 } 265 266 #[cfg(target_arch = "x86_64")] 267 impl From<kvm_clock_data> for ClockData { 268 fn from(d: kvm_clock_data) -> Self { 269 ClockData::Kvm(d) 270 } 271 } 272 273 #[cfg(target_arch = "x86_64")] 274 impl From<ClockData> for kvm_clock_data { 275 fn from(ms: ClockData) -> Self { 276 match ms { 277 ClockData::Kvm(s) => s, 278 /* Needed in case other hypervisors are enabled */ 279 #[allow(unreachable_patterns)] 280 _ => panic!("CpuState is not valid"), 281 } 282 } 283 } 284 285 impl From<kvm_irq_routing_entry> for IrqRoutingEntry { 286 fn from(s: kvm_irq_routing_entry) -> Self { 287 IrqRoutingEntry::Kvm(s) 288 } 289 } 290 291 impl From<IrqRoutingEntry> for kvm_irq_routing_entry { 292 fn from(e: IrqRoutingEntry) -> Self { 293 match e { 294 IrqRoutingEntry::Kvm(e) => e, 295 /* Needed in case other hypervisors are enabled */ 296 #[allow(unreachable_patterns)] 297 _ => panic!("IrqRoutingEntry is not valid"), 298 } 299 } 300 } 301 302 struct KvmDirtyLogSlot { 303 slot: u32, 304 guest_phys_addr: u64, 305 memory_size: u64, 306 userspace_addr: u64, 307 } 308 309 /// Wrapper over KVM VM ioctls. 310 pub struct KvmVm { 311 fd: Arc<VmFd>, 312 #[cfg(target_arch = "x86_64")] 313 msrs: Vec<MsrEntry>, 314 dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>, 315 } 316 317 impl KvmVm { 318 /// 319 /// Creates an emulated device in the kernel. 320 /// 321 /// See the documentation for `KVM_CREATE_DEVICE`. 322 fn create_device(&self, device: &mut CreateDevice) -> vm::Result<vfio_ioctls::VfioDeviceFd> { 323 let device_fd = self 324 .fd 325 .create_device(device) 326 .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?; 327 Ok(VfioDeviceFd::new_from_kvm(device_fd)) 328 } 329 /// Checks if a particular `Cap` is available. 330 fn check_extension(&self, c: Cap) -> bool { 331 self.fd.check_extension(c) 332 } 333 } 334 335 /// 336 /// Implementation of Vm trait for KVM 337 /// Example: 338 /// #[cfg(feature = "kvm")] 339 /// extern crate hypervisor 340 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 341 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 342 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 343 /// vm.set/get().unwrap() 344 /// 345 impl vm::Vm for KvmVm { 346 #[cfg(target_arch = "x86_64")] 347 /// 348 /// Sets the address of the one-page region in the VM's address space. 349 /// 350 fn set_identity_map_address(&self, address: u64) -> vm::Result<()> { 351 self.fd 352 .set_identity_map_address(address) 353 .map_err(|e| vm::HypervisorVmError::SetIdentityMapAddress(e.into())) 354 } 355 #[cfg(target_arch = "x86_64")] 356 /// 357 /// Sets the address of the three-page region in the VM's address space. 358 /// 359 fn set_tss_address(&self, offset: usize) -> vm::Result<()> { 360 self.fd 361 .set_tss_address(offset) 362 .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into())) 363 } 364 /// 365 /// Creates an in-kernel interrupt controller. 366 /// 367 fn create_irq_chip(&self) -> vm::Result<()> { 368 self.fd 369 .create_irq_chip() 370 .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into())) 371 } 372 /// 373 /// Registers an event that will, when signaled, trigger the `gsi` IRQ. 374 /// 375 fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> { 376 self.fd 377 .register_irqfd(fd, gsi) 378 .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into())) 379 } 380 /// 381 /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ. 382 /// 383 fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> { 384 self.fd 385 .unregister_irqfd(fd, gsi) 386 .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into())) 387 } 388 /// 389 /// Creates a VcpuFd object from a vcpu RawFd. 390 /// 391 fn create_vcpu( 392 &self, 393 id: u8, 394 vm_ops: Option<Arc<dyn VmOps>>, 395 ) -> vm::Result<Arc<dyn cpu::Vcpu>> { 396 let vc = self 397 .fd 398 .create_vcpu(id as u64) 399 .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?; 400 let vcpu = KvmVcpu { 401 fd: vc, 402 #[cfg(target_arch = "x86_64")] 403 msrs: self.msrs.clone(), 404 vm_ops, 405 #[cfg(target_arch = "x86_64")] 406 hyperv_synic: AtomicBool::new(false), 407 }; 408 Ok(Arc::new(vcpu)) 409 } 410 #[cfg(target_arch = "aarch64")] 411 /// 412 /// Creates a virtual GIC device. 413 /// 414 fn create_vgic( 415 &self, 416 vcpu_count: u64, 417 dist_addr: u64, 418 dist_size: u64, 419 redist_size: u64, 420 msi_size: u64, 421 nr_irqs: u32, 422 ) -> vm::Result<Arc<Mutex<dyn Vgic>>> { 423 let gic_device = KvmGicV3Its::new( 424 self, 425 vcpu_count, 426 dist_addr, 427 dist_size, 428 redist_size, 429 msi_size, 430 nr_irqs, 431 ) 432 .map_err(|e| vm::HypervisorVmError::CreateVgic(anyhow!("Vgic error {:?}", e)))?; 433 Ok(Arc::new(Mutex::new(gic_device))) 434 } 435 /// 436 /// Registers an event to be signaled whenever a certain address is written to. 437 /// 438 fn register_ioevent( 439 &self, 440 fd: &EventFd, 441 addr: &IoEventAddress, 442 datamatch: Option<vm::DataMatch>, 443 ) -> vm::Result<()> { 444 let addr = &kvm_ioctls::IoEventAddress::from(*addr); 445 if let Some(dm) = datamatch { 446 match dm { 447 vm::DataMatch::DataMatch32(kvm_dm32) => self 448 .fd 449 .register_ioevent(fd, addr, kvm_dm32) 450 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())), 451 vm::DataMatch::DataMatch64(kvm_dm64) => self 452 .fd 453 .register_ioevent(fd, addr, kvm_dm64) 454 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())), 455 } 456 } else { 457 self.fd 458 .register_ioevent(fd, addr, NoDatamatch) 459 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())) 460 } 461 } 462 /// 463 /// Unregisters an event from a certain address it has been previously registered to. 464 /// 465 fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> { 466 let addr = &kvm_ioctls::IoEventAddress::from(*addr); 467 self.fd 468 .unregister_ioevent(fd, addr, NoDatamatch) 469 .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into())) 470 } 471 472 /// 473 /// Constructs a routing entry 474 /// 475 fn make_routing_entry(&self, gsi: u32, config: &InterruptSourceConfig) -> IrqRoutingEntry { 476 match &config { 477 InterruptSourceConfig::MsiIrq(cfg) => { 478 let mut kvm_route = kvm_irq_routing_entry { 479 gsi, 480 type_: KVM_IRQ_ROUTING_MSI, 481 ..Default::default() 482 }; 483 484 kvm_route.u.msi.address_lo = cfg.low_addr; 485 kvm_route.u.msi.address_hi = cfg.high_addr; 486 kvm_route.u.msi.data = cfg.data; 487 488 if self.check_extension(crate::kvm::Cap::MsiDevid) { 489 // On AArch64, there is limitation on the range of the 'devid', 490 // it can not be greater than 65536 (the max of u16). 491 // 492 // BDF can not be used directly, because 'segment' is in high 493 // 16 bits. The layout of the u32 BDF is: 494 // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --| 495 // | segment | bus | device | function | 496 // 497 // Now that we support 1 bus only in a segment, we can build a 498 // 'devid' by replacing the 'bus' bits with the low 8 bits of 499 // 'segment' data. 500 // This way we can resolve the range checking problem and give 501 // different `devid` to all the devices. Limitation is that at 502 // most 256 segments can be supported. 503 // 504 let modified_devid = (cfg.devid & 0x00ff_0000) >> 8 | cfg.devid & 0xff; 505 506 kvm_route.flags = KVM_MSI_VALID_DEVID; 507 kvm_route.u.msi.__bindgen_anon_1.devid = modified_devid; 508 } 509 kvm_route.into() 510 } 511 InterruptSourceConfig::LegacyIrq(cfg) => { 512 let mut kvm_route = kvm_irq_routing_entry { 513 gsi, 514 type_: KVM_IRQ_ROUTING_IRQCHIP, 515 ..Default::default() 516 }; 517 kvm_route.u.irqchip.irqchip = cfg.irqchip; 518 kvm_route.u.irqchip.pin = cfg.pin; 519 520 kvm_route.into() 521 } 522 } 523 } 524 525 /// 526 /// Sets the GSI routing table entries, overwriting any previously set 527 /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl. 528 /// 529 fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> { 530 let mut irq_routing = 531 vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len()); 532 irq_routing[0].nr = entries.len() as u32; 533 irq_routing[0].flags = 0; 534 let entries: Vec<kvm_irq_routing_entry> = entries 535 .iter() 536 .map(|entry| match entry { 537 IrqRoutingEntry::Kvm(e) => *e, 538 #[allow(unreachable_patterns)] 539 _ => panic!("IrqRoutingEntry type is wrong"), 540 }) 541 .collect(); 542 543 // SAFETY: irq_routing initialized with entries.len() and now it is being turned into 544 // entries_slice with entries.len() again. It is guaranteed to be large enough to hold 545 // everything from entries. 546 unsafe { 547 let entries_slice: &mut [kvm_irq_routing_entry] = 548 irq_routing[0].entries.as_mut_slice(entries.len()); 549 entries_slice.copy_from_slice(&entries); 550 } 551 552 self.fd 553 .set_gsi_routing(&irq_routing[0]) 554 .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into())) 555 } 556 /// 557 /// Creates a memory region structure that can be used with {create/remove}_user_memory_region 558 /// 559 fn make_user_memory_region( 560 &self, 561 slot: u32, 562 guest_phys_addr: u64, 563 memory_size: u64, 564 userspace_addr: u64, 565 readonly: bool, 566 log_dirty_pages: bool, 567 ) -> UserMemoryRegion { 568 kvm_userspace_memory_region { 569 slot, 570 guest_phys_addr, 571 memory_size, 572 userspace_addr, 573 flags: if readonly { KVM_MEM_READONLY } else { 0 } 574 | if log_dirty_pages { 575 KVM_MEM_LOG_DIRTY_PAGES 576 } else { 577 0 578 }, 579 } 580 .into() 581 } 582 /// 583 /// Creates a guest physical memory region. 584 /// 585 fn create_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> { 586 let mut region: kvm_userspace_memory_region = user_memory_region.into(); 587 588 if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 { 589 if (region.flags & KVM_MEM_READONLY) != 0 { 590 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!( 591 "Error creating regions with both 'dirty-pages-log' and 'read-only'." 592 ))); 593 } 594 595 // Keep track of the regions that need dirty pages log 596 self.dirty_log_slots.write().unwrap().insert( 597 region.slot, 598 KvmDirtyLogSlot { 599 slot: region.slot, 600 guest_phys_addr: region.guest_phys_addr, 601 memory_size: region.memory_size, 602 userspace_addr: region.userspace_addr, 603 }, 604 ); 605 606 // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`. 607 // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`. 608 region.flags = 0; 609 } 610 611 // SAFETY: Safe because guest regions are guaranteed not to overlap. 612 unsafe { 613 self.fd 614 .set_user_memory_region(region) 615 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into())) 616 } 617 } 618 /// 619 /// Removes a guest physical memory region. 620 /// 621 fn remove_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> { 622 let mut region: kvm_userspace_memory_region = user_memory_region.into(); 623 624 // Remove the corresponding entry from "self.dirty_log_slots" if needed 625 self.dirty_log_slots.write().unwrap().remove(®ion.slot); 626 627 // Setting the size to 0 means "remove" 628 region.memory_size = 0; 629 // SAFETY: Safe because guest regions are guaranteed not to overlap. 630 unsafe { 631 self.fd 632 .set_user_memory_region(region) 633 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into())) 634 } 635 } 636 /// 637 /// Returns the preferred CPU target type which can be emulated by KVM on underlying host. 638 /// 639 #[cfg(target_arch = "aarch64")] 640 fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> { 641 self.fd 642 .get_preferred_target(kvi) 643 .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into())) 644 } 645 #[cfg(target_arch = "x86_64")] 646 fn enable_split_irq(&self) -> vm::Result<()> { 647 // Create split irqchip 648 // Only the local APIC is emulated in kernel, both PICs and IOAPIC 649 // are not. 650 let mut cap = kvm_enable_cap { 651 cap: KVM_CAP_SPLIT_IRQCHIP, 652 ..Default::default() 653 }; 654 cap.args[0] = NUM_IOAPIC_PINS as u64; 655 self.fd 656 .enable_cap(&cap) 657 .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?; 658 Ok(()) 659 } 660 #[cfg(target_arch = "x86_64")] 661 fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> { 662 let mut cap = kvm_enable_cap { 663 cap: KVM_CAP_SGX_ATTRIBUTE, 664 ..Default::default() 665 }; 666 cap.args[0] = file.as_raw_fd() as u64; 667 self.fd 668 .enable_cap(&cap) 669 .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?; 670 Ok(()) 671 } 672 /// Retrieve guest clock. 673 #[cfg(target_arch = "x86_64")] 674 fn get_clock(&self) -> vm::Result<ClockData> { 675 Ok(self 676 .fd 677 .get_clock() 678 .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))? 679 .into()) 680 } 681 /// Set guest clock. 682 #[cfg(target_arch = "x86_64")] 683 fn set_clock(&self, data: &ClockData) -> vm::Result<()> { 684 let data = (*data).into(); 685 self.fd 686 .set_clock(&data) 687 .map_err(|e| vm::HypervisorVmError::SetClock(e.into())) 688 } 689 /// Create a device that is used for passthrough 690 fn create_passthrough_device(&self) -> vm::Result<VfioDeviceFd> { 691 let mut vfio_dev = kvm_create_device { 692 type_: kvm_device_type_KVM_DEV_TYPE_VFIO, 693 fd: 0, 694 flags: 0, 695 }; 696 697 self.create_device(&mut vfio_dev) 698 .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into())) 699 } 700 /// 701 /// Start logging dirty pages 702 /// 703 fn start_dirty_log(&self) -> vm::Result<()> { 704 let dirty_log_slots = self.dirty_log_slots.read().unwrap(); 705 for (_, s) in dirty_log_slots.iter() { 706 let region = kvm_userspace_memory_region { 707 slot: s.slot, 708 guest_phys_addr: s.guest_phys_addr, 709 memory_size: s.memory_size, 710 userspace_addr: s.userspace_addr, 711 flags: KVM_MEM_LOG_DIRTY_PAGES, 712 }; 713 // SAFETY: Safe because guest regions are guaranteed not to overlap. 714 unsafe { 715 self.fd 716 .set_user_memory_region(region) 717 .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; 718 } 719 } 720 721 Ok(()) 722 } 723 724 /// 725 /// Stop logging dirty pages 726 /// 727 fn stop_dirty_log(&self) -> vm::Result<()> { 728 let dirty_log_slots = self.dirty_log_slots.read().unwrap(); 729 for (_, s) in dirty_log_slots.iter() { 730 let region = kvm_userspace_memory_region { 731 slot: s.slot, 732 guest_phys_addr: s.guest_phys_addr, 733 memory_size: s.memory_size, 734 userspace_addr: s.userspace_addr, 735 flags: 0, 736 }; 737 // SAFETY: Safe because guest regions are guaranteed not to overlap. 738 unsafe { 739 self.fd 740 .set_user_memory_region(region) 741 .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?; 742 } 743 } 744 745 Ok(()) 746 } 747 748 /// 749 /// Get dirty pages bitmap (one bit per page) 750 /// 751 fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> { 752 self.fd 753 .get_dirty_log(slot, memory_size as usize) 754 .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into())) 755 } 756 757 /// 758 /// Initialize TDX for this VM 759 /// 760 #[cfg(feature = "tdx")] 761 fn tdx_init(&self, cpuid: &[CpuIdEntry], max_vcpus: u32) -> vm::Result<()> { 762 use std::io::{Error, ErrorKind}; 763 let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> = 764 cpuid.iter().map(|e| (*e).into()).collect(); 765 let kvm_cpuid = kvm_bindings::CpuId::from_entries(&cpuid).map_err(|_| { 766 vm::HypervisorVmError::InitializeTdx(Error::new( 767 ErrorKind::Other, 768 "failed to allocate CpuId", 769 )) 770 })?; 771 772 #[repr(C)] 773 struct TdxInitVm { 774 max_vcpus: u32, 775 tsc_khz: u32, 776 attributes: u64, 777 cpuid: u64, 778 mrconfigid: [u64; 6], 779 mrowner: [u64; 6], 780 mrownerconfig: [u64; 6], 781 reserved: [u64; 43], 782 } 783 let data = TdxInitVm { 784 max_vcpus, 785 tsc_khz: 0, 786 attributes: 0, 787 cpuid: kvm_cpuid.as_fam_struct_ptr() as u64, 788 mrconfigid: [0; 6], 789 mrowner: [0; 6], 790 mrownerconfig: [0; 6], 791 reserved: [0; 43], 792 }; 793 794 tdx_command( 795 &self.fd.as_raw_fd(), 796 TdxCommand::InitVm, 797 0, 798 &data as *const _ as u64, 799 ) 800 .map_err(vm::HypervisorVmError::InitializeTdx) 801 } 802 803 /// 804 /// Finalize the TDX setup for this VM 805 /// 806 #[cfg(feature = "tdx")] 807 fn tdx_finalize(&self) -> vm::Result<()> { 808 tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0) 809 .map_err(vm::HypervisorVmError::FinalizeTdx) 810 } 811 812 /// 813 /// Initialize memory regions for the TDX VM 814 /// 815 #[cfg(feature = "tdx")] 816 fn tdx_init_memory_region( 817 &self, 818 host_address: u64, 819 guest_address: u64, 820 size: u64, 821 measure: bool, 822 ) -> vm::Result<()> { 823 #[repr(C)] 824 struct TdxInitMemRegion { 825 host_address: u64, 826 guest_address: u64, 827 pages: u64, 828 } 829 let data = TdxInitMemRegion { 830 host_address, 831 guest_address, 832 pages: size / 4096, 833 }; 834 835 tdx_command( 836 &self.fd.as_raw_fd(), 837 TdxCommand::InitMemRegion, 838 if measure { 1 } else { 0 }, 839 &data as *const _ as u64, 840 ) 841 .map_err(vm::HypervisorVmError::InitMemRegionTdx) 842 } 843 /// Downcast to the underlying KvmVm type 844 fn as_any(&self) -> &dyn Any { 845 self 846 } 847 } 848 849 #[cfg(feature = "tdx")] 850 fn tdx_command( 851 fd: &RawFd, 852 command: TdxCommand, 853 metadata: u32, 854 data: u64, 855 ) -> std::result::Result<(), std::io::Error> { 856 #[repr(C)] 857 struct TdxIoctlCmd { 858 command: TdxCommand, 859 metadata: u32, 860 data: u64, 861 } 862 let cmd = TdxIoctlCmd { 863 command, 864 metadata, 865 data, 866 }; 867 // SAFETY: FFI call. All input parameters are valid. 868 let ret = unsafe { 869 ioctl_with_val( 870 fd, 871 KVM_MEMORY_ENCRYPT_OP(), 872 &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong, 873 ) 874 }; 875 876 if ret < 0 { 877 return Err(std::io::Error::last_os_error()); 878 } 879 Ok(()) 880 } 881 882 /// Wrapper over KVM system ioctls. 883 pub struct KvmHypervisor { 884 kvm: Kvm, 885 } 886 887 impl KvmHypervisor { 888 #[cfg(target_arch = "x86_64")] 889 /// 890 /// Retrieve the list of MSRs supported by the hypervisor. 891 /// 892 fn get_msr_list(&self) -> hypervisor::Result<MsrList> { 893 self.kvm 894 .get_msr_index_list() 895 .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into())) 896 } 897 } 898 899 /// Enum for KVM related error 900 #[derive(Debug, Error)] 901 pub enum KvmError { 902 #[error("Capability missing: {0:?}")] 903 CapabilityMissing(Cap), 904 } 905 pub type KvmResult<T> = result::Result<T, KvmError>; 906 impl KvmHypervisor { 907 /// Create a hypervisor based on Kvm 908 #[allow(clippy::new_ret_no_self)] 909 pub fn new() -> hypervisor::Result<Arc<dyn hypervisor::Hypervisor>> { 910 let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?; 911 let api_version = kvm_obj.get_api_version(); 912 913 if api_version != kvm_bindings::KVM_API_VERSION as i32 { 914 return Err(hypervisor::HypervisorError::IncompatibleApiVersion); 915 } 916 917 Ok(Arc::new(KvmHypervisor { kvm: kvm_obj })) 918 } 919 /// Check if the hypervisor is available 920 pub fn is_available() -> hypervisor::Result<bool> { 921 match std::fs::metadata("/dev/kvm") { 922 Ok(_) => Ok(true), 923 Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false), 924 Err(err) => Err(hypervisor::HypervisorError::HypervisorAvailableCheck( 925 err.into(), 926 )), 927 } 928 } 929 } 930 /// Implementation of Hypervisor trait for KVM 931 /// Example: 932 /// #[cfg(feature = "kvm")] 933 /// extern crate hypervisor 934 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 935 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 936 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 937 /// 938 impl hypervisor::Hypervisor for KvmHypervisor { 939 /// 940 /// Returns the type of the hypervisor 941 /// 942 fn hypervisor_type(&self) -> HypervisorType { 943 HypervisorType::Kvm 944 } 945 /// Create a KVM vm object of a specific VM type and return the object as Vm trait object 946 /// Example 947 /// # extern crate hypervisor; 948 /// # use hypervisor::KvmHypervisor; 949 /// use hypervisor::KvmVm; 950 /// let hypervisor = KvmHypervisor::new().unwrap(); 951 /// let vm = hypervisor.create_vm_with_type(KvmVmType::LegacyVm).unwrap() 952 /// 953 fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> { 954 let fd: VmFd; 955 loop { 956 match self.kvm.create_vm_with_type(vm_type) { 957 Ok(res) => fd = res, 958 Err(e) => { 959 if e.errno() == libc::EINTR { 960 // If the error returned is EINTR, which means the 961 // ioctl has been interrupted, we have to retry as 962 // this can't be considered as a regular error. 963 continue; 964 } else { 965 return Err(hypervisor::HypervisorError::VmCreate(e.into())); 966 } 967 } 968 } 969 break; 970 } 971 972 let vm_fd = Arc::new(fd); 973 974 #[cfg(target_arch = "x86_64")] 975 { 976 let msr_list = self.get_msr_list()?; 977 let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize; 978 let mut msrs: Vec<MsrEntry> = vec![ 979 MsrEntry { 980 ..Default::default() 981 }; 982 num_msrs 983 ]; 984 let indices = msr_list.as_slice(); 985 for (pos, index) in indices.iter().enumerate() { 986 msrs[pos].index = *index; 987 } 988 989 Ok(Arc::new(KvmVm { 990 fd: vm_fd, 991 msrs, 992 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), 993 })) 994 } 995 996 #[cfg(target_arch = "aarch64")] 997 { 998 Ok(Arc::new(KvmVm { 999 fd: vm_fd, 1000 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())), 1001 })) 1002 } 1003 } 1004 1005 /// Create a KVM vm object and return the object as Vm trait object 1006 /// Example 1007 /// # extern crate hypervisor; 1008 /// # use hypervisor::KvmHypervisor; 1009 /// use hypervisor::KvmVm; 1010 /// let hypervisor = KvmHypervisor::new().unwrap(); 1011 /// let vm = hypervisor.create_vm().unwrap() 1012 /// 1013 fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> { 1014 #[allow(unused_mut)] 1015 let mut vm_type: u64 = 0; // Create with default platform type 1016 1017 // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA 1018 // size from the host and use that when creating the VM, which may 1019 // avoid unnecessary VM creation failures. 1020 #[cfg(target_arch = "aarch64")] 1021 if self.kvm.check_extension(Cap::ArmVmIPASize) { 1022 vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap(); 1023 } 1024 1025 self.create_vm_with_type(vm_type) 1026 } 1027 1028 fn check_required_extensions(&self) -> hypervisor::Result<()> { 1029 check_required_kvm_extensions(&self.kvm) 1030 .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into())) 1031 } 1032 1033 #[cfg(target_arch = "x86_64")] 1034 /// 1035 /// X86 specific call to get the system supported CPUID values. 1036 /// 1037 fn get_cpuid(&self) -> hypervisor::Result<Vec<CpuIdEntry>> { 1038 let kvm_cpuid = self 1039 .kvm 1040 .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES) 1041 .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))?; 1042 1043 let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect(); 1044 1045 Ok(v) 1046 } 1047 1048 #[cfg(target_arch = "aarch64")] 1049 /// 1050 /// Retrieve AArch64 host maximum IPA size supported by KVM. 1051 /// 1052 fn get_host_ipa_limit(&self) -> i32 { 1053 self.kvm.get_host_ipa_limit() 1054 } 1055 1056 /// 1057 /// Retrieve TDX capabilities 1058 /// 1059 #[cfg(feature = "tdx")] 1060 fn tdx_capabilities(&self) -> hypervisor::Result<TdxCapabilities> { 1061 let data = TdxCapabilities { 1062 nr_cpuid_configs: TDX_MAX_NR_CPUID_CONFIGS as u32, 1063 ..Default::default() 1064 }; 1065 1066 tdx_command( 1067 &self.kvm.as_raw_fd(), 1068 TdxCommand::Capabilities, 1069 0, 1070 &data as *const _ as u64, 1071 ) 1072 .map_err(|e| hypervisor::HypervisorError::TdxCapabilities(e.into()))?; 1073 1074 Ok(data) 1075 } 1076 } 1077 /// Vcpu struct for KVM 1078 pub struct KvmVcpu { 1079 fd: VcpuFd, 1080 #[cfg(target_arch = "x86_64")] 1081 msrs: Vec<MsrEntry>, 1082 vm_ops: Option<Arc<dyn vm::VmOps>>, 1083 #[cfg(target_arch = "x86_64")] 1084 hyperv_synic: AtomicBool, 1085 } 1086 /// Implementation of Vcpu trait for KVM 1087 /// Example: 1088 /// #[cfg(feature = "kvm")] 1089 /// extern crate hypervisor 1090 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1091 /// let hypervisor: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1092 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed"); 1093 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1094 /// vcpu.get/set().unwrap() 1095 /// 1096 impl cpu::Vcpu for KvmVcpu { 1097 #[cfg(target_arch = "x86_64")] 1098 /// 1099 /// Returns the vCPU general purpose registers. 1100 /// 1101 fn get_regs(&self) -> cpu::Result<StandardRegisters> { 1102 Ok(self 1103 .fd 1104 .get_regs() 1105 .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))? 1106 .into()) 1107 } 1108 /// 1109 /// Returns the vCPU general purpose registers. 1110 /// The `KVM_GET_REGS` ioctl is not available on AArch64, `KVM_GET_ONE_REG` 1111 /// is used to get registers one by one. 1112 /// 1113 #[cfg(target_arch = "aarch64")] 1114 fn get_regs(&self) -> cpu::Result<StandardRegisters> { 1115 let mut state: StandardRegisters = kvm_regs::default(); 1116 let mut off = offset__of!(user_pt_regs, regs); 1117 // There are 31 user_pt_regs: 1118 // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72 1119 // These actually are the general-purpose registers of the Armv8-a 1120 // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register). 1121 for i in 0..31 { 1122 state.regs.regs[i] = self 1123 .fd 1124 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1125 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1126 off += std::mem::size_of::<u64>(); 1127 } 1128 1129 // We are now entering the "Other register" section of the ARMv8-a architecture. 1130 // First one, stack pointer. 1131 let off = offset__of!(user_pt_regs, sp); 1132 state.regs.sp = self 1133 .fd 1134 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1135 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1136 1137 // Second one, the program counter. 1138 let off = offset__of!(user_pt_regs, pc); 1139 state.regs.pc = self 1140 .fd 1141 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1142 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1143 1144 // Next is the processor state. 1145 let off = offset__of!(user_pt_regs, pstate); 1146 state.regs.pstate = self 1147 .fd 1148 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1149 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1150 1151 // The stack pointer associated with EL1 1152 let off = offset__of!(kvm_regs, sp_el1); 1153 state.sp_el1 = self 1154 .fd 1155 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1156 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1157 1158 // Exception Link Register for EL1, when taking an exception to EL1, this register 1159 // holds the address to which to return afterwards. 1160 let off = offset__of!(kvm_regs, elr_el1); 1161 state.elr_el1 = self 1162 .fd 1163 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1164 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1165 1166 // Saved Program Status Registers, there are 5 of them used in the kernel. 1167 let mut off = offset__of!(kvm_regs, spsr); 1168 for i in 0..KVM_NR_SPSR as usize { 1169 state.spsr[i] = self 1170 .fd 1171 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off)) 1172 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?; 1173 off += std::mem::size_of::<u64>(); 1174 } 1175 1176 // Now moving on to floting point registers which are stored in the user_fpsimd_state in the kernel: 1177 // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53 1178 let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs); 1179 for i in 0..32 { 1180 state.fp_regs.vregs[i] = self 1181 .fd 1182 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off)) 1183 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1184 .into(); 1185 off += mem::size_of::<u128>(); 1186 } 1187 1188 // Floating-point Status Register 1189 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr); 1190 state.fp_regs.fpsr = self 1191 .fd 1192 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off)) 1193 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1194 as u32; 1195 1196 // Floating-point Control Register 1197 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr); 1198 state.fp_regs.fpcr = self 1199 .fd 1200 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off)) 1201 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))? 1202 as u32; 1203 Ok(state) 1204 } 1205 #[cfg(target_arch = "x86_64")] 1206 /// 1207 /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl. 1208 /// 1209 fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> { 1210 let regs = (*regs).into(); 1211 self.fd 1212 .set_regs(®s) 1213 .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into())) 1214 } 1215 1216 /// 1217 /// Sets the vCPU general purpose registers. 1218 /// The `KVM_SET_REGS` ioctl is not available on AArch64, `KVM_SET_ONE_REG` 1219 /// is used to set registers one by one. 1220 /// 1221 #[cfg(target_arch = "aarch64")] 1222 fn set_regs(&self, state: &StandardRegisters) -> cpu::Result<()> { 1223 // The function follows the exact identical order from `state`. Look there 1224 // for some additional info on registers. 1225 let mut off = offset__of!(user_pt_regs, regs); 1226 for i in 0..31 { 1227 self.fd 1228 .set_one_reg( 1229 arm64_core_reg_id!(KVM_REG_SIZE_U64, off), 1230 state.regs.regs[i], 1231 ) 1232 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1233 off += std::mem::size_of::<u64>(); 1234 } 1235 1236 let off = offset__of!(user_pt_regs, sp); 1237 self.fd 1238 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.sp) 1239 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1240 1241 let off = offset__of!(user_pt_regs, pc); 1242 self.fd 1243 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pc) 1244 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1245 1246 let off = offset__of!(user_pt_regs, pstate); 1247 self.fd 1248 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.regs.pstate) 1249 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1250 1251 let off = offset__of!(kvm_regs, sp_el1); 1252 self.fd 1253 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.sp_el1) 1254 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1255 1256 let off = offset__of!(kvm_regs, elr_el1); 1257 self.fd 1258 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.elr_el1) 1259 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1260 1261 let mut off = offset__of!(kvm_regs, spsr); 1262 for i in 0..KVM_NR_SPSR as usize { 1263 self.fd 1264 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off), state.spsr[i]) 1265 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1266 off += std::mem::size_of::<u64>(); 1267 } 1268 1269 let mut off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, vregs); 1270 for i in 0..32 { 1271 self.fd 1272 .set_one_reg( 1273 arm64_core_reg_id!(KVM_REG_SIZE_U128, off), 1274 state.fp_regs.vregs[i] as u64, 1275 ) 1276 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1277 off += mem::size_of::<u128>(); 1278 } 1279 1280 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpsr); 1281 self.fd 1282 .set_one_reg( 1283 arm64_core_reg_id!(KVM_REG_SIZE_U32, off), 1284 state.fp_regs.fpsr as u64, 1285 ) 1286 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1287 1288 let off = offset__of!(kvm_regs, fp_regs) + offset__of!(user_fpsimd_state, fpcr); 1289 self.fd 1290 .set_one_reg( 1291 arm64_core_reg_id!(KVM_REG_SIZE_U32, off), 1292 state.fp_regs.fpcr as u64, 1293 ) 1294 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1295 Ok(()) 1296 } 1297 1298 #[cfg(target_arch = "x86_64")] 1299 /// 1300 /// Returns the vCPU special registers. 1301 /// 1302 fn get_sregs(&self) -> cpu::Result<SpecialRegisters> { 1303 Ok(self 1304 .fd 1305 .get_sregs() 1306 .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))? 1307 .into()) 1308 } 1309 #[cfg(target_arch = "x86_64")] 1310 /// 1311 /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl. 1312 /// 1313 fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> { 1314 let sregs = (*sregs).into(); 1315 self.fd 1316 .set_sregs(&sregs) 1317 .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into())) 1318 } 1319 #[cfg(target_arch = "x86_64")] 1320 /// 1321 /// Returns the floating point state (FPU) from the vCPU. 1322 /// 1323 fn get_fpu(&self) -> cpu::Result<FpuState> { 1324 Ok(self 1325 .fd 1326 .get_fpu() 1327 .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))? 1328 .into()) 1329 } 1330 #[cfg(target_arch = "x86_64")] 1331 /// 1332 /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct. 1333 /// 1334 fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> { 1335 let fpu: kvm_bindings::kvm_fpu = (*fpu).clone().into(); 1336 self.fd 1337 .set_fpu(&fpu) 1338 .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into())) 1339 } 1340 #[cfg(target_arch = "x86_64")] 1341 /// 1342 /// X86 specific call to setup the CPUID registers. 1343 /// 1344 fn set_cpuid2(&self, cpuid: &[CpuIdEntry]) -> cpu::Result<()> { 1345 let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> = 1346 cpuid.iter().map(|e| (*e).into()).collect(); 1347 let kvm_cpuid = <CpuId>::from_entries(&cpuid) 1348 .map_err(|_| cpu::HypervisorCpuError::SetCpuid(anyhow!("failed to create CpuId")))?; 1349 1350 self.fd 1351 .set_cpuid2(&kvm_cpuid) 1352 .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into())) 1353 } 1354 #[cfg(target_arch = "x86_64")] 1355 /// 1356 /// X86 specific call to enable HyperV SynIC 1357 /// 1358 fn enable_hyperv_synic(&self) -> cpu::Result<()> { 1359 // Update the information about Hyper-V SynIC being enabled and 1360 // emulated as it will influence later which MSRs should be saved. 1361 self.hyperv_synic.store(true, Ordering::Release); 1362 1363 let cap = kvm_enable_cap { 1364 cap: KVM_CAP_HYPERV_SYNIC, 1365 ..Default::default() 1366 }; 1367 self.fd 1368 .enable_cap(&cap) 1369 .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into())) 1370 } 1371 /// 1372 /// X86 specific call to retrieve the CPUID registers. 1373 /// 1374 #[cfg(target_arch = "x86_64")] 1375 fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<Vec<CpuIdEntry>> { 1376 let kvm_cpuid = self 1377 .fd 1378 .get_cpuid2(num_entries) 1379 .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))?; 1380 1381 let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect(); 1382 1383 Ok(v) 1384 } 1385 #[cfg(target_arch = "x86_64")] 1386 /// 1387 /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller). 1388 /// 1389 fn get_lapic(&self) -> cpu::Result<LapicState> { 1390 Ok(self 1391 .fd 1392 .get_lapic() 1393 .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))? 1394 .into()) 1395 } 1396 #[cfg(target_arch = "x86_64")] 1397 /// 1398 /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller). 1399 /// 1400 fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> { 1401 let klapic: kvm_bindings::kvm_lapic_state = (*klapic).clone().into(); 1402 self.fd 1403 .set_lapic(&klapic) 1404 .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into())) 1405 } 1406 #[cfg(target_arch = "x86_64")] 1407 /// 1408 /// Returns the model-specific registers (MSR) for this vCPU. 1409 /// 1410 fn get_msrs(&self, msrs: &mut Vec<MsrEntry>) -> cpu::Result<usize> { 1411 let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect(); 1412 let mut kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap(); 1413 let succ = self 1414 .fd 1415 .get_msrs(&mut kvm_msrs) 1416 .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))?; 1417 1418 msrs[..succ].copy_from_slice( 1419 &kvm_msrs.as_slice()[..succ] 1420 .iter() 1421 .map(|e| (*e).into()) 1422 .collect::<Vec<MsrEntry>>(), 1423 ); 1424 1425 Ok(succ) 1426 } 1427 #[cfg(target_arch = "x86_64")] 1428 /// 1429 /// Setup the model-specific registers (MSR) for this vCPU. 1430 /// Returns the number of MSR entries actually written. 1431 /// 1432 fn set_msrs(&self, msrs: &[MsrEntry]) -> cpu::Result<usize> { 1433 let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect(); 1434 let kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap(); 1435 self.fd 1436 .set_msrs(&kvm_msrs) 1437 .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into())) 1438 } 1439 /// 1440 /// Returns the vcpu's current "multiprocessing state". 1441 /// 1442 fn get_mp_state(&self) -> cpu::Result<MpState> { 1443 Ok(self 1444 .fd 1445 .get_mp_state() 1446 .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))? 1447 .into()) 1448 } 1449 /// 1450 /// Sets the vcpu's current "multiprocessing state". 1451 /// 1452 fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> { 1453 self.fd 1454 .set_mp_state(mp_state.into()) 1455 .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into())) 1456 } 1457 #[cfg(target_arch = "x86_64")] 1458 /// 1459 /// Translates guest virtual address to guest physical address using the `KVM_TRANSLATE` ioctl. 1460 /// 1461 fn translate_gva(&self, gva: u64, _flags: u64) -> cpu::Result<(u64, u32)> { 1462 let tr = self 1463 .fd 1464 .translate_gva(gva) 1465 .map_err(|e| cpu::HypervisorCpuError::TranslateVirtualAddress(e.into()))?; 1466 // tr.valid is set if the GVA is mapped to valid GPA. 1467 match tr.valid { 1468 0 => Err(cpu::HypervisorCpuError::TranslateVirtualAddress(anyhow!( 1469 "Invalid GVA: {:#x}", 1470 gva 1471 ))), 1472 _ => Ok((tr.physical_address, 0)), 1473 } 1474 } 1475 /// 1476 /// Triggers the running of the current virtual CPU returning an exit reason. 1477 /// 1478 fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> { 1479 match self.fd.run() { 1480 Ok(run) => match run { 1481 #[cfg(target_arch = "x86_64")] 1482 VcpuExit::IoIn(addr, data) => { 1483 if let Some(vm_ops) = &self.vm_ops { 1484 return vm_ops 1485 .pio_read(addr.into(), data) 1486 .map(|_| cpu::VmExit::Ignore) 1487 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1488 } 1489 1490 Ok(cpu::VmExit::IoIn(addr, data)) 1491 } 1492 #[cfg(target_arch = "x86_64")] 1493 VcpuExit::IoOut(addr, data) => { 1494 if let Some(vm_ops) = &self.vm_ops { 1495 return vm_ops 1496 .pio_write(addr.into(), data) 1497 .map(|_| cpu::VmExit::Ignore) 1498 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1499 } 1500 1501 Ok(cpu::VmExit::IoOut(addr, data)) 1502 } 1503 #[cfg(target_arch = "x86_64")] 1504 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)), 1505 #[cfg(target_arch = "x86_64")] 1506 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset), 1507 1508 #[cfg(target_arch = "aarch64")] 1509 VcpuExit::SystemEvent(event_type, flags) => { 1510 use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN}; 1511 // On Aarch64, when the VM is shutdown, run() returns 1512 // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN 1513 if event_type == KVM_SYSTEM_EVENT_RESET { 1514 Ok(cpu::VmExit::Reset) 1515 } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN { 1516 Ok(cpu::VmExit::Shutdown) 1517 } else { 1518 Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1519 "Unexpected system event with type 0x{:x}, flags 0x{:x}", 1520 event_type, 1521 flags 1522 ))) 1523 } 1524 } 1525 1526 VcpuExit::MmioRead(addr, data) => { 1527 if let Some(vm_ops) = &self.vm_ops { 1528 return vm_ops 1529 .mmio_read(addr, data) 1530 .map(|_| cpu::VmExit::Ignore) 1531 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1532 } 1533 1534 Ok(cpu::VmExit::MmioRead(addr, data)) 1535 } 1536 VcpuExit::MmioWrite(addr, data) => { 1537 if let Some(vm_ops) = &self.vm_ops { 1538 return vm_ops 1539 .mmio_write(addr, data) 1540 .map(|_| cpu::VmExit::Ignore) 1541 .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into())); 1542 } 1543 1544 Ok(cpu::VmExit::MmioWrite(addr, data)) 1545 } 1546 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv), 1547 #[cfg(feature = "tdx")] 1548 VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx), 1549 VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug), 1550 1551 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1552 "Unexpected exit reason on vcpu run: {:?}", 1553 r 1554 ))), 1555 }, 1556 1557 Err(ref e) => match e.errno() { 1558 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore), 1559 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( 1560 "VCPU error {:?}", 1561 e 1562 ))), 1563 }, 1564 } 1565 } 1566 #[cfg(target_arch = "x86_64")] 1567 /// 1568 /// Let the guest know that it has been paused, which prevents from 1569 /// potential soft lockups when being resumed. 1570 /// 1571 fn notify_guest_clock_paused(&self) -> cpu::Result<()> { 1572 if let Err(e) = self.fd.kvmclock_ctrl() { 1573 // Linux kernel returns -EINVAL if the PV clock isn't yet initialised 1574 // which could be because we're still in firmware or the guest doesn't 1575 // use KVM clock. 1576 if e.errno() != libc::EINVAL { 1577 return Err(cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into())); 1578 } 1579 } 1580 1581 Ok(()) 1582 } 1583 #[cfg(target_arch = "x86_64")] 1584 /// 1585 /// Sets debug registers to set hardware breakpoints and/or enable single step. 1586 /// 1587 fn set_guest_debug( 1588 &self, 1589 addrs: &[vm_memory::GuestAddress], 1590 singlestep: bool, 1591 ) -> cpu::Result<()> { 1592 if addrs.len() > 4 { 1593 return Err(cpu::HypervisorCpuError::SetDebugRegs(anyhow!( 1594 "Support 4 breakpoints at most but {} addresses are passed", 1595 addrs.len() 1596 ))); 1597 } 1598 1599 let mut dbg = kvm_guest_debug { 1600 control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP, 1601 ..Default::default() 1602 }; 1603 if singlestep { 1604 dbg.control |= KVM_GUESTDBG_SINGLESTEP; 1605 } 1606 1607 // Set bits 9 and 10. 1608 // bit 9: GE (global exact breakpoint enable) flag. 1609 // bit 10: always 1. 1610 dbg.arch.debugreg[7] = 0x0600; 1611 1612 for (i, addr) in addrs.iter().enumerate() { 1613 dbg.arch.debugreg[i] = addr.0; 1614 // Set global breakpoint enable flag 1615 dbg.arch.debugreg[7] |= 2 << (i * 2); 1616 } 1617 1618 self.fd 1619 .set_guest_debug(&dbg) 1620 .map_err(|e| cpu::HypervisorCpuError::SetDebugRegs(e.into())) 1621 } 1622 #[cfg(target_arch = "aarch64")] 1623 fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> { 1624 self.fd 1625 .vcpu_init(kvi) 1626 .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into())) 1627 } 1628 /// 1629 /// Sets the value of one register for this vCPU. 1630 /// 1631 #[cfg(target_arch = "aarch64")] 1632 fn set_reg(&self, reg_id: u64, data: u64) -> cpu::Result<()> { 1633 self.fd 1634 .set_one_reg(reg_id, data) 1635 .map_err(|e| cpu::HypervisorCpuError::SetRegister(e.into())) 1636 } 1637 /// 1638 /// Gets the value of one register for this vCPU. 1639 /// 1640 #[cfg(target_arch = "aarch64")] 1641 fn get_reg(&self, reg_id: u64) -> cpu::Result<u64> { 1642 self.fd 1643 .get_one_reg(reg_id) 1644 .map_err(|e| cpu::HypervisorCpuError::GetRegister(e.into())) 1645 } 1646 /// 1647 /// Gets a list of the guest registers that are supported for the 1648 /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls. 1649 /// 1650 #[cfg(target_arch = "aarch64")] 1651 fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> { 1652 self.fd 1653 .get_reg_list(reg_list) 1654 .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into())) 1655 } 1656 /// 1657 /// Save the state of the system registers. 1658 /// 1659 #[cfg(target_arch = "aarch64")] 1660 fn get_sys_regs(&self) -> cpu::Result<Vec<Register>> { 1661 // Call KVM_GET_REG_LIST to get all registers available to the guest. For ArmV8 there are 1662 // around 500 registers. 1663 let mut state: Vec<Register> = Vec::new(); 1664 let mut reg_list = RegList::new(500).unwrap(); 1665 self.fd 1666 .get_reg_list(&mut reg_list) 1667 .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?; 1668 1669 // At this point reg_list should contain: core registers and system registers. 1670 // The register list contains the number of registers and their ids. We will be needing to 1671 // call KVM_GET_ONE_REG on each id in order to save all of them. We carve out from the list 1672 // the core registers which are represented in the kernel by kvm_regs structure and for which 1673 // we can calculate the id based on the offset in the structure. 1674 reg_list.retain(|regid| is_system_register(*regid)); 1675 1676 // Now, for the rest of the registers left in the previously fetched register list, we are 1677 // simply calling KVM_GET_ONE_REG. 1678 let indices = reg_list.as_slice(); 1679 for index in indices.iter() { 1680 state.push(kvm_bindings::kvm_one_reg { 1681 id: *index, 1682 addr: self 1683 .fd 1684 .get_one_reg(*index) 1685 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?, 1686 }); 1687 } 1688 1689 Ok(state) 1690 } 1691 /// 1692 /// Restore the state of the system registers. 1693 /// 1694 #[cfg(target_arch = "aarch64")] 1695 fn set_sys_regs(&self, state: &[Register]) -> cpu::Result<()> { 1696 for reg in state { 1697 self.fd 1698 .set_one_reg(reg.id, reg.addr) 1699 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?; 1700 } 1701 Ok(()) 1702 } 1703 /// 1704 /// Read the MPIDR - Multiprocessor Affinity Register. 1705 /// 1706 #[cfg(target_arch = "aarch64")] 1707 fn read_mpidr(&self) -> cpu::Result<u64> { 1708 self.fd 1709 .get_one_reg(MPIDR_EL1) 1710 .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into())) 1711 } 1712 /// 1713 /// Configure core registers for a given CPU. 1714 /// 1715 #[cfg(target_arch = "aarch64")] 1716 fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> { 1717 #[allow(non_upper_case_globals)] 1718 // PSR (Processor State Register) bits. 1719 // Taken from arch/arm64/include/uapi/asm/ptrace.h. 1720 const PSR_MODE_EL1h: u64 = 0x0000_0005; 1721 const PSR_F_BIT: u64 = 0x0000_0040; 1722 const PSR_I_BIT: u64 = 0x0000_0080; 1723 const PSR_A_BIT: u64 = 0x0000_0100; 1724 const PSR_D_BIT: u64 = 0x0000_0200; 1725 // Taken from arch/arm64/kvm/inject_fault.c. 1726 const PSTATE_FAULT_BITS_64: u64 = 1727 PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | PSR_I_BIT | PSR_D_BIT; 1728 1729 let kreg_off = offset__of!(kvm_regs, regs); 1730 1731 // Get the register index of the PSTATE (Processor State) register. 1732 let pstate = offset__of!(user_pt_regs, pstate) + kreg_off; 1733 self.set_reg( 1734 arm64_core_reg_id!(KVM_REG_SIZE_U64, pstate), 1735 PSTATE_FAULT_BITS_64, 1736 ) 1737 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1738 1739 // Other vCPUs are powered off initially awaiting PSCI wakeup. 1740 if cpu_id == 0 { 1741 // Setting the PC (Processor Counter) to the current program address (kernel address). 1742 let pc = offset__of!(user_pt_regs, pc) + kreg_off; 1743 self.set_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, pc), boot_ip as u64) 1744 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1745 1746 // Last mandatory thing to set -> the address pointing to the FDT (also called DTB). 1747 // "The device tree blob (dtb) must be placed on an 8-byte boundary and must 1748 // not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt. 1749 // We are choosing to place it the end of DRAM. See `get_fdt_addr`. 1750 let regs0 = offset__of!(user_pt_regs, regs) + kreg_off; 1751 self.set_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, regs0), fdt_start) 1752 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?; 1753 } 1754 Ok(()) 1755 } 1756 1757 #[cfg(target_arch = "x86_64")] 1758 /// 1759 /// Get the current CPU state 1760 /// 1761 /// Ordering requirements: 1762 /// 1763 /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify 1764 /// vCPU/LAPIC state. As such, it must be done before most everything 1765 /// else, otherwise we cannot restore everything and expect it to work. 1766 /// 1767 /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are 1768 /// still running. 1769 /// 1770 /// KVM_GET_LAPIC may change state of LAPIC before returning it. 1771 /// 1772 /// GET_VCPU_EVENTS should probably be last to save. The code looks as 1773 /// it might as well be affected by internal state modifications of the 1774 /// GET ioctls. 1775 /// 1776 /// SREGS saves/restores a pending interrupt, similar to what 1777 /// VCPU_EVENTS also does. 1778 /// 1779 /// GET_MSRS requires a pre-populated data structure to do something 1780 /// meaningful. For SET_MSRS it will then contain good data. 1781 /// 1782 /// # Example 1783 /// 1784 /// ```rust 1785 /// # extern crate hypervisor; 1786 /// # use hypervisor::KvmHypervisor; 1787 /// # use std::sync::Arc; 1788 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1789 /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1790 /// let vm = hv.create_vm().expect("new VM fd creation failed"); 1791 /// vm.enable_split_irq().unwrap(); 1792 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1793 /// let state = vcpu.state().unwrap(); 1794 /// ``` 1795 fn state(&self) -> cpu::Result<CpuState> { 1796 let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?; 1797 let mp_state = self.get_mp_state()?.into(); 1798 let regs = self.get_regs()?; 1799 let sregs = self.get_sregs()?; 1800 let xsave = self.get_xsave()?; 1801 let xcrs = self.get_xcrs()?; 1802 let lapic_state = self.get_lapic()?; 1803 let fpu = self.get_fpu()?; 1804 1805 // Try to get all MSRs based on the list previously retrieved from KVM. 1806 // If the number of MSRs obtained from GET_MSRS is different from the 1807 // expected amount, we fallback onto a slower method by getting MSRs 1808 // by chunks. This is the only way to make sure we try to get as many 1809 // MSRs as possible, even if some MSRs are not supported. 1810 let mut msr_entries = self.msrs.clone(); 1811 1812 // Save extra MSRs if the Hyper-V synthetic interrupt controller is 1813 // emulated. 1814 if self.hyperv_synic.load(Ordering::Acquire) { 1815 let hyperv_synic_msrs = vec![ 1816 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084, 1817 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096, 1818 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d, 1819 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4, 1820 0x400000b5, 0x400000b6, 0x400000b7, 1821 ]; 1822 for index in hyperv_synic_msrs { 1823 let msr = kvm_msr_entry { 1824 index, 1825 ..Default::default() 1826 }; 1827 msr_entries.push(msr.into()); 1828 } 1829 } 1830 1831 let expected_num_msrs = msr_entries.len(); 1832 let num_msrs = self.get_msrs(&mut msr_entries)?; 1833 let msrs = if num_msrs != expected_num_msrs { 1834 let mut faulty_msr_index = num_msrs; 1835 let mut msr_entries_tmp = msr_entries[..faulty_msr_index].to_vec(); 1836 1837 loop { 1838 warn!( 1839 "Detected faulty MSR 0x{:x} while getting MSRs", 1840 msr_entries[faulty_msr_index].index 1841 ); 1842 1843 // Skip the first bad MSR 1844 let start_pos = faulty_msr_index + 1; 1845 1846 let mut sub_msr_entries = msr_entries[start_pos..].to_vec(); 1847 let num_msrs = self.get_msrs(&mut sub_msr_entries)?; 1848 1849 msr_entries_tmp.extend(&sub_msr_entries[..num_msrs]); 1850 1851 if num_msrs == sub_msr_entries.len() { 1852 break; 1853 } 1854 1855 faulty_msr_index = start_pos + num_msrs; 1856 } 1857 1858 msr_entries_tmp 1859 } else { 1860 msr_entries 1861 }; 1862 1863 let vcpu_events = self.get_vcpu_events()?; 1864 1865 Ok(VcpuKvmState { 1866 cpuid, 1867 msrs, 1868 vcpu_events, 1869 regs: regs.into(), 1870 sregs: sregs.into(), 1871 fpu, 1872 lapic_state, 1873 xsave, 1874 xcrs, 1875 mp_state, 1876 } 1877 .into()) 1878 } 1879 /// 1880 /// Get the current AArch64 CPU state 1881 /// 1882 #[cfg(target_arch = "aarch64")] 1883 fn state(&self) -> cpu::Result<CpuState> { 1884 let mut state = VcpuKvmState { 1885 mp_state: self.get_mp_state()?.into(), 1886 mpidr: self.read_mpidr()?, 1887 ..Default::default() 1888 }; 1889 state.core_regs = self.get_regs()?; 1890 state.sys_regs = self.get_sys_regs()?; 1891 1892 Ok(state.into()) 1893 } 1894 #[cfg(target_arch = "x86_64")] 1895 /// 1896 /// Restore the previously saved CPU state 1897 /// 1898 /// Ordering requirements: 1899 /// 1900 /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are 1901 /// still running. 1902 /// 1903 /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so 1904 /// if we ever change the BSP, we have to do that before restoring anything. 1905 /// The same seems to be true for CPUID stuff. 1906 /// 1907 /// SREGS saves/restores a pending interrupt, similar to what 1908 /// VCPU_EVENTS also does. 1909 /// 1910 /// SET_REGS clears pending exceptions unconditionally, thus, it must be 1911 /// done before SET_VCPU_EVENTS, which restores it. 1912 /// 1913 /// SET_LAPIC must come after SET_SREGS, because the latter restores 1914 /// the apic base msr. 1915 /// 1916 /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR 1917 /// only restores successfully, when the LAPIC is correctly configured. 1918 /// 1919 /// Arguments: CpuState 1920 /// # Example 1921 /// 1922 /// ```rust 1923 /// # extern crate hypervisor; 1924 /// # use hypervisor::KvmHypervisor; 1925 /// # use std::sync::Arc; 1926 /// let kvm = hypervisor::kvm::KvmHypervisor::new().unwrap(); 1927 /// let hv: Arc<dyn hypervisor::Hypervisor> = Arc::new(kvm); 1928 /// let vm = hv.create_vm().expect("new VM fd creation failed"); 1929 /// vm.enable_split_irq().unwrap(); 1930 /// let vcpu = vm.create_vcpu(0, None).unwrap(); 1931 /// let state = vcpu.state().unwrap(); 1932 /// vcpu.set_state(&state).unwrap(); 1933 /// ``` 1934 fn set_state(&self, state: &CpuState) -> cpu::Result<()> { 1935 let state: VcpuKvmState = state.clone().into(); 1936 self.set_cpuid2(&state.cpuid)?; 1937 self.set_mp_state(state.mp_state.into())?; 1938 self.set_regs(&state.regs.into())?; 1939 self.set_sregs(&state.sregs.into())?; 1940 self.set_xsave(&state.xsave)?; 1941 self.set_xcrs(&state.xcrs)?; 1942 self.set_lapic(&state.lapic_state)?; 1943 self.set_fpu(&state.fpu)?; 1944 1945 // Try to set all MSRs previously stored. 1946 // If the number of MSRs set from SET_MSRS is different from the 1947 // expected amount, we fallback onto a slower method by setting MSRs 1948 // by chunks. This is the only way to make sure we try to set as many 1949 // MSRs as possible, even if some MSRs are not supported. 1950 let expected_num_msrs = state.msrs.len(); 1951 let num_msrs = self.set_msrs(&state.msrs)?; 1952 if num_msrs != expected_num_msrs { 1953 let mut faulty_msr_index = num_msrs; 1954 1955 loop { 1956 warn!( 1957 "Detected faulty MSR 0x{:x} while setting MSRs", 1958 state.msrs[faulty_msr_index].index 1959 ); 1960 1961 // Skip the first bad MSR 1962 let start_pos = faulty_msr_index + 1; 1963 1964 let sub_msr_entries = state.msrs[start_pos..].to_vec(); 1965 1966 let num_msrs = self.set_msrs(&sub_msr_entries)?; 1967 1968 if num_msrs == sub_msr_entries.len() { 1969 break; 1970 } 1971 1972 faulty_msr_index = start_pos + num_msrs; 1973 } 1974 } 1975 1976 self.set_vcpu_events(&state.vcpu_events)?; 1977 1978 Ok(()) 1979 } 1980 /// 1981 /// Restore the previously saved AArch64 CPU state 1982 /// 1983 #[cfg(target_arch = "aarch64")] 1984 fn set_state(&self, state: &CpuState) -> cpu::Result<()> { 1985 let state: VcpuKvmState = state.clone().into(); 1986 self.set_regs(&state.core_regs)?; 1987 self.set_sys_regs(&state.sys_regs)?; 1988 self.set_mp_state(state.mp_state.into())?; 1989 1990 Ok(()) 1991 } 1992 1993 /// 1994 /// Initialize TDX for this CPU 1995 /// 1996 #[cfg(feature = "tdx")] 1997 fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> { 1998 tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address) 1999 .map_err(cpu::HypervisorCpuError::InitializeTdx) 2000 } 2001 2002 /// 2003 /// Set the "immediate_exit" state 2004 /// 2005 fn set_immediate_exit(&self, exit: bool) { 2006 self.fd.set_kvm_immediate_exit(exit.into()); 2007 } 2008 2009 /// 2010 /// Returns the details about TDX exit reason 2011 /// 2012 #[cfg(feature = "tdx")] 2013 fn get_tdx_exit_details(&mut self) -> cpu::Result<TdxExitDetails> { 2014 let kvm_run = self.fd.get_kvm_run(); 2015 let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall }; 2016 2017 tdx_vmcall.status_code = TDG_VP_VMCALL_INVALID_OPERAND; 2018 2019 if tdx_vmcall.type_ != 0 { 2020 return Err(cpu::HypervisorCpuError::UnknownTdxVmCall); 2021 } 2022 2023 match tdx_vmcall.subfunction { 2024 TDG_VP_VMCALL_GET_QUOTE => Ok(TdxExitDetails::GetQuote), 2025 TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT => { 2026 Ok(TdxExitDetails::SetupEventNotifyInterrupt) 2027 } 2028 _ => Err(cpu::HypervisorCpuError::UnknownTdxVmCall), 2029 } 2030 } 2031 2032 /// 2033 /// Set the status code for TDX exit 2034 /// 2035 #[cfg(feature = "tdx")] 2036 fn set_tdx_status(&mut self, status: TdxExitStatus) { 2037 let kvm_run = self.fd.get_kvm_run(); 2038 let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall }; 2039 2040 tdx_vmcall.status_code = match status { 2041 TdxExitStatus::Success => TDG_VP_VMCALL_SUCCESS, 2042 TdxExitStatus::InvalidOperand => TDG_VP_VMCALL_INVALID_OPERAND, 2043 }; 2044 } 2045 #[cfg(target_arch = "x86_64")] 2046 /// 2047 /// Return the list of initial MSR entries for a VCPU 2048 /// 2049 fn boot_msr_entries(&self) -> Vec<MsrEntry> { 2050 use crate::arch::x86::{msr_index, MTRR_ENABLE, MTRR_MEM_TYPE_WB}; 2051 2052 [ 2053 msr!(msr_index::MSR_IA32_SYSENTER_CS), 2054 msr!(msr_index::MSR_IA32_SYSENTER_ESP), 2055 msr!(msr_index::MSR_IA32_SYSENTER_EIP), 2056 msr!(msr_index::MSR_STAR), 2057 msr!(msr_index::MSR_CSTAR), 2058 msr!(msr_index::MSR_LSTAR), 2059 msr!(msr_index::MSR_KERNEL_GS_BASE), 2060 msr!(msr_index::MSR_SYSCALL_MASK), 2061 msr!(msr_index::MSR_IA32_TSC), 2062 msr_data!( 2063 msr_index::MSR_IA32_MISC_ENABLE, 2064 msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64 2065 ), 2066 msr_data!(msr_index::MSR_MTRRdefType, MTRR_ENABLE | MTRR_MEM_TYPE_WB), 2067 ] 2068 .to_vec() 2069 } 2070 #[cfg(target_arch = "aarch64")] 2071 fn has_pmu_support(&self) -> bool { 2072 let cpu_attr = kvm_bindings::kvm_device_attr { 2073 group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL, 2074 attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT), 2075 addr: 0x0, 2076 flags: 0, 2077 }; 2078 self.fd.has_device_attr(&cpu_attr).is_ok() 2079 } 2080 #[cfg(target_arch = "aarch64")] 2081 fn init_pmu(&self, irq: u32) -> cpu::Result<()> { 2082 let cpu_attr = kvm_bindings::kvm_device_attr { 2083 group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL, 2084 attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT), 2085 addr: 0x0, 2086 flags: 0, 2087 }; 2088 let cpu_attr_irq = kvm_bindings::kvm_device_attr { 2089 group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL, 2090 attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_IRQ), 2091 addr: &irq as *const u32 as u64, 2092 flags: 0, 2093 }; 2094 self.fd 2095 .set_device_attr(&cpu_attr_irq) 2096 .map_err(|_| cpu::HypervisorCpuError::InitializePmu)?; 2097 self.fd 2098 .set_device_attr(&cpu_attr) 2099 .map_err(|_| cpu::HypervisorCpuError::InitializePmu) 2100 } 2101 } 2102 2103 impl KvmVcpu { 2104 #[cfg(target_arch = "x86_64")] 2105 /// 2106 /// X86 specific call that returns the vcpu's current "xsave struct". 2107 /// 2108 fn get_xsave(&self) -> cpu::Result<Xsave> { 2109 self.fd 2110 .get_xsave() 2111 .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into())) 2112 } 2113 #[cfg(target_arch = "x86_64")] 2114 /// 2115 /// X86 specific call that sets the vcpu's current "xsave struct". 2116 /// 2117 fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> { 2118 self.fd 2119 .set_xsave(xsave) 2120 .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into())) 2121 } 2122 #[cfg(target_arch = "x86_64")] 2123 /// 2124 /// X86 specific call that returns the vcpu's current "xcrs". 2125 /// 2126 fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> { 2127 self.fd 2128 .get_xcrs() 2129 .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into())) 2130 } 2131 #[cfg(target_arch = "x86_64")] 2132 /// 2133 /// X86 specific call that sets the vcpu's current "xcrs". 2134 /// 2135 fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> { 2136 self.fd 2137 .set_xcrs(xcrs) 2138 .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into())) 2139 } 2140 #[cfg(target_arch = "x86_64")] 2141 /// 2142 /// Returns currently pending exceptions, interrupts, and NMIs as well as related 2143 /// states of the vcpu. 2144 /// 2145 fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> { 2146 self.fd 2147 .get_vcpu_events() 2148 .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into())) 2149 } 2150 #[cfg(target_arch = "x86_64")] 2151 /// 2152 /// Sets pending exceptions, interrupts, and NMIs as well as related states 2153 /// of the vcpu. 2154 /// 2155 fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> { 2156 self.fd 2157 .set_vcpu_events(events) 2158 .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into())) 2159 } 2160 } 2161