1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2012 - Virtual Open Systems and Columbia University 4 * Author: Christoffer Dall <c.dall@virtualopensystems.com> 5 */ 6 7 #include <linux/mman.h> 8 #include <linux/kvm_host.h> 9 #include <linux/io.h> 10 #include <linux/hugetlb.h> 11 #include <linux/sched/signal.h> 12 #include <trace/events/kvm.h> 13 #include <asm/pgalloc.h> 14 #include <asm/cacheflush.h> 15 #include <asm/kvm_arm.h> 16 #include <asm/kvm_mmu.h> 17 #include <asm/kvm_pgtable.h> 18 #include <asm/kvm_pkvm.h> 19 #include <asm/kvm_ras.h> 20 #include <asm/kvm_asm.h> 21 #include <asm/kvm_emulate.h> 22 #include <asm/virt.h> 23 24 #include "trace.h" 25 26 static struct kvm_pgtable *hyp_pgtable; 27 static DEFINE_MUTEX(kvm_hyp_pgd_mutex); 28 29 static unsigned long __ro_after_init hyp_idmap_start; 30 static unsigned long __ro_after_init hyp_idmap_end; 31 static phys_addr_t __ro_after_init hyp_idmap_vector; 32 33 u32 __ro_after_init __hyp_va_bits; 34 35 static unsigned long __ro_after_init io_map_base; 36 37 #define KVM_PGT_FN(fn) (!is_protected_kvm_enabled() ? fn : p ## fn) 38 39 static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end, 40 phys_addr_t size) 41 { 42 phys_addr_t boundary = ALIGN_DOWN(addr + size, size); 43 44 return (boundary - 1 < end - 1) ? boundary : end; 45 } 46 47 static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) 48 { 49 phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); 50 51 return __stage2_range_addr_end(addr, end, size); 52 } 53 54 /* 55 * Release kvm_mmu_lock periodically if the memory region is large. Otherwise, 56 * we may see kernel panics with CONFIG_DETECT_HUNG_TASK, 57 * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too 58 * long will also starve other vCPUs. We have to also make sure that the page 59 * tables are not freed while we released the lock. 60 */ 61 static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, 62 phys_addr_t end, 63 int (*fn)(struct kvm_pgtable *, u64, u64), 64 bool resched) 65 { 66 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 67 int ret; 68 u64 next; 69 70 do { 71 struct kvm_pgtable *pgt = mmu->pgt; 72 if (!pgt) 73 return -EINVAL; 74 75 next = stage2_range_addr_end(addr, end); 76 ret = fn(pgt, addr, next - addr); 77 if (ret) 78 break; 79 80 if (resched && next != end) 81 cond_resched_rwlock_write(&kvm->mmu_lock); 82 } while (addr = next, addr != end); 83 84 return ret; 85 } 86 87 #define stage2_apply_range_resched(mmu, addr, end, fn) \ 88 stage2_apply_range(mmu, addr, end, fn, true) 89 90 /* 91 * Get the maximum number of page-tables pages needed to split a range 92 * of blocks into PAGE_SIZE PTEs. It assumes the range is already 93 * mapped at level 2, or at level 1 if allowed. 94 */ 95 static int kvm_mmu_split_nr_page_tables(u64 range) 96 { 97 int n = 0; 98 99 if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2) 100 n += DIV_ROUND_UP(range, PUD_SIZE); 101 n += DIV_ROUND_UP(range, PMD_SIZE); 102 return n; 103 } 104 105 static bool need_split_memcache_topup_or_resched(struct kvm *kvm) 106 { 107 struct kvm_mmu_memory_cache *cache; 108 u64 chunk_size, min; 109 110 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) 111 return true; 112 113 chunk_size = kvm->arch.mmu.split_page_chunk_size; 114 min = kvm_mmu_split_nr_page_tables(chunk_size); 115 cache = &kvm->arch.mmu.split_page_cache; 116 return kvm_mmu_memory_cache_nr_free_objects(cache) < min; 117 } 118 119 static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr, 120 phys_addr_t end) 121 { 122 struct kvm_mmu_memory_cache *cache; 123 struct kvm_pgtable *pgt; 124 int ret, cache_capacity; 125 u64 next, chunk_size; 126 127 lockdep_assert_held_write(&kvm->mmu_lock); 128 129 chunk_size = kvm->arch.mmu.split_page_chunk_size; 130 cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size); 131 132 if (chunk_size == 0) 133 return 0; 134 135 cache = &kvm->arch.mmu.split_page_cache; 136 137 do { 138 if (need_split_memcache_topup_or_resched(kvm)) { 139 write_unlock(&kvm->mmu_lock); 140 cond_resched(); 141 /* Eager page splitting is best-effort. */ 142 ret = __kvm_mmu_topup_memory_cache(cache, 143 cache_capacity, 144 cache_capacity); 145 write_lock(&kvm->mmu_lock); 146 if (ret) 147 break; 148 } 149 150 pgt = kvm->arch.mmu.pgt; 151 if (!pgt) 152 return -EINVAL; 153 154 next = __stage2_range_addr_end(addr, end, chunk_size); 155 ret = KVM_PGT_FN(kvm_pgtable_stage2_split)(pgt, addr, next - addr, cache); 156 if (ret) 157 break; 158 } while (addr = next, addr != end); 159 160 return ret; 161 } 162 163 static bool memslot_is_logging(struct kvm_memory_slot *memslot) 164 { 165 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); 166 } 167 168 /** 169 * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries for v7/8 170 * @kvm: pointer to kvm structure. 171 * 172 * Interface to HYP function to flush all VM TLB entries 173 */ 174 int kvm_arch_flush_remote_tlbs(struct kvm *kvm) 175 { 176 if (is_protected_kvm_enabled()) 177 kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); 178 else 179 kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); 180 return 0; 181 } 182 183 int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, 184 gfn_t gfn, u64 nr_pages) 185 { 186 u64 size = nr_pages << PAGE_SHIFT; 187 u64 addr = gfn << PAGE_SHIFT; 188 189 if (is_protected_kvm_enabled()) 190 kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); 191 else 192 kvm_tlb_flush_vmid_range(&kvm->arch.mmu, addr, size); 193 return 0; 194 } 195 196 static bool kvm_is_device_pfn(unsigned long pfn) 197 { 198 return !pfn_is_map_memory(pfn); 199 } 200 201 static void *stage2_memcache_zalloc_page(void *arg) 202 { 203 struct kvm_mmu_memory_cache *mc = arg; 204 void *virt; 205 206 /* Allocated with __GFP_ZERO, so no need to zero */ 207 virt = kvm_mmu_memory_cache_alloc(mc); 208 if (virt) 209 kvm_account_pgtable_pages(virt, 1); 210 return virt; 211 } 212 213 static void *kvm_host_zalloc_pages_exact(size_t size) 214 { 215 return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); 216 } 217 218 static void *kvm_s2_zalloc_pages_exact(size_t size) 219 { 220 void *virt = kvm_host_zalloc_pages_exact(size); 221 222 if (virt) 223 kvm_account_pgtable_pages(virt, (size >> PAGE_SHIFT)); 224 return virt; 225 } 226 227 static void kvm_s2_free_pages_exact(void *virt, size_t size) 228 { 229 kvm_account_pgtable_pages(virt, -(size >> PAGE_SHIFT)); 230 free_pages_exact(virt, size); 231 } 232 233 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops; 234 235 static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head) 236 { 237 struct page *page = container_of(head, struct page, rcu_head); 238 void *pgtable = page_to_virt(page); 239 s8 level = page_private(page); 240 241 KVM_PGT_FN(kvm_pgtable_stage2_free_unlinked)(&kvm_s2_mm_ops, pgtable, level); 242 } 243 244 static void stage2_free_unlinked_table(void *addr, s8 level) 245 { 246 struct page *page = virt_to_page(addr); 247 248 set_page_private(page, (unsigned long)level); 249 call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb); 250 } 251 252 static void kvm_host_get_page(void *addr) 253 { 254 get_page(virt_to_page(addr)); 255 } 256 257 static void kvm_host_put_page(void *addr) 258 { 259 put_page(virt_to_page(addr)); 260 } 261 262 static void kvm_s2_put_page(void *addr) 263 { 264 struct page *p = virt_to_page(addr); 265 /* Dropping last refcount, the page will be freed */ 266 if (page_count(p) == 1) 267 kvm_account_pgtable_pages(addr, -1); 268 put_page(p); 269 } 270 271 static int kvm_host_page_count(void *addr) 272 { 273 return page_count(virt_to_page(addr)); 274 } 275 276 static phys_addr_t kvm_host_pa(void *addr) 277 { 278 return __pa(addr); 279 } 280 281 static void *kvm_host_va(phys_addr_t phys) 282 { 283 return __va(phys); 284 } 285 286 static void clean_dcache_guest_page(void *va, size_t size) 287 { 288 __clean_dcache_guest_page(va, size); 289 } 290 291 static void invalidate_icache_guest_page(void *va, size_t size) 292 { 293 __invalidate_icache_guest_page(va, size); 294 } 295 296 /* 297 * Unmapping vs dcache management: 298 * 299 * If a guest maps certain memory pages as uncached, all writes will 300 * bypass the data cache and go directly to RAM. However, the CPUs 301 * can still speculate reads (not writes) and fill cache lines with 302 * data. 303 * 304 * Those cache lines will be *clean* cache lines though, so a 305 * clean+invalidate operation is equivalent to an invalidate 306 * operation, because no cache lines are marked dirty. 307 * 308 * Those clean cache lines could be filled prior to an uncached write 309 * by the guest, and the cache coherent IO subsystem would therefore 310 * end up writing old data to disk. 311 * 312 * This is why right after unmapping a page/section and invalidating 313 * the corresponding TLBs, we flush to make sure the IO subsystem will 314 * never hit in the cache. 315 * 316 * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as 317 * we then fully enforce cacheability of RAM, no matter what the guest 318 * does. 319 */ 320 /** 321 * __unmap_stage2_range -- Clear stage2 page table entries to unmap a range 322 * @mmu: The KVM stage-2 MMU pointer 323 * @start: The intermediate physical base address of the range to unmap 324 * @size: The size of the area to unmap 325 * @may_block: Whether or not we are permitted to block 326 * 327 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must 328 * be called while holding mmu_lock (unless for freeing the stage2 pgd before 329 * destroying the VM), otherwise another faulting VCPU may come in and mess 330 * with things behind our backs. 331 */ 332 static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size, 333 bool may_block) 334 { 335 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 336 phys_addr_t end = start + size; 337 338 lockdep_assert_held_write(&kvm->mmu_lock); 339 WARN_ON(size & ~PAGE_MASK); 340 WARN_ON(stage2_apply_range(mmu, start, end, KVM_PGT_FN(kvm_pgtable_stage2_unmap), 341 may_block)); 342 } 343 344 void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, 345 u64 size, bool may_block) 346 { 347 __unmap_stage2_range(mmu, start, size, may_block); 348 } 349 350 void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) 351 { 352 stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_flush)); 353 } 354 355 static void stage2_flush_memslot(struct kvm *kvm, 356 struct kvm_memory_slot *memslot) 357 { 358 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 359 phys_addr_t end = addr + PAGE_SIZE * memslot->npages; 360 361 kvm_stage2_flush_range(&kvm->arch.mmu, addr, end); 362 } 363 364 /** 365 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2 366 * @kvm: The struct kvm pointer 367 * 368 * Go through the stage 2 page tables and invalidate any cache lines 369 * backing memory already mapped to the VM. 370 */ 371 static void stage2_flush_vm(struct kvm *kvm) 372 { 373 struct kvm_memslots *slots; 374 struct kvm_memory_slot *memslot; 375 int idx, bkt; 376 377 idx = srcu_read_lock(&kvm->srcu); 378 write_lock(&kvm->mmu_lock); 379 380 slots = kvm_memslots(kvm); 381 kvm_for_each_memslot(memslot, bkt, slots) 382 stage2_flush_memslot(kvm, memslot); 383 384 kvm_nested_s2_flush(kvm); 385 386 write_unlock(&kvm->mmu_lock); 387 srcu_read_unlock(&kvm->srcu, idx); 388 } 389 390 /** 391 * free_hyp_pgds - free Hyp-mode page tables 392 */ 393 void __init free_hyp_pgds(void) 394 { 395 mutex_lock(&kvm_hyp_pgd_mutex); 396 if (hyp_pgtable) { 397 kvm_pgtable_hyp_destroy(hyp_pgtable); 398 kfree(hyp_pgtable); 399 hyp_pgtable = NULL; 400 } 401 mutex_unlock(&kvm_hyp_pgd_mutex); 402 } 403 404 static bool kvm_host_owns_hyp_mappings(void) 405 { 406 if (is_kernel_in_hyp_mode()) 407 return false; 408 409 if (static_branch_likely(&kvm_protected_mode_initialized)) 410 return false; 411 412 /* 413 * This can happen at boot time when __create_hyp_mappings() is called 414 * after the hyp protection has been enabled, but the static key has 415 * not been flipped yet. 416 */ 417 if (!hyp_pgtable && is_protected_kvm_enabled()) 418 return false; 419 420 WARN_ON(!hyp_pgtable); 421 422 return true; 423 } 424 425 int __create_hyp_mappings(unsigned long start, unsigned long size, 426 unsigned long phys, enum kvm_pgtable_prot prot) 427 { 428 int err; 429 430 if (WARN_ON(!kvm_host_owns_hyp_mappings())) 431 return -EINVAL; 432 433 mutex_lock(&kvm_hyp_pgd_mutex); 434 err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot); 435 mutex_unlock(&kvm_hyp_pgd_mutex); 436 437 return err; 438 } 439 440 static phys_addr_t kvm_kaddr_to_phys(void *kaddr) 441 { 442 if (!is_vmalloc_addr(kaddr)) { 443 BUG_ON(!virt_addr_valid(kaddr)); 444 return __pa(kaddr); 445 } else { 446 return page_to_phys(vmalloc_to_page(kaddr)) + 447 offset_in_page(kaddr); 448 } 449 } 450 451 struct hyp_shared_pfn { 452 u64 pfn; 453 int count; 454 struct rb_node node; 455 }; 456 457 static DEFINE_MUTEX(hyp_shared_pfns_lock); 458 static struct rb_root hyp_shared_pfns = RB_ROOT; 459 460 static struct hyp_shared_pfn *find_shared_pfn(u64 pfn, struct rb_node ***node, 461 struct rb_node **parent) 462 { 463 struct hyp_shared_pfn *this; 464 465 *node = &hyp_shared_pfns.rb_node; 466 *parent = NULL; 467 while (**node) { 468 this = container_of(**node, struct hyp_shared_pfn, node); 469 *parent = **node; 470 if (this->pfn < pfn) 471 *node = &((**node)->rb_left); 472 else if (this->pfn > pfn) 473 *node = &((**node)->rb_right); 474 else 475 return this; 476 } 477 478 return NULL; 479 } 480 481 static int share_pfn_hyp(u64 pfn) 482 { 483 struct rb_node **node, *parent; 484 struct hyp_shared_pfn *this; 485 int ret = 0; 486 487 mutex_lock(&hyp_shared_pfns_lock); 488 this = find_shared_pfn(pfn, &node, &parent); 489 if (this) { 490 this->count++; 491 goto unlock; 492 } 493 494 this = kzalloc(sizeof(*this), GFP_KERNEL); 495 if (!this) { 496 ret = -ENOMEM; 497 goto unlock; 498 } 499 500 this->pfn = pfn; 501 this->count = 1; 502 rb_link_node(&this->node, parent, node); 503 rb_insert_color(&this->node, &hyp_shared_pfns); 504 ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, pfn, 1); 505 unlock: 506 mutex_unlock(&hyp_shared_pfns_lock); 507 508 return ret; 509 } 510 511 static int unshare_pfn_hyp(u64 pfn) 512 { 513 struct rb_node **node, *parent; 514 struct hyp_shared_pfn *this; 515 int ret = 0; 516 517 mutex_lock(&hyp_shared_pfns_lock); 518 this = find_shared_pfn(pfn, &node, &parent); 519 if (WARN_ON(!this)) { 520 ret = -ENOENT; 521 goto unlock; 522 } 523 524 this->count--; 525 if (this->count) 526 goto unlock; 527 528 rb_erase(&this->node, &hyp_shared_pfns); 529 kfree(this); 530 ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, pfn, 1); 531 unlock: 532 mutex_unlock(&hyp_shared_pfns_lock); 533 534 return ret; 535 } 536 537 int kvm_share_hyp(void *from, void *to) 538 { 539 phys_addr_t start, end, cur; 540 u64 pfn; 541 int ret; 542 543 if (is_kernel_in_hyp_mode()) 544 return 0; 545 546 /* 547 * The share hcall maps things in the 'fixed-offset' region of the hyp 548 * VA space, so we can only share physically contiguous data-structures 549 * for now. 550 */ 551 if (is_vmalloc_or_module_addr(from) || is_vmalloc_or_module_addr(to)) 552 return -EINVAL; 553 554 if (kvm_host_owns_hyp_mappings()) 555 return create_hyp_mappings(from, to, PAGE_HYP); 556 557 start = ALIGN_DOWN(__pa(from), PAGE_SIZE); 558 end = PAGE_ALIGN(__pa(to)); 559 for (cur = start; cur < end; cur += PAGE_SIZE) { 560 pfn = __phys_to_pfn(cur); 561 ret = share_pfn_hyp(pfn); 562 if (ret) 563 return ret; 564 } 565 566 return 0; 567 } 568 569 void kvm_unshare_hyp(void *from, void *to) 570 { 571 phys_addr_t start, end, cur; 572 u64 pfn; 573 574 if (is_kernel_in_hyp_mode() || kvm_host_owns_hyp_mappings() || !from) 575 return; 576 577 start = ALIGN_DOWN(__pa(from), PAGE_SIZE); 578 end = PAGE_ALIGN(__pa(to)); 579 for (cur = start; cur < end; cur += PAGE_SIZE) { 580 pfn = __phys_to_pfn(cur); 581 WARN_ON(unshare_pfn_hyp(pfn)); 582 } 583 } 584 585 /** 586 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode 587 * @from: The virtual kernel start address of the range 588 * @to: The virtual kernel end address of the range (exclusive) 589 * @prot: The protection to be applied to this range 590 * 591 * The same virtual address as the kernel virtual address is also used 592 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying 593 * physical pages. 594 */ 595 int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot) 596 { 597 phys_addr_t phys_addr; 598 unsigned long virt_addr; 599 unsigned long start = kern_hyp_va((unsigned long)from); 600 unsigned long end = kern_hyp_va((unsigned long)to); 601 602 if (is_kernel_in_hyp_mode()) 603 return 0; 604 605 if (!kvm_host_owns_hyp_mappings()) 606 return -EPERM; 607 608 start = start & PAGE_MASK; 609 end = PAGE_ALIGN(end); 610 611 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { 612 int err; 613 614 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); 615 err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr, 616 prot); 617 if (err) 618 return err; 619 } 620 621 return 0; 622 } 623 624 static int __hyp_alloc_private_va_range(unsigned long base) 625 { 626 lockdep_assert_held(&kvm_hyp_pgd_mutex); 627 628 if (!PAGE_ALIGNED(base)) 629 return -EINVAL; 630 631 /* 632 * Verify that BIT(VA_BITS - 1) hasn't been flipped by 633 * allocating the new area, as it would indicate we've 634 * overflowed the idmap/IO address range. 635 */ 636 if ((base ^ io_map_base) & BIT(VA_BITS - 1)) 637 return -ENOMEM; 638 639 io_map_base = base; 640 641 return 0; 642 } 643 644 /** 645 * hyp_alloc_private_va_range - Allocates a private VA range. 646 * @size: The size of the VA range to reserve. 647 * @haddr: The hypervisor virtual start address of the allocation. 648 * 649 * The private virtual address (VA) range is allocated below io_map_base 650 * and aligned based on the order of @size. 651 * 652 * Return: 0 on success or negative error code on failure. 653 */ 654 int hyp_alloc_private_va_range(size_t size, unsigned long *haddr) 655 { 656 unsigned long base; 657 int ret = 0; 658 659 mutex_lock(&kvm_hyp_pgd_mutex); 660 661 /* 662 * This assumes that we have enough space below the idmap 663 * page to allocate our VAs. If not, the check in 664 * __hyp_alloc_private_va_range() will kick. A potential 665 * alternative would be to detect that overflow and switch 666 * to an allocation above the idmap. 667 * 668 * The allocated size is always a multiple of PAGE_SIZE. 669 */ 670 size = PAGE_ALIGN(size); 671 base = io_map_base - size; 672 ret = __hyp_alloc_private_va_range(base); 673 674 mutex_unlock(&kvm_hyp_pgd_mutex); 675 676 if (!ret) 677 *haddr = base; 678 679 return ret; 680 } 681 682 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size, 683 unsigned long *haddr, 684 enum kvm_pgtable_prot prot) 685 { 686 unsigned long addr; 687 int ret = 0; 688 689 if (!kvm_host_owns_hyp_mappings()) { 690 addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping, 691 phys_addr, size, prot); 692 if (IS_ERR_VALUE(addr)) 693 return addr; 694 *haddr = addr; 695 696 return 0; 697 } 698 699 size = PAGE_ALIGN(size + offset_in_page(phys_addr)); 700 ret = hyp_alloc_private_va_range(size, &addr); 701 if (ret) 702 return ret; 703 704 ret = __create_hyp_mappings(addr, size, phys_addr, prot); 705 if (ret) 706 return ret; 707 708 *haddr = addr + offset_in_page(phys_addr); 709 return ret; 710 } 711 712 int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr) 713 { 714 unsigned long base; 715 size_t size; 716 int ret; 717 718 mutex_lock(&kvm_hyp_pgd_mutex); 719 /* 720 * Efficient stack verification using the NVHE_STACK_SHIFT bit implies 721 * an alignment of our allocation on the order of the size. 722 */ 723 size = NVHE_STACK_SIZE * 2; 724 base = ALIGN_DOWN(io_map_base - size, size); 725 726 ret = __hyp_alloc_private_va_range(base); 727 728 mutex_unlock(&kvm_hyp_pgd_mutex); 729 730 if (ret) { 731 kvm_err("Cannot allocate hyp stack guard page\n"); 732 return ret; 733 } 734 735 /* 736 * Since the stack grows downwards, map the stack to the page 737 * at the higher address and leave the lower guard page 738 * unbacked. 739 * 740 * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1 741 * and addresses corresponding to the guard page have the 742 * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection. 743 */ 744 ret = __create_hyp_mappings(base + NVHE_STACK_SIZE, NVHE_STACK_SIZE, 745 phys_addr, PAGE_HYP); 746 if (ret) 747 kvm_err("Cannot map hyp stack\n"); 748 749 *haddr = base + size; 750 751 return ret; 752 } 753 754 /** 755 * create_hyp_io_mappings - Map IO into both kernel and HYP 756 * @phys_addr: The physical start address which gets mapped 757 * @size: Size of the region being mapped 758 * @kaddr: Kernel VA for this mapping 759 * @haddr: HYP VA for this mapping 760 */ 761 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, 762 void __iomem **kaddr, 763 void __iomem **haddr) 764 { 765 unsigned long addr; 766 int ret; 767 768 if (is_protected_kvm_enabled()) 769 return -EPERM; 770 771 *kaddr = ioremap(phys_addr, size); 772 if (!*kaddr) 773 return -ENOMEM; 774 775 if (is_kernel_in_hyp_mode()) { 776 *haddr = *kaddr; 777 return 0; 778 } 779 780 ret = __create_hyp_private_mapping(phys_addr, size, 781 &addr, PAGE_HYP_DEVICE); 782 if (ret) { 783 iounmap(*kaddr); 784 *kaddr = NULL; 785 *haddr = NULL; 786 return ret; 787 } 788 789 *haddr = (void __iomem *)addr; 790 return 0; 791 } 792 793 /** 794 * create_hyp_exec_mappings - Map an executable range into HYP 795 * @phys_addr: The physical start address which gets mapped 796 * @size: Size of the region being mapped 797 * @haddr: HYP VA for this mapping 798 */ 799 int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, 800 void **haddr) 801 { 802 unsigned long addr; 803 int ret; 804 805 BUG_ON(is_kernel_in_hyp_mode()); 806 807 ret = __create_hyp_private_mapping(phys_addr, size, 808 &addr, PAGE_HYP_EXEC); 809 if (ret) { 810 *haddr = NULL; 811 return ret; 812 } 813 814 *haddr = (void *)addr; 815 return 0; 816 } 817 818 static struct kvm_pgtable_mm_ops kvm_user_mm_ops = { 819 /* We shouldn't need any other callback to walk the PT */ 820 .phys_to_virt = kvm_host_va, 821 }; 822 823 static int get_user_mapping_size(struct kvm *kvm, u64 addr) 824 { 825 struct kvm_pgtable pgt = { 826 .pgd = (kvm_pteref_t)kvm->mm->pgd, 827 .ia_bits = vabits_actual, 828 .start_level = (KVM_PGTABLE_LAST_LEVEL - 829 ARM64_HW_PGTABLE_LEVELS(pgt.ia_bits) + 1), 830 .mm_ops = &kvm_user_mm_ops, 831 }; 832 unsigned long flags; 833 kvm_pte_t pte = 0; /* Keep GCC quiet... */ 834 s8 level = S8_MAX; 835 int ret; 836 837 /* 838 * Disable IRQs so that we hazard against a concurrent 839 * teardown of the userspace page tables (which relies on 840 * IPI-ing threads). 841 */ 842 local_irq_save(flags); 843 ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level); 844 local_irq_restore(flags); 845 846 if (ret) 847 return ret; 848 849 /* 850 * Not seeing an error, but not updating level? Something went 851 * deeply wrong... 852 */ 853 if (WARN_ON(level > KVM_PGTABLE_LAST_LEVEL)) 854 return -EFAULT; 855 if (WARN_ON(level < KVM_PGTABLE_FIRST_LEVEL)) 856 return -EFAULT; 857 858 /* Oops, the userspace PTs are gone... Replay the fault */ 859 if (!kvm_pte_valid(pte)) 860 return -EAGAIN; 861 862 return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)); 863 } 864 865 static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { 866 .zalloc_page = stage2_memcache_zalloc_page, 867 .zalloc_pages_exact = kvm_s2_zalloc_pages_exact, 868 .free_pages_exact = kvm_s2_free_pages_exact, 869 .free_unlinked_table = stage2_free_unlinked_table, 870 .get_page = kvm_host_get_page, 871 .put_page = kvm_s2_put_page, 872 .page_count = kvm_host_page_count, 873 .phys_to_virt = kvm_host_va, 874 .virt_to_phys = kvm_host_pa, 875 .dcache_clean_inval_poc = clean_dcache_guest_page, 876 .icache_inval_pou = invalidate_icache_guest_page, 877 }; 878 879 static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type) 880 { 881 u32 kvm_ipa_limit = get_kvm_ipa_limit(); 882 u64 mmfr0, mmfr1; 883 u32 phys_shift; 884 885 if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK) 886 return -EINVAL; 887 888 phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); 889 if (is_protected_kvm_enabled()) { 890 phys_shift = kvm_ipa_limit; 891 } else if (phys_shift) { 892 if (phys_shift > kvm_ipa_limit || 893 phys_shift < ARM64_MIN_PARANGE_BITS) 894 return -EINVAL; 895 } else { 896 phys_shift = KVM_PHYS_SHIFT; 897 if (phys_shift > kvm_ipa_limit) { 898 pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n", 899 current->comm); 900 return -EINVAL; 901 } 902 } 903 904 mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); 905 mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); 906 mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); 907 908 return 0; 909 } 910 911 /** 912 * kvm_init_stage2_mmu - Initialise a S2 MMU structure 913 * @kvm: The pointer to the KVM structure 914 * @mmu: The pointer to the s2 MMU structure 915 * @type: The machine type of the virtual machine 916 * 917 * Allocates only the stage-2 HW PGD level table(s). 918 * Note we don't need locking here as this is only called in two cases: 919 * 920 * - when the VM is created, which can't race against anything 921 * 922 * - when secondary kvm_s2_mmu structures are initialised for NV 923 * guests, and the caller must hold kvm->lock as this is called on a 924 * per-vcpu basis. 925 */ 926 int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type) 927 { 928 int cpu, err; 929 struct kvm_pgtable *pgt; 930 931 /* 932 * If we already have our page tables in place, and that the 933 * MMU context is the canonical one, we have a bug somewhere, 934 * as this is only supposed to ever happen once per VM. 935 * 936 * Otherwise, we're building nested page tables, and that's 937 * probably because userspace called KVM_ARM_VCPU_INIT more 938 * than once on the same vcpu. Since that's actually legal, 939 * don't kick a fuss and leave gracefully. 940 */ 941 if (mmu->pgt != NULL) { 942 if (kvm_is_nested_s2_mmu(kvm, mmu)) 943 return 0; 944 945 kvm_err("kvm_arch already initialized?\n"); 946 return -EINVAL; 947 } 948 949 err = kvm_init_ipa_range(mmu, type); 950 if (err) 951 return err; 952 953 pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT); 954 if (!pgt) 955 return -ENOMEM; 956 957 mmu->arch = &kvm->arch; 958 err = KVM_PGT_FN(kvm_pgtable_stage2_init)(pgt, mmu, &kvm_s2_mm_ops); 959 if (err) 960 goto out_free_pgtable; 961 962 mmu->pgt = pgt; 963 if (is_protected_kvm_enabled()) 964 return 0; 965 966 mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran)); 967 if (!mmu->last_vcpu_ran) { 968 err = -ENOMEM; 969 goto out_destroy_pgtable; 970 } 971 972 for_each_possible_cpu(cpu) 973 *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1; 974 975 /* The eager page splitting is disabled by default */ 976 mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; 977 mmu->split_page_cache.gfp_zero = __GFP_ZERO; 978 979 mmu->pgd_phys = __pa(pgt->pgd); 980 981 if (kvm_is_nested_s2_mmu(kvm, mmu)) 982 kvm_init_nested_s2_mmu(mmu); 983 984 return 0; 985 986 out_destroy_pgtable: 987 KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); 988 out_free_pgtable: 989 kfree(pgt); 990 return err; 991 } 992 993 void kvm_uninit_stage2_mmu(struct kvm *kvm) 994 { 995 kvm_free_stage2_pgd(&kvm->arch.mmu); 996 kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); 997 } 998 999 static void stage2_unmap_memslot(struct kvm *kvm, 1000 struct kvm_memory_slot *memslot) 1001 { 1002 hva_t hva = memslot->userspace_addr; 1003 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT; 1004 phys_addr_t size = PAGE_SIZE * memslot->npages; 1005 hva_t reg_end = hva + size; 1006 1007 /* 1008 * A memory region could potentially cover multiple VMAs, and any holes 1009 * between them, so iterate over all of them to find out if we should 1010 * unmap any of them. 1011 * 1012 * +--------------------------------------------+ 1013 * +---------------+----------------+ +----------------+ 1014 * | : VMA 1 | VMA 2 | | VMA 3 : | 1015 * +---------------+----------------+ +----------------+ 1016 * | memory region | 1017 * +--------------------------------------------+ 1018 */ 1019 do { 1020 struct vm_area_struct *vma; 1021 hva_t vm_start, vm_end; 1022 1023 vma = find_vma_intersection(current->mm, hva, reg_end); 1024 if (!vma) 1025 break; 1026 1027 /* 1028 * Take the intersection of this VMA with the memory region 1029 */ 1030 vm_start = max(hva, vma->vm_start); 1031 vm_end = min(reg_end, vma->vm_end); 1032 1033 if (!(vma->vm_flags & VM_PFNMAP)) { 1034 gpa_t gpa = addr + (vm_start - memslot->userspace_addr); 1035 kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true); 1036 } 1037 hva = vm_end; 1038 } while (hva < reg_end); 1039 } 1040 1041 /** 1042 * stage2_unmap_vm - Unmap Stage-2 RAM mappings 1043 * @kvm: The struct kvm pointer 1044 * 1045 * Go through the memregions and unmap any regular RAM 1046 * backing memory already mapped to the VM. 1047 */ 1048 void stage2_unmap_vm(struct kvm *kvm) 1049 { 1050 struct kvm_memslots *slots; 1051 struct kvm_memory_slot *memslot; 1052 int idx, bkt; 1053 1054 idx = srcu_read_lock(&kvm->srcu); 1055 mmap_read_lock(current->mm); 1056 write_lock(&kvm->mmu_lock); 1057 1058 slots = kvm_memslots(kvm); 1059 kvm_for_each_memslot(memslot, bkt, slots) 1060 stage2_unmap_memslot(kvm, memslot); 1061 1062 kvm_nested_s2_unmap(kvm, true); 1063 1064 write_unlock(&kvm->mmu_lock); 1065 mmap_read_unlock(current->mm); 1066 srcu_read_unlock(&kvm->srcu, idx); 1067 } 1068 1069 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) 1070 { 1071 struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 1072 struct kvm_pgtable *pgt = NULL; 1073 1074 write_lock(&kvm->mmu_lock); 1075 pgt = mmu->pgt; 1076 if (pgt) { 1077 mmu->pgd_phys = 0; 1078 mmu->pgt = NULL; 1079 free_percpu(mmu->last_vcpu_ran); 1080 } 1081 write_unlock(&kvm->mmu_lock); 1082 1083 if (pgt) { 1084 KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); 1085 kfree(pgt); 1086 } 1087 } 1088 1089 static void hyp_mc_free_fn(void *addr, void *mc) 1090 { 1091 struct kvm_hyp_memcache *memcache = mc; 1092 1093 if (memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) 1094 kvm_account_pgtable_pages(addr, -1); 1095 1096 free_page((unsigned long)addr); 1097 } 1098 1099 static void *hyp_mc_alloc_fn(void *mc) 1100 { 1101 struct kvm_hyp_memcache *memcache = mc; 1102 void *addr; 1103 1104 addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); 1105 if (addr && memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2) 1106 kvm_account_pgtable_pages(addr, 1); 1107 1108 return addr; 1109 } 1110 1111 void free_hyp_memcache(struct kvm_hyp_memcache *mc) 1112 { 1113 if (!is_protected_kvm_enabled()) 1114 return; 1115 1116 kfree(mc->mapping); 1117 __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, mc); 1118 } 1119 1120 int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) 1121 { 1122 if (!is_protected_kvm_enabled()) 1123 return 0; 1124 1125 if (!mc->mapping) { 1126 mc->mapping = kzalloc(sizeof(struct pkvm_mapping), GFP_KERNEL_ACCOUNT); 1127 if (!mc->mapping) 1128 return -ENOMEM; 1129 } 1130 1131 return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn, 1132 kvm_host_pa, mc); 1133 } 1134 1135 /** 1136 * kvm_phys_addr_ioremap - map a device range to guest IPA 1137 * 1138 * @kvm: The KVM pointer 1139 * @guest_ipa: The IPA at which to insert the mapping 1140 * @pa: The physical address of the device 1141 * @size: The size of the mapping 1142 * @writable: Whether or not to create a writable mapping 1143 */ 1144 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, 1145 phys_addr_t pa, unsigned long size, bool writable) 1146 { 1147 phys_addr_t addr; 1148 int ret = 0; 1149 struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO }; 1150 struct kvm_s2_mmu *mmu = &kvm->arch.mmu; 1151 struct kvm_pgtable *pgt = mmu->pgt; 1152 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE | 1153 KVM_PGTABLE_PROT_R | 1154 (writable ? KVM_PGTABLE_PROT_W : 0); 1155 1156 if (is_protected_kvm_enabled()) 1157 return -EPERM; 1158 1159 size += offset_in_page(guest_ipa); 1160 guest_ipa &= PAGE_MASK; 1161 1162 for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) { 1163 ret = kvm_mmu_topup_memory_cache(&cache, 1164 kvm_mmu_cache_min_pages(mmu)); 1165 if (ret) 1166 break; 1167 1168 write_lock(&kvm->mmu_lock); 1169 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, addr, PAGE_SIZE, 1170 pa, prot, &cache, 0); 1171 write_unlock(&kvm->mmu_lock); 1172 if (ret) 1173 break; 1174 1175 pa += PAGE_SIZE; 1176 } 1177 1178 kvm_mmu_free_memory_cache(&cache); 1179 return ret; 1180 } 1181 1182 /** 1183 * kvm_stage2_wp_range() - write protect stage2 memory region range 1184 * @mmu: The KVM stage-2 MMU pointer 1185 * @addr: Start address of range 1186 * @end: End address of range 1187 */ 1188 void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) 1189 { 1190 stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_wrprotect)); 1191 } 1192 1193 /** 1194 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot 1195 * @kvm: The KVM pointer 1196 * @slot: The memory slot to write protect 1197 * 1198 * Called to start logging dirty pages after memory region 1199 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns 1200 * all present PUD, PMD and PTEs are write protected in the memory region. 1201 * Afterwards read of dirty page log can be called. 1202 * 1203 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, 1204 * serializing operations for VM memory regions. 1205 */ 1206 static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) 1207 { 1208 struct kvm_memslots *slots = kvm_memslots(kvm); 1209 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); 1210 phys_addr_t start, end; 1211 1212 if (WARN_ON_ONCE(!memslot)) 1213 return; 1214 1215 start = memslot->base_gfn << PAGE_SHIFT; 1216 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1217 1218 write_lock(&kvm->mmu_lock); 1219 kvm_stage2_wp_range(&kvm->arch.mmu, start, end); 1220 kvm_nested_s2_wp(kvm); 1221 write_unlock(&kvm->mmu_lock); 1222 kvm_flush_remote_tlbs_memslot(kvm, memslot); 1223 } 1224 1225 /** 1226 * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE 1227 * pages for memory slot 1228 * @kvm: The KVM pointer 1229 * @slot: The memory slot to split 1230 * 1231 * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired, 1232 * serializing operations for VM memory regions. 1233 */ 1234 static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot) 1235 { 1236 struct kvm_memslots *slots; 1237 struct kvm_memory_slot *memslot; 1238 phys_addr_t start, end; 1239 1240 lockdep_assert_held(&kvm->slots_lock); 1241 1242 slots = kvm_memslots(kvm); 1243 memslot = id_to_memslot(slots, slot); 1244 1245 start = memslot->base_gfn << PAGE_SHIFT; 1246 end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; 1247 1248 write_lock(&kvm->mmu_lock); 1249 kvm_mmu_split_huge_pages(kvm, start, end); 1250 write_unlock(&kvm->mmu_lock); 1251 } 1252 1253 /* 1254 * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages. 1255 * @kvm: The KVM pointer 1256 * @slot: The memory slot associated with mask 1257 * @gfn_offset: The gfn offset in memory slot 1258 * @mask: The mask of pages at offset 'gfn_offset' in this memory 1259 * slot to enable dirty logging on 1260 * 1261 * Writes protect selected pages to enable dirty logging, and then 1262 * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock. 1263 */ 1264 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1265 struct kvm_memory_slot *slot, 1266 gfn_t gfn_offset, unsigned long mask) 1267 { 1268 phys_addr_t base_gfn = slot->base_gfn + gfn_offset; 1269 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; 1270 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; 1271 1272 lockdep_assert_held_write(&kvm->mmu_lock); 1273 1274 kvm_stage2_wp_range(&kvm->arch.mmu, start, end); 1275 1276 /* 1277 * Eager-splitting is done when manual-protect is set. We 1278 * also check for initially-all-set because we can avoid 1279 * eager-splitting if initially-all-set is false. 1280 * Initially-all-set equal false implies that huge-pages were 1281 * already split when enabling dirty logging: no need to do it 1282 * again. 1283 */ 1284 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 1285 kvm_mmu_split_huge_pages(kvm, start, end); 1286 1287 kvm_nested_s2_wp(kvm); 1288 } 1289 1290 static void kvm_send_hwpoison_signal(unsigned long address, short lsb) 1291 { 1292 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); 1293 } 1294 1295 static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, 1296 unsigned long hva, 1297 unsigned long map_size) 1298 { 1299 gpa_t gpa_start; 1300 hva_t uaddr_start, uaddr_end; 1301 size_t size; 1302 1303 /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */ 1304 if (map_size == PAGE_SIZE) 1305 return true; 1306 1307 /* pKVM only supports PMD_SIZE huge-mappings */ 1308 if (is_protected_kvm_enabled() && map_size != PMD_SIZE) 1309 return false; 1310 1311 size = memslot->npages * PAGE_SIZE; 1312 1313 gpa_start = memslot->base_gfn << PAGE_SHIFT; 1314 1315 uaddr_start = memslot->userspace_addr; 1316 uaddr_end = uaddr_start + size; 1317 1318 /* 1319 * Pages belonging to memslots that don't have the same alignment 1320 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2 1321 * PMD/PUD entries, because we'll end up mapping the wrong pages. 1322 * 1323 * Consider a layout like the following: 1324 * 1325 * memslot->userspace_addr: 1326 * +-----+--------------------+--------------------+---+ 1327 * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| 1328 * +-----+--------------------+--------------------+---+ 1329 * 1330 * memslot->base_gfn << PAGE_SHIFT: 1331 * +---+--------------------+--------------------+-----+ 1332 * |abc|def Stage-2 block | Stage-2 block |tvxyz| 1333 * +---+--------------------+--------------------+-----+ 1334 * 1335 * If we create those stage-2 blocks, we'll end up with this incorrect 1336 * mapping: 1337 * d -> f 1338 * e -> g 1339 * f -> h 1340 */ 1341 if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) 1342 return false; 1343 1344 /* 1345 * Next, let's make sure we're not trying to map anything not covered 1346 * by the memslot. This means we have to prohibit block size mappings 1347 * for the beginning and end of a non-block aligned and non-block sized 1348 * memory slot (illustrated by the head and tail parts of the 1349 * userspace view above containing pages 'abcde' and 'xyz', 1350 * respectively). 1351 * 1352 * Note that it doesn't matter if we do the check using the 1353 * userspace_addr or the base_gfn, as both are equally aligned (per 1354 * the check above) and equally sized. 1355 */ 1356 return (hva & ~(map_size - 1)) >= uaddr_start && 1357 (hva & ~(map_size - 1)) + map_size <= uaddr_end; 1358 } 1359 1360 /* 1361 * Check if the given hva is backed by a transparent huge page (THP) and 1362 * whether it can be mapped using block mapping in stage2. If so, adjust 1363 * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently 1364 * supported. This will need to be updated to support other THP sizes. 1365 * 1366 * Returns the size of the mapping. 1367 */ 1368 static long 1369 transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, 1370 unsigned long hva, kvm_pfn_t *pfnp, 1371 phys_addr_t *ipap) 1372 { 1373 kvm_pfn_t pfn = *pfnp; 1374 1375 /* 1376 * Make sure the adjustment is done only for THP pages. Also make 1377 * sure that the HVA and IPA are sufficiently aligned and that the 1378 * block map is contained within the memslot. 1379 */ 1380 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { 1381 int sz = get_user_mapping_size(kvm, hva); 1382 1383 if (sz < 0) 1384 return sz; 1385 1386 if (sz < PMD_SIZE) 1387 return PAGE_SIZE; 1388 1389 *ipap &= PMD_MASK; 1390 pfn &= ~(PTRS_PER_PMD - 1); 1391 *pfnp = pfn; 1392 1393 return PMD_SIZE; 1394 } 1395 1396 /* Use page mapping if we cannot use block mapping. */ 1397 return PAGE_SIZE; 1398 } 1399 1400 static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva) 1401 { 1402 unsigned long pa; 1403 1404 if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP)) 1405 return huge_page_shift(hstate_vma(vma)); 1406 1407 if (!(vma->vm_flags & VM_PFNMAP)) 1408 return PAGE_SHIFT; 1409 1410 VM_BUG_ON(is_vm_hugetlb_page(vma)); 1411 1412 pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start); 1413 1414 #ifndef __PAGETABLE_PMD_FOLDED 1415 if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) && 1416 ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start && 1417 ALIGN(hva, PUD_SIZE) <= vma->vm_end) 1418 return PUD_SHIFT; 1419 #endif 1420 1421 if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) && 1422 ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start && 1423 ALIGN(hva, PMD_SIZE) <= vma->vm_end) 1424 return PMD_SHIFT; 1425 1426 return PAGE_SHIFT; 1427 } 1428 1429 /* 1430 * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be 1431 * able to see the page's tags and therefore they must be initialised first. If 1432 * PG_mte_tagged is set, tags have already been initialised. 1433 * 1434 * The race in the test/set of the PG_mte_tagged flag is handled by: 1435 * - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs 1436 * racing to santise the same page 1437 * - mmap_lock protects between a VM faulting a page in and the VMM performing 1438 * an mprotect() to add VM_MTE 1439 */ 1440 static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, 1441 unsigned long size) 1442 { 1443 unsigned long i, nr_pages = size >> PAGE_SHIFT; 1444 struct page *page = pfn_to_page(pfn); 1445 struct folio *folio = page_folio(page); 1446 1447 if (!kvm_has_mte(kvm)) 1448 return; 1449 1450 if (folio_test_hugetlb(folio)) { 1451 /* Hugetlb has MTE flags set on head page only */ 1452 if (folio_try_hugetlb_mte_tagging(folio)) { 1453 for (i = 0; i < nr_pages; i++, page++) 1454 mte_clear_page_tags(page_address(page)); 1455 folio_set_hugetlb_mte_tagged(folio); 1456 } 1457 return; 1458 } 1459 1460 for (i = 0; i < nr_pages; i++, page++) { 1461 if (try_page_mte_tagging(page)) { 1462 mte_clear_page_tags(page_address(page)); 1463 set_page_mte_tagged(page); 1464 } 1465 } 1466 } 1467 1468 static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) 1469 { 1470 return vma->vm_flags & VM_MTE_ALLOWED; 1471 } 1472 1473 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, 1474 struct kvm_s2_trans *nested, 1475 struct kvm_memory_slot *memslot, unsigned long hva, 1476 bool fault_is_perm) 1477 { 1478 int ret = 0; 1479 bool write_fault, writable, force_pte = false; 1480 bool exec_fault, mte_allowed; 1481 bool device = false, vfio_allow_any_uc = false; 1482 unsigned long mmu_seq; 1483 phys_addr_t ipa = fault_ipa; 1484 struct kvm *kvm = vcpu->kvm; 1485 struct vm_area_struct *vma; 1486 short vma_shift; 1487 void *memcache; 1488 gfn_t gfn; 1489 kvm_pfn_t pfn; 1490 bool logging_active = memslot_is_logging(memslot); 1491 long vma_pagesize, fault_granule; 1492 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; 1493 struct kvm_pgtable *pgt; 1494 struct page *page; 1495 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED; 1496 1497 if (fault_is_perm) 1498 fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); 1499 write_fault = kvm_is_write_fault(vcpu); 1500 exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); 1501 VM_BUG_ON(write_fault && exec_fault); 1502 1503 if (fault_is_perm && !write_fault && !exec_fault) { 1504 kvm_err("Unexpected L2 read permission error\n"); 1505 return -EFAULT; 1506 } 1507 1508 if (!is_protected_kvm_enabled()) 1509 memcache = &vcpu->arch.mmu_page_cache; 1510 else 1511 memcache = &vcpu->arch.pkvm_memcache; 1512 1513 /* 1514 * Permission faults just need to update the existing leaf entry, 1515 * and so normally don't require allocations from the memcache. The 1516 * only exception to this is when dirty logging is enabled at runtime 1517 * and a write fault needs to collapse a block entry into a table. 1518 */ 1519 if (!fault_is_perm || (logging_active && write_fault)) { 1520 int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); 1521 1522 if (!is_protected_kvm_enabled()) 1523 ret = kvm_mmu_topup_memory_cache(memcache, min_pages); 1524 else 1525 ret = topup_hyp_memcache(memcache, min_pages); 1526 1527 if (ret) 1528 return ret; 1529 } 1530 1531 /* 1532 * Let's check if we will get back a huge page backed by hugetlbfs, or 1533 * get block mapping for device MMIO region. 1534 */ 1535 mmap_read_lock(current->mm); 1536 vma = vma_lookup(current->mm, hva); 1537 if (unlikely(!vma)) { 1538 kvm_err("Failed to find VMA for hva 0x%lx\n", hva); 1539 mmap_read_unlock(current->mm); 1540 return -EFAULT; 1541 } 1542 1543 /* 1544 * logging_active is guaranteed to never be true for VM_PFNMAP 1545 * memslots. 1546 */ 1547 if (logging_active) { 1548 force_pte = true; 1549 vma_shift = PAGE_SHIFT; 1550 } else { 1551 vma_shift = get_vma_page_shift(vma, hva); 1552 } 1553 1554 switch (vma_shift) { 1555 #ifndef __PAGETABLE_PMD_FOLDED 1556 case PUD_SHIFT: 1557 if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE)) 1558 break; 1559 fallthrough; 1560 #endif 1561 case CONT_PMD_SHIFT: 1562 vma_shift = PMD_SHIFT; 1563 fallthrough; 1564 case PMD_SHIFT: 1565 if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) 1566 break; 1567 fallthrough; 1568 case CONT_PTE_SHIFT: 1569 vma_shift = PAGE_SHIFT; 1570 force_pte = true; 1571 fallthrough; 1572 case PAGE_SHIFT: 1573 break; 1574 default: 1575 WARN_ONCE(1, "Unknown vma_shift %d", vma_shift); 1576 } 1577 1578 vma_pagesize = 1UL << vma_shift; 1579 1580 if (nested) { 1581 unsigned long max_map_size; 1582 1583 max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE; 1584 1585 ipa = kvm_s2_trans_output(nested); 1586 1587 /* 1588 * If we're about to create a shadow stage 2 entry, then we 1589 * can only create a block mapping if the guest stage 2 page 1590 * table uses at least as big a mapping. 1591 */ 1592 max_map_size = min(kvm_s2_trans_size(nested), max_map_size); 1593 1594 /* 1595 * Be careful that if the mapping size falls between 1596 * two host sizes, take the smallest of the two. 1597 */ 1598 if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE) 1599 max_map_size = PMD_SIZE; 1600 else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE) 1601 max_map_size = PAGE_SIZE; 1602 1603 force_pte = (max_map_size == PAGE_SIZE); 1604 vma_pagesize = min(vma_pagesize, (long)max_map_size); 1605 } 1606 1607 /* 1608 * Both the canonical IPA and fault IPA must be hugepage-aligned to 1609 * ensure we find the right PFN and lay down the mapping in the right 1610 * place. 1611 */ 1612 if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) { 1613 fault_ipa &= ~(vma_pagesize - 1); 1614 ipa &= ~(vma_pagesize - 1); 1615 } 1616 1617 gfn = ipa >> PAGE_SHIFT; 1618 mte_allowed = kvm_vma_mte_allowed(vma); 1619 1620 vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED; 1621 1622 /* Don't use the VMA after the unlock -- it may have vanished */ 1623 vma = NULL; 1624 1625 /* 1626 * Read mmu_invalidate_seq so that KVM can detect if the results of 1627 * vma_lookup() or __kvm_faultin_pfn() become stale prior to 1628 * acquiring kvm->mmu_lock. 1629 * 1630 * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs 1631 * with the smp_wmb() in kvm_mmu_invalidate_end(). 1632 */ 1633 mmu_seq = vcpu->kvm->mmu_invalidate_seq; 1634 mmap_read_unlock(current->mm); 1635 1636 pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, 1637 &writable, &page); 1638 if (pfn == KVM_PFN_ERR_HWPOISON) { 1639 kvm_send_hwpoison_signal(hva, vma_shift); 1640 return 0; 1641 } 1642 if (is_error_noslot_pfn(pfn)) 1643 return -EFAULT; 1644 1645 if (kvm_is_device_pfn(pfn)) { 1646 /* 1647 * If the page was identified as device early by looking at 1648 * the VMA flags, vma_pagesize is already representing the 1649 * largest quantity we can map. If instead it was mapped 1650 * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE 1651 * and must not be upgraded. 1652 * 1653 * In both cases, we don't let transparent_hugepage_adjust() 1654 * change things at the last minute. 1655 */ 1656 device = true; 1657 } else if (logging_active && !write_fault) { 1658 /* 1659 * Only actually map the page as writable if this was a write 1660 * fault. 1661 */ 1662 writable = false; 1663 } 1664 1665 if (exec_fault && device) 1666 return -ENOEXEC; 1667 1668 /* 1669 * Potentially reduce shadow S2 permissions to match the guest's own 1670 * S2. For exec faults, we'd only reach this point if the guest 1671 * actually allowed it (see kvm_s2_handle_perm_fault). 1672 * 1673 * Also encode the level of the original translation in the SW bits 1674 * of the leaf entry as a proxy for the span of that translation. 1675 * This will be retrieved on TLB invalidation from the guest and 1676 * used to limit the invalidation scope if a TTL hint or a range 1677 * isn't provided. 1678 */ 1679 if (nested) { 1680 writable &= kvm_s2_trans_writable(nested); 1681 if (!kvm_s2_trans_readable(nested)) 1682 prot &= ~KVM_PGTABLE_PROT_R; 1683 1684 prot |= kvm_encode_nested_level(nested); 1685 } 1686 1687 kvm_fault_lock(kvm); 1688 pgt = vcpu->arch.hw_mmu->pgt; 1689 if (mmu_invalidate_retry(kvm, mmu_seq)) { 1690 ret = -EAGAIN; 1691 goto out_unlock; 1692 } 1693 1694 /* 1695 * If we are not forced to use page mapping, check if we are 1696 * backed by a THP and thus use block mapping if possible. 1697 */ 1698 if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) { 1699 if (fault_is_perm && fault_granule > PAGE_SIZE) 1700 vma_pagesize = fault_granule; 1701 else 1702 vma_pagesize = transparent_hugepage_adjust(kvm, memslot, 1703 hva, &pfn, 1704 &fault_ipa); 1705 1706 if (vma_pagesize < 0) { 1707 ret = vma_pagesize; 1708 goto out_unlock; 1709 } 1710 } 1711 1712 if (!fault_is_perm && !device && kvm_has_mte(kvm)) { 1713 /* Check the VMM hasn't introduced a new disallowed VMA */ 1714 if (mte_allowed) { 1715 sanitise_mte_tags(kvm, pfn, vma_pagesize); 1716 } else { 1717 ret = -EFAULT; 1718 goto out_unlock; 1719 } 1720 } 1721 1722 if (writable) 1723 prot |= KVM_PGTABLE_PROT_W; 1724 1725 if (exec_fault) 1726 prot |= KVM_PGTABLE_PROT_X; 1727 1728 if (device) { 1729 if (vfio_allow_any_uc) 1730 prot |= KVM_PGTABLE_PROT_NORMAL_NC; 1731 else 1732 prot |= KVM_PGTABLE_PROT_DEVICE; 1733 } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) && 1734 (!nested || kvm_s2_trans_executable(nested))) { 1735 prot |= KVM_PGTABLE_PROT_X; 1736 } 1737 1738 /* 1739 * Under the premise of getting a FSC_PERM fault, we just need to relax 1740 * permissions only if vma_pagesize equals fault_granule. Otherwise, 1741 * kvm_pgtable_stage2_map() should be called to change block size. 1742 */ 1743 if (fault_is_perm && vma_pagesize == fault_granule) { 1744 /* 1745 * Drop the SW bits in favour of those stored in the 1746 * PTE, which will be preserved. 1747 */ 1748 prot &= ~KVM_NV_GUEST_MAP_SZ; 1749 ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags); 1750 } else { 1751 ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize, 1752 __pfn_to_phys(pfn), prot, 1753 memcache, flags); 1754 } 1755 1756 out_unlock: 1757 kvm_release_faultin_page(kvm, page, !!ret, writable); 1758 kvm_fault_unlock(kvm); 1759 1760 /* Mark the page dirty only if the fault is handled successfully */ 1761 if (writable && !ret) 1762 mark_page_dirty_in_slot(kvm, memslot, gfn); 1763 1764 return ret != -EAGAIN ? ret : 0; 1765 } 1766 1767 /* Resolve the access fault by making the page young again. */ 1768 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) 1769 { 1770 enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED; 1771 struct kvm_s2_mmu *mmu; 1772 1773 trace_kvm_access_fault(fault_ipa); 1774 1775 read_lock(&vcpu->kvm->mmu_lock); 1776 mmu = vcpu->arch.hw_mmu; 1777 KVM_PGT_FN(kvm_pgtable_stage2_mkyoung)(mmu->pgt, fault_ipa, flags); 1778 read_unlock(&vcpu->kvm->mmu_lock); 1779 } 1780 1781 /** 1782 * kvm_handle_guest_abort - handles all 2nd stage aborts 1783 * @vcpu: the VCPU pointer 1784 * 1785 * Any abort that gets to the host is almost guaranteed to be caused by a 1786 * missing second stage translation table entry, which can mean that either the 1787 * guest simply needs more memory and we must allocate an appropriate page or it 1788 * can mean that the guest tried to access I/O memory, which is emulated by user 1789 * space. The distinction is based on the IPA causing the fault and whether this 1790 * memory region has been registered as standard RAM by user space. 1791 */ 1792 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) 1793 { 1794 struct kvm_s2_trans nested_trans, *nested = NULL; 1795 unsigned long esr; 1796 phys_addr_t fault_ipa; /* The address we faulted on */ 1797 phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */ 1798 struct kvm_memory_slot *memslot; 1799 unsigned long hva; 1800 bool is_iabt, write_fault, writable; 1801 gfn_t gfn; 1802 int ret, idx; 1803 1804 /* Synchronous External Abort? */ 1805 if (kvm_vcpu_abt_issea(vcpu)) { 1806 /* 1807 * For RAS the host kernel may handle this abort. 1808 * There is no need to pass the error into the guest. 1809 */ 1810 if (kvm_handle_guest_sea()) 1811 kvm_inject_vabt(vcpu); 1812 1813 return 1; 1814 } 1815 1816 esr = kvm_vcpu_get_esr(vcpu); 1817 1818 /* 1819 * The fault IPA should be reliable at this point as we're not dealing 1820 * with an SEA. 1821 */ 1822 ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); 1823 if (KVM_BUG_ON(ipa == INVALID_GPA, vcpu->kvm)) 1824 return -EFAULT; 1825 1826 is_iabt = kvm_vcpu_trap_is_iabt(vcpu); 1827 1828 if (esr_fsc_is_translation_fault(esr)) { 1829 /* Beyond sanitised PARange (which is the IPA limit) */ 1830 if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) { 1831 kvm_inject_size_fault(vcpu); 1832 return 1; 1833 } 1834 1835 /* Falls between the IPA range and the PARange? */ 1836 if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) { 1837 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); 1838 1839 if (is_iabt) 1840 kvm_inject_pabt(vcpu, fault_ipa); 1841 else 1842 kvm_inject_dabt(vcpu, fault_ipa); 1843 return 1; 1844 } 1845 } 1846 1847 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu), 1848 kvm_vcpu_get_hfar(vcpu), fault_ipa); 1849 1850 /* Check the stage-2 fault is trans. fault or write fault */ 1851 if (!esr_fsc_is_translation_fault(esr) && 1852 !esr_fsc_is_permission_fault(esr) && 1853 !esr_fsc_is_access_flag_fault(esr)) { 1854 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", 1855 kvm_vcpu_trap_get_class(vcpu), 1856 (unsigned long)kvm_vcpu_trap_get_fault(vcpu), 1857 (unsigned long)kvm_vcpu_get_esr(vcpu)); 1858 return -EFAULT; 1859 } 1860 1861 idx = srcu_read_lock(&vcpu->kvm->srcu); 1862 1863 /* 1864 * We may have faulted on a shadow stage 2 page table if we are 1865 * running a nested guest. In this case, we have to resolve the L2 1866 * IPA to the L1 IPA first, before knowing what kind of memory should 1867 * back the L1 IPA. 1868 * 1869 * If the shadow stage 2 page table walk faults, then we simply inject 1870 * this to the guest and carry on. 1871 * 1872 * If there are no shadow S2 PTs because S2 is disabled, there is 1873 * nothing to walk and we treat it as a 1:1 before going through the 1874 * canonical translation. 1875 */ 1876 if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) && 1877 vcpu->arch.hw_mmu->nested_stage2_enabled) { 1878 u32 esr; 1879 1880 ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans); 1881 if (ret) { 1882 esr = kvm_s2_trans_esr(&nested_trans); 1883 kvm_inject_s2_fault(vcpu, esr); 1884 goto out_unlock; 1885 } 1886 1887 ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans); 1888 if (ret) { 1889 esr = kvm_s2_trans_esr(&nested_trans); 1890 kvm_inject_s2_fault(vcpu, esr); 1891 goto out_unlock; 1892 } 1893 1894 ipa = kvm_s2_trans_output(&nested_trans); 1895 nested = &nested_trans; 1896 } 1897 1898 gfn = ipa >> PAGE_SHIFT; 1899 memslot = gfn_to_memslot(vcpu->kvm, gfn); 1900 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable); 1901 write_fault = kvm_is_write_fault(vcpu); 1902 if (kvm_is_error_hva(hva) || (write_fault && !writable)) { 1903 /* 1904 * The guest has put either its instructions or its page-tables 1905 * somewhere it shouldn't have. Userspace won't be able to do 1906 * anything about this (there's no syndrome for a start), so 1907 * re-inject the abort back into the guest. 1908 */ 1909 if (is_iabt) { 1910 ret = -ENOEXEC; 1911 goto out; 1912 } 1913 1914 if (kvm_vcpu_abt_iss1tw(vcpu)) { 1915 kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 1916 ret = 1; 1917 goto out_unlock; 1918 } 1919 1920 /* 1921 * Check for a cache maintenance operation. Since we 1922 * ended-up here, we know it is outside of any memory 1923 * slot. But we can't find out if that is for a device, 1924 * or if the guest is just being stupid. The only thing 1925 * we know for sure is that this range cannot be cached. 1926 * 1927 * So let's assume that the guest is just being 1928 * cautious, and skip the instruction. 1929 */ 1930 if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) { 1931 kvm_incr_pc(vcpu); 1932 ret = 1; 1933 goto out_unlock; 1934 } 1935 1936 /* 1937 * The IPA is reported as [MAX:12], so we need to 1938 * complement it with the bottom 12 bits from the 1939 * faulting VA. This is always 12 bits, irrespective 1940 * of the page size. 1941 */ 1942 ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); 1943 ret = io_mem_abort(vcpu, ipa); 1944 goto out_unlock; 1945 } 1946 1947 /* Userspace should not be able to register out-of-bounds IPAs */ 1948 VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu)); 1949 1950 if (esr_fsc_is_access_flag_fault(esr)) { 1951 handle_access_fault(vcpu, fault_ipa); 1952 ret = 1; 1953 goto out_unlock; 1954 } 1955 1956 ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva, 1957 esr_fsc_is_permission_fault(esr)); 1958 if (ret == 0) 1959 ret = 1; 1960 out: 1961 if (ret == -ENOEXEC) { 1962 kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu)); 1963 ret = 1; 1964 } 1965 out_unlock: 1966 srcu_read_unlock(&vcpu->kvm->srcu, idx); 1967 return ret; 1968 } 1969 1970 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 1971 { 1972 if (!kvm->arch.mmu.pgt) 1973 return false; 1974 1975 __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT, 1976 (range->end - range->start) << PAGE_SHIFT, 1977 range->may_block); 1978 1979 kvm_nested_s2_unmap(kvm, range->may_block); 1980 return false; 1981 } 1982 1983 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1984 { 1985 u64 size = (range->end - range->start) << PAGE_SHIFT; 1986 1987 if (!kvm->arch.mmu.pgt) 1988 return false; 1989 1990 return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, 1991 range->start << PAGE_SHIFT, 1992 size, true); 1993 /* 1994 * TODO: Handle nested_mmu structures here using the reverse mapping in 1995 * a later version of patch series. 1996 */ 1997 } 1998 1999 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 2000 { 2001 u64 size = (range->end - range->start) << PAGE_SHIFT; 2002 2003 if (!kvm->arch.mmu.pgt) 2004 return false; 2005 2006 return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, 2007 range->start << PAGE_SHIFT, 2008 size, false); 2009 } 2010 2011 phys_addr_t kvm_mmu_get_httbr(void) 2012 { 2013 return __pa(hyp_pgtable->pgd); 2014 } 2015 2016 phys_addr_t kvm_get_idmap_vector(void) 2017 { 2018 return hyp_idmap_vector; 2019 } 2020 2021 static int kvm_map_idmap_text(void) 2022 { 2023 unsigned long size = hyp_idmap_end - hyp_idmap_start; 2024 int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start, 2025 PAGE_HYP_EXEC); 2026 if (err) 2027 kvm_err("Failed to idmap %lx-%lx\n", 2028 hyp_idmap_start, hyp_idmap_end); 2029 2030 return err; 2031 } 2032 2033 static void *kvm_hyp_zalloc_page(void *arg) 2034 { 2035 return (void *)get_zeroed_page(GFP_KERNEL); 2036 } 2037 2038 static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = { 2039 .zalloc_page = kvm_hyp_zalloc_page, 2040 .get_page = kvm_host_get_page, 2041 .put_page = kvm_host_put_page, 2042 .phys_to_virt = kvm_host_va, 2043 .virt_to_phys = kvm_host_pa, 2044 }; 2045 2046 int __init kvm_mmu_init(u32 *hyp_va_bits) 2047 { 2048 int err; 2049 u32 idmap_bits; 2050 u32 kernel_bits; 2051 2052 hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start); 2053 hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); 2054 hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end); 2055 hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE); 2056 hyp_idmap_vector = __pa_symbol(__kvm_hyp_init); 2057 2058 /* 2059 * We rely on the linker script to ensure at build time that the HYP 2060 * init code does not cross a page boundary. 2061 */ 2062 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); 2063 2064 /* 2065 * The ID map is always configured for 48 bits of translation, which 2066 * may be fewer than the number of VA bits used by the regular kernel 2067 * stage 1, when VA_BITS=52. 2068 * 2069 * At EL2, there is only one TTBR register, and we can't switch between 2070 * translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom 2071 * line: we need to use the extended range with *both* our translation 2072 * tables. 2073 * 2074 * So use the maximum of the idmap VA bits and the regular kernel stage 2075 * 1 VA bits to assure that the hypervisor can both ID map its code page 2076 * and map any kernel memory. 2077 */ 2078 idmap_bits = IDMAP_VA_BITS; 2079 kernel_bits = vabits_actual; 2080 *hyp_va_bits = max(idmap_bits, kernel_bits); 2081 2082 kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits); 2083 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); 2084 kvm_debug("HYP VA range: %lx:%lx\n", 2085 kern_hyp_va(PAGE_OFFSET), 2086 kern_hyp_va((unsigned long)high_memory - 1)); 2087 2088 if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) && 2089 hyp_idmap_start < kern_hyp_va((unsigned long)high_memory - 1) && 2090 hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) { 2091 /* 2092 * The idmap page is intersecting with the VA space, 2093 * it is not safe to continue further. 2094 */ 2095 kvm_err("IDMAP intersecting with HYP VA, unable to continue\n"); 2096 err = -EINVAL; 2097 goto out; 2098 } 2099 2100 hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL); 2101 if (!hyp_pgtable) { 2102 kvm_err("Hyp mode page-table not allocated\n"); 2103 err = -ENOMEM; 2104 goto out; 2105 } 2106 2107 err = kvm_pgtable_hyp_init(hyp_pgtable, *hyp_va_bits, &kvm_hyp_mm_ops); 2108 if (err) 2109 goto out_free_pgtable; 2110 2111 err = kvm_map_idmap_text(); 2112 if (err) 2113 goto out_destroy_pgtable; 2114 2115 io_map_base = hyp_idmap_start; 2116 __hyp_va_bits = *hyp_va_bits; 2117 return 0; 2118 2119 out_destroy_pgtable: 2120 kvm_pgtable_hyp_destroy(hyp_pgtable); 2121 out_free_pgtable: 2122 kfree(hyp_pgtable); 2123 hyp_pgtable = NULL; 2124 out: 2125 return err; 2126 } 2127 2128 void kvm_arch_commit_memory_region(struct kvm *kvm, 2129 struct kvm_memory_slot *old, 2130 const struct kvm_memory_slot *new, 2131 enum kvm_mr_change change) 2132 { 2133 bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES; 2134 2135 /* 2136 * At this point memslot has been committed and there is an 2137 * allocated dirty_bitmap[], dirty pages will be tracked while the 2138 * memory slot is write protected. 2139 */ 2140 if (log_dirty_pages) { 2141 2142 if (change == KVM_MR_DELETE) 2143 return; 2144 2145 /* 2146 * Huge and normal pages are write-protected and split 2147 * on either of these two cases: 2148 * 2149 * 1. with initial-all-set: gradually with CLEAR ioctls, 2150 */ 2151 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 2152 return; 2153 /* 2154 * or 2155 * 2. without initial-all-set: all in one shot when 2156 * enabling dirty logging. 2157 */ 2158 kvm_mmu_wp_memory_region(kvm, new->id); 2159 kvm_mmu_split_memory_region(kvm, new->id); 2160 } else { 2161 /* 2162 * Free any leftovers from the eager page splitting cache. Do 2163 * this when deleting, moving, disabling dirty logging, or 2164 * creating the memslot (a nop). Doing it for deletes makes 2165 * sure we don't leak memory, and there's no need to keep the 2166 * cache around for any of the other cases. 2167 */ 2168 kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); 2169 } 2170 } 2171 2172 int kvm_arch_prepare_memory_region(struct kvm *kvm, 2173 const struct kvm_memory_slot *old, 2174 struct kvm_memory_slot *new, 2175 enum kvm_mr_change change) 2176 { 2177 hva_t hva, reg_end; 2178 int ret = 0; 2179 2180 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE && 2181 change != KVM_MR_FLAGS_ONLY) 2182 return 0; 2183 2184 /* 2185 * Prevent userspace from creating a memory region outside of the IPA 2186 * space addressable by the KVM guest IPA space. 2187 */ 2188 if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT)) 2189 return -EFAULT; 2190 2191 hva = new->userspace_addr; 2192 reg_end = hva + (new->npages << PAGE_SHIFT); 2193 2194 mmap_read_lock(current->mm); 2195 /* 2196 * A memory region could potentially cover multiple VMAs, and any holes 2197 * between them, so iterate over all of them. 2198 * 2199 * +--------------------------------------------+ 2200 * +---------------+----------------+ +----------------+ 2201 * | : VMA 1 | VMA 2 | | VMA 3 : | 2202 * +---------------+----------------+ +----------------+ 2203 * | memory region | 2204 * +--------------------------------------------+ 2205 */ 2206 do { 2207 struct vm_area_struct *vma; 2208 2209 vma = find_vma_intersection(current->mm, hva, reg_end); 2210 if (!vma) 2211 break; 2212 2213 if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) { 2214 ret = -EINVAL; 2215 break; 2216 } 2217 2218 if (vma->vm_flags & VM_PFNMAP) { 2219 /* IO region dirty page logging not allowed */ 2220 if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { 2221 ret = -EINVAL; 2222 break; 2223 } 2224 } 2225 hva = min(reg_end, vma->vm_end); 2226 } while (hva < reg_end); 2227 2228 mmap_read_unlock(current->mm); 2229 return ret; 2230 } 2231 2232 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) 2233 { 2234 } 2235 2236 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) 2237 { 2238 } 2239 2240 void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 2241 struct kvm_memory_slot *slot) 2242 { 2243 gpa_t gpa = slot->base_gfn << PAGE_SHIFT; 2244 phys_addr_t size = slot->npages << PAGE_SHIFT; 2245 2246 write_lock(&kvm->mmu_lock); 2247 kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true); 2248 kvm_nested_s2_unmap(kvm, true); 2249 write_unlock(&kvm->mmu_lock); 2250 } 2251 2252 /* 2253 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). 2254 * 2255 * Main problems: 2256 * - S/W ops are local to a CPU (not broadcast) 2257 * - We have line migration behind our back (speculation) 2258 * - System caches don't support S/W at all (damn!) 2259 * 2260 * In the face of the above, the best we can do is to try and convert 2261 * S/W ops to VA ops. Because the guest is not allowed to infer the 2262 * S/W to PA mapping, it can only use S/W to nuke the whole cache, 2263 * which is a rather good thing for us. 2264 * 2265 * Also, it is only used when turning caches on/off ("The expected 2266 * usage of the cache maintenance instructions that operate by set/way 2267 * is associated with the cache maintenance instructions associated 2268 * with the powerdown and powerup of caches, if this is required by 2269 * the implementation."). 2270 * 2271 * We use the following policy: 2272 * 2273 * - If we trap a S/W operation, we enable VM trapping to detect 2274 * caches being turned on/off, and do a full clean. 2275 * 2276 * - We flush the caches on both caches being turned on and off. 2277 * 2278 * - Once the caches are enabled, we stop trapping VM ops. 2279 */ 2280 void kvm_set_way_flush(struct kvm_vcpu *vcpu) 2281 { 2282 unsigned long hcr = *vcpu_hcr(vcpu); 2283 2284 /* 2285 * If this is the first time we do a S/W operation 2286 * (i.e. HCR_TVM not set) flush the whole memory, and set the 2287 * VM trapping. 2288 * 2289 * Otherwise, rely on the VM trapping to wait for the MMU + 2290 * Caches to be turned off. At that point, we'll be able to 2291 * clean the caches again. 2292 */ 2293 if (!(hcr & HCR_TVM)) { 2294 trace_kvm_set_way_flush(*vcpu_pc(vcpu), 2295 vcpu_has_cache_enabled(vcpu)); 2296 stage2_flush_vm(vcpu->kvm); 2297 *vcpu_hcr(vcpu) = hcr | HCR_TVM; 2298 } 2299 } 2300 2301 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) 2302 { 2303 bool now_enabled = vcpu_has_cache_enabled(vcpu); 2304 2305 /* 2306 * If switching the MMU+caches on, need to invalidate the caches. 2307 * If switching it off, need to clean the caches. 2308 * Clean + invalidate does the trick always. 2309 */ 2310 if (now_enabled != was_enabled) 2311 stage2_flush_vm(vcpu->kvm); 2312 2313 /* Caches are now on, stop trapping VM ops (until a S/W op) */ 2314 if (now_enabled) 2315 *vcpu_hcr(vcpu) &= ~HCR_TVM; 2316 2317 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); 2318 } 2319