1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Based on arch/arm/mm/mmu.c 4 * 5 * Copyright (C) 1995-2005 Russell King 6 * Copyright (C) 2012 ARM Ltd. 7 */ 8 9 #include <linux/cache.h> 10 #include <linux/export.h> 11 #include <linux/kernel.h> 12 #include <linux/errno.h> 13 #include <linux/init.h> 14 #include <linux/ioport.h> 15 #include <linux/kexec.h> 16 #include <linux/libfdt.h> 17 #include <linux/mman.h> 18 #include <linux/nodemask.h> 19 #include <linux/memblock.h> 20 #include <linux/memremap.h> 21 #include <linux/memory.h> 22 #include <linux/fs.h> 23 #include <linux/io.h> 24 #include <linux/mm.h> 25 #include <linux/vmalloc.h> 26 #include <linux/set_memory.h> 27 #include <linux/kfence.h> 28 #include <linux/pkeys.h> 29 30 #include <asm/barrier.h> 31 #include <asm/cputype.h> 32 #include <asm/fixmap.h> 33 #include <asm/kasan.h> 34 #include <asm/kernel-pgtable.h> 35 #include <asm/sections.h> 36 #include <asm/setup.h> 37 #include <linux/sizes.h> 38 #include <asm/tlb.h> 39 #include <asm/mmu_context.h> 40 #include <asm/ptdump.h> 41 #include <asm/tlbflush.h> 42 #include <asm/pgalloc.h> 43 #include <asm/kfence.h> 44 45 #define NO_BLOCK_MAPPINGS BIT(0) 46 #define NO_CONT_MAPPINGS BIT(1) 47 #define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */ 48 49 enum pgtable_type { 50 TABLE_PTE, 51 TABLE_PMD, 52 TABLE_PUD, 53 TABLE_P4D, 54 }; 55 56 u64 kimage_voffset __ro_after_init; 57 EXPORT_SYMBOL(kimage_voffset); 58 59 u32 __boot_cpu_mode[] = { BOOT_CPU_MODE_EL2, BOOT_CPU_MODE_EL1 }; 60 61 static bool rodata_is_rw __ro_after_init = true; 62 63 /* 64 * The booting CPU updates the failed status @__early_cpu_boot_status, 65 * with MMU turned off. 66 */ 67 long __section(".mmuoff.data.write") __early_cpu_boot_status; 68 69 /* 70 * Empty_zero_page is a special page that is used for zero-initialized data 71 * and COW. 72 */ 73 unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; 74 EXPORT_SYMBOL(empty_zero_page); 75 76 static DEFINE_SPINLOCK(swapper_pgdir_lock); 77 static DEFINE_MUTEX(fixmap_lock); 78 79 void noinstr set_swapper_pgd(pgd_t *pgdp, pgd_t pgd) 80 { 81 pgd_t *fixmap_pgdp; 82 83 /* 84 * Don't bother with the fixmap if swapper_pg_dir is still mapped 85 * writable in the kernel mapping. 86 */ 87 if (rodata_is_rw) { 88 WRITE_ONCE(*pgdp, pgd); 89 dsb(ishst); 90 isb(); 91 return; 92 } 93 94 spin_lock(&swapper_pgdir_lock); 95 fixmap_pgdp = pgd_set_fixmap(__pa_symbol(pgdp)); 96 WRITE_ONCE(*fixmap_pgdp, pgd); 97 /* 98 * We need dsb(ishst) here to ensure the page-table-walker sees 99 * our new entry before set_p?d() returns. The fixmap's 100 * flush_tlb_kernel_range() via clear_fixmap() does this for us. 101 */ 102 pgd_clear_fixmap(); 103 spin_unlock(&swapper_pgdir_lock); 104 } 105 106 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 107 unsigned long size, pgprot_t vma_prot) 108 { 109 if (!pfn_is_map_memory(pfn)) 110 return pgprot_noncached(vma_prot); 111 else if (file->f_flags & O_SYNC) 112 return pgprot_writecombine(vma_prot); 113 return vma_prot; 114 } 115 EXPORT_SYMBOL(phys_mem_access_prot); 116 117 static phys_addr_t __init early_pgtable_alloc(enum pgtable_type pgtable_type) 118 { 119 phys_addr_t phys; 120 121 phys = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0, 122 MEMBLOCK_ALLOC_NOLEAKTRACE); 123 if (!phys) 124 panic("Failed to allocate page table page\n"); 125 126 return phys; 127 } 128 129 bool pgattr_change_is_safe(pteval_t old, pteval_t new) 130 { 131 /* 132 * The following mapping attributes may be updated in live 133 * kernel mappings without the need for break-before-make. 134 */ 135 pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE | PTE_NG | 136 PTE_SWBITS_MASK; 137 138 /* creating or taking down mappings is always safe */ 139 if (!pte_valid(__pte(old)) || !pte_valid(__pte(new))) 140 return true; 141 142 /* A live entry's pfn should not change */ 143 if (pte_pfn(__pte(old)) != pte_pfn(__pte(new))) 144 return false; 145 146 /* live contiguous mappings may not be manipulated at all */ 147 if ((old | new) & PTE_CONT) 148 return false; 149 150 /* Transitioning from Non-Global to Global is unsafe */ 151 if (old & ~new & PTE_NG) 152 return false; 153 154 /* 155 * Changing the memory type between Normal and Normal-Tagged is safe 156 * since Tagged is considered a permission attribute from the 157 * mismatched attribute aliases perspective. 158 */ 159 if (((old & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL) || 160 (old & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_TAGGED)) && 161 ((new & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL) || 162 (new & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_TAGGED))) 163 mask |= PTE_ATTRINDX_MASK; 164 165 return ((old ^ new) & ~mask) == 0; 166 } 167 168 static void init_clear_pgtable(void *table) 169 { 170 clear_page(table); 171 172 /* Ensure the zeroing is observed by page table walks. */ 173 dsb(ishst); 174 } 175 176 static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end, 177 phys_addr_t phys, pgprot_t prot) 178 { 179 do { 180 pte_t old_pte = __ptep_get(ptep); 181 182 /* 183 * Required barriers to make this visible to the table walker 184 * are deferred to the end of alloc_init_cont_pte(). 185 */ 186 __set_pte_nosync(ptep, pfn_pte(__phys_to_pfn(phys), prot)); 187 188 /* 189 * After the PTE entry has been populated once, we 190 * only allow updates to the permission attributes. 191 */ 192 BUG_ON(!pgattr_change_is_safe(pte_val(old_pte), 193 pte_val(__ptep_get(ptep)))); 194 195 phys += PAGE_SIZE; 196 } while (ptep++, addr += PAGE_SIZE, addr != end); 197 } 198 199 static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, 200 unsigned long end, phys_addr_t phys, 201 pgprot_t prot, 202 phys_addr_t (*pgtable_alloc)(enum pgtable_type), 203 int flags) 204 { 205 unsigned long next; 206 pmd_t pmd = READ_ONCE(*pmdp); 207 pte_t *ptep; 208 209 BUG_ON(pmd_sect(pmd)); 210 if (pmd_none(pmd)) { 211 pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF; 212 phys_addr_t pte_phys; 213 214 if (flags & NO_EXEC_MAPPINGS) 215 pmdval |= PMD_TABLE_PXN; 216 BUG_ON(!pgtable_alloc); 217 pte_phys = pgtable_alloc(TABLE_PTE); 218 ptep = pte_set_fixmap(pte_phys); 219 init_clear_pgtable(ptep); 220 ptep += pte_index(addr); 221 __pmd_populate(pmdp, pte_phys, pmdval); 222 } else { 223 BUG_ON(pmd_bad(pmd)); 224 ptep = pte_set_fixmap_offset(pmdp, addr); 225 } 226 227 do { 228 pgprot_t __prot = prot; 229 230 next = pte_cont_addr_end(addr, end); 231 232 /* use a contiguous mapping if the range is suitably aligned */ 233 if ((((addr | next | phys) & ~CONT_PTE_MASK) == 0) && 234 (flags & NO_CONT_MAPPINGS) == 0) 235 __prot = __pgprot(pgprot_val(prot) | PTE_CONT); 236 237 init_pte(ptep, addr, next, phys, __prot); 238 239 ptep += pte_index(next) - pte_index(addr); 240 phys += next - addr; 241 } while (addr = next, addr != end); 242 243 /* 244 * Note: barriers and maintenance necessary to clear the fixmap slot 245 * ensure that all previous pgtable writes are visible to the table 246 * walker. 247 */ 248 pte_clear_fixmap(); 249 } 250 251 static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end, 252 phys_addr_t phys, pgprot_t prot, 253 phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags) 254 { 255 unsigned long next; 256 257 do { 258 pmd_t old_pmd = READ_ONCE(*pmdp); 259 260 next = pmd_addr_end(addr, end); 261 262 /* try section mapping first */ 263 if (((addr | next | phys) & ~PMD_MASK) == 0 && 264 (flags & NO_BLOCK_MAPPINGS) == 0) { 265 pmd_set_huge(pmdp, phys, prot); 266 267 /* 268 * After the PMD entry has been populated once, we 269 * only allow updates to the permission attributes. 270 */ 271 BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd), 272 READ_ONCE(pmd_val(*pmdp)))); 273 } else { 274 alloc_init_cont_pte(pmdp, addr, next, phys, prot, 275 pgtable_alloc, flags); 276 277 BUG_ON(pmd_val(old_pmd) != 0 && 278 pmd_val(old_pmd) != READ_ONCE(pmd_val(*pmdp))); 279 } 280 phys += next - addr; 281 } while (pmdp++, addr = next, addr != end); 282 } 283 284 static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, 285 unsigned long end, phys_addr_t phys, 286 pgprot_t prot, 287 phys_addr_t (*pgtable_alloc)(enum pgtable_type), 288 int flags) 289 { 290 unsigned long next; 291 pud_t pud = READ_ONCE(*pudp); 292 pmd_t *pmdp; 293 294 /* 295 * Check for initial section mappings in the pgd/pud. 296 */ 297 BUG_ON(pud_sect(pud)); 298 if (pud_none(pud)) { 299 pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF; 300 phys_addr_t pmd_phys; 301 302 if (flags & NO_EXEC_MAPPINGS) 303 pudval |= PUD_TABLE_PXN; 304 BUG_ON(!pgtable_alloc); 305 pmd_phys = pgtable_alloc(TABLE_PMD); 306 pmdp = pmd_set_fixmap(pmd_phys); 307 init_clear_pgtable(pmdp); 308 pmdp += pmd_index(addr); 309 __pud_populate(pudp, pmd_phys, pudval); 310 } else { 311 BUG_ON(pud_bad(pud)); 312 pmdp = pmd_set_fixmap_offset(pudp, addr); 313 } 314 315 do { 316 pgprot_t __prot = prot; 317 318 next = pmd_cont_addr_end(addr, end); 319 320 /* use a contiguous mapping if the range is suitably aligned */ 321 if ((((addr | next | phys) & ~CONT_PMD_MASK) == 0) && 322 (flags & NO_CONT_MAPPINGS) == 0) 323 __prot = __pgprot(pgprot_val(prot) | PTE_CONT); 324 325 init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags); 326 327 pmdp += pmd_index(next) - pmd_index(addr); 328 phys += next - addr; 329 } while (addr = next, addr != end); 330 331 pmd_clear_fixmap(); 332 } 333 334 static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end, 335 phys_addr_t phys, pgprot_t prot, 336 phys_addr_t (*pgtable_alloc)(enum pgtable_type), 337 int flags) 338 { 339 unsigned long next; 340 p4d_t p4d = READ_ONCE(*p4dp); 341 pud_t *pudp; 342 343 if (p4d_none(p4d)) { 344 p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN | P4D_TABLE_AF; 345 phys_addr_t pud_phys; 346 347 if (flags & NO_EXEC_MAPPINGS) 348 p4dval |= P4D_TABLE_PXN; 349 BUG_ON(!pgtable_alloc); 350 pud_phys = pgtable_alloc(TABLE_PUD); 351 pudp = pud_set_fixmap(pud_phys); 352 init_clear_pgtable(pudp); 353 pudp += pud_index(addr); 354 __p4d_populate(p4dp, pud_phys, p4dval); 355 } else { 356 BUG_ON(p4d_bad(p4d)); 357 pudp = pud_set_fixmap_offset(p4dp, addr); 358 } 359 360 do { 361 pud_t old_pud = READ_ONCE(*pudp); 362 363 next = pud_addr_end(addr, end); 364 365 /* 366 * For 4K granule only, attempt to put down a 1GB block 367 */ 368 if (pud_sect_supported() && 369 ((addr | next | phys) & ~PUD_MASK) == 0 && 370 (flags & NO_BLOCK_MAPPINGS) == 0) { 371 pud_set_huge(pudp, phys, prot); 372 373 /* 374 * After the PUD entry has been populated once, we 375 * only allow updates to the permission attributes. 376 */ 377 BUG_ON(!pgattr_change_is_safe(pud_val(old_pud), 378 READ_ONCE(pud_val(*pudp)))); 379 } else { 380 alloc_init_cont_pmd(pudp, addr, next, phys, prot, 381 pgtable_alloc, flags); 382 383 BUG_ON(pud_val(old_pud) != 0 && 384 pud_val(old_pud) != READ_ONCE(pud_val(*pudp))); 385 } 386 phys += next - addr; 387 } while (pudp++, addr = next, addr != end); 388 389 pud_clear_fixmap(); 390 } 391 392 static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end, 393 phys_addr_t phys, pgprot_t prot, 394 phys_addr_t (*pgtable_alloc)(enum pgtable_type), 395 int flags) 396 { 397 unsigned long next; 398 pgd_t pgd = READ_ONCE(*pgdp); 399 p4d_t *p4dp; 400 401 if (pgd_none(pgd)) { 402 pgdval_t pgdval = PGD_TYPE_TABLE | PGD_TABLE_UXN | PGD_TABLE_AF; 403 phys_addr_t p4d_phys; 404 405 if (flags & NO_EXEC_MAPPINGS) 406 pgdval |= PGD_TABLE_PXN; 407 BUG_ON(!pgtable_alloc); 408 p4d_phys = pgtable_alloc(TABLE_P4D); 409 p4dp = p4d_set_fixmap(p4d_phys); 410 init_clear_pgtable(p4dp); 411 p4dp += p4d_index(addr); 412 __pgd_populate(pgdp, p4d_phys, pgdval); 413 } else { 414 BUG_ON(pgd_bad(pgd)); 415 p4dp = p4d_set_fixmap_offset(pgdp, addr); 416 } 417 418 do { 419 p4d_t old_p4d = READ_ONCE(*p4dp); 420 421 next = p4d_addr_end(addr, end); 422 423 alloc_init_pud(p4dp, addr, next, phys, prot, 424 pgtable_alloc, flags); 425 426 BUG_ON(p4d_val(old_p4d) != 0 && 427 p4d_val(old_p4d) != READ_ONCE(p4d_val(*p4dp))); 428 429 phys += next - addr; 430 } while (p4dp++, addr = next, addr != end); 431 432 p4d_clear_fixmap(); 433 } 434 435 static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys, 436 unsigned long virt, phys_addr_t size, 437 pgprot_t prot, 438 phys_addr_t (*pgtable_alloc)(enum pgtable_type), 439 int flags) 440 { 441 unsigned long addr, end, next; 442 pgd_t *pgdp = pgd_offset_pgd(pgdir, virt); 443 444 /* 445 * If the virtual and physical address don't have the same offset 446 * within a page, we cannot map the region as the caller expects. 447 */ 448 if (WARN_ON((phys ^ virt) & ~PAGE_MASK)) 449 return; 450 451 phys &= PAGE_MASK; 452 addr = virt & PAGE_MASK; 453 end = PAGE_ALIGN(virt + size); 454 455 do { 456 next = pgd_addr_end(addr, end); 457 alloc_init_p4d(pgdp, addr, next, phys, prot, pgtable_alloc, 458 flags); 459 phys += next - addr; 460 } while (pgdp++, addr = next, addr != end); 461 } 462 463 static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, 464 unsigned long virt, phys_addr_t size, 465 pgprot_t prot, 466 phys_addr_t (*pgtable_alloc)(enum pgtable_type), 467 int flags) 468 { 469 mutex_lock(&fixmap_lock); 470 __create_pgd_mapping_locked(pgdir, phys, virt, size, prot, 471 pgtable_alloc, flags); 472 mutex_unlock(&fixmap_lock); 473 } 474 475 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 476 extern __alias(__create_pgd_mapping_locked) 477 void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, 478 phys_addr_t size, pgprot_t prot, 479 phys_addr_t (*pgtable_alloc)(enum pgtable_type), 480 int flags); 481 #endif 482 483 static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, 484 enum pgtable_type pgtable_type) 485 { 486 /* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */ 487 struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0); 488 phys_addr_t pa; 489 490 BUG_ON(!ptdesc); 491 pa = page_to_phys(ptdesc_page(ptdesc)); 492 493 switch (pgtable_type) { 494 case TABLE_PTE: 495 BUG_ON(!pagetable_pte_ctor(mm, ptdesc)); 496 break; 497 case TABLE_PMD: 498 BUG_ON(!pagetable_pmd_ctor(mm, ptdesc)); 499 break; 500 case TABLE_PUD: 501 pagetable_pud_ctor(ptdesc); 502 break; 503 case TABLE_P4D: 504 pagetable_p4d_ctor(ptdesc); 505 break; 506 } 507 508 return pa; 509 } 510 511 static phys_addr_t __maybe_unused 512 pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type) 513 { 514 return __pgd_pgtable_alloc(&init_mm, pgtable_type); 515 } 516 517 static phys_addr_t 518 pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type) 519 { 520 return __pgd_pgtable_alloc(NULL, pgtable_type); 521 } 522 523 /* 524 * This function can only be used to modify existing table entries, 525 * without allocating new levels of table. Note that this permits the 526 * creation of new section or page entries. 527 */ 528 void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt, 529 phys_addr_t size, pgprot_t prot) 530 { 531 if (virt < PAGE_OFFSET) { 532 pr_warn("BUG: not creating mapping for %pa at 0x%016lx - outside kernel range\n", 533 &phys, virt); 534 return; 535 } 536 __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, 537 NO_CONT_MAPPINGS); 538 } 539 540 void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, 541 unsigned long virt, phys_addr_t size, 542 pgprot_t prot, bool page_mappings_only) 543 { 544 int flags = 0; 545 546 BUG_ON(mm == &init_mm); 547 548 if (page_mappings_only) 549 flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; 550 551 __create_pgd_mapping(mm->pgd, phys, virt, size, prot, 552 pgd_pgtable_alloc_special_mm, flags); 553 } 554 555 static void update_mapping_prot(phys_addr_t phys, unsigned long virt, 556 phys_addr_t size, pgprot_t prot) 557 { 558 if (virt < PAGE_OFFSET) { 559 pr_warn("BUG: not updating mapping for %pa at 0x%016lx - outside kernel range\n", 560 &phys, virt); 561 return; 562 } 563 564 __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, 565 NO_CONT_MAPPINGS); 566 567 /* flush the TLBs after updating live kernel mappings */ 568 flush_tlb_kernel_range(virt, virt + size); 569 } 570 571 static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start, 572 phys_addr_t end, pgprot_t prot, int flags) 573 { 574 __create_pgd_mapping(pgdp, start, __phys_to_virt(start), end - start, 575 prot, early_pgtable_alloc, flags); 576 } 577 578 void __init mark_linear_text_alias_ro(void) 579 { 580 /* 581 * Remove the write permissions from the linear alias of .text/.rodata 582 */ 583 update_mapping_prot(__pa_symbol(_stext), (unsigned long)lm_alias(_stext), 584 (unsigned long)__init_begin - (unsigned long)_stext, 585 PAGE_KERNEL_RO); 586 } 587 588 #ifdef CONFIG_KFENCE 589 590 bool __ro_after_init kfence_early_init = !!CONFIG_KFENCE_SAMPLE_INTERVAL; 591 592 /* early_param() will be parsed before map_mem() below. */ 593 static int __init parse_kfence_early_init(char *arg) 594 { 595 int val; 596 597 if (get_option(&arg, &val)) 598 kfence_early_init = !!val; 599 return 0; 600 } 601 early_param("kfence.sample_interval", parse_kfence_early_init); 602 603 static phys_addr_t __init arm64_kfence_alloc_pool(void) 604 { 605 phys_addr_t kfence_pool; 606 607 if (!kfence_early_init) 608 return 0; 609 610 kfence_pool = memblock_phys_alloc(KFENCE_POOL_SIZE, PAGE_SIZE); 611 if (!kfence_pool) { 612 pr_err("failed to allocate kfence pool\n"); 613 kfence_early_init = false; 614 return 0; 615 } 616 617 /* Temporarily mark as NOMAP. */ 618 memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE); 619 620 return kfence_pool; 621 } 622 623 static void __init arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) 624 { 625 if (!kfence_pool) 626 return; 627 628 /* KFENCE pool needs page-level mapping. */ 629 __map_memblock(pgdp, kfence_pool, kfence_pool + KFENCE_POOL_SIZE, 630 pgprot_tagged(PAGE_KERNEL), 631 NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS); 632 memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE); 633 __kfence_pool = phys_to_virt(kfence_pool); 634 } 635 #else /* CONFIG_KFENCE */ 636 637 static inline phys_addr_t arm64_kfence_alloc_pool(void) { return 0; } 638 static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) { } 639 640 #endif /* CONFIG_KFENCE */ 641 642 static void __init map_mem(pgd_t *pgdp) 643 { 644 static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN); 645 phys_addr_t kernel_start = __pa_symbol(_stext); 646 phys_addr_t kernel_end = __pa_symbol(__init_begin); 647 phys_addr_t start, end; 648 phys_addr_t early_kfence_pool; 649 int flags = NO_EXEC_MAPPINGS; 650 u64 i; 651 652 /* 653 * Setting hierarchical PXNTable attributes on table entries covering 654 * the linear region is only possible if it is guaranteed that no table 655 * entries at any level are being shared between the linear region and 656 * the vmalloc region. Check whether this is true for the PGD level, in 657 * which case it is guaranteed to be true for all other levels as well. 658 * (Unless we are running with support for LPA2, in which case the 659 * entire reduced VA space is covered by a single pgd_t which will have 660 * been populated without the PXNTable attribute by the time we get here.) 661 */ 662 BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end) && 663 pgd_index(_PAGE_OFFSET(VA_BITS_MIN)) != PTRS_PER_PGD - 1); 664 665 early_kfence_pool = arm64_kfence_alloc_pool(); 666 667 if (can_set_direct_map()) 668 flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; 669 670 /* 671 * Take care not to create a writable alias for the 672 * read-only text and rodata sections of the kernel image. 673 * So temporarily mark them as NOMAP to skip mappings in 674 * the following for-loop 675 */ 676 memblock_mark_nomap(kernel_start, kernel_end - kernel_start); 677 678 /* map all the memory banks */ 679 for_each_mem_range(i, &start, &end) { 680 if (start >= end) 681 break; 682 /* 683 * The linear map must allow allocation tags reading/writing 684 * if MTE is present. Otherwise, it has the same attributes as 685 * PAGE_KERNEL. 686 */ 687 __map_memblock(pgdp, start, end, pgprot_tagged(PAGE_KERNEL), 688 flags); 689 } 690 691 /* 692 * Map the linear alias of the [_stext, __init_begin) interval 693 * as non-executable now, and remove the write permission in 694 * mark_linear_text_alias_ro() below (which will be called after 695 * alternative patching has completed). This makes the contents 696 * of the region accessible to subsystems such as hibernate, 697 * but protects it from inadvertent modification or execution. 698 * Note that contiguous mappings cannot be remapped in this way, 699 * so we should avoid them here. 700 */ 701 __map_memblock(pgdp, kernel_start, kernel_end, 702 PAGE_KERNEL, NO_CONT_MAPPINGS); 703 memblock_clear_nomap(kernel_start, kernel_end - kernel_start); 704 arm64_kfence_map_pool(early_kfence_pool, pgdp); 705 } 706 707 void mark_rodata_ro(void) 708 { 709 unsigned long section_size; 710 711 /* 712 * mark .rodata as read only. Use __init_begin rather than __end_rodata 713 * to cover NOTES and EXCEPTION_TABLE. 714 */ 715 section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata; 716 WRITE_ONCE(rodata_is_rw, false); 717 update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata, 718 section_size, PAGE_KERNEL_RO); 719 } 720 721 static void __init declare_vma(struct vm_struct *vma, 722 void *va_start, void *va_end, 723 unsigned long vm_flags) 724 { 725 phys_addr_t pa_start = __pa_symbol(va_start); 726 unsigned long size = va_end - va_start; 727 728 BUG_ON(!PAGE_ALIGNED(pa_start)); 729 BUG_ON(!PAGE_ALIGNED(size)); 730 731 if (!(vm_flags & VM_NO_GUARD)) 732 size += PAGE_SIZE; 733 734 vma->addr = va_start; 735 vma->phys_addr = pa_start; 736 vma->size = size; 737 vma->flags = VM_MAP | vm_flags; 738 vma->caller = __builtin_return_address(0); 739 740 vm_area_add_early(vma); 741 } 742 743 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 744 static pgprot_t kernel_exec_prot(void) 745 { 746 return rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC; 747 } 748 749 static int __init map_entry_trampoline(void) 750 { 751 int i; 752 753 if (!arm64_kernel_unmapped_at_el0()) 754 return 0; 755 756 pgprot_t prot = kernel_exec_prot(); 757 phys_addr_t pa_start = __pa_symbol(__entry_tramp_text_start); 758 759 /* The trampoline is always mapped and can therefore be global */ 760 pgprot_val(prot) &= ~PTE_NG; 761 762 /* Map only the text into the trampoline page table */ 763 memset(tramp_pg_dir, 0, PGD_SIZE); 764 __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, 765 entry_tramp_text_size(), prot, 766 pgd_pgtable_alloc_init_mm, NO_BLOCK_MAPPINGS); 767 768 /* Map both the text and data into the kernel page table */ 769 for (i = 0; i < DIV_ROUND_UP(entry_tramp_text_size(), PAGE_SIZE); i++) 770 __set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i, 771 pa_start + i * PAGE_SIZE, prot); 772 773 if (IS_ENABLED(CONFIG_RELOCATABLE)) 774 __set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i, 775 pa_start + i * PAGE_SIZE, PAGE_KERNEL_RO); 776 777 return 0; 778 } 779 core_initcall(map_entry_trampoline); 780 #endif 781 782 /* 783 * Declare the VMA areas for the kernel 784 */ 785 static void __init declare_kernel_vmas(void) 786 { 787 static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT]; 788 789 declare_vma(&vmlinux_seg[0], _stext, _etext, VM_NO_GUARD); 790 declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD); 791 declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD); 792 declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD); 793 declare_vma(&vmlinux_seg[4], _data, _end, 0); 794 } 795 796 void __pi_map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot, 797 int level, pte_t *tbl, bool may_use_cont, u64 va_offset); 798 799 static u8 idmap_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init, 800 kpti_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init; 801 802 static void __init create_idmap(void) 803 { 804 u64 start = __pa_symbol(__idmap_text_start); 805 u64 end = __pa_symbol(__idmap_text_end); 806 u64 ptep = __pa_symbol(idmap_ptes); 807 808 __pi_map_range(&ptep, start, end, start, PAGE_KERNEL_ROX, 809 IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false, 810 __phys_to_virt(ptep) - ptep); 811 812 if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && !arm64_use_ng_mappings) { 813 extern u32 __idmap_kpti_flag; 814 u64 pa = __pa_symbol(&__idmap_kpti_flag); 815 816 /* 817 * The KPTI G-to-nG conversion code needs a read-write mapping 818 * of its synchronization flag in the ID map. 819 */ 820 ptep = __pa_symbol(kpti_ptes); 821 __pi_map_range(&ptep, pa, pa + sizeof(u32), pa, PAGE_KERNEL, 822 IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false, 823 __phys_to_virt(ptep) - ptep); 824 } 825 } 826 827 void __init paging_init(void) 828 { 829 map_mem(swapper_pg_dir); 830 831 memblock_allow_resize(); 832 833 create_idmap(); 834 declare_kernel_vmas(); 835 } 836 837 #ifdef CONFIG_MEMORY_HOTPLUG 838 static void free_hotplug_page_range(struct page *page, size_t size, 839 struct vmem_altmap *altmap) 840 { 841 if (altmap) { 842 vmem_altmap_free(altmap, size >> PAGE_SHIFT); 843 } else { 844 WARN_ON(PageReserved(page)); 845 free_pages((unsigned long)page_address(page), get_order(size)); 846 } 847 } 848 849 static void free_hotplug_pgtable_page(struct page *page) 850 { 851 free_hotplug_page_range(page, PAGE_SIZE, NULL); 852 } 853 854 static bool pgtable_range_aligned(unsigned long start, unsigned long end, 855 unsigned long floor, unsigned long ceiling, 856 unsigned long mask) 857 { 858 start &= mask; 859 if (start < floor) 860 return false; 861 862 if (ceiling) { 863 ceiling &= mask; 864 if (!ceiling) 865 return false; 866 } 867 868 if (end - 1 > ceiling - 1) 869 return false; 870 return true; 871 } 872 873 static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr, 874 unsigned long end, bool free_mapped, 875 struct vmem_altmap *altmap) 876 { 877 pte_t *ptep, pte; 878 879 do { 880 ptep = pte_offset_kernel(pmdp, addr); 881 pte = __ptep_get(ptep); 882 if (pte_none(pte)) 883 continue; 884 885 WARN_ON(!pte_present(pte)); 886 __pte_clear(&init_mm, addr, ptep); 887 flush_tlb_kernel_range(addr, addr + PAGE_SIZE); 888 if (free_mapped) 889 free_hotplug_page_range(pte_page(pte), 890 PAGE_SIZE, altmap); 891 } while (addr += PAGE_SIZE, addr < end); 892 } 893 894 static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr, 895 unsigned long end, bool free_mapped, 896 struct vmem_altmap *altmap) 897 { 898 unsigned long next; 899 pmd_t *pmdp, pmd; 900 901 do { 902 next = pmd_addr_end(addr, end); 903 pmdp = pmd_offset(pudp, addr); 904 pmd = READ_ONCE(*pmdp); 905 if (pmd_none(pmd)) 906 continue; 907 908 WARN_ON(!pmd_present(pmd)); 909 if (pmd_sect(pmd)) { 910 pmd_clear(pmdp); 911 912 /* 913 * One TLBI should be sufficient here as the PMD_SIZE 914 * range is mapped with a single block entry. 915 */ 916 flush_tlb_kernel_range(addr, addr + PAGE_SIZE); 917 if (free_mapped) 918 free_hotplug_page_range(pmd_page(pmd), 919 PMD_SIZE, altmap); 920 continue; 921 } 922 WARN_ON(!pmd_table(pmd)); 923 unmap_hotplug_pte_range(pmdp, addr, next, free_mapped, altmap); 924 } while (addr = next, addr < end); 925 } 926 927 static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr, 928 unsigned long end, bool free_mapped, 929 struct vmem_altmap *altmap) 930 { 931 unsigned long next; 932 pud_t *pudp, pud; 933 934 do { 935 next = pud_addr_end(addr, end); 936 pudp = pud_offset(p4dp, addr); 937 pud = READ_ONCE(*pudp); 938 if (pud_none(pud)) 939 continue; 940 941 WARN_ON(!pud_present(pud)); 942 if (pud_sect(pud)) { 943 pud_clear(pudp); 944 945 /* 946 * One TLBI should be sufficient here as the PUD_SIZE 947 * range is mapped with a single block entry. 948 */ 949 flush_tlb_kernel_range(addr, addr + PAGE_SIZE); 950 if (free_mapped) 951 free_hotplug_page_range(pud_page(pud), 952 PUD_SIZE, altmap); 953 continue; 954 } 955 WARN_ON(!pud_table(pud)); 956 unmap_hotplug_pmd_range(pudp, addr, next, free_mapped, altmap); 957 } while (addr = next, addr < end); 958 } 959 960 static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr, 961 unsigned long end, bool free_mapped, 962 struct vmem_altmap *altmap) 963 { 964 unsigned long next; 965 p4d_t *p4dp, p4d; 966 967 do { 968 next = p4d_addr_end(addr, end); 969 p4dp = p4d_offset(pgdp, addr); 970 p4d = READ_ONCE(*p4dp); 971 if (p4d_none(p4d)) 972 continue; 973 974 WARN_ON(!p4d_present(p4d)); 975 unmap_hotplug_pud_range(p4dp, addr, next, free_mapped, altmap); 976 } while (addr = next, addr < end); 977 } 978 979 static void unmap_hotplug_range(unsigned long addr, unsigned long end, 980 bool free_mapped, struct vmem_altmap *altmap) 981 { 982 unsigned long next; 983 pgd_t *pgdp, pgd; 984 985 /* 986 * altmap can only be used as vmemmap mapping backing memory. 987 * In case the backing memory itself is not being freed, then 988 * altmap is irrelevant. Warn about this inconsistency when 989 * encountered. 990 */ 991 WARN_ON(!free_mapped && altmap); 992 993 do { 994 next = pgd_addr_end(addr, end); 995 pgdp = pgd_offset_k(addr); 996 pgd = READ_ONCE(*pgdp); 997 if (pgd_none(pgd)) 998 continue; 999 1000 WARN_ON(!pgd_present(pgd)); 1001 unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped, altmap); 1002 } while (addr = next, addr < end); 1003 } 1004 1005 static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr, 1006 unsigned long end, unsigned long floor, 1007 unsigned long ceiling) 1008 { 1009 pte_t *ptep, pte; 1010 unsigned long i, start = addr; 1011 1012 do { 1013 ptep = pte_offset_kernel(pmdp, addr); 1014 pte = __ptep_get(ptep); 1015 1016 /* 1017 * This is just a sanity check here which verifies that 1018 * pte clearing has been done by earlier unmap loops. 1019 */ 1020 WARN_ON(!pte_none(pte)); 1021 } while (addr += PAGE_SIZE, addr < end); 1022 1023 if (!pgtable_range_aligned(start, end, floor, ceiling, PMD_MASK)) 1024 return; 1025 1026 /* 1027 * Check whether we can free the pte page if the rest of the 1028 * entries are empty. Overlap with other regions have been 1029 * handled by the floor/ceiling check. 1030 */ 1031 ptep = pte_offset_kernel(pmdp, 0UL); 1032 for (i = 0; i < PTRS_PER_PTE; i++) { 1033 if (!pte_none(__ptep_get(&ptep[i]))) 1034 return; 1035 } 1036 1037 pmd_clear(pmdp); 1038 __flush_tlb_kernel_pgtable(start); 1039 free_hotplug_pgtable_page(virt_to_page(ptep)); 1040 } 1041 1042 static void free_empty_pmd_table(pud_t *pudp, unsigned long addr, 1043 unsigned long end, unsigned long floor, 1044 unsigned long ceiling) 1045 { 1046 pmd_t *pmdp, pmd; 1047 unsigned long i, next, start = addr; 1048 1049 do { 1050 next = pmd_addr_end(addr, end); 1051 pmdp = pmd_offset(pudp, addr); 1052 pmd = READ_ONCE(*pmdp); 1053 if (pmd_none(pmd)) 1054 continue; 1055 1056 WARN_ON(!pmd_present(pmd) || !pmd_table(pmd) || pmd_sect(pmd)); 1057 free_empty_pte_table(pmdp, addr, next, floor, ceiling); 1058 } while (addr = next, addr < end); 1059 1060 if (CONFIG_PGTABLE_LEVELS <= 2) 1061 return; 1062 1063 if (!pgtable_range_aligned(start, end, floor, ceiling, PUD_MASK)) 1064 return; 1065 1066 /* 1067 * Check whether we can free the pmd page if the rest of the 1068 * entries are empty. Overlap with other regions have been 1069 * handled by the floor/ceiling check. 1070 */ 1071 pmdp = pmd_offset(pudp, 0UL); 1072 for (i = 0; i < PTRS_PER_PMD; i++) { 1073 if (!pmd_none(READ_ONCE(pmdp[i]))) 1074 return; 1075 } 1076 1077 pud_clear(pudp); 1078 __flush_tlb_kernel_pgtable(start); 1079 free_hotplug_pgtable_page(virt_to_page(pmdp)); 1080 } 1081 1082 static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr, 1083 unsigned long end, unsigned long floor, 1084 unsigned long ceiling) 1085 { 1086 pud_t *pudp, pud; 1087 unsigned long i, next, start = addr; 1088 1089 do { 1090 next = pud_addr_end(addr, end); 1091 pudp = pud_offset(p4dp, addr); 1092 pud = READ_ONCE(*pudp); 1093 if (pud_none(pud)) 1094 continue; 1095 1096 WARN_ON(!pud_present(pud) || !pud_table(pud) || pud_sect(pud)); 1097 free_empty_pmd_table(pudp, addr, next, floor, ceiling); 1098 } while (addr = next, addr < end); 1099 1100 if (!pgtable_l4_enabled()) 1101 return; 1102 1103 if (!pgtable_range_aligned(start, end, floor, ceiling, P4D_MASK)) 1104 return; 1105 1106 /* 1107 * Check whether we can free the pud page if the rest of the 1108 * entries are empty. Overlap with other regions have been 1109 * handled by the floor/ceiling check. 1110 */ 1111 pudp = pud_offset(p4dp, 0UL); 1112 for (i = 0; i < PTRS_PER_PUD; i++) { 1113 if (!pud_none(READ_ONCE(pudp[i]))) 1114 return; 1115 } 1116 1117 p4d_clear(p4dp); 1118 __flush_tlb_kernel_pgtable(start); 1119 free_hotplug_pgtable_page(virt_to_page(pudp)); 1120 } 1121 1122 static void free_empty_p4d_table(pgd_t *pgdp, unsigned long addr, 1123 unsigned long end, unsigned long floor, 1124 unsigned long ceiling) 1125 { 1126 p4d_t *p4dp, p4d; 1127 unsigned long i, next, start = addr; 1128 1129 do { 1130 next = p4d_addr_end(addr, end); 1131 p4dp = p4d_offset(pgdp, addr); 1132 p4d = READ_ONCE(*p4dp); 1133 if (p4d_none(p4d)) 1134 continue; 1135 1136 WARN_ON(!p4d_present(p4d)); 1137 free_empty_pud_table(p4dp, addr, next, floor, ceiling); 1138 } while (addr = next, addr < end); 1139 1140 if (!pgtable_l5_enabled()) 1141 return; 1142 1143 if (!pgtable_range_aligned(start, end, floor, ceiling, PGDIR_MASK)) 1144 return; 1145 1146 /* 1147 * Check whether we can free the p4d page if the rest of the 1148 * entries are empty. Overlap with other regions have been 1149 * handled by the floor/ceiling check. 1150 */ 1151 p4dp = p4d_offset(pgdp, 0UL); 1152 for (i = 0; i < PTRS_PER_P4D; i++) { 1153 if (!p4d_none(READ_ONCE(p4dp[i]))) 1154 return; 1155 } 1156 1157 pgd_clear(pgdp); 1158 __flush_tlb_kernel_pgtable(start); 1159 free_hotplug_pgtable_page(virt_to_page(p4dp)); 1160 } 1161 1162 static void free_empty_tables(unsigned long addr, unsigned long end, 1163 unsigned long floor, unsigned long ceiling) 1164 { 1165 unsigned long next; 1166 pgd_t *pgdp, pgd; 1167 1168 do { 1169 next = pgd_addr_end(addr, end); 1170 pgdp = pgd_offset_k(addr); 1171 pgd = READ_ONCE(*pgdp); 1172 if (pgd_none(pgd)) 1173 continue; 1174 1175 WARN_ON(!pgd_present(pgd)); 1176 free_empty_p4d_table(pgdp, addr, next, floor, ceiling); 1177 } while (addr = next, addr < end); 1178 } 1179 #endif 1180 1181 void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node, 1182 unsigned long addr, unsigned long next) 1183 { 1184 pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL)); 1185 } 1186 1187 int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, 1188 unsigned long addr, unsigned long next) 1189 { 1190 vmemmap_verify((pte_t *)pmdp, node, addr, next); 1191 1192 return pmd_sect(READ_ONCE(*pmdp)); 1193 } 1194 1195 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, 1196 struct vmem_altmap *altmap) 1197 { 1198 WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END)); 1199 /* [start, end] should be within one section */ 1200 WARN_ON_ONCE(end - start > PAGES_PER_SECTION * sizeof(struct page)); 1201 1202 if (!IS_ENABLED(CONFIG_ARM64_4K_PAGES) || 1203 (end - start < PAGES_PER_SECTION * sizeof(struct page))) 1204 return vmemmap_populate_basepages(start, end, node, altmap); 1205 else 1206 return vmemmap_populate_hugepages(start, end, node, altmap); 1207 } 1208 1209 #ifdef CONFIG_MEMORY_HOTPLUG 1210 void vmemmap_free(unsigned long start, unsigned long end, 1211 struct vmem_altmap *altmap) 1212 { 1213 WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END)); 1214 1215 unmap_hotplug_range(start, end, true, altmap); 1216 free_empty_tables(start, end, VMEMMAP_START, VMEMMAP_END); 1217 } 1218 #endif /* CONFIG_MEMORY_HOTPLUG */ 1219 1220 int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot) 1221 { 1222 pud_t new_pud = pfn_pud(__phys_to_pfn(phys), mk_pud_sect_prot(prot)); 1223 1224 /* Only allow permission changes for now */ 1225 if (!pgattr_change_is_safe(READ_ONCE(pud_val(*pudp)), 1226 pud_val(new_pud))) 1227 return 0; 1228 1229 VM_BUG_ON(phys & ~PUD_MASK); 1230 set_pud(pudp, new_pud); 1231 return 1; 1232 } 1233 1234 int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot) 1235 { 1236 pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), mk_pmd_sect_prot(prot)); 1237 1238 /* Only allow permission changes for now */ 1239 if (!pgattr_change_is_safe(READ_ONCE(pmd_val(*pmdp)), 1240 pmd_val(new_pmd))) 1241 return 0; 1242 1243 VM_BUG_ON(phys & ~PMD_MASK); 1244 set_pmd(pmdp, new_pmd); 1245 return 1; 1246 } 1247 1248 #ifndef __PAGETABLE_P4D_FOLDED 1249 void p4d_clear_huge(p4d_t *p4dp) 1250 { 1251 } 1252 #endif 1253 1254 int pud_clear_huge(pud_t *pudp) 1255 { 1256 if (!pud_sect(READ_ONCE(*pudp))) 1257 return 0; 1258 pud_clear(pudp); 1259 return 1; 1260 } 1261 1262 int pmd_clear_huge(pmd_t *pmdp) 1263 { 1264 if (!pmd_sect(READ_ONCE(*pmdp))) 1265 return 0; 1266 pmd_clear(pmdp); 1267 return 1; 1268 } 1269 1270 int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr) 1271 { 1272 pte_t *table; 1273 pmd_t pmd; 1274 1275 pmd = READ_ONCE(*pmdp); 1276 1277 if (!pmd_table(pmd)) { 1278 VM_WARN_ON(1); 1279 return 1; 1280 } 1281 1282 table = pte_offset_kernel(pmdp, addr); 1283 pmd_clear(pmdp); 1284 __flush_tlb_kernel_pgtable(addr); 1285 pte_free_kernel(NULL, table); 1286 return 1; 1287 } 1288 1289 int pud_free_pmd_page(pud_t *pudp, unsigned long addr) 1290 { 1291 pmd_t *table; 1292 pmd_t *pmdp; 1293 pud_t pud; 1294 unsigned long next, end; 1295 1296 pud = READ_ONCE(*pudp); 1297 1298 if (!pud_table(pud)) { 1299 VM_WARN_ON(1); 1300 return 1; 1301 } 1302 1303 table = pmd_offset(pudp, addr); 1304 pmdp = table; 1305 next = addr; 1306 end = addr + PUD_SIZE; 1307 do { 1308 if (pmd_present(pmdp_get(pmdp))) 1309 pmd_free_pte_page(pmdp, next); 1310 } while (pmdp++, next += PMD_SIZE, next != end); 1311 1312 pud_clear(pudp); 1313 __flush_tlb_kernel_pgtable(addr); 1314 pmd_free(NULL, table); 1315 return 1; 1316 } 1317 1318 #ifdef CONFIG_MEMORY_HOTPLUG 1319 static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size) 1320 { 1321 unsigned long end = start + size; 1322 1323 WARN_ON(pgdir != init_mm.pgd); 1324 WARN_ON((start < PAGE_OFFSET) || (end > PAGE_END)); 1325 1326 unmap_hotplug_range(start, end, false, NULL); 1327 free_empty_tables(start, end, PAGE_OFFSET, PAGE_END); 1328 } 1329 1330 struct range arch_get_mappable_range(void) 1331 { 1332 struct range mhp_range; 1333 u64 start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual)); 1334 u64 end_linear_pa = __pa(PAGE_END - 1); 1335 1336 if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { 1337 /* 1338 * Check for a wrap, it is possible because of randomized linear 1339 * mapping the start physical address is actually bigger than 1340 * the end physical address. In this case set start to zero 1341 * because [0, end_linear_pa] range must still be able to cover 1342 * all addressable physical addresses. 1343 */ 1344 if (start_linear_pa > end_linear_pa) 1345 start_linear_pa = 0; 1346 } 1347 1348 WARN_ON(start_linear_pa > end_linear_pa); 1349 1350 /* 1351 * Linear mapping region is the range [PAGE_OFFSET..(PAGE_END - 1)] 1352 * accommodating both its ends but excluding PAGE_END. Max physical 1353 * range which can be mapped inside this linear mapping range, must 1354 * also be derived from its end points. 1355 */ 1356 mhp_range.start = start_linear_pa; 1357 mhp_range.end = end_linear_pa; 1358 1359 return mhp_range; 1360 } 1361 1362 int arch_add_memory(int nid, u64 start, u64 size, 1363 struct mhp_params *params) 1364 { 1365 int ret, flags = NO_EXEC_MAPPINGS; 1366 1367 VM_BUG_ON(!mhp_range_allowed(start, size, true)); 1368 1369 if (can_set_direct_map()) 1370 flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; 1371 1372 __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start), 1373 size, params->pgprot, pgd_pgtable_alloc_init_mm, 1374 flags); 1375 1376 memblock_clear_nomap(start, size); 1377 1378 ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, 1379 params); 1380 if (ret) 1381 __remove_pgd_mapping(swapper_pg_dir, 1382 __phys_to_virt(start), size); 1383 else { 1384 /* Address of hotplugged memory can be smaller */ 1385 max_pfn = max(max_pfn, PFN_UP(start + size)); 1386 max_low_pfn = max_pfn; 1387 } 1388 1389 return ret; 1390 } 1391 1392 void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) 1393 { 1394 unsigned long start_pfn = start >> PAGE_SHIFT; 1395 unsigned long nr_pages = size >> PAGE_SHIFT; 1396 1397 __remove_pages(start_pfn, nr_pages, altmap); 1398 __remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size); 1399 } 1400 1401 /* 1402 * This memory hotplug notifier helps prevent boot memory from being 1403 * inadvertently removed as it blocks pfn range offlining process in 1404 * __offline_pages(). Hence this prevents both offlining as well as 1405 * removal process for boot memory which is initially always online. 1406 * In future if and when boot memory could be removed, this notifier 1407 * should be dropped and free_hotplug_page_range() should handle any 1408 * reserved pages allocated during boot. 1409 */ 1410 static int prevent_bootmem_remove_notifier(struct notifier_block *nb, 1411 unsigned long action, void *data) 1412 { 1413 struct mem_section *ms; 1414 struct memory_notify *arg = data; 1415 unsigned long end_pfn = arg->start_pfn + arg->nr_pages; 1416 unsigned long pfn = arg->start_pfn; 1417 1418 if ((action != MEM_GOING_OFFLINE) && (action != MEM_OFFLINE)) 1419 return NOTIFY_OK; 1420 1421 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1422 unsigned long start = PFN_PHYS(pfn); 1423 unsigned long end = start + (1UL << PA_SECTION_SHIFT); 1424 1425 ms = __pfn_to_section(pfn); 1426 if (!early_section(ms)) 1427 continue; 1428 1429 if (action == MEM_GOING_OFFLINE) { 1430 /* 1431 * Boot memory removal is not supported. Prevent 1432 * it via blocking any attempted offline request 1433 * for the boot memory and just report it. 1434 */ 1435 pr_warn("Boot memory [%lx %lx] offlining attempted\n", start, end); 1436 return NOTIFY_BAD; 1437 } else if (action == MEM_OFFLINE) { 1438 /* 1439 * This should have never happened. Boot memory 1440 * offlining should have been prevented by this 1441 * very notifier. Probably some memory removal 1442 * procedure might have changed which would then 1443 * require further debug. 1444 */ 1445 pr_err("Boot memory [%lx %lx] offlined\n", start, end); 1446 1447 /* 1448 * Core memory hotplug does not process a return 1449 * code from the notifier for MEM_OFFLINE events. 1450 * The error condition has been reported. Return 1451 * from here as if ignored. 1452 */ 1453 return NOTIFY_DONE; 1454 } 1455 } 1456 return NOTIFY_OK; 1457 } 1458 1459 static struct notifier_block prevent_bootmem_remove_nb = { 1460 .notifier_call = prevent_bootmem_remove_notifier, 1461 }; 1462 1463 /* 1464 * This ensures that boot memory sections on the platform are online 1465 * from early boot. Memory sections could not be prevented from being 1466 * offlined, unless for some reason they are not online to begin with. 1467 * This helps validate the basic assumption on which the above memory 1468 * event notifier works to prevent boot memory section offlining and 1469 * its possible removal. 1470 */ 1471 static void validate_bootmem_online(void) 1472 { 1473 phys_addr_t start, end, addr; 1474 struct mem_section *ms; 1475 u64 i; 1476 1477 /* 1478 * Scanning across all memblock might be expensive 1479 * on some big memory systems. Hence enable this 1480 * validation only with DEBUG_VM. 1481 */ 1482 if (!IS_ENABLED(CONFIG_DEBUG_VM)) 1483 return; 1484 1485 for_each_mem_range(i, &start, &end) { 1486 for (addr = start; addr < end; addr += (1UL << PA_SECTION_SHIFT)) { 1487 ms = __pfn_to_section(PHYS_PFN(addr)); 1488 1489 /* 1490 * All memory ranges in the system at this point 1491 * should have been marked as early sections. 1492 */ 1493 WARN_ON(!early_section(ms)); 1494 1495 /* 1496 * Memory notifier mechanism here to prevent boot 1497 * memory offlining depends on the fact that each 1498 * early section memory on the system is initially 1499 * online. Otherwise a given memory section which 1500 * is already offline will be overlooked and can 1501 * be removed completely. Call out such sections. 1502 */ 1503 if (!online_section(ms)) 1504 pr_err("Boot memory [%llx %llx] is offline, can be removed\n", 1505 addr, addr + (1UL << PA_SECTION_SHIFT)); 1506 } 1507 } 1508 } 1509 1510 static int __init prevent_bootmem_remove_init(void) 1511 { 1512 int ret = 0; 1513 1514 if (!IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) 1515 return ret; 1516 1517 validate_bootmem_online(); 1518 ret = register_memory_notifier(&prevent_bootmem_remove_nb); 1519 if (ret) 1520 pr_err("%s: Notifier registration failed %d\n", __func__, ret); 1521 1522 return ret; 1523 } 1524 early_initcall(prevent_bootmem_remove_init); 1525 #endif 1526 1527 pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) 1528 { 1529 if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) { 1530 /* 1531 * Break-before-make (BBM) is required for all user space mappings 1532 * when the permission changes from executable to non-executable 1533 * in cases where cpu is affected with errata #2645198. 1534 */ 1535 if (pte_user_exec(ptep_get(ptep))) 1536 return ptep_clear_flush(vma, addr, ptep); 1537 } 1538 return ptep_get_and_clear(vma->vm_mm, addr, ptep); 1539 } 1540 1541 void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, 1542 pte_t old_pte, pte_t pte) 1543 { 1544 set_pte_at(vma->vm_mm, addr, ptep, pte); 1545 } 1546 1547 /* 1548 * Atomically replaces the active TTBR1_EL1 PGD with a new VA-compatible PGD, 1549 * avoiding the possibility of conflicting TLB entries being allocated. 1550 */ 1551 void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp) 1552 { 1553 typedef void (ttbr_replace_func)(phys_addr_t); 1554 extern ttbr_replace_func idmap_cpu_replace_ttbr1; 1555 ttbr_replace_func *replace_phys; 1556 unsigned long daif; 1557 1558 /* phys_to_ttbr() zeros lower 2 bits of ttbr with 52-bit PA */ 1559 phys_addr_t ttbr1 = phys_to_ttbr(virt_to_phys(pgdp)); 1560 1561 if (cnp) 1562 ttbr1 |= TTBR_CNP_BIT; 1563 1564 replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1); 1565 1566 cpu_install_idmap(); 1567 1568 /* 1569 * We really don't want to take *any* exceptions while TTBR1 is 1570 * in the process of being replaced so mask everything. 1571 */ 1572 daif = local_daif_save(); 1573 replace_phys(ttbr1); 1574 local_daif_restore(daif); 1575 1576 cpu_uninstall_idmap(); 1577 } 1578 1579 #ifdef CONFIG_ARCH_HAS_PKEYS 1580 int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) 1581 { 1582 u64 new_por; 1583 u64 old_por; 1584 1585 if (!system_supports_poe()) 1586 return -ENOSPC; 1587 1588 /* 1589 * This code should only be called with valid 'pkey' 1590 * values originating from in-kernel users. Complain 1591 * if a bad value is observed. 1592 */ 1593 if (WARN_ON_ONCE(pkey >= arch_max_pkey())) 1594 return -EINVAL; 1595 1596 /* Set the bits we need in POR: */ 1597 new_por = POE_RWX; 1598 if (init_val & PKEY_DISABLE_WRITE) 1599 new_por &= ~POE_W; 1600 if (init_val & PKEY_DISABLE_ACCESS) 1601 new_por &= ~POE_RW; 1602 if (init_val & PKEY_DISABLE_READ) 1603 new_por &= ~POE_R; 1604 if (init_val & PKEY_DISABLE_EXECUTE) 1605 new_por &= ~POE_X; 1606 1607 /* Shift the bits in to the correct place in POR for pkey: */ 1608 new_por = POR_ELx_PERM_PREP(pkey, new_por); 1609 1610 /* Get old POR and mask off any old bits in place: */ 1611 old_por = read_sysreg_s(SYS_POR_EL0); 1612 old_por &= ~(POE_MASK << POR_ELx_PERM_SHIFT(pkey)); 1613 1614 /* Write old part along with new part: */ 1615 write_sysreg_s(old_por | new_por, SYS_POR_EL0); 1616 1617 return 0; 1618 } 1619 #endif 1620