1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright IBM Corp. 2007, 2011 4 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> 5 */ 6 7 #include <linux/cpufeature.h> 8 #include <linux/sched.h> 9 #include <linux/kernel.h> 10 #include <linux/errno.h> 11 #include <linux/gfp.h> 12 #include <linux/mm.h> 13 #include <linux/swap.h> 14 #include <linux/smp.h> 15 #include <linux/spinlock.h> 16 #include <linux/rcupdate.h> 17 #include <linux/slab.h> 18 #include <linux/swapops.h> 19 #include <linux/sysctl.h> 20 #include <linux/ksm.h> 21 #include <linux/mman.h> 22 23 #include <asm/tlbflush.h> 24 #include <asm/mmu_context.h> 25 #include <asm/page-states.h> 26 #include <asm/machine.h> 27 28 pgprot_t pgprot_writecombine(pgprot_t prot) 29 { 30 /* 31 * mio_wb_bit_mask may be set on a different CPU, but it is only set 32 * once at init and only read afterwards. 33 */ 34 return __pgprot(pgprot_val(prot) | mio_wb_bit_mask); 35 } 36 EXPORT_SYMBOL_GPL(pgprot_writecombine); 37 38 static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr, 39 pte_t *ptep, int nodat) 40 { 41 unsigned long opt, asce; 42 43 if (machine_has_tlb_guest()) { 44 opt = 0; 45 asce = READ_ONCE(mm->context.gmap_asce); 46 if (asce == 0UL || nodat) 47 opt |= IPTE_NODAT; 48 if (asce != -1UL) { 49 asce = asce ? : mm->context.asce; 50 opt |= IPTE_GUEST_ASCE; 51 } 52 __ptep_ipte(addr, ptep, opt, asce, IPTE_LOCAL); 53 } else { 54 __ptep_ipte(addr, ptep, 0, 0, IPTE_LOCAL); 55 } 56 } 57 58 static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr, 59 pte_t *ptep, int nodat) 60 { 61 unsigned long opt, asce; 62 63 if (machine_has_tlb_guest()) { 64 opt = 0; 65 asce = READ_ONCE(mm->context.gmap_asce); 66 if (asce == 0UL || nodat) 67 opt |= IPTE_NODAT; 68 if (asce != -1UL) { 69 asce = asce ? : mm->context.asce; 70 opt |= IPTE_GUEST_ASCE; 71 } 72 __ptep_ipte(addr, ptep, opt, asce, IPTE_GLOBAL); 73 } else { 74 __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); 75 } 76 } 77 78 static inline pte_t ptep_flush_direct(struct mm_struct *mm, 79 unsigned long addr, pte_t *ptep, 80 int nodat) 81 { 82 pte_t old; 83 84 old = *ptep; 85 if (unlikely(pte_val(old) & _PAGE_INVALID)) 86 return old; 87 atomic_inc(&mm->context.flush_count); 88 if (cpu_has_tlb_lc() && 89 cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 90 ptep_ipte_local(mm, addr, ptep, nodat); 91 else 92 ptep_ipte_global(mm, addr, ptep, nodat); 93 atomic_dec(&mm->context.flush_count); 94 return old; 95 } 96 97 static inline pte_t ptep_flush_lazy(struct mm_struct *mm, 98 unsigned long addr, pte_t *ptep, 99 int nodat) 100 { 101 pte_t old; 102 103 old = *ptep; 104 if (unlikely(pte_val(old) & _PAGE_INVALID)) 105 return old; 106 atomic_inc(&mm->context.flush_count); 107 if (cpumask_equal(&mm->context.cpu_attach_mask, 108 cpumask_of(smp_processor_id()))) { 109 set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_INVALID))); 110 mm->context.flush_mm = 1; 111 } else 112 ptep_ipte_global(mm, addr, ptep, nodat); 113 atomic_dec(&mm->context.flush_count); 114 return old; 115 } 116 117 static inline pgste_t pgste_get_lock(pte_t *ptep) 118 { 119 unsigned long value = 0; 120 #ifdef CONFIG_PGSTE 121 unsigned long *ptr = (unsigned long *)(ptep + PTRS_PER_PTE); 122 123 do { 124 value = __atomic64_or_barrier(PGSTE_PCL_BIT, ptr); 125 } while (value & PGSTE_PCL_BIT); 126 value |= PGSTE_PCL_BIT; 127 #endif 128 return __pgste(value); 129 } 130 131 static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste) 132 { 133 #ifdef CONFIG_PGSTE 134 barrier(); 135 WRITE_ONCE(*(unsigned long *)(ptep + PTRS_PER_PTE), pgste_val(pgste) & ~PGSTE_PCL_BIT); 136 #endif 137 } 138 139 static inline pgste_t pgste_get(pte_t *ptep) 140 { 141 unsigned long pgste = 0; 142 #ifdef CONFIG_PGSTE 143 pgste = *(unsigned long *)(ptep + PTRS_PER_PTE); 144 #endif 145 return __pgste(pgste); 146 } 147 148 static inline void pgste_set(pte_t *ptep, pgste_t pgste) 149 { 150 #ifdef CONFIG_PGSTE 151 *(pgste_t *)(ptep + PTRS_PER_PTE) = pgste; 152 #endif 153 } 154 155 static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste, 156 struct mm_struct *mm) 157 { 158 #ifdef CONFIG_PGSTE 159 unsigned long address, bits, skey; 160 161 if (!mm_uses_skeys(mm) || pte_val(pte) & _PAGE_INVALID) 162 return pgste; 163 address = pte_val(pte) & PAGE_MASK; 164 skey = (unsigned long) page_get_storage_key(address); 165 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); 166 /* Transfer page changed & referenced bit to guest bits in pgste */ 167 pgste = set_pgste_bit(pgste, bits << 48); /* GR bit & GC bit */ 168 /* Copy page access key and fetch protection bit to pgste */ 169 pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT); 170 pgste = set_pgste_bit(pgste, (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56); 171 #endif 172 return pgste; 173 174 } 175 176 static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry, 177 struct mm_struct *mm) 178 { 179 #ifdef CONFIG_PGSTE 180 unsigned long address; 181 unsigned long nkey; 182 183 if (!mm_uses_skeys(mm) || pte_val(entry) & _PAGE_INVALID) 184 return; 185 VM_BUG_ON(!(pte_val(*ptep) & _PAGE_INVALID)); 186 address = pte_val(entry) & PAGE_MASK; 187 /* 188 * Set page access key and fetch protection bit from pgste. 189 * The guest C/R information is still in the PGSTE, set real 190 * key C/R to 0. 191 */ 192 nkey = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; 193 nkey |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; 194 page_set_storage_key(address, nkey, 0); 195 #endif 196 } 197 198 static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) 199 { 200 #ifdef CONFIG_PGSTE 201 if ((pte_val(entry) & _PAGE_PRESENT) && 202 (pte_val(entry) & _PAGE_WRITE) && 203 !(pte_val(entry) & _PAGE_INVALID)) { 204 if (!machine_has_esop()) { 205 /* 206 * Without enhanced suppression-on-protection force 207 * the dirty bit on for all writable ptes. 208 */ 209 entry = set_pte_bit(entry, __pgprot(_PAGE_DIRTY)); 210 entry = clear_pte_bit(entry, __pgprot(_PAGE_PROTECT)); 211 } 212 if (!(pte_val(entry) & _PAGE_PROTECT)) 213 /* This pte allows write access, set user-dirty */ 214 pgste = set_pgste_bit(pgste, PGSTE_UC_BIT); 215 } 216 #endif 217 set_pte(ptep, entry); 218 return pgste; 219 } 220 221 static inline pgste_t pgste_pte_notify(struct mm_struct *mm, 222 unsigned long addr, 223 pte_t *ptep, pgste_t pgste) 224 { 225 #ifdef CONFIG_PGSTE 226 unsigned long bits; 227 228 bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT); 229 if (bits) { 230 pgste = __pgste(pgste_val(pgste) ^ bits); 231 ptep_notify(mm, addr, ptep, bits); 232 } 233 #endif 234 return pgste; 235 } 236 237 static inline pgste_t ptep_xchg_start(struct mm_struct *mm, 238 unsigned long addr, pte_t *ptep) 239 { 240 pgste_t pgste = __pgste(0); 241 242 if (mm_has_pgste(mm)) { 243 pgste = pgste_get_lock(ptep); 244 pgste = pgste_pte_notify(mm, addr, ptep, pgste); 245 } 246 return pgste; 247 } 248 249 static inline pte_t ptep_xchg_commit(struct mm_struct *mm, 250 unsigned long addr, pte_t *ptep, 251 pgste_t pgste, pte_t old, pte_t new) 252 { 253 if (mm_has_pgste(mm)) { 254 if (pte_val(old) & _PAGE_INVALID) 255 pgste_set_key(ptep, pgste, new, mm); 256 if (pte_val(new) & _PAGE_INVALID) { 257 pgste = pgste_update_all(old, pgste, mm); 258 if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) == 259 _PGSTE_GPS_USAGE_UNUSED) 260 old = set_pte_bit(old, __pgprot(_PAGE_UNUSED)); 261 } 262 pgste = pgste_set_pte(ptep, pgste, new); 263 pgste_set_unlock(ptep, pgste); 264 } else { 265 set_pte(ptep, new); 266 } 267 return old; 268 } 269 270 pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr, 271 pte_t *ptep, pte_t new) 272 { 273 pgste_t pgste; 274 pte_t old; 275 int nodat; 276 277 preempt_disable(); 278 pgste = ptep_xchg_start(mm, addr, ptep); 279 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 280 old = ptep_flush_direct(mm, addr, ptep, nodat); 281 old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); 282 preempt_enable(); 283 return old; 284 } 285 EXPORT_SYMBOL(ptep_xchg_direct); 286 287 /* 288 * Caller must check that new PTE only differs in _PAGE_PROTECT HW bit, so that 289 * RDP can be used instead of IPTE. See also comments at pte_allow_rdp(). 290 */ 291 void ptep_reset_dat_prot(struct mm_struct *mm, unsigned long addr, pte_t *ptep, 292 pte_t new) 293 { 294 preempt_disable(); 295 atomic_inc(&mm->context.flush_count); 296 if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 297 __ptep_rdp(addr, ptep, 0, 0, 1); 298 else 299 __ptep_rdp(addr, ptep, 0, 0, 0); 300 /* 301 * PTE is not invalidated by RDP, only _PAGE_PROTECT is cleared. That 302 * means it is still valid and active, and must not be changed according 303 * to the architecture. But writing a new value that only differs in SW 304 * bits is allowed. 305 */ 306 set_pte(ptep, new); 307 atomic_dec(&mm->context.flush_count); 308 preempt_enable(); 309 } 310 EXPORT_SYMBOL(ptep_reset_dat_prot); 311 312 pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr, 313 pte_t *ptep, pte_t new) 314 { 315 pgste_t pgste; 316 pte_t old; 317 int nodat; 318 319 preempt_disable(); 320 pgste = ptep_xchg_start(mm, addr, ptep); 321 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 322 old = ptep_flush_lazy(mm, addr, ptep, nodat); 323 old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); 324 preempt_enable(); 325 return old; 326 } 327 EXPORT_SYMBOL(ptep_xchg_lazy); 328 329 pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, 330 pte_t *ptep) 331 { 332 pgste_t pgste; 333 pte_t old; 334 int nodat; 335 struct mm_struct *mm = vma->vm_mm; 336 337 preempt_disable(); 338 pgste = ptep_xchg_start(mm, addr, ptep); 339 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 340 old = ptep_flush_lazy(mm, addr, ptep, nodat); 341 if (mm_has_pgste(mm)) { 342 pgste = pgste_update_all(old, pgste, mm); 343 pgste_set(ptep, pgste); 344 } 345 return old; 346 } 347 348 void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, 349 pte_t *ptep, pte_t old_pte, pte_t pte) 350 { 351 pgste_t pgste; 352 struct mm_struct *mm = vma->vm_mm; 353 354 if (mm_has_pgste(mm)) { 355 pgste = pgste_get(ptep); 356 pgste_set_key(ptep, pgste, pte, mm); 357 pgste = pgste_set_pte(ptep, pgste, pte); 358 pgste_set_unlock(ptep, pgste); 359 } else { 360 set_pte(ptep, pte); 361 } 362 preempt_enable(); 363 } 364 365 static inline void pmdp_idte_local(struct mm_struct *mm, 366 unsigned long addr, pmd_t *pmdp) 367 { 368 if (machine_has_tlb_guest()) 369 __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, 370 mm->context.asce, IDTE_LOCAL); 371 else 372 __pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL); 373 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 374 gmap_pmdp_idte_local(mm, addr); 375 } 376 377 static inline void pmdp_idte_global(struct mm_struct *mm, 378 unsigned long addr, pmd_t *pmdp) 379 { 380 if (machine_has_tlb_guest()) { 381 __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, 382 mm->context.asce, IDTE_GLOBAL); 383 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 384 gmap_pmdp_idte_global(mm, addr); 385 } else if (cpu_has_idte()) { 386 __pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL); 387 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 388 gmap_pmdp_idte_global(mm, addr); 389 } else { 390 __pmdp_csp(pmdp); 391 if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m) 392 gmap_pmdp_csp(mm, addr); 393 } 394 } 395 396 static inline pmd_t pmdp_flush_direct(struct mm_struct *mm, 397 unsigned long addr, pmd_t *pmdp) 398 { 399 pmd_t old; 400 401 old = *pmdp; 402 if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) 403 return old; 404 atomic_inc(&mm->context.flush_count); 405 if (cpu_has_tlb_lc() && 406 cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 407 pmdp_idte_local(mm, addr, pmdp); 408 else 409 pmdp_idte_global(mm, addr, pmdp); 410 atomic_dec(&mm->context.flush_count); 411 return old; 412 } 413 414 static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, 415 unsigned long addr, pmd_t *pmdp) 416 { 417 pmd_t old; 418 419 old = *pmdp; 420 if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) 421 return old; 422 atomic_inc(&mm->context.flush_count); 423 if (cpumask_equal(&mm->context.cpu_attach_mask, 424 cpumask_of(smp_processor_id()))) { 425 set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_INVALID))); 426 mm->context.flush_mm = 1; 427 if (mm_has_pgste(mm)) 428 gmap_pmdp_invalidate(mm, addr); 429 } else { 430 pmdp_idte_global(mm, addr, pmdp); 431 } 432 atomic_dec(&mm->context.flush_count); 433 return old; 434 } 435 436 #ifdef CONFIG_PGSTE 437 static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp) 438 { 439 struct vm_area_struct *vma; 440 pgd_t *pgd; 441 p4d_t *p4d; 442 pud_t *pud; 443 444 /* We need a valid VMA, otherwise this is clearly a fault. */ 445 vma = vma_lookup(mm, addr); 446 if (!vma) 447 return -EFAULT; 448 449 pgd = pgd_offset(mm, addr); 450 if (!pgd_present(*pgd)) 451 return -ENOENT; 452 453 p4d = p4d_offset(pgd, addr); 454 if (!p4d_present(*p4d)) 455 return -ENOENT; 456 457 pud = pud_offset(p4d, addr); 458 if (!pud_present(*pud)) 459 return -ENOENT; 460 461 /* Large PUDs are not supported yet. */ 462 if (pud_leaf(*pud)) 463 return -EFAULT; 464 465 *pmdp = pmd_offset(pud, addr); 466 return 0; 467 } 468 #endif 469 470 pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr, 471 pmd_t *pmdp, pmd_t new) 472 { 473 pmd_t old; 474 475 preempt_disable(); 476 old = pmdp_flush_direct(mm, addr, pmdp); 477 set_pmd(pmdp, new); 478 preempt_enable(); 479 return old; 480 } 481 EXPORT_SYMBOL(pmdp_xchg_direct); 482 483 pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr, 484 pmd_t *pmdp, pmd_t new) 485 { 486 pmd_t old; 487 488 preempt_disable(); 489 old = pmdp_flush_lazy(mm, addr, pmdp); 490 set_pmd(pmdp, new); 491 preempt_enable(); 492 return old; 493 } 494 EXPORT_SYMBOL(pmdp_xchg_lazy); 495 496 static inline void pudp_idte_local(struct mm_struct *mm, 497 unsigned long addr, pud_t *pudp) 498 { 499 if (machine_has_tlb_guest()) 500 __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, 501 mm->context.asce, IDTE_LOCAL); 502 else 503 __pudp_idte(addr, pudp, 0, 0, IDTE_LOCAL); 504 } 505 506 static inline void pudp_idte_global(struct mm_struct *mm, 507 unsigned long addr, pud_t *pudp) 508 { 509 if (machine_has_tlb_guest()) 510 __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, 511 mm->context.asce, IDTE_GLOBAL); 512 else if (cpu_has_idte()) 513 __pudp_idte(addr, pudp, 0, 0, IDTE_GLOBAL); 514 else 515 /* 516 * Invalid bit position is the same for pmd and pud, so we can 517 * reuse _pmd_csp() here 518 */ 519 __pmdp_csp((pmd_t *) pudp); 520 } 521 522 static inline pud_t pudp_flush_direct(struct mm_struct *mm, 523 unsigned long addr, pud_t *pudp) 524 { 525 pud_t old; 526 527 old = *pudp; 528 if (pud_val(old) & _REGION_ENTRY_INVALID) 529 return old; 530 atomic_inc(&mm->context.flush_count); 531 if (cpu_has_tlb_lc() && 532 cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) 533 pudp_idte_local(mm, addr, pudp); 534 else 535 pudp_idte_global(mm, addr, pudp); 536 atomic_dec(&mm->context.flush_count); 537 return old; 538 } 539 540 pud_t pudp_xchg_direct(struct mm_struct *mm, unsigned long addr, 541 pud_t *pudp, pud_t new) 542 { 543 pud_t old; 544 545 preempt_disable(); 546 old = pudp_flush_direct(mm, addr, pudp); 547 set_pud(pudp, new); 548 preempt_enable(); 549 return old; 550 } 551 EXPORT_SYMBOL(pudp_xchg_direct); 552 553 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 554 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 555 pgtable_t pgtable) 556 { 557 struct list_head *lh = (struct list_head *) pgtable; 558 559 assert_spin_locked(pmd_lockptr(mm, pmdp)); 560 561 /* FIFO */ 562 if (!pmd_huge_pte(mm, pmdp)) 563 INIT_LIST_HEAD(lh); 564 else 565 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); 566 pmd_huge_pte(mm, pmdp) = pgtable; 567 } 568 569 pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 570 { 571 struct list_head *lh; 572 pgtable_t pgtable; 573 pte_t *ptep; 574 575 assert_spin_locked(pmd_lockptr(mm, pmdp)); 576 577 /* FIFO */ 578 pgtable = pmd_huge_pte(mm, pmdp); 579 lh = (struct list_head *) pgtable; 580 if (list_empty(lh)) 581 pmd_huge_pte(mm, pmdp) = NULL; 582 else { 583 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; 584 list_del(lh); 585 } 586 ptep = (pte_t *) pgtable; 587 set_pte(ptep, __pte(_PAGE_INVALID)); 588 ptep++; 589 set_pte(ptep, __pte(_PAGE_INVALID)); 590 return pgtable; 591 } 592 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 593 594 #ifdef CONFIG_PGSTE 595 void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, 596 pte_t *ptep, pte_t entry) 597 { 598 pgste_t pgste; 599 600 /* the mm_has_pgste() check is done in set_pte_at() */ 601 preempt_disable(); 602 pgste = pgste_get_lock(ptep); 603 pgste = clear_pgste_bit(pgste, _PGSTE_GPS_ZERO); 604 pgste_set_key(ptep, pgste, entry, mm); 605 pgste = pgste_set_pte(ptep, pgste, entry); 606 pgste_set_unlock(ptep, pgste); 607 preempt_enable(); 608 } 609 610 void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 611 { 612 pgste_t pgste; 613 614 preempt_disable(); 615 pgste = pgste_get_lock(ptep); 616 pgste = set_pgste_bit(pgste, PGSTE_IN_BIT); 617 pgste_set_unlock(ptep, pgste); 618 preempt_enable(); 619 } 620 621 /** 622 * ptep_force_prot - change access rights of a locked pte 623 * @mm: pointer to the process mm_struct 624 * @addr: virtual address in the guest address space 625 * @ptep: pointer to the page table entry 626 * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE 627 * @bit: pgste bit to set (e.g. for notification) 628 * 629 * Returns 0 if the access rights were changed and -EAGAIN if the current 630 * and requested access rights are incompatible. 631 */ 632 int ptep_force_prot(struct mm_struct *mm, unsigned long addr, 633 pte_t *ptep, int prot, unsigned long bit) 634 { 635 pte_t entry; 636 pgste_t pgste; 637 int pte_i, pte_p, nodat; 638 639 pgste = pgste_get_lock(ptep); 640 entry = *ptep; 641 /* Check pte entry after all locks have been acquired */ 642 pte_i = pte_val(entry) & _PAGE_INVALID; 643 pte_p = pte_val(entry) & _PAGE_PROTECT; 644 if ((pte_i && (prot != PROT_NONE)) || 645 (pte_p && (prot & PROT_WRITE))) { 646 pgste_set_unlock(ptep, pgste); 647 return -EAGAIN; 648 } 649 /* Change access rights and set pgste bit */ 650 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 651 if (prot == PROT_NONE && !pte_i) { 652 ptep_flush_direct(mm, addr, ptep, nodat); 653 pgste = pgste_update_all(entry, pgste, mm); 654 entry = set_pte_bit(entry, __pgprot(_PAGE_INVALID)); 655 } 656 if (prot == PROT_READ && !pte_p) { 657 ptep_flush_direct(mm, addr, ptep, nodat); 658 entry = clear_pte_bit(entry, __pgprot(_PAGE_INVALID)); 659 entry = set_pte_bit(entry, __pgprot(_PAGE_PROTECT)); 660 } 661 pgste = set_pgste_bit(pgste, bit); 662 pgste = pgste_set_pte(ptep, pgste, entry); 663 pgste_set_unlock(ptep, pgste); 664 return 0; 665 } 666 667 int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, 668 pte_t *sptep, pte_t *tptep, pte_t pte) 669 { 670 pgste_t spgste, tpgste; 671 pte_t spte, tpte; 672 int rc = -EAGAIN; 673 674 if (!(pte_val(*tptep) & _PAGE_INVALID)) 675 return 0; /* already shadowed */ 676 spgste = pgste_get_lock(sptep); 677 spte = *sptep; 678 if (!(pte_val(spte) & _PAGE_INVALID) && 679 !((pte_val(spte) & _PAGE_PROTECT) && 680 !(pte_val(pte) & _PAGE_PROTECT))) { 681 spgste = set_pgste_bit(spgste, PGSTE_VSIE_BIT); 682 tpgste = pgste_get_lock(tptep); 683 tpte = __pte((pte_val(spte) & PAGE_MASK) | 684 (pte_val(pte) & _PAGE_PROTECT)); 685 /* don't touch the storage key - it belongs to parent pgste */ 686 tpgste = pgste_set_pte(tptep, tpgste, tpte); 687 pgste_set_unlock(tptep, tpgste); 688 rc = 1; 689 } 690 pgste_set_unlock(sptep, spgste); 691 return rc; 692 } 693 694 void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep) 695 { 696 pgste_t pgste; 697 int nodat; 698 699 pgste = pgste_get_lock(ptep); 700 /* notifier is called by the caller */ 701 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 702 ptep_flush_direct(mm, saddr, ptep, nodat); 703 /* don't touch the storage key - it belongs to parent pgste */ 704 pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID)); 705 pgste_set_unlock(ptep, pgste); 706 } 707 708 static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) 709 { 710 if (!non_swap_entry(entry)) 711 dec_mm_counter(mm, MM_SWAPENTS); 712 else if (is_migration_entry(entry)) { 713 struct folio *folio = pfn_swap_entry_folio(entry); 714 715 dec_mm_counter(mm, mm_counter(folio)); 716 } 717 free_swap_and_cache(entry); 718 } 719 720 void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, 721 pte_t *ptep, int reset) 722 { 723 unsigned long pgstev; 724 pgste_t pgste; 725 pte_t pte; 726 727 /* Zap unused and logically-zero pages */ 728 preempt_disable(); 729 pgste = pgste_get_lock(ptep); 730 pgstev = pgste_val(pgste); 731 pte = *ptep; 732 if (!reset && pte_swap(pte) && 733 ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED || 734 (pgstev & _PGSTE_GPS_ZERO))) { 735 ptep_zap_swap_entry(mm, pte_to_swp_entry(pte)); 736 pte_clear(mm, addr, ptep); 737 } 738 if (reset) 739 pgste = clear_pgste_bit(pgste, _PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); 740 pgste_set_unlock(ptep, pgste); 741 preempt_enable(); 742 } 743 744 void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 745 { 746 unsigned long ptev; 747 pgste_t pgste; 748 749 /* Clear storage key ACC and F, but set R/C */ 750 preempt_disable(); 751 pgste = pgste_get_lock(ptep); 752 pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT); 753 pgste = set_pgste_bit(pgste, PGSTE_GR_BIT | PGSTE_GC_BIT); 754 ptev = pte_val(*ptep); 755 if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) 756 page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0); 757 pgste_set_unlock(ptep, pgste); 758 preempt_enable(); 759 } 760 761 /* 762 * Test and reset if a guest page is dirty 763 */ 764 bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr, 765 pte_t *ptep) 766 { 767 pgste_t pgste; 768 pte_t pte; 769 bool dirty; 770 int nodat; 771 772 pgste = pgste_get_lock(ptep); 773 dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT); 774 pgste = clear_pgste_bit(pgste, PGSTE_UC_BIT); 775 pte = *ptep; 776 if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { 777 pgste = pgste_pte_notify(mm, addr, ptep, pgste); 778 nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); 779 ptep_ipte_global(mm, addr, ptep, nodat); 780 if (machine_has_esop() || !(pte_val(pte) & _PAGE_WRITE)) 781 pte = set_pte_bit(pte, __pgprot(_PAGE_PROTECT)); 782 else 783 pte = set_pte_bit(pte, __pgprot(_PAGE_INVALID)); 784 set_pte(ptep, pte); 785 } 786 pgste_set_unlock(ptep, pgste); 787 return dirty; 788 } 789 EXPORT_SYMBOL_GPL(ptep_test_and_clear_uc); 790 791 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, 792 unsigned char key, bool nq) 793 { 794 unsigned long keyul, paddr; 795 spinlock_t *ptl; 796 pgste_t old, new; 797 pmd_t *pmdp; 798 pte_t *ptep; 799 800 /* 801 * If we don't have a PTE table and if there is no huge page mapped, 802 * we can ignore attempts to set the key to 0, because it already is 0. 803 */ 804 switch (pmd_lookup(mm, addr, &pmdp)) { 805 case -ENOENT: 806 return key ? -EFAULT : 0; 807 case 0: 808 break; 809 default: 810 return -EFAULT; 811 } 812 again: 813 ptl = pmd_lock(mm, pmdp); 814 if (!pmd_present(*pmdp)) { 815 spin_unlock(ptl); 816 return key ? -EFAULT : 0; 817 } 818 819 if (pmd_leaf(*pmdp)) { 820 paddr = pmd_val(*pmdp) & HPAGE_MASK; 821 paddr |= addr & ~HPAGE_MASK; 822 /* 823 * Huge pmds need quiescing operations, they are 824 * always mapped. 825 */ 826 page_set_storage_key(paddr, key, 1); 827 spin_unlock(ptl); 828 return 0; 829 } 830 spin_unlock(ptl); 831 832 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 833 if (!ptep) 834 goto again; 835 new = old = pgste_get_lock(ptep); 836 new = clear_pgste_bit(new, PGSTE_GR_BIT | PGSTE_GC_BIT | 837 PGSTE_ACC_BITS | PGSTE_FP_BIT); 838 keyul = (unsigned long) key; 839 new = set_pgste_bit(new, (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48); 840 new = set_pgste_bit(new, (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56); 841 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 842 unsigned long bits, skey; 843 844 paddr = pte_val(*ptep) & PAGE_MASK; 845 skey = (unsigned long) page_get_storage_key(paddr); 846 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); 847 skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT); 848 /* Set storage key ACC and FP */ 849 page_set_storage_key(paddr, skey, !nq); 850 /* Merge host changed & referenced into pgste */ 851 new = set_pgste_bit(new, bits << 52); 852 } 853 /* changing the guest storage key is considered a change of the page */ 854 if ((pgste_val(new) ^ pgste_val(old)) & 855 (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT)) 856 new = set_pgste_bit(new, PGSTE_UC_BIT); 857 858 pgste_set_unlock(ptep, new); 859 pte_unmap_unlock(ptep, ptl); 860 return 0; 861 } 862 EXPORT_SYMBOL(set_guest_storage_key); 863 864 /* 865 * Conditionally set a guest storage key (handling csske). 866 * oldkey will be updated when either mr or mc is set and a pointer is given. 867 * 868 * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest 869 * storage key was updated and -EFAULT on access errors. 870 */ 871 int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr, 872 unsigned char key, unsigned char *oldkey, 873 bool nq, bool mr, bool mc) 874 { 875 unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT; 876 int rc; 877 878 /* we can drop the pgste lock between getting and setting the key */ 879 if (mr | mc) { 880 rc = get_guest_storage_key(current->mm, addr, &tmp); 881 if (rc) 882 return rc; 883 if (oldkey) 884 *oldkey = tmp; 885 if (!mr) 886 mask |= _PAGE_REFERENCED; 887 if (!mc) 888 mask |= _PAGE_CHANGED; 889 if (!((tmp ^ key) & mask)) 890 return 0; 891 } 892 rc = set_guest_storage_key(current->mm, addr, key, nq); 893 return rc < 0 ? rc : 1; 894 } 895 EXPORT_SYMBOL(cond_set_guest_storage_key); 896 897 /* 898 * Reset a guest reference bit (rrbe), returning the reference and changed bit. 899 * 900 * Returns < 0 in case of error, otherwise the cc to be reported to the guest. 901 */ 902 int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) 903 { 904 spinlock_t *ptl; 905 unsigned long paddr; 906 pgste_t old, new; 907 pmd_t *pmdp; 908 pte_t *ptep; 909 int cc = 0; 910 911 /* 912 * If we don't have a PTE table and if there is no huge page mapped, 913 * the storage key is 0 and there is nothing for us to do. 914 */ 915 switch (pmd_lookup(mm, addr, &pmdp)) { 916 case -ENOENT: 917 return 0; 918 case 0: 919 break; 920 default: 921 return -EFAULT; 922 } 923 again: 924 ptl = pmd_lock(mm, pmdp); 925 if (!pmd_present(*pmdp)) { 926 spin_unlock(ptl); 927 return 0; 928 } 929 930 if (pmd_leaf(*pmdp)) { 931 paddr = pmd_val(*pmdp) & HPAGE_MASK; 932 paddr |= addr & ~HPAGE_MASK; 933 cc = page_reset_referenced(paddr); 934 spin_unlock(ptl); 935 return cc; 936 } 937 spin_unlock(ptl); 938 939 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 940 if (!ptep) 941 goto again; 942 new = old = pgste_get_lock(ptep); 943 /* Reset guest reference bit only */ 944 new = clear_pgste_bit(new, PGSTE_GR_BIT); 945 946 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 947 paddr = pte_val(*ptep) & PAGE_MASK; 948 cc = page_reset_referenced(paddr); 949 /* Merge real referenced bit into host-set */ 950 new = set_pgste_bit(new, ((unsigned long)cc << 53) & PGSTE_HR_BIT); 951 } 952 /* Reflect guest's logical view, not physical */ 953 cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49; 954 /* Changing the guest storage key is considered a change of the page */ 955 if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT) 956 new = set_pgste_bit(new, PGSTE_UC_BIT); 957 958 pgste_set_unlock(ptep, new); 959 pte_unmap_unlock(ptep, ptl); 960 return cc; 961 } 962 EXPORT_SYMBOL(reset_guest_reference_bit); 963 964 int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, 965 unsigned char *key) 966 { 967 unsigned long paddr; 968 spinlock_t *ptl; 969 pgste_t pgste; 970 pmd_t *pmdp; 971 pte_t *ptep; 972 973 /* 974 * If we don't have a PTE table and if there is no huge page mapped, 975 * the storage key is 0. 976 */ 977 *key = 0; 978 979 switch (pmd_lookup(mm, addr, &pmdp)) { 980 case -ENOENT: 981 return 0; 982 case 0: 983 break; 984 default: 985 return -EFAULT; 986 } 987 again: 988 ptl = pmd_lock(mm, pmdp); 989 if (!pmd_present(*pmdp)) { 990 spin_unlock(ptl); 991 return 0; 992 } 993 994 if (pmd_leaf(*pmdp)) { 995 paddr = pmd_val(*pmdp) & HPAGE_MASK; 996 paddr |= addr & ~HPAGE_MASK; 997 *key = page_get_storage_key(paddr); 998 spin_unlock(ptl); 999 return 0; 1000 } 1001 spin_unlock(ptl); 1002 1003 ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 1004 if (!ptep) 1005 goto again; 1006 pgste = pgste_get_lock(ptep); 1007 *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; 1008 paddr = pte_val(*ptep) & PAGE_MASK; 1009 if (!(pte_val(*ptep) & _PAGE_INVALID)) 1010 *key = page_get_storage_key(paddr); 1011 /* Reflect guest's logical view, not physical */ 1012 *key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; 1013 pgste_set_unlock(ptep, pgste); 1014 pte_unmap_unlock(ptep, ptl); 1015 return 0; 1016 } 1017 EXPORT_SYMBOL(get_guest_storage_key); 1018 1019 /** 1020 * pgste_perform_essa - perform ESSA actions on the PGSTE. 1021 * @mm: the memory context. It must have PGSTEs, no check is performed here! 1022 * @hva: the host virtual address of the page whose PGSTE is to be processed 1023 * @orc: the specific action to perform, see the ESSA_SET_* macros. 1024 * @oldpte: the PTE will be saved there if the pointer is not NULL. 1025 * @oldpgste: the old PGSTE will be saved there if the pointer is not NULL. 1026 * 1027 * Return: 1 if the page is to be added to the CBRL, otherwise 0, 1028 * or < 0 in case of error. -EINVAL is returned for invalid values 1029 * of orc, -EFAULT for invalid addresses. 1030 */ 1031 int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, 1032 unsigned long *oldpte, unsigned long *oldpgste) 1033 { 1034 struct vm_area_struct *vma; 1035 unsigned long pgstev; 1036 spinlock_t *ptl; 1037 pgste_t pgste; 1038 pte_t *ptep; 1039 int res = 0; 1040 1041 WARN_ON_ONCE(orc > ESSA_MAX); 1042 if (unlikely(orc > ESSA_MAX)) 1043 return -EINVAL; 1044 1045 vma = vma_lookup(mm, hva); 1046 if (!vma || is_vm_hugetlb_page(vma)) 1047 return -EFAULT; 1048 ptep = get_locked_pte(mm, hva, &ptl); 1049 if (unlikely(!ptep)) 1050 return -EFAULT; 1051 pgste = pgste_get_lock(ptep); 1052 pgstev = pgste_val(pgste); 1053 if (oldpte) 1054 *oldpte = pte_val(*ptep); 1055 if (oldpgste) 1056 *oldpgste = pgstev; 1057 1058 switch (orc) { 1059 case ESSA_GET_STATE: 1060 break; 1061 case ESSA_SET_STABLE: 1062 pgstev &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); 1063 pgstev |= _PGSTE_GPS_USAGE_STABLE; 1064 break; 1065 case ESSA_SET_UNUSED: 1066 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1067 pgstev |= _PGSTE_GPS_USAGE_UNUSED; 1068 if (pte_val(*ptep) & _PAGE_INVALID) 1069 res = 1; 1070 break; 1071 case ESSA_SET_VOLATILE: 1072 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1073 pgstev |= _PGSTE_GPS_USAGE_VOLATILE; 1074 if (pte_val(*ptep) & _PAGE_INVALID) 1075 res = 1; 1076 break; 1077 case ESSA_SET_POT_VOLATILE: 1078 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1079 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 1080 pgstev |= _PGSTE_GPS_USAGE_POT_VOLATILE; 1081 break; 1082 } 1083 if (pgstev & _PGSTE_GPS_ZERO) { 1084 pgstev |= _PGSTE_GPS_USAGE_VOLATILE; 1085 break; 1086 } 1087 if (!(pgstev & PGSTE_GC_BIT)) { 1088 pgstev |= _PGSTE_GPS_USAGE_VOLATILE; 1089 res = 1; 1090 break; 1091 } 1092 break; 1093 case ESSA_SET_STABLE_RESIDENT: 1094 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1095 pgstev |= _PGSTE_GPS_USAGE_STABLE; 1096 /* 1097 * Since the resident state can go away any time after this 1098 * call, we will not make this page resident. We can revisit 1099 * this decision if a guest will ever start using this. 1100 */ 1101 break; 1102 case ESSA_SET_STABLE_IF_RESIDENT: 1103 if (!(pte_val(*ptep) & _PAGE_INVALID)) { 1104 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1105 pgstev |= _PGSTE_GPS_USAGE_STABLE; 1106 } 1107 break; 1108 case ESSA_SET_STABLE_NODAT: 1109 pgstev &= ~_PGSTE_GPS_USAGE_MASK; 1110 pgstev |= _PGSTE_GPS_USAGE_STABLE | _PGSTE_GPS_NODAT; 1111 break; 1112 default: 1113 /* we should never get here! */ 1114 break; 1115 } 1116 /* If we are discarding a page, set it to logical zero */ 1117 if (res) 1118 pgstev |= _PGSTE_GPS_ZERO; 1119 1120 pgste = __pgste(pgstev); 1121 pgste_set_unlock(ptep, pgste); 1122 pte_unmap_unlock(ptep, ptl); 1123 return res; 1124 } 1125 EXPORT_SYMBOL(pgste_perform_essa); 1126 1127 /** 1128 * set_pgste_bits - set specific PGSTE bits. 1129 * @mm: the memory context. It must have PGSTEs, no check is performed here! 1130 * @hva: the host virtual address of the page whose PGSTE is to be processed 1131 * @bits: a bitmask representing the bits that will be touched 1132 * @value: the values of the bits to be written. Only the bits in the mask 1133 * will be written. 1134 * 1135 * Return: 0 on success, < 0 in case of error. 1136 */ 1137 int set_pgste_bits(struct mm_struct *mm, unsigned long hva, 1138 unsigned long bits, unsigned long value) 1139 { 1140 struct vm_area_struct *vma; 1141 spinlock_t *ptl; 1142 pgste_t new; 1143 pte_t *ptep; 1144 1145 vma = vma_lookup(mm, hva); 1146 if (!vma || is_vm_hugetlb_page(vma)) 1147 return -EFAULT; 1148 ptep = get_locked_pte(mm, hva, &ptl); 1149 if (unlikely(!ptep)) 1150 return -EFAULT; 1151 new = pgste_get_lock(ptep); 1152 1153 new = clear_pgste_bit(new, bits); 1154 new = set_pgste_bit(new, value & bits); 1155 1156 pgste_set_unlock(ptep, new); 1157 pte_unmap_unlock(ptep, ptl); 1158 return 0; 1159 } 1160 EXPORT_SYMBOL(set_pgste_bits); 1161 1162 /** 1163 * get_pgste - get the current PGSTE for the given address. 1164 * @mm: the memory context. It must have PGSTEs, no check is performed here! 1165 * @hva: the host virtual address of the page whose PGSTE is to be processed 1166 * @pgstep: will be written with the current PGSTE for the given address. 1167 * 1168 * Return: 0 on success, < 0 in case of error. 1169 */ 1170 int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep) 1171 { 1172 struct vm_area_struct *vma; 1173 spinlock_t *ptl; 1174 pte_t *ptep; 1175 1176 vma = vma_lookup(mm, hva); 1177 if (!vma || is_vm_hugetlb_page(vma)) 1178 return -EFAULT; 1179 ptep = get_locked_pte(mm, hva, &ptl); 1180 if (unlikely(!ptep)) 1181 return -EFAULT; 1182 *pgstep = pgste_val(pgste_get(ptep)); 1183 pte_unmap_unlock(ptep, ptl); 1184 return 0; 1185 } 1186 EXPORT_SYMBOL(get_pgste); 1187 #endif 1188