1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright 2002 Andi Kleen, SuSE Labs.
4 * Thanks to Ben LaHaise for precious feedback.
5 */
6 #include <linux/highmem.h>
7 #include <linux/memblock.h>
8 #include <linux/sched.h>
9 #include <linux/mm.h>
10 #include <linux/interrupt.h>
11 #include <linux/seq_file.h>
12 #include <linux/proc_fs.h>
13 #include <linux/debugfs.h>
14 #include <linux/pfn.h>
15 #include <linux/percpu.h>
16 #include <linux/gfp.h>
17 #include <linux/pci.h>
18 #include <linux/vmalloc.h>
19 #include <linux/libnvdimm.h>
20 #include <linux/vmstat.h>
21 #include <linux/kernel.h>
22 #include <linux/cc_platform.h>
23 #include <linux/set_memory.h>
24 #include <linux/memregion.h>
25
26 #include <asm/e820/api.h>
27 #include <asm/processor.h>
28 #include <asm/tlbflush.h>
29 #include <asm/sections.h>
30 #include <asm/setup.h>
31 #include <linux/uaccess.h>
32 #include <asm/pgalloc.h>
33 #include <asm/proto.h>
34 #include <asm/memtype.h>
35
36 #include "../mm_internal.h"
37
38 /*
39 * The current flushing context - we pass it instead of 5 arguments:
40 */
41 struct cpa_data {
42 unsigned long *vaddr;
43 pgd_t *pgd;
44 pgprot_t mask_set;
45 pgprot_t mask_clr;
46 unsigned long numpages;
47 unsigned long curpage;
48 unsigned long pfn;
49 unsigned int flags;
50 unsigned int force_split : 1,
51 force_static_prot : 1,
52 force_flush_all : 1;
53 struct page **pages;
54 };
55
56 enum cpa_warn {
57 CPA_CONFLICT,
58 CPA_PROTECT,
59 CPA_DETECT,
60 };
61
62 static const int cpa_warn_level = CPA_PROTECT;
63
64 /*
65 * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
66 * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
67 * entries change the page attribute in parallel to some other cpu
68 * splitting a large page entry along with changing the attribute.
69 */
70 static DEFINE_SPINLOCK(cpa_lock);
71
72 #define CPA_FLUSHTLB 1
73 #define CPA_ARRAY 2
74 #define CPA_PAGES_ARRAY 4
75 #define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */
76 #define CPA_COLLAPSE 16 /* try to collapse large pages */
77
cachemode2pgprot(enum page_cache_mode pcm)78 static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm)
79 {
80 return __pgprot(cachemode2protval(pcm));
81 }
82
83 #ifdef CONFIG_PROC_FS
84 static unsigned long direct_pages_count[PG_LEVEL_NUM];
85
update_page_count(int level,unsigned long pages)86 void update_page_count(int level, unsigned long pages)
87 {
88 /* Protect against CPA */
89 spin_lock(&pgd_lock);
90 direct_pages_count[level] += pages;
91 spin_unlock(&pgd_lock);
92 }
93
split_page_count(int level)94 static void split_page_count(int level)
95 {
96 if (direct_pages_count[level] == 0)
97 return;
98
99 direct_pages_count[level]--;
100 if (system_state == SYSTEM_RUNNING) {
101 if (level == PG_LEVEL_2M)
102 count_vm_event(DIRECT_MAP_LEVEL2_SPLIT);
103 else if (level == PG_LEVEL_1G)
104 count_vm_event(DIRECT_MAP_LEVEL3_SPLIT);
105 }
106 direct_pages_count[level - 1] += PTRS_PER_PTE;
107 }
108
collapse_page_count(int level)109 static void collapse_page_count(int level)
110 {
111 direct_pages_count[level]++;
112 if (system_state == SYSTEM_RUNNING) {
113 if (level == PG_LEVEL_2M)
114 count_vm_event(DIRECT_MAP_LEVEL2_COLLAPSE);
115 else if (level == PG_LEVEL_1G)
116 count_vm_event(DIRECT_MAP_LEVEL3_COLLAPSE);
117 }
118 direct_pages_count[level - 1] -= PTRS_PER_PTE;
119 }
120
arch_report_meminfo(struct seq_file * m)121 void arch_report_meminfo(struct seq_file *m)
122 {
123 seq_printf(m, "DirectMap4k: %8lu kB\n",
124 direct_pages_count[PG_LEVEL_4K] << 2);
125 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
126 seq_printf(m, "DirectMap2M: %8lu kB\n",
127 direct_pages_count[PG_LEVEL_2M] << 11);
128 #else
129 seq_printf(m, "DirectMap4M: %8lu kB\n",
130 direct_pages_count[PG_LEVEL_2M] << 12);
131 #endif
132 if (direct_gbpages)
133 seq_printf(m, "DirectMap1G: %8lu kB\n",
134 direct_pages_count[PG_LEVEL_1G] << 20);
135 }
136 #else
split_page_count(int level)137 static inline void split_page_count(int level) { }
collapse_page_count(int level)138 static inline void collapse_page_count(int level) { }
139 #endif
140
141 #ifdef CONFIG_X86_CPA_STATISTICS
142
143 static unsigned long cpa_1g_checked;
144 static unsigned long cpa_1g_sameprot;
145 static unsigned long cpa_1g_preserved;
146 static unsigned long cpa_2m_checked;
147 static unsigned long cpa_2m_sameprot;
148 static unsigned long cpa_2m_preserved;
149 static unsigned long cpa_4k_install;
150
cpa_inc_1g_checked(void)151 static inline void cpa_inc_1g_checked(void)
152 {
153 cpa_1g_checked++;
154 }
155
cpa_inc_2m_checked(void)156 static inline void cpa_inc_2m_checked(void)
157 {
158 cpa_2m_checked++;
159 }
160
cpa_inc_4k_install(void)161 static inline void cpa_inc_4k_install(void)
162 {
163 data_race(cpa_4k_install++);
164 }
165
cpa_inc_lp_sameprot(int level)166 static inline void cpa_inc_lp_sameprot(int level)
167 {
168 if (level == PG_LEVEL_1G)
169 cpa_1g_sameprot++;
170 else
171 cpa_2m_sameprot++;
172 }
173
cpa_inc_lp_preserved(int level)174 static inline void cpa_inc_lp_preserved(int level)
175 {
176 if (level == PG_LEVEL_1G)
177 cpa_1g_preserved++;
178 else
179 cpa_2m_preserved++;
180 }
181
cpastats_show(struct seq_file * m,void * p)182 static int cpastats_show(struct seq_file *m, void *p)
183 {
184 seq_printf(m, "1G pages checked: %16lu\n", cpa_1g_checked);
185 seq_printf(m, "1G pages sameprot: %16lu\n", cpa_1g_sameprot);
186 seq_printf(m, "1G pages preserved: %16lu\n", cpa_1g_preserved);
187 seq_printf(m, "2M pages checked: %16lu\n", cpa_2m_checked);
188 seq_printf(m, "2M pages sameprot: %16lu\n", cpa_2m_sameprot);
189 seq_printf(m, "2M pages preserved: %16lu\n", cpa_2m_preserved);
190 seq_printf(m, "4K pages set-checked: %16lu\n", cpa_4k_install);
191 return 0;
192 }
193
cpastats_open(struct inode * inode,struct file * file)194 static int cpastats_open(struct inode *inode, struct file *file)
195 {
196 return single_open(file, cpastats_show, NULL);
197 }
198
199 static const struct file_operations cpastats_fops = {
200 .open = cpastats_open,
201 .read = seq_read,
202 .llseek = seq_lseek,
203 .release = single_release,
204 };
205
cpa_stats_init(void)206 static int __init cpa_stats_init(void)
207 {
208 debugfs_create_file("cpa_stats", S_IRUSR, arch_debugfs_dir, NULL,
209 &cpastats_fops);
210 return 0;
211 }
212 late_initcall(cpa_stats_init);
213 #else
cpa_inc_1g_checked(void)214 static inline void cpa_inc_1g_checked(void) { }
cpa_inc_2m_checked(void)215 static inline void cpa_inc_2m_checked(void) { }
cpa_inc_4k_install(void)216 static inline void cpa_inc_4k_install(void) { }
cpa_inc_lp_sameprot(int level)217 static inline void cpa_inc_lp_sameprot(int level) { }
cpa_inc_lp_preserved(int level)218 static inline void cpa_inc_lp_preserved(int level) { }
219 #endif
220
221
222 static inline int
within(unsigned long addr,unsigned long start,unsigned long end)223 within(unsigned long addr, unsigned long start, unsigned long end)
224 {
225 return addr >= start && addr < end;
226 }
227
228 #ifdef CONFIG_X86_64
229
230 static inline int
within_inclusive(unsigned long addr,unsigned long start,unsigned long end)231 within_inclusive(unsigned long addr, unsigned long start, unsigned long end)
232 {
233 return addr >= start && addr <= end;
234 }
235
236 /*
237 * The kernel image is mapped into two places in the virtual address space
238 * (addresses without KASLR, of course):
239 *
240 * 1. The kernel direct map (0xffff880000000000)
241 * 2. The "high kernel map" (0xffffffff81000000)
242 *
243 * We actually execute out of #2. If we get the address of a kernel symbol, it
244 * points to #2, but almost all physical-to-virtual translations point to #1.
245 *
246 * This is so that we can have both a directmap of all physical memory *and*
247 * take full advantage of the limited (s32) immediate addressing range (2G)
248 * of x86_64.
249 *
250 * See Documentation/arch/x86/x86_64/mm.rst for more detail.
251 */
252
highmap_start_pfn(void)253 static inline unsigned long highmap_start_pfn(void)
254 {
255 return __pa_symbol(_text) >> PAGE_SHIFT;
256 }
257
highmap_end_pfn(void)258 static inline unsigned long highmap_end_pfn(void)
259 {
260 /* Do not reference physical address outside the kernel. */
261 return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT;
262 }
263
__cpa_pfn_in_highmap(unsigned long pfn)264 static bool __cpa_pfn_in_highmap(unsigned long pfn)
265 {
266 /*
267 * Kernel text has an alias mapping at a high address, known
268 * here as "highmap".
269 */
270 return within_inclusive(pfn, highmap_start_pfn(), highmap_end_pfn());
271 }
272
273 #else
274
__cpa_pfn_in_highmap(unsigned long pfn)275 static bool __cpa_pfn_in_highmap(unsigned long pfn)
276 {
277 /* There is no highmap on 32-bit */
278 return false;
279 }
280
281 #endif
282
283 /*
284 * See set_mce_nospec().
285 *
286 * Machine check recovery code needs to change cache mode of poisoned pages to
287 * UC to avoid speculative access logging another error. But passing the
288 * address of the 1:1 mapping to set_memory_uc() is a fine way to encourage a
289 * speculative access. So we cheat and flip the top bit of the address. This
290 * works fine for the code that updates the page tables. But at the end of the
291 * process we need to flush the TLB and cache and the non-canonical address
292 * causes a #GP fault when used by the INVLPG and CLFLUSH instructions.
293 *
294 * But in the common case we already have a canonical address. This code
295 * will fix the top bit if needed and is a no-op otherwise.
296 */
fix_addr(unsigned long addr)297 static inline unsigned long fix_addr(unsigned long addr)
298 {
299 #ifdef CONFIG_X86_64
300 return (long)(addr << 1) >> 1;
301 #else
302 return addr;
303 #endif
304 }
305
__cpa_addr(struct cpa_data * cpa,unsigned long idx)306 static unsigned long __cpa_addr(struct cpa_data *cpa, unsigned long idx)
307 {
308 if (cpa->flags & CPA_PAGES_ARRAY) {
309 struct page *page = cpa->pages[idx];
310
311 if (unlikely(PageHighMem(page)))
312 return 0;
313
314 return (unsigned long)page_address(page);
315 }
316
317 if (cpa->flags & CPA_ARRAY)
318 return cpa->vaddr[idx];
319
320 return *cpa->vaddr + idx * PAGE_SIZE;
321 }
322
323 /*
324 * Flushing functions
325 */
326
clflush_cache_range_opt(void * vaddr,unsigned int size)327 static void clflush_cache_range_opt(void *vaddr, unsigned int size)
328 {
329 const unsigned long clflush_size = boot_cpu_data.x86_clflush_size;
330 void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1));
331 void *vend = vaddr + size;
332
333 if (p >= vend)
334 return;
335
336 for (; p < vend; p += clflush_size)
337 clflushopt(p);
338 }
339
340 /**
341 * clflush_cache_range - flush a cache range with clflush
342 * @vaddr: virtual start address
343 * @size: number of bytes to flush
344 *
345 * CLFLUSHOPT is an unordered instruction which needs fencing with MFENCE or
346 * SFENCE to avoid ordering issues.
347 */
clflush_cache_range(void * vaddr,unsigned int size)348 void clflush_cache_range(void *vaddr, unsigned int size)
349 {
350 mb();
351 clflush_cache_range_opt(vaddr, size);
352 mb();
353 }
354 EXPORT_SYMBOL_GPL(clflush_cache_range);
355
356 #ifdef CONFIG_ARCH_HAS_PMEM_API
arch_invalidate_pmem(void * addr,size_t size)357 void arch_invalidate_pmem(void *addr, size_t size)
358 {
359 clflush_cache_range(addr, size);
360 }
361 EXPORT_SYMBOL_GPL(arch_invalidate_pmem);
362 #endif
363
364 #ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
cpu_cache_has_invalidate_memregion(void)365 bool cpu_cache_has_invalidate_memregion(void)
366 {
367 return !cpu_feature_enabled(X86_FEATURE_HYPERVISOR);
368 }
369 EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, "DEVMEM");
370
cpu_cache_invalidate_memregion(phys_addr_t start,size_t len)371 int cpu_cache_invalidate_memregion(phys_addr_t start, size_t len)
372 {
373 if (WARN_ON_ONCE(!cpu_cache_has_invalidate_memregion()))
374 return -ENXIO;
375 wbinvd_on_all_cpus();
376 return 0;
377 }
378 EXPORT_SYMBOL_NS_GPL(cpu_cache_invalidate_memregion, "DEVMEM");
379 #endif
380
__cpa_flush_all(void * arg)381 static void __cpa_flush_all(void *arg)
382 {
383 unsigned long cache = (unsigned long)arg;
384
385 /*
386 * Flush all to work around Errata in early athlons regarding
387 * large page flushing.
388 */
389 __flush_tlb_all();
390
391 if (cache && boot_cpu_data.x86 >= 4)
392 wbinvd();
393 }
394
cpa_flush_all(unsigned long cache)395 static void cpa_flush_all(unsigned long cache)
396 {
397 BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
398
399 on_each_cpu(__cpa_flush_all, (void *) cache, 1);
400 }
401
402 static int collapse_large_pages(unsigned long addr, struct list_head *pgtables);
403
cpa_collapse_large_pages(struct cpa_data * cpa)404 static void cpa_collapse_large_pages(struct cpa_data *cpa)
405 {
406 unsigned long start, addr, end;
407 struct ptdesc *ptdesc, *tmp;
408 LIST_HEAD(pgtables);
409 int collapsed = 0;
410 int i;
411
412 if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
413 for (i = 0; i < cpa->numpages; i++)
414 collapsed += collapse_large_pages(__cpa_addr(cpa, i),
415 &pgtables);
416 } else {
417 addr = __cpa_addr(cpa, 0);
418 start = addr & PMD_MASK;
419 end = addr + PAGE_SIZE * cpa->numpages;
420
421 for (addr = start; within(addr, start, end); addr += PMD_SIZE)
422 collapsed += collapse_large_pages(addr, &pgtables);
423 }
424
425 if (!collapsed)
426 return;
427
428 flush_tlb_all();
429
430 list_for_each_entry_safe(ptdesc, tmp, &pgtables, pt_list) {
431 list_del(&ptdesc->pt_list);
432 pagetable_free(ptdesc);
433 }
434 }
435
cpa_flush(struct cpa_data * cpa,int cache)436 static void cpa_flush(struct cpa_data *cpa, int cache)
437 {
438 unsigned long start, end;
439 unsigned int i;
440
441 BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
442
443 if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
444 cpa_flush_all(cache);
445 goto collapse_large_pages;
446 }
447
448 start = fix_addr(__cpa_addr(cpa, 0));
449 end = start + cpa->numpages * PAGE_SIZE;
450 if (cpa->force_flush_all)
451 end = TLB_FLUSH_ALL;
452
453 flush_tlb_kernel_range(start, end);
454
455 if (!cache)
456 goto collapse_large_pages;
457
458 mb();
459 for (i = 0; i < cpa->numpages; i++) {
460 unsigned long addr = __cpa_addr(cpa, i);
461 unsigned int level;
462
463 pte_t *pte = lookup_address(addr, &level);
464
465 /*
466 * Only flush present addresses:
467 */
468 if (pte && (pte_val(*pte) & _PAGE_PRESENT))
469 clflush_cache_range_opt((void *)fix_addr(addr), PAGE_SIZE);
470 }
471 mb();
472
473 collapse_large_pages:
474 if (cpa->flags & CPA_COLLAPSE)
475 cpa_collapse_large_pages(cpa);
476 }
477
overlaps(unsigned long r1_start,unsigned long r1_end,unsigned long r2_start,unsigned long r2_end)478 static bool overlaps(unsigned long r1_start, unsigned long r1_end,
479 unsigned long r2_start, unsigned long r2_end)
480 {
481 return (r1_start <= r2_end && r1_end >= r2_start) ||
482 (r2_start <= r1_end && r2_end >= r1_start);
483 }
484
485 #ifdef CONFIG_PCI_BIOS
486 /*
487 * The BIOS area between 640k and 1Mb needs to be executable for PCI BIOS
488 * based config access (CONFIG_PCI_GOBIOS) support.
489 */
490 #define BIOS_PFN PFN_DOWN(BIOS_BEGIN)
491 #define BIOS_PFN_END PFN_DOWN(BIOS_END - 1)
492
protect_pci_bios(unsigned long spfn,unsigned long epfn)493 static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
494 {
495 if (pcibios_enabled && overlaps(spfn, epfn, BIOS_PFN, BIOS_PFN_END))
496 return _PAGE_NX;
497 return 0;
498 }
499 #else
protect_pci_bios(unsigned long spfn,unsigned long epfn)500 static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
501 {
502 return 0;
503 }
504 #endif
505
506 /*
507 * The .rodata section needs to be read-only. Using the pfn catches all
508 * aliases. This also includes __ro_after_init, so do not enforce until
509 * kernel_set_to_readonly is true.
510 */
protect_rodata(unsigned long spfn,unsigned long epfn)511 static pgprotval_t protect_rodata(unsigned long spfn, unsigned long epfn)
512 {
513 unsigned long epfn_ro, spfn_ro = PFN_DOWN(__pa_symbol(__start_rodata));
514
515 /*
516 * Note: __end_rodata is at page aligned and not inclusive, so
517 * subtract 1 to get the last enforced PFN in the rodata area.
518 */
519 epfn_ro = PFN_DOWN(__pa_symbol(__end_rodata)) - 1;
520
521 if (kernel_set_to_readonly && overlaps(spfn, epfn, spfn_ro, epfn_ro))
522 return _PAGE_RW;
523 return 0;
524 }
525
526 /*
527 * Protect kernel text against becoming non executable by forbidding
528 * _PAGE_NX. This protects only the high kernel mapping (_text -> _etext)
529 * out of which the kernel actually executes. Do not protect the low
530 * mapping.
531 *
532 * This does not cover __inittext since that is gone after boot.
533 */
protect_kernel_text(unsigned long start,unsigned long end)534 static pgprotval_t protect_kernel_text(unsigned long start, unsigned long end)
535 {
536 unsigned long t_end = (unsigned long)_etext - 1;
537 unsigned long t_start = (unsigned long)_text;
538
539 if (overlaps(start, end, t_start, t_end))
540 return _PAGE_NX;
541 return 0;
542 }
543
544 #if defined(CONFIG_X86_64)
545 /*
546 * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
547 * kernel text mappings for the large page aligned text, rodata sections
548 * will be always read-only. For the kernel identity mappings covering the
549 * holes caused by this alignment can be anything that user asks.
550 *
551 * This will preserve the large page mappings for kernel text/data at no
552 * extra cost.
553 */
protect_kernel_text_ro(unsigned long start,unsigned long end)554 static pgprotval_t protect_kernel_text_ro(unsigned long start,
555 unsigned long end)
556 {
557 unsigned long t_end = (unsigned long)__end_rodata_hpage_align - 1;
558 unsigned long t_start = (unsigned long)_text;
559 unsigned int level;
560
561 if (!kernel_set_to_readonly || !overlaps(start, end, t_start, t_end))
562 return 0;
563 /*
564 * Don't enforce the !RW mapping for the kernel text mapping, if
565 * the current mapping is already using small page mapping. No
566 * need to work hard to preserve large page mappings in this case.
567 *
568 * This also fixes the Linux Xen paravirt guest boot failure caused
569 * by unexpected read-only mappings for kernel identity
570 * mappings. In this paravirt guest case, the kernel text mapping
571 * and the kernel identity mapping share the same page-table pages,
572 * so the protections for kernel text and identity mappings have to
573 * be the same.
574 */
575 if (lookup_address(start, &level) && (level != PG_LEVEL_4K))
576 return _PAGE_RW;
577 return 0;
578 }
579 #else
protect_kernel_text_ro(unsigned long start,unsigned long end)580 static pgprotval_t protect_kernel_text_ro(unsigned long start,
581 unsigned long end)
582 {
583 return 0;
584 }
585 #endif
586
conflicts(pgprot_t prot,pgprotval_t val)587 static inline bool conflicts(pgprot_t prot, pgprotval_t val)
588 {
589 return (pgprot_val(prot) & ~val) != pgprot_val(prot);
590 }
591
check_conflict(int warnlvl,pgprot_t prot,pgprotval_t val,unsigned long start,unsigned long end,unsigned long pfn,const char * txt)592 static inline void check_conflict(int warnlvl, pgprot_t prot, pgprotval_t val,
593 unsigned long start, unsigned long end,
594 unsigned long pfn, const char *txt)
595 {
596 static const char *lvltxt[] = {
597 [CPA_CONFLICT] = "conflict",
598 [CPA_PROTECT] = "protect",
599 [CPA_DETECT] = "detect",
600 };
601
602 if (warnlvl > cpa_warn_level || !conflicts(prot, val))
603 return;
604
605 pr_warn("CPA %8s %10s: 0x%016lx - 0x%016lx PFN %lx req %016llx prevent %016llx\n",
606 lvltxt[warnlvl], txt, start, end, pfn, (unsigned long long)pgprot_val(prot),
607 (unsigned long long)val);
608 }
609
610 /*
611 * Certain areas of memory on x86 require very specific protection flags,
612 * for example the BIOS area or kernel text. Callers don't always get this
613 * right (again, ioremap() on BIOS memory is not uncommon) so this function
614 * checks and fixes these known static required protection bits.
615 */
static_protections(pgprot_t prot,unsigned long start,unsigned long pfn,unsigned long npg,unsigned long lpsize,int warnlvl)616 static inline pgprot_t static_protections(pgprot_t prot, unsigned long start,
617 unsigned long pfn, unsigned long npg,
618 unsigned long lpsize, int warnlvl)
619 {
620 pgprotval_t forbidden, res;
621 unsigned long end;
622
623 /*
624 * There is no point in checking RW/NX conflicts when the requested
625 * mapping is setting the page !PRESENT.
626 */
627 if (!(pgprot_val(prot) & _PAGE_PRESENT))
628 return prot;
629
630 /* Operate on the virtual address */
631 end = start + npg * PAGE_SIZE - 1;
632
633 res = protect_kernel_text(start, end);
634 check_conflict(warnlvl, prot, res, start, end, pfn, "Text NX");
635 forbidden = res;
636
637 /*
638 * Special case to preserve a large page. If the change spawns the
639 * full large page mapping then there is no point to split it
640 * up. Happens with ftrace and is going to be removed once ftrace
641 * switched to text_poke().
642 */
643 if (lpsize != (npg * PAGE_SIZE) || (start & (lpsize - 1))) {
644 res = protect_kernel_text_ro(start, end);
645 check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO");
646 forbidden |= res;
647 }
648
649 /* Check the PFN directly */
650 res = protect_pci_bios(pfn, pfn + npg - 1);
651 check_conflict(warnlvl, prot, res, start, end, pfn, "PCIBIOS NX");
652 forbidden |= res;
653
654 res = protect_rodata(pfn, pfn + npg - 1);
655 check_conflict(warnlvl, prot, res, start, end, pfn, "Rodata RO");
656 forbidden |= res;
657
658 return __pgprot(pgprot_val(prot) & ~forbidden);
659 }
660
661 /*
662 * Validate strict W^X semantics.
663 */
verify_rwx(pgprot_t old,pgprot_t new,unsigned long start,unsigned long pfn,unsigned long npg,bool nx,bool rw)664 static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long start,
665 unsigned long pfn, unsigned long npg,
666 bool nx, bool rw)
667 {
668 unsigned long end;
669
670 /*
671 * 32-bit has some unfixable W+X issues, like EFI code
672 * and writeable data being in the same page. Disable
673 * detection and enforcement there.
674 */
675 if (IS_ENABLED(CONFIG_X86_32))
676 return new;
677
678 /* Only verify when NX is supported: */
679 if (!(__supported_pte_mask & _PAGE_NX))
680 return new;
681
682 if (!((pgprot_val(old) ^ pgprot_val(new)) & (_PAGE_RW | _PAGE_NX)))
683 return new;
684
685 if ((pgprot_val(new) & (_PAGE_RW | _PAGE_NX)) != _PAGE_RW)
686 return new;
687
688 /* Non-leaf translation entries can disable writing or execution. */
689 if (!rw || nx)
690 return new;
691
692 end = start + npg * PAGE_SIZE - 1;
693 WARN_ONCE(1, "CPA detected W^X violation: %016llx -> %016llx range: 0x%016lx - 0x%016lx PFN %lx\n",
694 (unsigned long long)pgprot_val(old),
695 (unsigned long long)pgprot_val(new),
696 start, end, pfn);
697
698 /*
699 * For now, allow all permission change attempts by returning the
700 * attempted permissions. This can 'return old' to actively
701 * refuse the permission change at a later time.
702 */
703 return new;
704 }
705
706 /*
707 * Lookup the page table entry for a virtual address in a specific pgd.
708 * Return a pointer to the entry (or NULL if the entry does not exist),
709 * the level of the entry, and the effective NX and RW bits of all
710 * page table levels.
711 */
lookup_address_in_pgd_attr(pgd_t * pgd,unsigned long address,unsigned int * level,bool * nx,bool * rw)712 pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address,
713 unsigned int *level, bool *nx, bool *rw)
714 {
715 p4d_t *p4d;
716 pud_t *pud;
717 pmd_t *pmd;
718
719 *level = PG_LEVEL_256T;
720 *nx = false;
721 *rw = true;
722
723 if (pgd_none(*pgd))
724 return NULL;
725
726 *level = PG_LEVEL_512G;
727 *nx |= pgd_flags(*pgd) & _PAGE_NX;
728 *rw &= pgd_flags(*pgd) & _PAGE_RW;
729
730 p4d = p4d_offset(pgd, address);
731 if (p4d_none(*p4d))
732 return NULL;
733
734 if (p4d_leaf(*p4d) || !p4d_present(*p4d))
735 return (pte_t *)p4d;
736
737 *level = PG_LEVEL_1G;
738 *nx |= p4d_flags(*p4d) & _PAGE_NX;
739 *rw &= p4d_flags(*p4d) & _PAGE_RW;
740
741 pud = pud_offset(p4d, address);
742 if (pud_none(*pud))
743 return NULL;
744
745 if (pud_leaf(*pud) || !pud_present(*pud))
746 return (pte_t *)pud;
747
748 *level = PG_LEVEL_2M;
749 *nx |= pud_flags(*pud) & _PAGE_NX;
750 *rw &= pud_flags(*pud) & _PAGE_RW;
751
752 pmd = pmd_offset(pud, address);
753 if (pmd_none(*pmd))
754 return NULL;
755
756 if (pmd_leaf(*pmd) || !pmd_present(*pmd))
757 return (pte_t *)pmd;
758
759 *level = PG_LEVEL_4K;
760 *nx |= pmd_flags(*pmd) & _PAGE_NX;
761 *rw &= pmd_flags(*pmd) & _PAGE_RW;
762
763 return pte_offset_kernel(pmd, address);
764 }
765
766 /*
767 * Lookup the page table entry for a virtual address in a specific pgd.
768 * Return a pointer to the entry and the level of the mapping.
769 */
lookup_address_in_pgd(pgd_t * pgd,unsigned long address,unsigned int * level)770 pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
771 unsigned int *level)
772 {
773 bool nx, rw;
774
775 return lookup_address_in_pgd_attr(pgd, address, level, &nx, &rw);
776 }
777
778 /*
779 * Lookup the page table entry for a virtual address. Return a pointer
780 * to the entry and the level of the mapping.
781 *
782 * Note: the function returns p4d, pud or pmd either when the entry is marked
783 * large or when the present bit is not set. Otherwise it returns NULL.
784 */
lookup_address(unsigned long address,unsigned int * level)785 pte_t *lookup_address(unsigned long address, unsigned int *level)
786 {
787 return lookup_address_in_pgd(pgd_offset_k(address), address, level);
788 }
789 EXPORT_SYMBOL_GPL(lookup_address);
790
_lookup_address_cpa(struct cpa_data * cpa,unsigned long address,unsigned int * level,bool * nx,bool * rw)791 static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
792 unsigned int *level, bool *nx, bool *rw)
793 {
794 pgd_t *pgd;
795
796 if (!cpa->pgd)
797 pgd = pgd_offset_k(address);
798 else
799 pgd = cpa->pgd + pgd_index(address);
800
801 return lookup_address_in_pgd_attr(pgd, address, level, nx, rw);
802 }
803
804 /*
805 * Lookup the PMD entry for a virtual address. Return a pointer to the entry
806 * or NULL if not present.
807 */
lookup_pmd_address(unsigned long address)808 pmd_t *lookup_pmd_address(unsigned long address)
809 {
810 pgd_t *pgd;
811 p4d_t *p4d;
812 pud_t *pud;
813
814 pgd = pgd_offset_k(address);
815 if (pgd_none(*pgd))
816 return NULL;
817
818 p4d = p4d_offset(pgd, address);
819 if (p4d_none(*p4d) || p4d_leaf(*p4d) || !p4d_present(*p4d))
820 return NULL;
821
822 pud = pud_offset(p4d, address);
823 if (pud_none(*pud) || pud_leaf(*pud) || !pud_present(*pud))
824 return NULL;
825
826 return pmd_offset(pud, address);
827 }
828
829 /*
830 * This is necessary because __pa() does not work on some
831 * kinds of memory, like vmalloc() or the alloc_remap()
832 * areas on 32-bit NUMA systems. The percpu areas can
833 * end up in this kind of memory, for instance.
834 *
835 * Note that as long as the PTEs are well-formed with correct PFNs, this
836 * works without checking the PRESENT bit in the leaf PTE. This is unlike
837 * the similar vmalloc_to_page() and derivatives. Callers may depend on
838 * this behavior.
839 *
840 * This could be optimized, but it is only used in paths that are not perf
841 * sensitive, and keeping it unoptimized should increase the testing coverage
842 * for the more obscure platforms.
843 */
slow_virt_to_phys(void * __virt_addr)844 phys_addr_t slow_virt_to_phys(void *__virt_addr)
845 {
846 unsigned long virt_addr = (unsigned long)__virt_addr;
847 phys_addr_t phys_addr;
848 unsigned long offset;
849 enum pg_level level;
850 pte_t *pte;
851
852 pte = lookup_address(virt_addr, &level);
853 BUG_ON(!pte);
854
855 /*
856 * pXX_pfn() returns unsigned long, which must be cast to phys_addr_t
857 * before being left-shifted PAGE_SHIFT bits -- this trick is to
858 * make 32-PAE kernel work correctly.
859 */
860 switch (level) {
861 case PG_LEVEL_1G:
862 phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT;
863 offset = virt_addr & ~PUD_MASK;
864 break;
865 case PG_LEVEL_2M:
866 phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT;
867 offset = virt_addr & ~PMD_MASK;
868 break;
869 default:
870 phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
871 offset = virt_addr & ~PAGE_MASK;
872 }
873
874 return (phys_addr_t)(phys_addr | offset);
875 }
876 EXPORT_SYMBOL_GPL(slow_virt_to_phys);
877
878 /*
879 * Set the new pmd in all the pgds we know about:
880 */
__set_pmd_pte(pte_t * kpte,unsigned long address,pte_t pte)881 static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
882 {
883 /* change init_mm */
884 set_pte_atomic(kpte, pte);
885 #ifdef CONFIG_X86_32
886 {
887 struct page *page;
888
889 list_for_each_entry(page, &pgd_list, lru) {
890 pgd_t *pgd;
891 p4d_t *p4d;
892 pud_t *pud;
893 pmd_t *pmd;
894
895 pgd = (pgd_t *)page_address(page) + pgd_index(address);
896 p4d = p4d_offset(pgd, address);
897 pud = pud_offset(p4d, address);
898 pmd = pmd_offset(pud, address);
899 set_pte_atomic((pte_t *)pmd, pte);
900 }
901 }
902 #endif
903 }
904
pgprot_clear_protnone_bits(pgprot_t prot)905 static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot)
906 {
907 /*
908 * _PAGE_GLOBAL means "global page" for present PTEs.
909 * But, it is also used to indicate _PAGE_PROTNONE
910 * for non-present PTEs.
911 *
912 * This ensures that a _PAGE_GLOBAL PTE going from
913 * present to non-present is not confused as
914 * _PAGE_PROTNONE.
915 */
916 if (!(pgprot_val(prot) & _PAGE_PRESENT))
917 pgprot_val(prot) &= ~_PAGE_GLOBAL;
918
919 return prot;
920 }
921
__should_split_large_page(pte_t * kpte,unsigned long address,struct cpa_data * cpa)922 static int __should_split_large_page(pte_t *kpte, unsigned long address,
923 struct cpa_data *cpa)
924 {
925 unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn;
926 pgprot_t old_prot, new_prot, req_prot, chk_prot;
927 pte_t new_pte, *tmp;
928 enum pg_level level;
929 bool nx, rw;
930
931 /*
932 * Check for races, another CPU might have split this page
933 * up already:
934 */
935 tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
936 if (tmp != kpte)
937 return 1;
938
939 switch (level) {
940 case PG_LEVEL_2M:
941 old_prot = pmd_pgprot(*(pmd_t *)kpte);
942 old_pfn = pmd_pfn(*(pmd_t *)kpte);
943 cpa_inc_2m_checked();
944 break;
945 case PG_LEVEL_1G:
946 old_prot = pud_pgprot(*(pud_t *)kpte);
947 old_pfn = pud_pfn(*(pud_t *)kpte);
948 cpa_inc_1g_checked();
949 break;
950 default:
951 return -EINVAL;
952 }
953
954 psize = page_level_size(level);
955 pmask = page_level_mask(level);
956
957 /*
958 * Calculate the number of pages, which fit into this large
959 * page starting at address:
960 */
961 lpaddr = (address + psize) & pmask;
962 numpages = (lpaddr - address) >> PAGE_SHIFT;
963 if (numpages < cpa->numpages)
964 cpa->numpages = numpages;
965
966 /*
967 * We are safe now. Check whether the new pgprot is the same:
968 * Convert protection attributes to 4k-format, as cpa->mask* are set
969 * up accordingly.
970 */
971
972 /* Clear PSE (aka _PAGE_PAT) and move PAT bit to correct position */
973 req_prot = pgprot_large_2_4k(old_prot);
974
975 pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
976 pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
977
978 /*
979 * req_prot is in format of 4k pages. It must be converted to large
980 * page format: the caching mode includes the PAT bit located at
981 * different bit positions in the two formats.
982 */
983 req_prot = pgprot_4k_2_large(req_prot);
984 req_prot = pgprot_clear_protnone_bits(req_prot);
985 if (pgprot_val(req_prot) & _PAGE_PRESENT)
986 pgprot_val(req_prot) |= _PAGE_PSE;
987
988 /*
989 * old_pfn points to the large page base pfn. So we need to add the
990 * offset of the virtual address:
991 */
992 pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT);
993 cpa->pfn = pfn;
994
995 /*
996 * Calculate the large page base address and the number of 4K pages
997 * in the large page
998 */
999 lpaddr = address & pmask;
1000 numpages = psize >> PAGE_SHIFT;
1001
1002 /*
1003 * Sanity check that the existing mapping is correct versus the static
1004 * protections. static_protections() guards against !PRESENT, so no
1005 * extra conditional required here.
1006 */
1007 chk_prot = static_protections(old_prot, lpaddr, old_pfn, numpages,
1008 psize, CPA_CONFLICT);
1009
1010 if (WARN_ON_ONCE(pgprot_val(chk_prot) != pgprot_val(old_prot))) {
1011 /*
1012 * Split the large page and tell the split code to
1013 * enforce static protections.
1014 */
1015 cpa->force_static_prot = 1;
1016 return 1;
1017 }
1018
1019 /*
1020 * Optimization: If the requested pgprot is the same as the current
1021 * pgprot, then the large page can be preserved and no updates are
1022 * required independent of alignment and length of the requested
1023 * range. The above already established that the current pgprot is
1024 * correct, which in consequence makes the requested pgprot correct
1025 * as well if it is the same. The static protection scan below will
1026 * not come to a different conclusion.
1027 */
1028 if (pgprot_val(req_prot) == pgprot_val(old_prot)) {
1029 cpa_inc_lp_sameprot(level);
1030 return 0;
1031 }
1032
1033 /*
1034 * If the requested range does not cover the full page, split it up
1035 */
1036 if (address != lpaddr || cpa->numpages != numpages)
1037 return 1;
1038
1039 /*
1040 * Check whether the requested pgprot is conflicting with a static
1041 * protection requirement in the large page.
1042 */
1043 new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages,
1044 psize, CPA_DETECT);
1045
1046 new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages,
1047 nx, rw);
1048
1049 /*
1050 * If there is a conflict, split the large page.
1051 *
1052 * There used to be a 4k wise evaluation trying really hard to
1053 * preserve the large pages, but experimentation has shown, that this
1054 * does not help at all. There might be corner cases which would
1055 * preserve one large page occasionally, but it's really not worth the
1056 * extra code and cycles for the common case.
1057 */
1058 if (pgprot_val(req_prot) != pgprot_val(new_prot))
1059 return 1;
1060
1061 /* All checks passed. Update the large page mapping. */
1062 new_pte = pfn_pte(old_pfn, new_prot);
1063 __set_pmd_pte(kpte, address, new_pte);
1064 cpa->flags |= CPA_FLUSHTLB;
1065 cpa_inc_lp_preserved(level);
1066 return 0;
1067 }
1068
should_split_large_page(pte_t * kpte,unsigned long address,struct cpa_data * cpa)1069 static int should_split_large_page(pte_t *kpte, unsigned long address,
1070 struct cpa_data *cpa)
1071 {
1072 int do_split;
1073
1074 if (cpa->force_split)
1075 return 1;
1076
1077 spin_lock(&pgd_lock);
1078 do_split = __should_split_large_page(kpte, address, cpa);
1079 spin_unlock(&pgd_lock);
1080
1081 return do_split;
1082 }
1083
split_set_pte(struct cpa_data * cpa,pte_t * pte,unsigned long pfn,pgprot_t ref_prot,unsigned long address,unsigned long size)1084 static void split_set_pte(struct cpa_data *cpa, pte_t *pte, unsigned long pfn,
1085 pgprot_t ref_prot, unsigned long address,
1086 unsigned long size)
1087 {
1088 unsigned int npg = PFN_DOWN(size);
1089 pgprot_t prot;
1090
1091 /*
1092 * If should_split_large_page() discovered an inconsistent mapping,
1093 * remove the invalid protection in the split mapping.
1094 */
1095 if (!cpa->force_static_prot)
1096 goto set;
1097
1098 /* Hand in lpsize = 0 to enforce the protection mechanism */
1099 prot = static_protections(ref_prot, address, pfn, npg, 0, CPA_PROTECT);
1100
1101 if (pgprot_val(prot) == pgprot_val(ref_prot))
1102 goto set;
1103
1104 /*
1105 * If this is splitting a PMD, fix it up. PUD splits cannot be
1106 * fixed trivially as that would require to rescan the newly
1107 * installed PMD mappings after returning from split_large_page()
1108 * so an eventual further split can allocate the necessary PTE
1109 * pages. Warn for now and revisit it in case this actually
1110 * happens.
1111 */
1112 if (size == PAGE_SIZE)
1113 ref_prot = prot;
1114 else
1115 pr_warn_once("CPA: Cannot fixup static protections for PUD split\n");
1116 set:
1117 set_pte(pte, pfn_pte(pfn, ref_prot));
1118 }
1119
1120 static int
__split_large_page(struct cpa_data * cpa,pte_t * kpte,unsigned long address,struct ptdesc * ptdesc)1121 __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
1122 struct ptdesc *ptdesc)
1123 {
1124 unsigned long lpaddr, lpinc, ref_pfn, pfn, pfninc = 1;
1125 struct page *base = ptdesc_page(ptdesc);
1126 pte_t *pbase = (pte_t *)page_address(base);
1127 unsigned int i, level;
1128 pgprot_t ref_prot;
1129 bool nx, rw;
1130 pte_t *tmp;
1131
1132 spin_lock(&pgd_lock);
1133 /*
1134 * Check for races, another CPU might have split this page
1135 * up for us already:
1136 */
1137 tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
1138 if (tmp != kpte) {
1139 spin_unlock(&pgd_lock);
1140 return 1;
1141 }
1142
1143 paravirt_alloc_pte(&init_mm, page_to_pfn(base));
1144
1145 switch (level) {
1146 case PG_LEVEL_2M:
1147 ref_prot = pmd_pgprot(*(pmd_t *)kpte);
1148 /*
1149 * Clear PSE (aka _PAGE_PAT) and move
1150 * PAT bit to correct position.
1151 */
1152 ref_prot = pgprot_large_2_4k(ref_prot);
1153 ref_pfn = pmd_pfn(*(pmd_t *)kpte);
1154 lpaddr = address & PMD_MASK;
1155 lpinc = PAGE_SIZE;
1156 break;
1157
1158 case PG_LEVEL_1G:
1159 ref_prot = pud_pgprot(*(pud_t *)kpte);
1160 ref_pfn = pud_pfn(*(pud_t *)kpte);
1161 pfninc = PMD_SIZE >> PAGE_SHIFT;
1162 lpaddr = address & PUD_MASK;
1163 lpinc = PMD_SIZE;
1164 /*
1165 * Clear the PSE flags if the PRESENT flag is not set
1166 * otherwise pmd_present() will return true even on a non
1167 * present pmd.
1168 */
1169 if (!(pgprot_val(ref_prot) & _PAGE_PRESENT))
1170 pgprot_val(ref_prot) &= ~_PAGE_PSE;
1171 break;
1172
1173 default:
1174 spin_unlock(&pgd_lock);
1175 return 1;
1176 }
1177
1178 ref_prot = pgprot_clear_protnone_bits(ref_prot);
1179
1180 /*
1181 * Get the target pfn from the original entry:
1182 */
1183 pfn = ref_pfn;
1184 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc, lpaddr += lpinc)
1185 split_set_pte(cpa, pbase + i, pfn, ref_prot, lpaddr, lpinc);
1186
1187 if (virt_addr_valid(address)) {
1188 unsigned long pfn = PFN_DOWN(__pa(address));
1189
1190 if (pfn_range_is_mapped(pfn, pfn + 1))
1191 split_page_count(level);
1192 }
1193
1194 /*
1195 * Install the new, split up pagetable.
1196 *
1197 * We use the standard kernel pagetable protections for the new
1198 * pagetable protections, the actual ptes set above control the
1199 * primary protection behavior:
1200 */
1201 __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
1202
1203 /*
1204 * Do a global flush tlb after splitting the large page
1205 * and before we do the actual change page attribute in the PTE.
1206 *
1207 * Without this, we violate the TLB application note, that says:
1208 * "The TLBs may contain both ordinary and large-page
1209 * translations for a 4-KByte range of linear addresses. This
1210 * may occur if software modifies the paging structures so that
1211 * the page size used for the address range changes. If the two
1212 * translations differ with respect to page frame or attributes
1213 * (e.g., permissions), processor behavior is undefined and may
1214 * be implementation-specific."
1215 *
1216 * We do this global tlb flush inside the cpa_lock, so that we
1217 * don't allow any other cpu, with stale tlb entries change the
1218 * page attribute in parallel, that also falls into the
1219 * just split large page entry.
1220 */
1221 flush_tlb_all();
1222 spin_unlock(&pgd_lock);
1223
1224 return 0;
1225 }
1226
split_large_page(struct cpa_data * cpa,pte_t * kpte,unsigned long address)1227 static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
1228 unsigned long address)
1229 {
1230 struct ptdesc *ptdesc;
1231
1232 if (!debug_pagealloc_enabled())
1233 spin_unlock(&cpa_lock);
1234 ptdesc = pagetable_alloc(GFP_KERNEL, 0);
1235 if (!debug_pagealloc_enabled())
1236 spin_lock(&cpa_lock);
1237 if (!ptdesc)
1238 return -ENOMEM;
1239
1240 if (__split_large_page(cpa, kpte, address, ptdesc))
1241 pagetable_free(ptdesc);
1242
1243 return 0;
1244 }
1245
collapse_pmd_page(pmd_t * pmd,unsigned long addr,struct list_head * pgtables)1246 static int collapse_pmd_page(pmd_t *pmd, unsigned long addr,
1247 struct list_head *pgtables)
1248 {
1249 pmd_t _pmd, old_pmd;
1250 pte_t *pte, first;
1251 unsigned long pfn;
1252 pgprot_t pgprot;
1253 int i = 0;
1254
1255 if (!cpu_feature_enabled(X86_FEATURE_PSE))
1256 return 0;
1257
1258 addr &= PMD_MASK;
1259 pte = pte_offset_kernel(pmd, addr);
1260 first = *pte;
1261 pfn = pte_pfn(first);
1262
1263 /* Make sure alignment is suitable */
1264 if (PFN_PHYS(pfn) & ~PMD_MASK)
1265 return 0;
1266
1267 /* The page is 4k intentionally */
1268 if (pte_flags(first) & _PAGE_KERNEL_4K)
1269 return 0;
1270
1271 /* Check that the rest of PTEs are compatible with the first one */
1272 for (i = 1, pte++; i < PTRS_PER_PTE; i++, pte++) {
1273 pte_t entry = *pte;
1274
1275 if (!pte_present(entry))
1276 return 0;
1277 if (pte_flags(entry) != pte_flags(first))
1278 return 0;
1279 if (pte_pfn(entry) != pte_pfn(first) + i)
1280 return 0;
1281 }
1282
1283 old_pmd = *pmd;
1284
1285 /* Success: set up a large page */
1286 pgprot = pgprot_4k_2_large(pte_pgprot(first));
1287 pgprot_val(pgprot) |= _PAGE_PSE;
1288 _pmd = pfn_pmd(pfn, pgprot);
1289 set_pmd(pmd, _pmd);
1290
1291 /* Queue the page table to be freed after TLB flush */
1292 list_add(&page_ptdesc(pmd_page(old_pmd))->pt_list, pgtables);
1293
1294 if (IS_ENABLED(CONFIG_X86_32)) {
1295 struct page *page;
1296
1297 /* Update all PGD tables to use the same large page */
1298 list_for_each_entry(page, &pgd_list, lru) {
1299 pgd_t *pgd = (pgd_t *)page_address(page) + pgd_index(addr);
1300 p4d_t *p4d = p4d_offset(pgd, addr);
1301 pud_t *pud = pud_offset(p4d, addr);
1302 pmd_t *pmd = pmd_offset(pud, addr);
1303 /* Something is wrong if entries doesn't match */
1304 if (WARN_ON(pmd_val(old_pmd) != pmd_val(*pmd)))
1305 continue;
1306 set_pmd(pmd, _pmd);
1307 }
1308 }
1309
1310 if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1))
1311 collapse_page_count(PG_LEVEL_2M);
1312
1313 return 1;
1314 }
1315
collapse_pud_page(pud_t * pud,unsigned long addr,struct list_head * pgtables)1316 static int collapse_pud_page(pud_t *pud, unsigned long addr,
1317 struct list_head *pgtables)
1318 {
1319 unsigned long pfn;
1320 pmd_t *pmd, first;
1321 int i;
1322
1323 if (!direct_gbpages)
1324 return 0;
1325
1326 addr &= PUD_MASK;
1327 pmd = pmd_offset(pud, addr);
1328 first = *pmd;
1329
1330 /*
1331 * To restore PUD page all PMD entries must be large and
1332 * have suitable alignment
1333 */
1334 pfn = pmd_pfn(first);
1335 if (!pmd_leaf(first) || (PFN_PHYS(pfn) & ~PUD_MASK))
1336 return 0;
1337
1338 /*
1339 * To restore PUD page, all following PMDs must be compatible with the
1340 * first one.
1341 */
1342 for (i = 1, pmd++; i < PTRS_PER_PMD; i++, pmd++) {
1343 pmd_t entry = *pmd;
1344
1345 if (!pmd_present(entry) || !pmd_leaf(entry))
1346 return 0;
1347 if (pmd_flags(entry) != pmd_flags(first))
1348 return 0;
1349 if (pmd_pfn(entry) != pmd_pfn(first) + i * PTRS_PER_PTE)
1350 return 0;
1351 }
1352
1353 /* Restore PUD page and queue page table to be freed after TLB flush */
1354 list_add(&page_ptdesc(pud_page(*pud))->pt_list, pgtables);
1355 set_pud(pud, pfn_pud(pfn, pmd_pgprot(first)));
1356
1357 if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1))
1358 collapse_page_count(PG_LEVEL_1G);
1359
1360 return 1;
1361 }
1362
1363 /*
1364 * Collapse PMD and PUD pages in the kernel mapping around the address where
1365 * possible.
1366 *
1367 * Caller must flush TLB and free page tables queued on the list before
1368 * touching the new entries. CPU must not see TLB entries of different size
1369 * with different attributes.
1370 */
collapse_large_pages(unsigned long addr,struct list_head * pgtables)1371 static int collapse_large_pages(unsigned long addr, struct list_head *pgtables)
1372 {
1373 int collapsed = 0;
1374 pgd_t *pgd;
1375 p4d_t *p4d;
1376 pud_t *pud;
1377 pmd_t *pmd;
1378
1379 addr &= PMD_MASK;
1380
1381 spin_lock(&pgd_lock);
1382 pgd = pgd_offset_k(addr);
1383 if (pgd_none(*pgd))
1384 goto out;
1385 p4d = p4d_offset(pgd, addr);
1386 if (p4d_none(*p4d))
1387 goto out;
1388 pud = pud_offset(p4d, addr);
1389 if (!pud_present(*pud) || pud_leaf(*pud))
1390 goto out;
1391 pmd = pmd_offset(pud, addr);
1392 if (!pmd_present(*pmd) || pmd_leaf(*pmd))
1393 goto out;
1394
1395 collapsed = collapse_pmd_page(pmd, addr, pgtables);
1396 if (collapsed)
1397 collapsed += collapse_pud_page(pud, addr, pgtables);
1398
1399 out:
1400 spin_unlock(&pgd_lock);
1401 return collapsed;
1402 }
1403
try_to_free_pte_page(pte_t * pte)1404 static bool try_to_free_pte_page(pte_t *pte)
1405 {
1406 int i;
1407
1408 for (i = 0; i < PTRS_PER_PTE; i++)
1409 if (!pte_none(pte[i]))
1410 return false;
1411
1412 pte_free_kernel(&init_mm, pte);
1413 return true;
1414 }
1415
try_to_free_pmd_page(pmd_t * pmd)1416 static bool try_to_free_pmd_page(pmd_t *pmd)
1417 {
1418 int i;
1419
1420 for (i = 0; i < PTRS_PER_PMD; i++)
1421 if (!pmd_none(pmd[i]))
1422 return false;
1423
1424 pmd_free(&init_mm, pmd);
1425 return true;
1426 }
1427
unmap_pte_range(pmd_t * pmd,unsigned long start,unsigned long end)1428 static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
1429 {
1430 pte_t *pte = pte_offset_kernel(pmd, start);
1431
1432 while (start < end) {
1433 set_pte(pte, __pte(0));
1434
1435 start += PAGE_SIZE;
1436 pte++;
1437 }
1438
1439 if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
1440 pmd_clear(pmd);
1441 return true;
1442 }
1443 return false;
1444 }
1445
__unmap_pmd_range(pud_t * pud,pmd_t * pmd,unsigned long start,unsigned long end)1446 static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
1447 unsigned long start, unsigned long end)
1448 {
1449 if (unmap_pte_range(pmd, start, end))
1450 if (try_to_free_pmd_page(pud_pgtable(*pud)))
1451 pud_clear(pud);
1452 }
1453
unmap_pmd_range(pud_t * pud,unsigned long start,unsigned long end)1454 static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
1455 {
1456 pmd_t *pmd = pmd_offset(pud, start);
1457
1458 /*
1459 * Not on a 2MB page boundary?
1460 */
1461 if (start & (PMD_SIZE - 1)) {
1462 unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
1463 unsigned long pre_end = min_t(unsigned long, end, next_page);
1464
1465 __unmap_pmd_range(pud, pmd, start, pre_end);
1466
1467 start = pre_end;
1468 pmd++;
1469 }
1470
1471 /*
1472 * Try to unmap in 2M chunks.
1473 */
1474 while (end - start >= PMD_SIZE) {
1475 if (pmd_leaf(*pmd))
1476 pmd_clear(pmd);
1477 else
1478 __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
1479
1480 start += PMD_SIZE;
1481 pmd++;
1482 }
1483
1484 /*
1485 * 4K leftovers?
1486 */
1487 if (start < end)
1488 return __unmap_pmd_range(pud, pmd, start, end);
1489
1490 /*
1491 * Try again to free the PMD page if haven't succeeded above.
1492 */
1493 if (!pud_none(*pud))
1494 if (try_to_free_pmd_page(pud_pgtable(*pud)))
1495 pud_clear(pud);
1496 }
1497
unmap_pud_range(p4d_t * p4d,unsigned long start,unsigned long end)1498 static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
1499 {
1500 pud_t *pud = pud_offset(p4d, start);
1501
1502 /*
1503 * Not on a GB page boundary?
1504 */
1505 if (start & (PUD_SIZE - 1)) {
1506 unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
1507 unsigned long pre_end = min_t(unsigned long, end, next_page);
1508
1509 unmap_pmd_range(pud, start, pre_end);
1510
1511 start = pre_end;
1512 pud++;
1513 }
1514
1515 /*
1516 * Try to unmap in 1G chunks?
1517 */
1518 while (end - start >= PUD_SIZE) {
1519
1520 if (pud_leaf(*pud))
1521 pud_clear(pud);
1522 else
1523 unmap_pmd_range(pud, start, start + PUD_SIZE);
1524
1525 start += PUD_SIZE;
1526 pud++;
1527 }
1528
1529 /*
1530 * 2M leftovers?
1531 */
1532 if (start < end)
1533 unmap_pmd_range(pud, start, end);
1534
1535 /*
1536 * No need to try to free the PUD page because we'll free it in
1537 * populate_pgd's error path
1538 */
1539 }
1540
alloc_pte_page(pmd_t * pmd)1541 static int alloc_pte_page(pmd_t *pmd)
1542 {
1543 pte_t *pte = pte_alloc_one_kernel(&init_mm);
1544 if (!pte)
1545 return -1;
1546
1547 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
1548 return 0;
1549 }
1550
alloc_pmd_page(pud_t * pud)1551 static int alloc_pmd_page(pud_t *pud)
1552 {
1553 /*
1554 * Pass 0 as a placeholder for the second argument, since the
1555 * generic implementation of pmd_alloc_one() does not use it.
1556 */
1557 pmd_t *pmd = pmd_alloc_one(&init_mm, 0);
1558 if (!pmd)
1559 return -1;
1560
1561 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
1562 return 0;
1563 }
1564
populate_pte(struct cpa_data * cpa,unsigned long start,unsigned long end,unsigned num_pages,pmd_t * pmd,pgprot_t pgprot)1565 static void populate_pte(struct cpa_data *cpa,
1566 unsigned long start, unsigned long end,
1567 unsigned num_pages, pmd_t *pmd, pgprot_t pgprot)
1568 {
1569 pte_t *pte;
1570
1571 pte = pte_offset_kernel(pmd, start);
1572
1573 pgprot = pgprot_clear_protnone_bits(pgprot);
1574
1575 while (num_pages-- && start < end) {
1576 set_pte(pte, pfn_pte(cpa->pfn, pgprot));
1577
1578 start += PAGE_SIZE;
1579 cpa->pfn++;
1580 pte++;
1581 }
1582 }
1583
populate_pmd(struct cpa_data * cpa,unsigned long start,unsigned long end,unsigned num_pages,pud_t * pud,pgprot_t pgprot)1584 static long populate_pmd(struct cpa_data *cpa,
1585 unsigned long start, unsigned long end,
1586 unsigned num_pages, pud_t *pud, pgprot_t pgprot)
1587 {
1588 long cur_pages = 0;
1589 pmd_t *pmd;
1590 pgprot_t pmd_pgprot;
1591
1592 /*
1593 * Not on a 2M boundary?
1594 */
1595 if (start & (PMD_SIZE - 1)) {
1596 unsigned long pre_end = start + (num_pages << PAGE_SHIFT);
1597 unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
1598
1599 pre_end = min_t(unsigned long, pre_end, next_page);
1600 cur_pages = (pre_end - start) >> PAGE_SHIFT;
1601 cur_pages = min_t(unsigned int, num_pages, cur_pages);
1602
1603 /*
1604 * Need a PTE page?
1605 */
1606 pmd = pmd_offset(pud, start);
1607 if (pmd_none(*pmd))
1608 if (alloc_pte_page(pmd))
1609 return -1;
1610
1611 populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot);
1612
1613 start = pre_end;
1614 }
1615
1616 /*
1617 * We mapped them all?
1618 */
1619 if (num_pages == cur_pages)
1620 return cur_pages;
1621
1622 pmd_pgprot = pgprot_4k_2_large(pgprot);
1623
1624 while (end - start >= PMD_SIZE) {
1625
1626 /*
1627 * We cannot use a 1G page so allocate a PMD page if needed.
1628 */
1629 if (pud_none(*pud))
1630 if (alloc_pmd_page(pud))
1631 return -1;
1632
1633 pmd = pmd_offset(pud, start);
1634
1635 set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn,
1636 canon_pgprot(pmd_pgprot))));
1637
1638 start += PMD_SIZE;
1639 cpa->pfn += PMD_SIZE >> PAGE_SHIFT;
1640 cur_pages += PMD_SIZE >> PAGE_SHIFT;
1641 }
1642
1643 /*
1644 * Map trailing 4K pages.
1645 */
1646 if (start < end) {
1647 pmd = pmd_offset(pud, start);
1648 if (pmd_none(*pmd))
1649 if (alloc_pte_page(pmd))
1650 return -1;
1651
1652 populate_pte(cpa, start, end, num_pages - cur_pages,
1653 pmd, pgprot);
1654 }
1655 return num_pages;
1656 }
1657
populate_pud(struct cpa_data * cpa,unsigned long start,p4d_t * p4d,pgprot_t pgprot)1658 static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
1659 pgprot_t pgprot)
1660 {
1661 pud_t *pud;
1662 unsigned long end;
1663 long cur_pages = 0;
1664 pgprot_t pud_pgprot;
1665
1666 end = start + (cpa->numpages << PAGE_SHIFT);
1667
1668 /*
1669 * Not on a Gb page boundary? => map everything up to it with
1670 * smaller pages.
1671 */
1672 if (start & (PUD_SIZE - 1)) {
1673 unsigned long pre_end;
1674 unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
1675
1676 pre_end = min_t(unsigned long, end, next_page);
1677 cur_pages = (pre_end - start) >> PAGE_SHIFT;
1678 cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
1679
1680 pud = pud_offset(p4d, start);
1681
1682 /*
1683 * Need a PMD page?
1684 */
1685 if (pud_none(*pud))
1686 if (alloc_pmd_page(pud))
1687 return -1;
1688
1689 cur_pages = populate_pmd(cpa, start, pre_end, cur_pages,
1690 pud, pgprot);
1691 if (cur_pages < 0)
1692 return cur_pages;
1693
1694 start = pre_end;
1695 }
1696
1697 /* We mapped them all? */
1698 if (cpa->numpages == cur_pages)
1699 return cur_pages;
1700
1701 pud = pud_offset(p4d, start);
1702 pud_pgprot = pgprot_4k_2_large(pgprot);
1703
1704 /*
1705 * Map everything starting from the Gb boundary, possibly with 1G pages
1706 */
1707 while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) {
1708 set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn,
1709 canon_pgprot(pud_pgprot))));
1710
1711 start += PUD_SIZE;
1712 cpa->pfn += PUD_SIZE >> PAGE_SHIFT;
1713 cur_pages += PUD_SIZE >> PAGE_SHIFT;
1714 pud++;
1715 }
1716
1717 /* Map trailing leftover */
1718 if (start < end) {
1719 long tmp;
1720
1721 pud = pud_offset(p4d, start);
1722 if (pud_none(*pud))
1723 if (alloc_pmd_page(pud))
1724 return -1;
1725
1726 tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages,
1727 pud, pgprot);
1728 if (tmp < 0)
1729 return cur_pages;
1730
1731 cur_pages += tmp;
1732 }
1733 return cur_pages;
1734 }
1735
1736 /*
1737 * Restrictions for kernel page table do not necessarily apply when mapping in
1738 * an alternate PGD.
1739 */
populate_pgd(struct cpa_data * cpa,unsigned long addr)1740 static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
1741 {
1742 pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
1743 pud_t *pud = NULL; /* shut up gcc */
1744 p4d_t *p4d;
1745 pgd_t *pgd_entry;
1746 long ret;
1747
1748 pgd_entry = cpa->pgd + pgd_index(addr);
1749
1750 if (pgd_none(*pgd_entry)) {
1751 /*
1752 * Pass 0 as a placeholder for the second argument, since the
1753 * generic implementation of p4d_alloc_one() does not use it.
1754 */
1755 p4d = p4d_alloc_one(&init_mm, 0);
1756 if (!p4d)
1757 return -1;
1758
1759 set_pgd(pgd_entry, __pgd(__pa(p4d) | _KERNPG_TABLE));
1760 }
1761
1762 /*
1763 * Allocate a PUD page and hand it down for mapping.
1764 */
1765 p4d = p4d_offset(pgd_entry, addr);
1766 if (p4d_none(*p4d)) {
1767 /*
1768 * Pass 0 as a placeholder for the second argument, since the
1769 * generic implementation of pud_alloc_one() does not use it.
1770 */
1771 pud = pud_alloc_one(&init_mm, 0);
1772 if (!pud)
1773 return -1;
1774
1775 set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
1776 }
1777
1778 pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
1779 pgprot_val(pgprot) |= pgprot_val(cpa->mask_set);
1780
1781 ret = populate_pud(cpa, addr, p4d, pgprot);
1782 if (ret < 0) {
1783 /*
1784 * Leave the PUD page in place in case some other CPU or thread
1785 * already found it, but remove any useless entries we just
1786 * added to it.
1787 */
1788 unmap_pud_range(p4d, addr,
1789 addr + (cpa->numpages << PAGE_SHIFT));
1790 return ret;
1791 }
1792
1793 cpa->numpages = ret;
1794 return 0;
1795 }
1796
__cpa_process_fault(struct cpa_data * cpa,unsigned long vaddr,int primary)1797 static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
1798 int primary)
1799 {
1800 if (cpa->pgd) {
1801 /*
1802 * Right now, we only execute this code path when mapping
1803 * the EFI virtual memory map regions, no other users
1804 * provide a ->pgd value. This may change in the future.
1805 */
1806 return populate_pgd(cpa, vaddr);
1807 }
1808
1809 /*
1810 * Ignore all non primary paths.
1811 */
1812 if (!primary) {
1813 cpa->numpages = 1;
1814 return 0;
1815 }
1816
1817 /*
1818 * Ignore the NULL PTE for kernel identity mapping, as it is expected
1819 * to have holes.
1820 * Also set numpages to '1' indicating that we processed cpa req for
1821 * one virtual address page and its pfn. TBD: numpages can be set based
1822 * on the initial value and the level returned by lookup_address().
1823 */
1824 if (within(vaddr, PAGE_OFFSET,
1825 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
1826 cpa->numpages = 1;
1827 cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
1828 return 0;
1829
1830 } else if (__cpa_pfn_in_highmap(cpa->pfn)) {
1831 /* Faults in the highmap are OK, so do not warn: */
1832 return -EFAULT;
1833 } else {
1834 WARN(1, KERN_WARNING "CPA: called for zero pte. "
1835 "vaddr = %lx cpa->vaddr = %lx\n", vaddr,
1836 *cpa->vaddr);
1837
1838 return -EFAULT;
1839 }
1840 }
1841
__change_page_attr(struct cpa_data * cpa,int primary)1842 static int __change_page_attr(struct cpa_data *cpa, int primary)
1843 {
1844 unsigned long address;
1845 int do_split, err;
1846 unsigned int level;
1847 pte_t *kpte, old_pte;
1848 bool nx, rw;
1849
1850 address = __cpa_addr(cpa, cpa->curpage);
1851 repeat:
1852 kpte = _lookup_address_cpa(cpa, address, &level, &nx, &rw);
1853 if (!kpte)
1854 return __cpa_process_fault(cpa, address, primary);
1855
1856 old_pte = *kpte;
1857 if (pte_none(old_pte))
1858 return __cpa_process_fault(cpa, address, primary);
1859
1860 if (level == PG_LEVEL_4K) {
1861 pte_t new_pte;
1862 pgprot_t old_prot = pte_pgprot(old_pte);
1863 pgprot_t new_prot = pte_pgprot(old_pte);
1864 unsigned long pfn = pte_pfn(old_pte);
1865
1866 pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
1867 pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
1868
1869 cpa_inc_4k_install();
1870 /* Hand in lpsize = 0 to enforce the protection mechanism */
1871 new_prot = static_protections(new_prot, address, pfn, 1, 0,
1872 CPA_PROTECT);
1873
1874 new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1,
1875 nx, rw);
1876
1877 new_prot = pgprot_clear_protnone_bits(new_prot);
1878
1879 /*
1880 * We need to keep the pfn from the existing PTE,
1881 * after all we're only going to change its attributes
1882 * not the memory it points to
1883 */
1884 new_pte = pfn_pte(pfn, new_prot);
1885 cpa->pfn = pfn;
1886 /*
1887 * Do we really change anything ?
1888 */
1889 if (pte_val(old_pte) != pte_val(new_pte)) {
1890 set_pte_atomic(kpte, new_pte);
1891 cpa->flags |= CPA_FLUSHTLB;
1892 }
1893 cpa->numpages = 1;
1894 return 0;
1895 }
1896
1897 /*
1898 * Check, whether we can keep the large page intact
1899 * and just change the pte:
1900 */
1901 do_split = should_split_large_page(kpte, address, cpa);
1902 /*
1903 * When the range fits into the existing large page,
1904 * return. cp->numpages and cpa->tlbflush have been updated in
1905 * try_large_page:
1906 */
1907 if (do_split <= 0)
1908 return do_split;
1909
1910 /*
1911 * We have to split the large page:
1912 */
1913 err = split_large_page(cpa, kpte, address);
1914 if (!err)
1915 goto repeat;
1916
1917 return err;
1918 }
1919
1920 static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary);
1921
1922 /*
1923 * Check the directmap and "high kernel map" 'aliases'.
1924 */
cpa_process_alias(struct cpa_data * cpa)1925 static int cpa_process_alias(struct cpa_data *cpa)
1926 {
1927 struct cpa_data alias_cpa;
1928 unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
1929 unsigned long vaddr;
1930 int ret;
1931
1932 if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
1933 return 0;
1934
1935 /*
1936 * No need to redo, when the primary call touched the direct
1937 * mapping already:
1938 */
1939 vaddr = __cpa_addr(cpa, cpa->curpage);
1940 if (!(within(vaddr, PAGE_OFFSET,
1941 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
1942
1943 alias_cpa = *cpa;
1944 alias_cpa.vaddr = &laddr;
1945 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
1946 alias_cpa.curpage = 0;
1947
1948 /* Directmap always has NX set, do not modify. */
1949 if (__supported_pte_mask & _PAGE_NX) {
1950 alias_cpa.mask_clr.pgprot &= ~_PAGE_NX;
1951 alias_cpa.mask_set.pgprot &= ~_PAGE_NX;
1952 }
1953
1954 cpa->force_flush_all = 1;
1955
1956 ret = __change_page_attr_set_clr(&alias_cpa, 0);
1957 if (ret)
1958 return ret;
1959 }
1960
1961 #ifdef CONFIG_X86_64
1962 /*
1963 * If the primary call didn't touch the high mapping already
1964 * and the physical address is inside the kernel map, we need
1965 * to touch the high mapped kernel as well:
1966 */
1967 if (!within(vaddr, (unsigned long)_text, _brk_end) &&
1968 __cpa_pfn_in_highmap(cpa->pfn)) {
1969 unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
1970 __START_KERNEL_map - phys_base;
1971 alias_cpa = *cpa;
1972 alias_cpa.vaddr = &temp_cpa_vaddr;
1973 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
1974 alias_cpa.curpage = 0;
1975
1976 /*
1977 * [_text, _brk_end) also covers data, do not modify NX except
1978 * in cases where the highmap is the primary target.
1979 */
1980 if (__supported_pte_mask & _PAGE_NX) {
1981 alias_cpa.mask_clr.pgprot &= ~_PAGE_NX;
1982 alias_cpa.mask_set.pgprot &= ~_PAGE_NX;
1983 }
1984
1985 cpa->force_flush_all = 1;
1986 /*
1987 * The high mapping range is imprecise, so ignore the
1988 * return value.
1989 */
1990 __change_page_attr_set_clr(&alias_cpa, 0);
1991 }
1992 #endif
1993
1994 return 0;
1995 }
1996
__change_page_attr_set_clr(struct cpa_data * cpa,int primary)1997 static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary)
1998 {
1999 unsigned long numpages = cpa->numpages;
2000 unsigned long rempages = numpages;
2001 int ret = 0;
2002
2003 /*
2004 * No changes, easy!
2005 */
2006 if (!(pgprot_val(cpa->mask_set) | pgprot_val(cpa->mask_clr)) &&
2007 !cpa->force_split)
2008 return ret;
2009
2010 while (rempages) {
2011 /*
2012 * Store the remaining nr of pages for the large page
2013 * preservation check.
2014 */
2015 cpa->numpages = rempages;
2016 /* for array changes, we can't use large page */
2017 if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
2018 cpa->numpages = 1;
2019
2020 if (!debug_pagealloc_enabled())
2021 spin_lock(&cpa_lock);
2022 ret = __change_page_attr(cpa, primary);
2023 if (!debug_pagealloc_enabled())
2024 spin_unlock(&cpa_lock);
2025 if (ret)
2026 goto out;
2027
2028 if (primary && !(cpa->flags & CPA_NO_CHECK_ALIAS)) {
2029 ret = cpa_process_alias(cpa);
2030 if (ret)
2031 goto out;
2032 }
2033
2034 /*
2035 * Adjust the number of pages with the result of the
2036 * CPA operation. Either a large page has been
2037 * preserved or a single page update happened.
2038 */
2039 BUG_ON(cpa->numpages > rempages || !cpa->numpages);
2040 rempages -= cpa->numpages;
2041 cpa->curpage += cpa->numpages;
2042 }
2043
2044 out:
2045 /* Restore the original numpages */
2046 cpa->numpages = numpages;
2047 return ret;
2048 }
2049
change_page_attr_set_clr(unsigned long * addr,int numpages,pgprot_t mask_set,pgprot_t mask_clr,int force_split,int in_flag,struct page ** pages)2050 static int change_page_attr_set_clr(unsigned long *addr, int numpages,
2051 pgprot_t mask_set, pgprot_t mask_clr,
2052 int force_split, int in_flag,
2053 struct page **pages)
2054 {
2055 struct cpa_data cpa;
2056 int ret, cache;
2057
2058 memset(&cpa, 0, sizeof(cpa));
2059
2060 /*
2061 * Check, if we are requested to set a not supported
2062 * feature. Clearing non-supported features is OK.
2063 */
2064 mask_set = canon_pgprot(mask_set);
2065
2066 if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
2067 return 0;
2068
2069 /* Ensure we are PAGE_SIZE aligned */
2070 if (in_flag & CPA_ARRAY) {
2071 int i;
2072 for (i = 0; i < numpages; i++) {
2073 if (addr[i] & ~PAGE_MASK) {
2074 addr[i] &= PAGE_MASK;
2075 WARN_ON_ONCE(1);
2076 }
2077 }
2078 } else if (!(in_flag & CPA_PAGES_ARRAY)) {
2079 /*
2080 * in_flag of CPA_PAGES_ARRAY implies it is aligned.
2081 * No need to check in that case
2082 */
2083 if (*addr & ~PAGE_MASK) {
2084 *addr &= PAGE_MASK;
2085 /*
2086 * People should not be passing in unaligned addresses:
2087 */
2088 WARN_ON_ONCE(1);
2089 }
2090 }
2091
2092 /* Must avoid aliasing mappings in the highmem code */
2093 kmap_flush_unused();
2094
2095 vm_unmap_aliases();
2096
2097 cpa.vaddr = addr;
2098 cpa.pages = pages;
2099 cpa.numpages = numpages;
2100 cpa.mask_set = mask_set;
2101 cpa.mask_clr = mask_clr;
2102 cpa.flags = in_flag;
2103 cpa.curpage = 0;
2104 cpa.force_split = force_split;
2105
2106 ret = __change_page_attr_set_clr(&cpa, 1);
2107
2108 /*
2109 * Check whether we really changed something:
2110 */
2111 if (!(cpa.flags & CPA_FLUSHTLB))
2112 goto out;
2113
2114 /*
2115 * No need to flush, when we did not set any of the caching
2116 * attributes:
2117 */
2118 cache = !!pgprot2cachemode(mask_set);
2119
2120 /*
2121 * On error; flush everything to be sure.
2122 */
2123 if (ret) {
2124 cpa_flush_all(cache);
2125 goto out;
2126 }
2127
2128 cpa_flush(&cpa, cache);
2129 out:
2130 return ret;
2131 }
2132
change_page_attr_set(unsigned long * addr,int numpages,pgprot_t mask,int array)2133 static inline int change_page_attr_set(unsigned long *addr, int numpages,
2134 pgprot_t mask, int array)
2135 {
2136 return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
2137 (array ? CPA_ARRAY : 0), NULL);
2138 }
2139
change_page_attr_clear(unsigned long * addr,int numpages,pgprot_t mask,int array)2140 static inline int change_page_attr_clear(unsigned long *addr, int numpages,
2141 pgprot_t mask, int array)
2142 {
2143 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
2144 (array ? CPA_ARRAY : 0), NULL);
2145 }
2146
cpa_set_pages_array(struct page ** pages,int numpages,pgprot_t mask)2147 static inline int cpa_set_pages_array(struct page **pages, int numpages,
2148 pgprot_t mask)
2149 {
2150 return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
2151 CPA_PAGES_ARRAY, pages);
2152 }
2153
cpa_clear_pages_array(struct page ** pages,int numpages,pgprot_t mask)2154 static inline int cpa_clear_pages_array(struct page **pages, int numpages,
2155 pgprot_t mask)
2156 {
2157 return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
2158 CPA_PAGES_ARRAY, pages);
2159 }
2160
_set_memory_uc(unsigned long addr,int numpages)2161 int _set_memory_uc(unsigned long addr, int numpages)
2162 {
2163 /*
2164 * for now UC MINUS. see comments in ioremap()
2165 * If you really need strong UC use ioremap_uc(), but note
2166 * that you cannot override IO areas with set_memory_*() as
2167 * these helpers cannot work with IO memory.
2168 */
2169 return change_page_attr_set(&addr, numpages,
2170 cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
2171 0);
2172 }
2173
set_memory_uc(unsigned long addr,int numpages)2174 int set_memory_uc(unsigned long addr, int numpages)
2175 {
2176 int ret;
2177
2178 /*
2179 * for now UC MINUS. see comments in ioremap()
2180 */
2181 ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
2182 _PAGE_CACHE_MODE_UC_MINUS, NULL);
2183 if (ret)
2184 goto out_err;
2185
2186 ret = _set_memory_uc(addr, numpages);
2187 if (ret)
2188 goto out_free;
2189
2190 return 0;
2191
2192 out_free:
2193 memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
2194 out_err:
2195 return ret;
2196 }
2197 EXPORT_SYMBOL(set_memory_uc);
2198
_set_memory_wc(unsigned long addr,int numpages)2199 int _set_memory_wc(unsigned long addr, int numpages)
2200 {
2201 int ret;
2202
2203 ret = change_page_attr_set(&addr, numpages,
2204 cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
2205 0);
2206 if (!ret) {
2207 ret = change_page_attr_set_clr(&addr, numpages,
2208 cachemode2pgprot(_PAGE_CACHE_MODE_WC),
2209 __pgprot(_PAGE_CACHE_MASK),
2210 0, 0, NULL);
2211 }
2212 return ret;
2213 }
2214
set_memory_wc(unsigned long addr,int numpages)2215 int set_memory_wc(unsigned long addr, int numpages)
2216 {
2217 int ret;
2218
2219 ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
2220 _PAGE_CACHE_MODE_WC, NULL);
2221 if (ret)
2222 return ret;
2223
2224 ret = _set_memory_wc(addr, numpages);
2225 if (ret)
2226 memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
2227
2228 return ret;
2229 }
2230 EXPORT_SYMBOL(set_memory_wc);
2231
_set_memory_wt(unsigned long addr,int numpages)2232 int _set_memory_wt(unsigned long addr, int numpages)
2233 {
2234 return change_page_attr_set(&addr, numpages,
2235 cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0);
2236 }
2237
_set_memory_wb(unsigned long addr,int numpages)2238 int _set_memory_wb(unsigned long addr, int numpages)
2239 {
2240 /* WB cache mode is hard wired to all cache attribute bits being 0 */
2241 return change_page_attr_clear(&addr, numpages,
2242 __pgprot(_PAGE_CACHE_MASK), 0);
2243 }
2244
set_memory_wb(unsigned long addr,int numpages)2245 int set_memory_wb(unsigned long addr, int numpages)
2246 {
2247 int ret;
2248
2249 ret = _set_memory_wb(addr, numpages);
2250 if (ret)
2251 return ret;
2252
2253 memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
2254 return 0;
2255 }
2256 EXPORT_SYMBOL(set_memory_wb);
2257
2258 /* Prevent speculative access to a page by marking it not-present */
2259 #ifdef CONFIG_X86_64
set_mce_nospec(unsigned long pfn)2260 int set_mce_nospec(unsigned long pfn)
2261 {
2262 unsigned long decoy_addr;
2263 int rc;
2264
2265 /* SGX pages are not in the 1:1 map */
2266 if (arch_is_platform_page(pfn << PAGE_SHIFT))
2267 return 0;
2268 /*
2269 * We would like to just call:
2270 * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1);
2271 * but doing that would radically increase the odds of a
2272 * speculative access to the poison page because we'd have
2273 * the virtual address of the kernel 1:1 mapping sitting
2274 * around in registers.
2275 * Instead we get tricky. We create a non-canonical address
2276 * that looks just like the one we want, but has bit 63 flipped.
2277 * This relies on set_memory_XX() properly sanitizing any __pa()
2278 * results with __PHYSICAL_MASK or PTE_PFN_MASK.
2279 */
2280 decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
2281
2282 rc = set_memory_np(decoy_addr, 1);
2283 if (rc)
2284 pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
2285 return rc;
2286 }
2287 EXPORT_SYMBOL_GPL(set_mce_nospec);
2288
2289 /* Restore full speculative operation to the pfn. */
clear_mce_nospec(unsigned long pfn)2290 int clear_mce_nospec(unsigned long pfn)
2291 {
2292 unsigned long addr = (unsigned long) pfn_to_kaddr(pfn);
2293
2294 return set_memory_p(addr, 1);
2295 }
2296 EXPORT_SYMBOL_GPL(clear_mce_nospec);
2297 #endif /* CONFIG_X86_64 */
2298
set_memory_x(unsigned long addr,int numpages)2299 int set_memory_x(unsigned long addr, int numpages)
2300 {
2301 if (!(__supported_pte_mask & _PAGE_NX))
2302 return 0;
2303
2304 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
2305 }
2306
set_memory_nx(unsigned long addr,int numpages)2307 int set_memory_nx(unsigned long addr, int numpages)
2308 {
2309 if (!(__supported_pte_mask & _PAGE_NX))
2310 return 0;
2311
2312 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
2313 }
2314
set_memory_ro(unsigned long addr,int numpages)2315 int set_memory_ro(unsigned long addr, int numpages)
2316 {
2317 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW | _PAGE_DIRTY), 0);
2318 }
2319
set_memory_rox(unsigned long addr,int numpages)2320 int set_memory_rox(unsigned long addr, int numpages)
2321 {
2322 pgprot_t clr = __pgprot(_PAGE_RW | _PAGE_DIRTY);
2323
2324 if (__supported_pte_mask & _PAGE_NX)
2325 clr.pgprot |= _PAGE_NX;
2326
2327 return change_page_attr_set_clr(&addr, numpages, __pgprot(0), clr, 0,
2328 CPA_COLLAPSE, NULL);
2329 }
2330
set_memory_rw(unsigned long addr,int numpages)2331 int set_memory_rw(unsigned long addr, int numpages)
2332 {
2333 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
2334 }
2335
set_memory_np(unsigned long addr,int numpages)2336 int set_memory_np(unsigned long addr, int numpages)
2337 {
2338 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
2339 }
2340
set_memory_np_noalias(unsigned long addr,int numpages)2341 int set_memory_np_noalias(unsigned long addr, int numpages)
2342 {
2343 return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
2344 __pgprot(_PAGE_PRESENT), 0,
2345 CPA_NO_CHECK_ALIAS, NULL);
2346 }
2347
set_memory_p(unsigned long addr,int numpages)2348 int set_memory_p(unsigned long addr, int numpages)
2349 {
2350 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
2351 }
2352
set_memory_4k(unsigned long addr,int numpages)2353 int set_memory_4k(unsigned long addr, int numpages)
2354 {
2355 return change_page_attr_set_clr(&addr, numpages,
2356 __pgprot(_PAGE_KERNEL_4K),
2357 __pgprot(0), 1, 0, NULL);
2358 }
2359
set_memory_nonglobal(unsigned long addr,int numpages)2360 int set_memory_nonglobal(unsigned long addr, int numpages)
2361 {
2362 return change_page_attr_clear(&addr, numpages,
2363 __pgprot(_PAGE_GLOBAL), 0);
2364 }
2365
set_memory_global(unsigned long addr,int numpages)2366 int set_memory_global(unsigned long addr, int numpages)
2367 {
2368 return change_page_attr_set(&addr, numpages,
2369 __pgprot(_PAGE_GLOBAL), 0);
2370 }
2371
2372 /*
2373 * __set_memory_enc_pgtable() is used for the hypervisors that get
2374 * informed about "encryption" status via page tables.
2375 */
__set_memory_enc_pgtable(unsigned long addr,int numpages,bool enc)2376 static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc)
2377 {
2378 pgprot_t empty = __pgprot(0);
2379 struct cpa_data cpa;
2380 int ret;
2381
2382 /* Should not be working on unaligned addresses */
2383 if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr))
2384 addr &= PAGE_MASK;
2385
2386 memset(&cpa, 0, sizeof(cpa));
2387 cpa.vaddr = &addr;
2388 cpa.numpages = numpages;
2389 cpa.mask_set = enc ? pgprot_encrypted(empty) : pgprot_decrypted(empty);
2390 cpa.mask_clr = enc ? pgprot_decrypted(empty) : pgprot_encrypted(empty);
2391 cpa.pgd = init_mm.pgd;
2392
2393 /* Must avoid aliasing mappings in the highmem code */
2394 kmap_flush_unused();
2395 vm_unmap_aliases();
2396
2397 /* Flush the caches as needed before changing the encryption attribute. */
2398 if (x86_platform.guest.enc_tlb_flush_required(enc))
2399 cpa_flush(&cpa, x86_platform.guest.enc_cache_flush_required());
2400
2401 /* Notify hypervisor that we are about to set/clr encryption attribute. */
2402 ret = x86_platform.guest.enc_status_change_prepare(addr, numpages, enc);
2403 if (ret)
2404 goto vmm_fail;
2405
2406 ret = __change_page_attr_set_clr(&cpa, 1);
2407
2408 /*
2409 * After changing the encryption attribute, we need to flush TLBs again
2410 * in case any speculative TLB caching occurred (but no need to flush
2411 * caches again). We could just use cpa_flush_all(), but in case TLB
2412 * flushing gets optimized in the cpa_flush() path use the same logic
2413 * as above.
2414 */
2415 cpa_flush(&cpa, 0);
2416
2417 if (ret)
2418 return ret;
2419
2420 /* Notify hypervisor that we have successfully set/clr encryption attribute. */
2421 ret = x86_platform.guest.enc_status_change_finish(addr, numpages, enc);
2422 if (ret)
2423 goto vmm_fail;
2424
2425 return 0;
2426
2427 vmm_fail:
2428 WARN_ONCE(1, "CPA VMM failure to convert memory (addr=%p, numpages=%d) to %s: %d\n",
2429 (void *)addr, numpages, enc ? "private" : "shared", ret);
2430
2431 return ret;
2432 }
2433
2434 /*
2435 * The lock serializes conversions between private and shared memory.
2436 *
2437 * It is taken for read on conversion. A write lock guarantees that no
2438 * concurrent conversions are in progress.
2439 */
2440 static DECLARE_RWSEM(mem_enc_lock);
2441
2442 /*
2443 * Stop new private<->shared conversions.
2444 *
2445 * Taking the exclusive mem_enc_lock waits for in-flight conversions to complete.
2446 * The lock is not released to prevent new conversions from being started.
2447 */
set_memory_enc_stop_conversion(void)2448 bool set_memory_enc_stop_conversion(void)
2449 {
2450 /*
2451 * In a crash scenario, sleep is not allowed. Try to take the lock.
2452 * Failure indicates that there is a race with the conversion.
2453 */
2454 if (oops_in_progress)
2455 return down_write_trylock(&mem_enc_lock);
2456
2457 down_write(&mem_enc_lock);
2458
2459 return true;
2460 }
2461
__set_memory_enc_dec(unsigned long addr,int numpages,bool enc)2462 static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
2463 {
2464 int ret = 0;
2465
2466 if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) {
2467 if (!down_read_trylock(&mem_enc_lock))
2468 return -EBUSY;
2469
2470 ret = __set_memory_enc_pgtable(addr, numpages, enc);
2471
2472 up_read(&mem_enc_lock);
2473 }
2474
2475 return ret;
2476 }
2477
set_memory_encrypted(unsigned long addr,int numpages)2478 int set_memory_encrypted(unsigned long addr, int numpages)
2479 {
2480 return __set_memory_enc_dec(addr, numpages, true);
2481 }
2482 EXPORT_SYMBOL_GPL(set_memory_encrypted);
2483
set_memory_decrypted(unsigned long addr,int numpages)2484 int set_memory_decrypted(unsigned long addr, int numpages)
2485 {
2486 return __set_memory_enc_dec(addr, numpages, false);
2487 }
2488 EXPORT_SYMBOL_GPL(set_memory_decrypted);
2489
set_pages_uc(struct page * page,int numpages)2490 int set_pages_uc(struct page *page, int numpages)
2491 {
2492 unsigned long addr = (unsigned long)page_address(page);
2493
2494 return set_memory_uc(addr, numpages);
2495 }
2496 EXPORT_SYMBOL(set_pages_uc);
2497
_set_pages_array(struct page ** pages,int numpages,enum page_cache_mode new_type)2498 static int _set_pages_array(struct page **pages, int numpages,
2499 enum page_cache_mode new_type)
2500 {
2501 unsigned long start;
2502 unsigned long end;
2503 enum page_cache_mode set_type;
2504 int i;
2505 int free_idx;
2506 int ret;
2507
2508 for (i = 0; i < numpages; i++) {
2509 if (PageHighMem(pages[i]))
2510 continue;
2511 start = page_to_pfn(pages[i]) << PAGE_SHIFT;
2512 end = start + PAGE_SIZE;
2513 if (memtype_reserve(start, end, new_type, NULL))
2514 goto err_out;
2515 }
2516
2517 /* If WC, set to UC- first and then WC */
2518 set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
2519 _PAGE_CACHE_MODE_UC_MINUS : new_type;
2520
2521 ret = cpa_set_pages_array(pages, numpages,
2522 cachemode2pgprot(set_type));
2523 if (!ret && new_type == _PAGE_CACHE_MODE_WC)
2524 ret = change_page_attr_set_clr(NULL, numpages,
2525 cachemode2pgprot(
2526 _PAGE_CACHE_MODE_WC),
2527 __pgprot(_PAGE_CACHE_MASK),
2528 0, CPA_PAGES_ARRAY, pages);
2529 if (ret)
2530 goto err_out;
2531 return 0; /* Success */
2532 err_out:
2533 free_idx = i;
2534 for (i = 0; i < free_idx; i++) {
2535 if (PageHighMem(pages[i]))
2536 continue;
2537 start = page_to_pfn(pages[i]) << PAGE_SHIFT;
2538 end = start + PAGE_SIZE;
2539 memtype_free(start, end);
2540 }
2541 return -EINVAL;
2542 }
2543
set_pages_array_uc(struct page ** pages,int numpages)2544 int set_pages_array_uc(struct page **pages, int numpages)
2545 {
2546 return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_UC_MINUS);
2547 }
2548 EXPORT_SYMBOL(set_pages_array_uc);
2549
set_pages_array_wc(struct page ** pages,int numpages)2550 int set_pages_array_wc(struct page **pages, int numpages)
2551 {
2552 return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_WC);
2553 }
2554 EXPORT_SYMBOL(set_pages_array_wc);
2555
set_pages_wb(struct page * page,int numpages)2556 int set_pages_wb(struct page *page, int numpages)
2557 {
2558 unsigned long addr = (unsigned long)page_address(page);
2559
2560 return set_memory_wb(addr, numpages);
2561 }
2562 EXPORT_SYMBOL(set_pages_wb);
2563
set_pages_array_wb(struct page ** pages,int numpages)2564 int set_pages_array_wb(struct page **pages, int numpages)
2565 {
2566 int retval;
2567 unsigned long start;
2568 unsigned long end;
2569 int i;
2570
2571 /* WB cache mode is hard wired to all cache attribute bits being 0 */
2572 retval = cpa_clear_pages_array(pages, numpages,
2573 __pgprot(_PAGE_CACHE_MASK));
2574 if (retval)
2575 return retval;
2576
2577 for (i = 0; i < numpages; i++) {
2578 if (PageHighMem(pages[i]))
2579 continue;
2580 start = page_to_pfn(pages[i]) << PAGE_SHIFT;
2581 end = start + PAGE_SIZE;
2582 memtype_free(start, end);
2583 }
2584
2585 return 0;
2586 }
2587 EXPORT_SYMBOL(set_pages_array_wb);
2588
set_pages_ro(struct page * page,int numpages)2589 int set_pages_ro(struct page *page, int numpages)
2590 {
2591 unsigned long addr = (unsigned long)page_address(page);
2592
2593 return set_memory_ro(addr, numpages);
2594 }
2595
set_pages_rw(struct page * page,int numpages)2596 int set_pages_rw(struct page *page, int numpages)
2597 {
2598 unsigned long addr = (unsigned long)page_address(page);
2599
2600 return set_memory_rw(addr, numpages);
2601 }
2602
__set_pages_p(struct page * page,int numpages)2603 static int __set_pages_p(struct page *page, int numpages)
2604 {
2605 unsigned long tempaddr = (unsigned long) page_address(page);
2606 struct cpa_data cpa = { .vaddr = &tempaddr,
2607 .pgd = NULL,
2608 .numpages = numpages,
2609 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
2610 .mask_clr = __pgprot(0),
2611 .flags = CPA_NO_CHECK_ALIAS };
2612
2613 /*
2614 * No alias checking needed for setting present flag. otherwise,
2615 * we may need to break large pages for 64-bit kernel text
2616 * mappings (this adds to complexity if we want to do this from
2617 * atomic context especially). Let's keep it simple!
2618 */
2619 return __change_page_attr_set_clr(&cpa, 1);
2620 }
2621
__set_pages_np(struct page * page,int numpages)2622 static int __set_pages_np(struct page *page, int numpages)
2623 {
2624 unsigned long tempaddr = (unsigned long) page_address(page);
2625 struct cpa_data cpa = { .vaddr = &tempaddr,
2626 .pgd = NULL,
2627 .numpages = numpages,
2628 .mask_set = __pgprot(0),
2629 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY),
2630 .flags = CPA_NO_CHECK_ALIAS };
2631
2632 /*
2633 * No alias checking needed for setting not present flag. otherwise,
2634 * we may need to break large pages for 64-bit kernel text
2635 * mappings (this adds to complexity if we want to do this from
2636 * atomic context especially). Let's keep it simple!
2637 */
2638 return __change_page_attr_set_clr(&cpa, 1);
2639 }
2640
set_direct_map_invalid_noflush(struct page * page)2641 int set_direct_map_invalid_noflush(struct page *page)
2642 {
2643 return __set_pages_np(page, 1);
2644 }
2645
set_direct_map_default_noflush(struct page * page)2646 int set_direct_map_default_noflush(struct page *page)
2647 {
2648 return __set_pages_p(page, 1);
2649 }
2650
set_direct_map_valid_noflush(struct page * page,unsigned nr,bool valid)2651 int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
2652 {
2653 if (valid)
2654 return __set_pages_p(page, nr);
2655
2656 return __set_pages_np(page, nr);
2657 }
2658
2659 #ifdef CONFIG_DEBUG_PAGEALLOC
__kernel_map_pages(struct page * page,int numpages,int enable)2660 void __kernel_map_pages(struct page *page, int numpages, int enable)
2661 {
2662 if (PageHighMem(page))
2663 return;
2664 if (!enable) {
2665 debug_check_no_locks_freed(page_address(page),
2666 numpages * PAGE_SIZE);
2667 }
2668
2669 /*
2670 * The return value is ignored as the calls cannot fail.
2671 * Large pages for identity mappings are not used at boot time
2672 * and hence no memory allocations during large page split.
2673 */
2674 if (enable)
2675 __set_pages_p(page, numpages);
2676 else
2677 __set_pages_np(page, numpages);
2678
2679 /*
2680 * We should perform an IPI and flush all tlbs,
2681 * but that can deadlock->flush only current cpu.
2682 * Preemption needs to be disabled around __flush_tlb_all() due to
2683 * CR3 reload in __native_flush_tlb().
2684 */
2685 preempt_disable();
2686 __flush_tlb_all();
2687 preempt_enable();
2688
2689 arch_flush_lazy_mmu_mode();
2690 }
2691 #endif /* CONFIG_DEBUG_PAGEALLOC */
2692
kernel_page_present(struct page * page)2693 bool kernel_page_present(struct page *page)
2694 {
2695 unsigned int level;
2696 pte_t *pte;
2697
2698 if (PageHighMem(page))
2699 return false;
2700
2701 pte = lookup_address((unsigned long)page_address(page), &level);
2702 return (pte_val(*pte) & _PAGE_PRESENT);
2703 }
2704
kernel_map_pages_in_pgd(pgd_t * pgd,u64 pfn,unsigned long address,unsigned numpages,unsigned long page_flags)2705 int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
2706 unsigned numpages, unsigned long page_flags)
2707 {
2708 int retval = -EINVAL;
2709
2710 struct cpa_data cpa = {
2711 .vaddr = &address,
2712 .pfn = pfn,
2713 .pgd = pgd,
2714 .numpages = numpages,
2715 .mask_set = __pgprot(0),
2716 .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW|_PAGE_DIRTY)),
2717 .flags = CPA_NO_CHECK_ALIAS,
2718 };
2719
2720 WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP");
2721
2722 if (!(__supported_pte_mask & _PAGE_NX))
2723 goto out;
2724
2725 if (!(page_flags & _PAGE_ENC))
2726 cpa.mask_clr = pgprot_encrypted(cpa.mask_clr);
2727
2728 cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
2729
2730 retval = __change_page_attr_set_clr(&cpa, 1);
2731 __flush_tlb_all();
2732
2733 out:
2734 return retval;
2735 }
2736
2737 /*
2738 * __flush_tlb_all() flushes mappings only on current CPU and hence this
2739 * function shouldn't be used in an SMP environment. Presently, it's used only
2740 * during boot (way before smp_init()) by EFI subsystem and hence is ok.
2741 */
kernel_unmap_pages_in_pgd(pgd_t * pgd,unsigned long address,unsigned long numpages)2742 int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
2743 unsigned long numpages)
2744 {
2745 int retval;
2746
2747 /*
2748 * The typical sequence for unmapping is to find a pte through
2749 * lookup_address_in_pgd() (ideally, it should never return NULL because
2750 * the address is already mapped) and change its protections. As pfn is
2751 * the *target* of a mapping, it's not useful while unmapping.
2752 */
2753 struct cpa_data cpa = {
2754 .vaddr = &address,
2755 .pfn = 0,
2756 .pgd = pgd,
2757 .numpages = numpages,
2758 .mask_set = __pgprot(0),
2759 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY),
2760 .flags = CPA_NO_CHECK_ALIAS,
2761 };
2762
2763 WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP");
2764
2765 retval = __change_page_attr_set_clr(&cpa, 1);
2766 __flush_tlb_all();
2767
2768 return retval;
2769 }
2770
2771 /*
2772 * The testcases use internal knowledge of the implementation that shouldn't
2773 * be exposed to the rest of the kernel. Include these directly here.
2774 */
2775 #ifdef CONFIG_CPA_DEBUG
2776 #include "cpa-test.c"
2777 #endif
2778