xref: /linux/arch/x86/boot/startup/map_kernel.c (revision ab93e0dd72c37d378dd936f031ffb83ff2bd87ce)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include <linux/init.h>
4 #include <linux/linkage.h>
5 #include <linux/types.h>
6 #include <linux/kernel.h>
7 #include <linux/pgtable.h>
8 
9 #include <asm/init.h>
10 #include <asm/sections.h>
11 #include <asm/setup.h>
12 #include <asm/sev.h>
13 
14 extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
15 extern unsigned int next_early_pgt;
16 
check_la57_support(void)17 static inline bool check_la57_support(void)
18 {
19 	/*
20 	 * 5-level paging is detected and enabled at kernel decompression
21 	 * stage. Only check if it has been enabled there.
22 	 */
23 	if (!(native_read_cr4() & X86_CR4_LA57))
24 		return false;
25 
26 	__pgtable_l5_enabled	= 1;
27 	pgdir_shift		= 48;
28 	ptrs_per_p4d		= 512;
29 
30 	return true;
31 }
32 
sme_postprocess_startup(struct boot_params * bp,pmdval_t * pmd,unsigned long p2v_offset)33 static unsigned long __head sme_postprocess_startup(struct boot_params *bp,
34 						    pmdval_t *pmd,
35 						    unsigned long p2v_offset)
36 {
37 	unsigned long paddr, paddr_end;
38 	int i;
39 
40 	/* Encrypt the kernel and related (if SME is active) */
41 	sme_encrypt_kernel(bp);
42 
43 	/*
44 	 * Clear the memory encryption mask from the .bss..decrypted section.
45 	 * The bss section will be memset to zero later in the initialization so
46 	 * there is no need to zero it after changing the memory encryption
47 	 * attribute.
48 	 */
49 	if (sme_get_me_mask()) {
50 		paddr = (unsigned long)rip_rel_ptr(__start_bss_decrypted);
51 		paddr_end = (unsigned long)rip_rel_ptr(__end_bss_decrypted);
52 
53 		for (; paddr < paddr_end; paddr += PMD_SIZE) {
54 			/*
55 			 * On SNP, transition the page to shared in the RMP table so that
56 			 * it is consistent with the page table attribute change.
57 			 *
58 			 * __start_bss_decrypted has a virtual address in the high range
59 			 * mapping (kernel .text). PVALIDATE, by way of
60 			 * early_snp_set_memory_shared(), requires a valid virtual
61 			 * address but the kernel is currently running off of the identity
62 			 * mapping so use the PA to get a *currently* valid virtual address.
63 			 */
64 			early_snp_set_memory_shared(paddr, paddr, PTRS_PER_PMD);
65 
66 			i = pmd_index(paddr - p2v_offset);
67 			pmd[i] -= sme_get_me_mask();
68 		}
69 	}
70 
71 	/*
72 	 * Return the SME encryption mask (if SME is active) to be used as a
73 	 * modifier for the initial pgdir entry programmed into CR3.
74 	 */
75 	return sme_get_me_mask();
76 }
77 
78 /*
79  * This code is compiled using PIC codegen because it will execute from the
80  * early 1:1 mapping of memory, which deviates from the mapping expected by the
81  * linker. Due to this deviation, taking the address of a global variable will
82  * produce an ambiguous result when using the plain & operator.  Instead,
83  * rip_rel_ptr() must be used, which will return the RIP-relative address in
84  * the 1:1 mapping of memory. Kernel virtual addresses can be determined by
85  * subtracting p2v_offset from the RIP-relative address.
86  */
__startup_64(unsigned long p2v_offset,struct boot_params * bp)87 unsigned long __head __startup_64(unsigned long p2v_offset,
88 				  struct boot_params *bp)
89 {
90 	pmd_t (*early_pgts)[PTRS_PER_PMD] = rip_rel_ptr(early_dynamic_pgts);
91 	unsigned long physaddr = (unsigned long)rip_rel_ptr(_text);
92 	unsigned long va_text, va_end;
93 	unsigned long pgtable_flags;
94 	unsigned long load_delta;
95 	pgdval_t *pgd;
96 	p4dval_t *p4d;
97 	pudval_t *pud;
98 	pmdval_t *pmd, pmd_entry;
99 	bool la57;
100 	int i;
101 
102 	la57 = check_la57_support();
103 
104 	/* Is the address too large? */
105 	if (physaddr >> MAX_PHYSMEM_BITS)
106 		for (;;);
107 
108 	/*
109 	 * Compute the delta between the address I am compiled to run at
110 	 * and the address I am actually running at.
111 	 */
112 	phys_base = load_delta = __START_KERNEL_map + p2v_offset;
113 
114 	/* Is the address not 2M aligned? */
115 	if (load_delta & ~PMD_MASK)
116 		for (;;);
117 
118 	va_text = physaddr - p2v_offset;
119 	va_end  = (unsigned long)rip_rel_ptr(_end) - p2v_offset;
120 
121 	/* Include the SME encryption mask in the fixup value */
122 	load_delta += sme_get_me_mask();
123 
124 	/* Fixup the physical addresses in the page table */
125 
126 	pgd = rip_rel_ptr(early_top_pgt);
127 	pgd[pgd_index(__START_KERNEL_map)] += load_delta;
128 
129 	if (la57) {
130 		p4d = (p4dval_t *)rip_rel_ptr(level4_kernel_pgt);
131 		p4d[MAX_PTRS_PER_P4D - 1] += load_delta;
132 
133 		pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE;
134 	}
135 
136 	level3_kernel_pgt[PTRS_PER_PUD - 2].pud += load_delta;
137 	level3_kernel_pgt[PTRS_PER_PUD - 1].pud += load_delta;
138 
139 	for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--)
140 		level2_fixmap_pgt[i].pmd += load_delta;
141 
142 	/*
143 	 * Set up the identity mapping for the switchover.  These
144 	 * entries should *NOT* have the global bit set!  This also
145 	 * creates a bunch of nonsense entries but that is fine --
146 	 * it avoids problems around wraparound.
147 	 */
148 
149 	pud = &early_pgts[0]->pmd;
150 	pmd = &early_pgts[1]->pmd;
151 	next_early_pgt = 2;
152 
153 	pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
154 
155 	if (la57) {
156 		p4d = &early_pgts[next_early_pgt++]->pmd;
157 
158 		i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
159 		pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
160 		pgd[i + 1] = (pgdval_t)p4d + pgtable_flags;
161 
162 		i = physaddr >> P4D_SHIFT;
163 		p4d[(i + 0) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
164 		p4d[(i + 1) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
165 	} else {
166 		i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
167 		pgd[i + 0] = (pgdval_t)pud + pgtable_flags;
168 		pgd[i + 1] = (pgdval_t)pud + pgtable_flags;
169 	}
170 
171 	i = physaddr >> PUD_SHIFT;
172 	pud[(i + 0) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
173 	pud[(i + 1) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
174 
175 	pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
176 	pmd_entry += sme_get_me_mask();
177 	pmd_entry +=  physaddr;
178 
179 	for (i = 0; i < DIV_ROUND_UP(va_end - va_text, PMD_SIZE); i++) {
180 		int idx = i + (physaddr >> PMD_SHIFT);
181 
182 		pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE;
183 	}
184 
185 	/*
186 	 * Fixup the kernel text+data virtual addresses. Note that
187 	 * we might write invalid pmds, when the kernel is relocated
188 	 * cleanup_highmap() fixes this up along with the mappings
189 	 * beyond _end.
190 	 *
191 	 * Only the region occupied by the kernel image has so far
192 	 * been checked against the table of usable memory regions
193 	 * provided by the firmware, so invalidate pages outside that
194 	 * region. A page table entry that maps to a reserved area of
195 	 * memory would allow processor speculation into that area,
196 	 * and on some hardware (particularly the UV platform) even
197 	 * speculative access to some reserved areas is caught as an
198 	 * error, causing the BIOS to halt the system.
199 	 */
200 
201 	pmd = rip_rel_ptr(level2_kernel_pgt);
202 
203 	/* invalidate pages before the kernel image */
204 	for (i = 0; i < pmd_index(va_text); i++)
205 		pmd[i] &= ~_PAGE_PRESENT;
206 
207 	/* fixup pages that are part of the kernel image */
208 	for (; i <= pmd_index(va_end); i++)
209 		if (pmd[i] & _PAGE_PRESENT)
210 			pmd[i] += load_delta;
211 
212 	/* invalidate pages after the kernel image */
213 	for (; i < PTRS_PER_PMD; i++)
214 		pmd[i] &= ~_PAGE_PRESENT;
215 
216 	return sme_postprocess_startup(bp, pmd, p2v_offset);
217 }
218