x86/mm/tlb.c

1 // SPDX-License-Identifier: GPL-2.0-only
14 #include <asm/nospec-branch.h>
31  *	TLB flushing, formerly SMP-only
57  * Instead we have a small per-cpu array of ASIDs and cache the last few mm's
64  * ASID  - [0, TLB_NR_DYN_ASIDS-1]
67  * kPCID - [1, TLB_NR_DYN_ASIDS]
71  * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
91 #define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
94  * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid.  -1 below to account
95  * for them being zero-based.  Another -1 is because PCID 0 is reserved for
96  * use by non-PCID-aware users.
98 #define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
121 	 * The dynamically-assigned ASIDs that get passed in are small  in kern_pcid()
125 	 * If PCID is on, ASID-aware code paths put the ASID+1 into the  in kern_pcid()
127 	 * situation in which PCID-unaware code saves CR3, loads some other  in kern_pcid()
129 	 * the TLB for ASID 0 if the saved ASID was nonzero.  It also means  in kern_pcid()
130 	 * that any bugs involving loading a PCID-enabled CR3 with  in kern_pcid()
171  * We get here when we do something requiring a TLB invalidation
174  * forces a TLB flush when the context is loaded.
221 		    next->context.ctx_id)  in choose_new_asid()
234 	*new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1;  in choose_new_asid()
281 	 * that load_cr3() is serializing and orders TLB  in load_new_mm_cr3()
292 	 * It's plausible that we're in lazy TLB mode while our mm is init_mm.  in leave_mm()
293 	 * If so, our callers still expect us to flush the TLB, but there  in leave_mm()
294 	 * aren't any user TLB entries in init_mm to worry about.  in leave_mm()
321 	unsigned long next_tif = task_thread_info(next)->flags;  in mm_mangle_tif_spec_ib()
324 	return (unsigned long)next->mm | ibpb;  in mm_mangle_tif_spec_ib()
329 	if (!next || !next->mm)  in cond_ibpb()
335 	 * same process. Using the mm pointer instead of mm->context.ctx_id  in cond_ibpb()
360 		 * - the same user space task is scheduled out and later  in cond_ibpb()
364 		 * - a user space task belonging to the same process is  in cond_ibpb()
367 		 * - a user space task belonging to the same process is  in cond_ibpb()
395 		if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) {  in cond_ibpb()
397 			this_cpu_write(cpu_tlbstate.last_user_mm, next->mm);  in cond_ibpb()
407 	     atomic_read(&mm->context.perf_rdpmc_allowed)))  in cr4_update_pce_mm()
435 	 * from lazy TLB mode to normal mode if active_mm isn't changing.  in switch_mm_irqs_off()
456 	if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {  in switch_mm_irqs_off()
458 		 * If we were to BUG here, we'd be very likely to kill  in switch_mm_irqs_off()
476 	 * core serialization before returning to user-space, after  in switch_mm_irqs_off()
477 	 * storing to rq->curr, when changing mm.  This is because  in switch_mm_irqs_off()
488 			   next->context.ctx_id);  in switch_mm_irqs_off()
491 		 * Even in lazy TLB mode, the CPU should stay set in the  in switch_mm_irqs_off()
492 		 * mm_cpumask. The TLB shootdown code can figure out from  in switch_mm_irqs_off()
500 		 * If the CPU is not in lazy TLB mode, we are just switching  in switch_mm_irqs_off()
502 		 * process. No TLB flush required.  in switch_mm_irqs_off()
509 		 * If the TLB is up to date, just use it.  in switch_mm_irqs_off()
511 		 * the TLB shootdown code.  in switch_mm_irqs_off()
514 		next_tlb_gen = atomic64_read(&next->context.tlb_gen);  in switch_mm_irqs_off()
520 		 * TLB contents went out of date while we were in lazy  in switch_mm_irqs_off()
521 		 * mode. Fall through to the TLB switching code below.  in switch_mm_irqs_off()
529 		 * one process from doing Spectre-v2 attacks on another.  in switch_mm_irqs_off()
535 		 * Skip kernel threads; we never send init_mm TLB flushing IPIs,  in switch_mm_irqs_off()
549 		next_tlb_gen = atomic64_read(&next->context.tlb_gen);  in switch_mm_irqs_off()
559 		this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);  in switch_mm_irqs_off()
561 		load_new_mm_cr3(next->pgd, new_asid, true);  in switch_mm_irqs_off()
566 		load_new_mm_cr3(next->pgd, new_asid, false);  in switch_mm_irqs_off()
590  * lazy tricks to try to minimize TLB flushes.
608  * - The ASID changed from what cpu_tlbstate thinks it is (most likely
612  * - The TLB contains junk in slots corresponding to inactive ASIDs.
614  * - The CPU went so far out to lunch that it may have missed a TLB
625 	WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));  in initialize_tlbstate_and_flush()
635 	/* Force ASID 0 and force a TLB flush. */  in initialize_tlbstate_and_flush()
636 	write_cr3(build_cr3(mm->pgd, 0));  in initialize_tlbstate_and_flush()
642 	this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);  in initialize_tlbstate_and_flush()
651  * TLB fills that happen after we flush the TLB are ordered after we
662 	 * - mm_tlb_gen:     the latest generation.  in flush_tlb_func_common()
663 	 * - local_tlb_gen:  the generation that this CPU has already caught  in flush_tlb_func_common()
665 	 * - f->new_tlb_gen: the generation that the requester of the flush  in flush_tlb_func_common()
670 	u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);  in flush_tlb_func_common()
680 		   loaded_mm->context.ctx_id);  in flush_tlb_func_common()
685 		 * paging-structure cache to avoid speculatively reading  in flush_tlb_func_common()
686 		 * garbage into our TLB.  Since switching to init_mm is barely  in flush_tlb_func_common()
690 		 * IPIs to lazy TLB mode CPUs.  in flush_tlb_func_common()
699 		 * happen if two concurrent flushes happen -- the first flush to  in flush_tlb_func_common()
708 	WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);  in flush_tlb_func_common()
711 	 * If we get to this point, we know that our TLB is out of date.  in flush_tlb_func_common()
713 	 * possible that f->new_tlb_gen <= local_tlb_gen), but we're  in flush_tlb_func_common()
722 	 * 1. f->new_tlb_gen == local_tlb_gen + 1.  We have an invariant that  in flush_tlb_func_common()
725 	 *    f->new_tlb_gen == 3, then we know that the flush needed to bring  in flush_tlb_func_common()
736 	 *    3, we'd be break the invariant: we'd update local_tlb_gen above  in flush_tlb_func_common()
739 	 * 2. f->new_tlb_gen == mm_tlb_gen.  This is purely an optimiation.  in flush_tlb_func_common()
740 	 *    Partial TLB flushes are not all that much cheaper than full TLB  in flush_tlb_func_common()
742 	 *    to do a partial flush if that won't bring our TLB fully up to  in flush_tlb_func_common()
747 	if (f->end != TLB_FLUSH_ALL &&  in flush_tlb_func_common()
748 	    f->new_tlb_gen == local_tlb_gen + 1 &&  in flush_tlb_func_common()
749 	    f->new_tlb_gen == mm_tlb_gen) {  in flush_tlb_func_common()
751 		unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift;  in flush_tlb_func_common()
752 		unsigned long addr = f->start;  in flush_tlb_func_common()
754 		while (addr < f->end) {  in flush_tlb_func_common()
756 			addr += 1UL << f->stride_shift;  in flush_tlb_func_common()
786 	if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))  in flush_tlb_func_remote()
802 	if (info->end == TLB_FLUSH_ALL)  in native_flush_tlb_others()
806 				(info->end - info->start) >> PAGE_SHIFT);  in native_flush_tlb_others()
810 	 * CPUs in lazy TLB mode. They will flush the CPU themselves  in native_flush_tlb_others()
814 	 * IPI everywhere, to prevent CPUs in lazy TLB mode from tripping  in native_flush_tlb_others()
818 	if (info->freed_tables)  in native_flush_tlb_others()
833  * See Documentation/x86/tlb.rst for details.  We choose 33
859 	 * Ensure that the following code is non-reentrant and flush_tlb_info  in get_flush_tlb_info()
860 	 * is not overwritten. This means no TLB flushing is initiated by  in get_flush_tlb_info()
861 	 * interrupt handlers and machine-check exception handlers.  in get_flush_tlb_info()
866 	info->start		= start;  in get_flush_tlb_info()
867 	info->end		= end;  in get_flush_tlb_info()
868 	info->mm		= mm;  in get_flush_tlb_info()
869 	info->stride_shift	= stride_shift;  in get_flush_tlb_info()
870 	info->freed_tables	= freed_tables;  in get_flush_tlb_info()
871 	info->new_tlb_gen	= new_tlb_gen;  in get_flush_tlb_info()
897 	    ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) {  in flush_tlb_mm_range()
941 	for (addr = f->start; addr < f->end; addr += PAGE_SIZE)  in do_kernel_range_flush()
949 	    (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {  in flush_tlb_kernel_range()
973 	unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,  in __get_current_cr3_fast()
997 	 * If PTI is on, then the kernel is mapped with non-global PTEs, and  in flush_tlb_one_kernel()
1062 	 * Read-modify-write to CR4 - protect it from preemption and  in native_flush_tlb_global()
1091 	/* If current->mm == NULL then the read_cr3() "borrows" an mm */  in native_flush_tlb_local()
1115 		 * !PGE -> !PCID (setup_pcid()), thus every flush is total.  in __flush_tlb_all()
1123  * arch_tlbbatch_flush() performs a full TLB flush regardless of the active mm.
1138 	if (cpumask_test_cpu(cpu, &batch->cpumask)) {  in arch_tlbbatch_flush()
1145 	if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)  in arch_tlbbatch_flush()
1146 		flush_tlb_others(&batch->cpumask, &full_flush_tlb_info);  in arch_tlbbatch_flush()
1148 	cpumask_clear(&batch->cpumask);  in arch_tlbbatch_flush()
1163 	struct mm_struct *current_mm = current->mm;  in nmi_uaccess_okay()
1169 	 * current_mm->pgd == __va(read_cr3_pa()).  This may be slow, though,  in nmi_uaccess_okay()
1175 	 * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3.  in nmi_uaccess_okay()
1180 	VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa()));  in nmi_uaccess_okay()
1202 	len = min(count, sizeof(buf) - 1);  in tlbflush_write_file()
1204 		return -EFAULT;  in tlbflush_write_file()
1208 		return -EINVAL;  in tlbflush_write_file()
1211 		return -EINVAL;  in tlbflush_write_file()