1 // SPDX-License-Identifier: GPL-2.0
2 #define CREATE_TRACE_POINTS
3 #include <trace/events/mmap_lock.h>
4
5 #include <linux/mm.h>
6 #include <linux/cgroup.h>
7 #include <linux/memcontrol.h>
8 #include <linux/mmap_lock.h>
9 #include <linux/mutex.h>
10 #include <linux/percpu.h>
11 #include <linux/rcupdate.h>
12 #include <linux/smp.h>
13 #include <linux/trace_events.h>
14 #include <linux/local_lock.h>
15
16 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
17 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
18 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
19
20 #ifdef CONFIG_TRACING
21 /*
22 * Trace calls must be in a separate file, as otherwise there's a circular
23 * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
24 */
25
__mmap_lock_do_trace_start_locking(struct mm_struct * mm,bool write)26 void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
27 {
28 trace_mmap_lock_start_locking(mm, write);
29 }
30 EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);
31
__mmap_lock_do_trace_acquire_returned(struct mm_struct * mm,bool write,bool success)32 void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
33 bool success)
34 {
35 trace_mmap_lock_acquire_returned(mm, write, success);
36 }
37 EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);
38
__mmap_lock_do_trace_released(struct mm_struct * mm,bool write)39 void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
40 {
41 trace_mmap_lock_released(mm, write);
42 }
43 EXPORT_SYMBOL(__mmap_lock_do_trace_released);
44 #endif /* CONFIG_TRACING */
45
46 #ifdef CONFIG_MMU
47 #ifdef CONFIG_PER_VMA_LOCK
__vma_enter_locked(struct vm_area_struct * vma,bool detaching)48 static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
49 {
50 unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
51
52 /* Additional refcnt if the vma is attached. */
53 if (!detaching)
54 tgt_refcnt++;
55
56 /*
57 * If vma is detached then only vma_mark_attached() can raise the
58 * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
59 */
60 if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
61 return false;
62
63 rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
64 rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
65 refcount_read(&vma->vm_refcnt) == tgt_refcnt,
66 TASK_UNINTERRUPTIBLE);
67 lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
68
69 return true;
70 }
71
__vma_exit_locked(struct vm_area_struct * vma,bool * detached)72 static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
73 {
74 *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
75 rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
76 }
77
__vma_start_write(struct vm_area_struct * vma,unsigned int mm_lock_seq)78 void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
79 {
80 bool locked;
81
82 /*
83 * __vma_enter_locked() returns false immediately if the vma is not
84 * attached, otherwise it waits until refcnt is indicating that vma
85 * is attached with no readers.
86 */
87 locked = __vma_enter_locked(vma, false);
88
89 /*
90 * We should use WRITE_ONCE() here because we can have concurrent reads
91 * from the early lockless pessimistic check in vma_start_read().
92 * We don't really care about the correctness of that early check, but
93 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
94 */
95 WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
96
97 if (locked) {
98 bool detached;
99
100 __vma_exit_locked(vma, &detached);
101 WARN_ON_ONCE(detached); /* vma should remain attached */
102 }
103 }
104 EXPORT_SYMBOL_GPL(__vma_start_write);
105
vma_mark_detached(struct vm_area_struct * vma)106 void vma_mark_detached(struct vm_area_struct *vma)
107 {
108 vma_assert_write_locked(vma);
109 vma_assert_attached(vma);
110
111 /*
112 * We are the only writer, so no need to use vma_refcount_put().
113 * The condition below is unlikely because the vma has been already
114 * write-locked and readers can increment vm_refcnt only temporarily
115 * before they check vm_lock_seq, realize the vma is locked and drop
116 * back the vm_refcnt. That is a narrow window for observing a raised
117 * vm_refcnt.
118 */
119 if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
120 /* Wait until vma is detached with no readers. */
121 if (__vma_enter_locked(vma, true)) {
122 bool detached;
123
124 __vma_exit_locked(vma, &detached);
125 WARN_ON_ONCE(!detached);
126 }
127 }
128 }
129
130 /*
131 * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
132 * stable and not isolated. If the VMA is not found or is being modified the
133 * function returns NULL.
134 */
lock_vma_under_rcu(struct mm_struct * mm,unsigned long address)135 struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
136 unsigned long address)
137 {
138 MA_STATE(mas, &mm->mm_mt, address, address);
139 struct vm_area_struct *vma;
140
141 rcu_read_lock();
142 retry:
143 vma = mas_walk(&mas);
144 if (!vma)
145 goto inval;
146
147 vma = vma_start_read(mm, vma);
148 if (IS_ERR_OR_NULL(vma)) {
149 /* Check if the VMA got isolated after we found it */
150 if (PTR_ERR(vma) == -EAGAIN) {
151 count_vm_vma_lock_event(VMA_LOCK_MISS);
152 /* The area was replaced with another one */
153 goto retry;
154 }
155
156 /* Failed to lock the VMA */
157 goto inval;
158 }
159 /*
160 * At this point, we have a stable reference to a VMA: The VMA is
161 * locked and we know it hasn't already been isolated.
162 * From here on, we can access the VMA without worrying about which
163 * fields are accessible for RCU readers.
164 */
165
166 /* Check if the vma we locked is the right one. */
167 if (unlikely(address < vma->vm_start || address >= vma->vm_end))
168 goto inval_end_read;
169
170 rcu_read_unlock();
171 return vma;
172
173 inval_end_read:
174 vma_end_read(vma);
175 inval:
176 rcu_read_unlock();
177 count_vm_vma_lock_event(VMA_LOCK_ABORT);
178 return NULL;
179 }
180
lock_next_vma_under_mmap_lock(struct mm_struct * mm,struct vma_iterator * vmi,unsigned long from_addr)181 static struct vm_area_struct *lock_next_vma_under_mmap_lock(struct mm_struct *mm,
182 struct vma_iterator *vmi,
183 unsigned long from_addr)
184 {
185 struct vm_area_struct *vma;
186 int ret;
187
188 ret = mmap_read_lock_killable(mm);
189 if (ret)
190 return ERR_PTR(ret);
191
192 /* Lookup the vma at the last position again under mmap_read_lock */
193 vma_iter_set(vmi, from_addr);
194 vma = vma_next(vmi);
195 if (vma) {
196 /* Very unlikely vma->vm_refcnt overflow case */
197 if (unlikely(!vma_start_read_locked(vma)))
198 vma = ERR_PTR(-EAGAIN);
199 }
200
201 mmap_read_unlock(mm);
202
203 return vma;
204 }
205
lock_next_vma(struct mm_struct * mm,struct vma_iterator * vmi,unsigned long from_addr)206 struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
207 struct vma_iterator *vmi,
208 unsigned long from_addr)
209 {
210 struct vm_area_struct *vma;
211 unsigned int mm_wr_seq;
212 bool mmap_unlocked;
213
214 RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held");
215 retry:
216 /* Start mmap_lock speculation in case we need to verify the vma later */
217 mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq);
218 vma = vma_next(vmi);
219 if (!vma)
220 return NULL;
221
222 vma = vma_start_read(mm, vma);
223 if (IS_ERR_OR_NULL(vma)) {
224 /*
225 * Retry immediately if the vma gets detached from under us.
226 * Infinite loop should not happen because the vma we find will
227 * have to be constantly knocked out from under us.
228 */
229 if (PTR_ERR(vma) == -EAGAIN) {
230 /* reset to search from the last address */
231 vma_iter_set(vmi, from_addr);
232 goto retry;
233 }
234
235 goto fallback;
236 }
237
238 /* Verify the vma is not behind the last search position. */
239 if (unlikely(from_addr >= vma->vm_end))
240 goto fallback_unlock;
241
242 /*
243 * vma can be ahead of the last search position but we need to verify
244 * it was not shrunk after we found it and another vma has not been
245 * installed ahead of it. Otherwise we might observe a gap that should
246 * not be there.
247 */
248 if (from_addr < vma->vm_start) {
249 /* Verify only if the address space might have changed since vma lookup. */
250 if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) {
251 vma_iter_set(vmi, from_addr);
252 if (vma != vma_next(vmi))
253 goto fallback_unlock;
254 }
255 }
256
257 return vma;
258
259 fallback_unlock:
260 vma_end_read(vma);
261 fallback:
262 rcu_read_unlock();
263 vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr);
264 rcu_read_lock();
265 /* Reinitialize the iterator after re-entering rcu read section */
266 vma_iter_set(vmi, IS_ERR_OR_NULL(vma) ? from_addr : vma->vm_end);
267
268 return vma;
269 }
270 #endif /* CONFIG_PER_VMA_LOCK */
271
272 #ifdef CONFIG_LOCK_MM_AND_FIND_VMA
273 #include <linux/extable.h>
274
get_mmap_lock_carefully(struct mm_struct * mm,struct pt_regs * regs)275 static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
276 {
277 if (likely(mmap_read_trylock(mm)))
278 return true;
279
280 if (regs && !user_mode(regs)) {
281 unsigned long ip = exception_ip(regs);
282 if (!search_exception_tables(ip))
283 return false;
284 }
285
286 return !mmap_read_lock_killable(mm);
287 }
288
mmap_upgrade_trylock(struct mm_struct * mm)289 static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
290 {
291 /*
292 * We don't have this operation yet.
293 *
294 * It should be easy enough to do: it's basically a
295 * atomic_long_try_cmpxchg_acquire()
296 * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
297 * it also needs the proper lockdep magic etc.
298 */
299 return false;
300 }
301
upgrade_mmap_lock_carefully(struct mm_struct * mm,struct pt_regs * regs)302 static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
303 {
304 mmap_read_unlock(mm);
305 if (regs && !user_mode(regs)) {
306 unsigned long ip = exception_ip(regs);
307 if (!search_exception_tables(ip))
308 return false;
309 }
310 return !mmap_write_lock_killable(mm);
311 }
312
313 /*
314 * Helper for page fault handling.
315 *
316 * This is kind of equivalent to "mmap_read_lock()" followed
317 * by "find_extend_vma()", except it's a lot more careful about
318 * the locking (and will drop the lock on failure).
319 *
320 * For example, if we have a kernel bug that causes a page
321 * fault, we don't want to just use mmap_read_lock() to get
322 * the mm lock, because that would deadlock if the bug were
323 * to happen while we're holding the mm lock for writing.
324 *
325 * So this checks the exception tables on kernel faults in
326 * order to only do this all for instructions that are actually
327 * expected to fault.
328 *
329 * We can also actually take the mm lock for writing if we
330 * need to extend the vma, which helps the VM layer a lot.
331 */
lock_mm_and_find_vma(struct mm_struct * mm,unsigned long addr,struct pt_regs * regs)332 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
333 unsigned long addr, struct pt_regs *regs)
334 {
335 struct vm_area_struct *vma;
336
337 if (!get_mmap_lock_carefully(mm, regs))
338 return NULL;
339
340 vma = find_vma(mm, addr);
341 if (likely(vma && (vma->vm_start <= addr)))
342 return vma;
343
344 /*
345 * Well, dang. We might still be successful, but only
346 * if we can extend a vma to do so.
347 */
348 if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
349 mmap_read_unlock(mm);
350 return NULL;
351 }
352
353 /*
354 * We can try to upgrade the mmap lock atomically,
355 * in which case we can continue to use the vma
356 * we already looked up.
357 *
358 * Otherwise we'll have to drop the mmap lock and
359 * re-take it, and also look up the vma again,
360 * re-checking it.
361 */
362 if (!mmap_upgrade_trylock(mm)) {
363 if (!upgrade_mmap_lock_carefully(mm, regs))
364 return NULL;
365
366 vma = find_vma(mm, addr);
367 if (!vma)
368 goto fail;
369 if (vma->vm_start <= addr)
370 goto success;
371 if (!(vma->vm_flags & VM_GROWSDOWN))
372 goto fail;
373 }
374
375 if (expand_stack_locked(vma, addr))
376 goto fail;
377
378 success:
379 mmap_write_downgrade(mm);
380 return vma;
381
382 fail:
383 mmap_write_unlock(mm);
384 return NULL;
385 }
386 #endif /* CONFIG_LOCK_MM_AND_FIND_VMA */
387
388 #else /* CONFIG_MMU */
389
390 /*
391 * At least xtensa ends up having protection faults even with no
392 * MMU.. No stack expansion, at least.
393 */
lock_mm_and_find_vma(struct mm_struct * mm,unsigned long addr,struct pt_regs * regs)394 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
395 unsigned long addr, struct pt_regs *regs)
396 {
397 struct vm_area_struct *vma;
398
399 mmap_read_lock(mm);
400 vma = vma_lookup(mm, addr);
401 if (!vma)
402 mmap_read_unlock(mm);
403 return vma;
404 }
405
406 #endif /* CONFIG_MMU */
407