1 // SPDX-License-Identifier: GPL-2.0 2 #define CREATE_TRACE_POINTS 3 #include <trace/events/mmap_lock.h> 4 5 #include <linux/mm.h> 6 #include <linux/cgroup.h> 7 #include <linux/memcontrol.h> 8 #include <linux/mmap_lock.h> 9 #include <linux/mutex.h> 10 #include <linux/percpu.h> 11 #include <linux/rcupdate.h> 12 #include <linux/smp.h> 13 #include <linux/trace_events.h> 14 #include <linux/local_lock.h> 15 16 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking); 17 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned); 18 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released); 19 20 #ifdef CONFIG_TRACING 21 /* 22 * Trace calls must be in a separate file, as otherwise there's a circular 23 * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h. 24 */ 25 26 void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write) 27 { 28 trace_mmap_lock_start_locking(mm, write); 29 } 30 EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking); 31 32 void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, 33 bool success) 34 { 35 trace_mmap_lock_acquire_returned(mm, write, success); 36 } 37 EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned); 38 39 void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write) 40 { 41 trace_mmap_lock_released(mm, write); 42 } 43 EXPORT_SYMBOL(__mmap_lock_do_trace_released); 44 #endif /* CONFIG_TRACING */ 45 46 #ifdef CONFIG_MMU 47 #ifdef CONFIG_PER_VMA_LOCK 48 static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching) 49 { 50 unsigned int tgt_refcnt = VMA_LOCK_OFFSET; 51 52 /* Additional refcnt if the vma is attached. */ 53 if (!detaching) 54 tgt_refcnt++; 55 56 /* 57 * If vma is detached then only vma_mark_attached() can raise the 58 * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). 59 */ 60 if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) 61 return false; 62 63 rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); 64 rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, 65 refcount_read(&vma->vm_refcnt) == tgt_refcnt, 66 TASK_UNINTERRUPTIBLE); 67 lock_acquired(&vma->vmlock_dep_map, _RET_IP_); 68 69 return true; 70 } 71 72 static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) 73 { 74 *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt); 75 rwsem_release(&vma->vmlock_dep_map, _RET_IP_); 76 } 77 78 void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) 79 { 80 bool locked; 81 82 /* 83 * __vma_enter_locked() returns false immediately if the vma is not 84 * attached, otherwise it waits until refcnt is indicating that vma 85 * is attached with no readers. 86 */ 87 locked = __vma_enter_locked(vma, false); 88 89 /* 90 * We should use WRITE_ONCE() here because we can have concurrent reads 91 * from the early lockless pessimistic check in vma_start_read(). 92 * We don't really care about the correctness of that early check, but 93 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. 94 */ 95 WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); 96 97 if (locked) { 98 bool detached; 99 100 __vma_exit_locked(vma, &detached); 101 WARN_ON_ONCE(detached); /* vma should remain attached */ 102 } 103 } 104 EXPORT_SYMBOL_GPL(__vma_start_write); 105 106 void vma_mark_detached(struct vm_area_struct *vma) 107 { 108 vma_assert_write_locked(vma); 109 vma_assert_attached(vma); 110 111 /* 112 * We are the only writer, so no need to use vma_refcount_put(). 113 * The condition below is unlikely because the vma has been already 114 * write-locked and readers can increment vm_refcnt only temporarily 115 * before they check vm_lock_seq, realize the vma is locked and drop 116 * back the vm_refcnt. That is a narrow window for observing a raised 117 * vm_refcnt. 118 */ 119 if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { 120 /* Wait until vma is detached with no readers. */ 121 if (__vma_enter_locked(vma, true)) { 122 bool detached; 123 124 __vma_exit_locked(vma, &detached); 125 WARN_ON_ONCE(!detached); 126 } 127 } 128 } 129 130 /* 131 * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be 132 * stable and not isolated. If the VMA is not found or is being modified the 133 * function returns NULL. 134 */ 135 struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, 136 unsigned long address) 137 { 138 MA_STATE(mas, &mm->mm_mt, address, address); 139 struct vm_area_struct *vma; 140 141 rcu_read_lock(); 142 retry: 143 vma = mas_walk(&mas); 144 if (!vma) 145 goto inval; 146 147 vma = vma_start_read(mm, vma); 148 if (IS_ERR_OR_NULL(vma)) { 149 /* Check if the VMA got isolated after we found it */ 150 if (PTR_ERR(vma) == -EAGAIN) { 151 count_vm_vma_lock_event(VMA_LOCK_MISS); 152 /* The area was replaced with another one */ 153 goto retry; 154 } 155 156 /* Failed to lock the VMA */ 157 goto inval; 158 } 159 /* 160 * At this point, we have a stable reference to a VMA: The VMA is 161 * locked and we know it hasn't already been isolated. 162 * From here on, we can access the VMA without worrying about which 163 * fields are accessible for RCU readers. 164 */ 165 166 /* Check if the vma we locked is the right one. */ 167 if (unlikely(vma->vm_mm != mm || 168 address < vma->vm_start || address >= vma->vm_end)) 169 goto inval_end_read; 170 171 rcu_read_unlock(); 172 return vma; 173 174 inval_end_read: 175 vma_end_read(vma); 176 inval: 177 rcu_read_unlock(); 178 count_vm_vma_lock_event(VMA_LOCK_ABORT); 179 return NULL; 180 } 181 #endif /* CONFIG_PER_VMA_LOCK */ 182 183 #ifdef CONFIG_LOCK_MM_AND_FIND_VMA 184 #include <linux/extable.h> 185 186 static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) 187 { 188 if (likely(mmap_read_trylock(mm))) 189 return true; 190 191 if (regs && !user_mode(regs)) { 192 unsigned long ip = exception_ip(regs); 193 if (!search_exception_tables(ip)) 194 return false; 195 } 196 197 return !mmap_read_lock_killable(mm); 198 } 199 200 static inline bool mmap_upgrade_trylock(struct mm_struct *mm) 201 { 202 /* 203 * We don't have this operation yet. 204 * 205 * It should be easy enough to do: it's basically a 206 * atomic_long_try_cmpxchg_acquire() 207 * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but 208 * it also needs the proper lockdep magic etc. 209 */ 210 return false; 211 } 212 213 static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) 214 { 215 mmap_read_unlock(mm); 216 if (regs && !user_mode(regs)) { 217 unsigned long ip = exception_ip(regs); 218 if (!search_exception_tables(ip)) 219 return false; 220 } 221 return !mmap_write_lock_killable(mm); 222 } 223 224 /* 225 * Helper for page fault handling. 226 * 227 * This is kind of equivalent to "mmap_read_lock()" followed 228 * by "find_extend_vma()", except it's a lot more careful about 229 * the locking (and will drop the lock on failure). 230 * 231 * For example, if we have a kernel bug that causes a page 232 * fault, we don't want to just use mmap_read_lock() to get 233 * the mm lock, because that would deadlock if the bug were 234 * to happen while we're holding the mm lock for writing. 235 * 236 * So this checks the exception tables on kernel faults in 237 * order to only do this all for instructions that are actually 238 * expected to fault. 239 * 240 * We can also actually take the mm lock for writing if we 241 * need to extend the vma, which helps the VM layer a lot. 242 */ 243 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, 244 unsigned long addr, struct pt_regs *regs) 245 { 246 struct vm_area_struct *vma; 247 248 if (!get_mmap_lock_carefully(mm, regs)) 249 return NULL; 250 251 vma = find_vma(mm, addr); 252 if (likely(vma && (vma->vm_start <= addr))) 253 return vma; 254 255 /* 256 * Well, dang. We might still be successful, but only 257 * if we can extend a vma to do so. 258 */ 259 if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { 260 mmap_read_unlock(mm); 261 return NULL; 262 } 263 264 /* 265 * We can try to upgrade the mmap lock atomically, 266 * in which case we can continue to use the vma 267 * we already looked up. 268 * 269 * Otherwise we'll have to drop the mmap lock and 270 * re-take it, and also look up the vma again, 271 * re-checking it. 272 */ 273 if (!mmap_upgrade_trylock(mm)) { 274 if (!upgrade_mmap_lock_carefully(mm, regs)) 275 return NULL; 276 277 vma = find_vma(mm, addr); 278 if (!vma) 279 goto fail; 280 if (vma->vm_start <= addr) 281 goto success; 282 if (!(vma->vm_flags & VM_GROWSDOWN)) 283 goto fail; 284 } 285 286 if (expand_stack_locked(vma, addr)) 287 goto fail; 288 289 success: 290 mmap_write_downgrade(mm); 291 return vma; 292 293 fail: 294 mmap_write_unlock(mm); 295 return NULL; 296 } 297 #endif /* CONFIG_LOCK_MM_AND_FIND_VMA */ 298 299 #else /* CONFIG_MMU */ 300 301 /* 302 * At least xtensa ends up having protection faults even with no 303 * MMU.. No stack expansion, at least. 304 */ 305 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, 306 unsigned long addr, struct pt_regs *regs) 307 { 308 struct vm_area_struct *vma; 309 310 mmap_read_lock(mm); 311 vma = vma_lookup(mm, addr); 312 if (!vma) 313 mmap_read_unlock(mm); 314 return vma; 315 } 316 317 #endif /* CONFIG_MMU */ 318