xref: /linux/kernel/locking/rwsem.c (revision 7393febcb1b2082c0484952729cbebfe4dc508d5)
1 // SPDX-License-Identifier: GPL-2.0
2 /* kernel/rwsem.c: R/W semaphores, public implementation
3  *
4  * Written by David Howells (dhowells@redhat.com).
5  * Derived from asm-i386/semaphore.h
6  *
7  * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
8  * and Michel Lespinasse <walken@google.com>
9  *
10  * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
11  * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
12  *
13  * Rwsem count bit fields re-definition and rwsem rearchitecture by
14  * Waiman Long <longman@redhat.com> and
15  * Peter Zijlstra <peterz@infradead.org>.
16  */
17 
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/sched/rt.h>
22 #include <linux/sched/task.h>
23 #include <linux/sched/debug.h>
24 #include <linux/sched/wake_q.h>
25 #include <linux/sched/signal.h>
26 #include <linux/sched/clock.h>
27 #include <linux/export.h>
28 #include <linux/rwsem.h>
29 #include <linux/atomic.h>
30 #include <linux/hung_task.h>
31 #include <trace/events/lock.h>
32 
33 #ifndef CONFIG_PREEMPT_RT
34 #include "lock_events.h"
35 
36 /*
37  * The least significant 2 bits of the owner value has the following
38  * meanings when set.
39  *  - Bit 0: RWSEM_READER_OWNED - rwsem may be owned by readers (just a hint)
40  *  - Bit 1: RWSEM_NONSPINNABLE - Cannot spin on a reader-owned lock
41  *
42  * When the rwsem is reader-owned and a spinning writer has timed out,
43  * the nonspinnable bit will be set to disable optimistic spinning.
44 
45  * When a writer acquires a rwsem, it puts its task_struct pointer
46  * into the owner field. It is cleared after an unlock.
47  *
48  * When a reader acquires a rwsem, it will also puts its task_struct
49  * pointer into the owner field with the RWSEM_READER_OWNED bit set.
50  * On unlock, the owner field will largely be left untouched. So
51  * for a free or reader-owned rwsem, the owner value may contain
52  * information about the last reader that acquires the rwsem.
53  *
54  * That information may be helpful in debugging cases where the system
55  * seems to hang on a reader owned rwsem especially if only one reader
56  * is involved. Ideally we would like to track all the readers that own
57  * a rwsem, but the overhead is simply too big.
58  *
59  * A fast path reader optimistic lock stealing is supported when the rwsem
60  * is previously owned by a writer and the following conditions are met:
61  *  - rwsem is not currently writer owned
62  *  - the handoff isn't set.
63  */
64 #define RWSEM_READER_OWNED	(1UL << 0)
65 #define RWSEM_NONSPINNABLE	(1UL << 1)
66 #define RWSEM_OWNER_FLAGS_MASK	(RWSEM_READER_OWNED | RWSEM_NONSPINNABLE)
67 
68 #ifdef CONFIG_DEBUG_RWSEMS
69 # define DEBUG_RWSEMS_WARN_ON(c, sem)	do {			\
70 	if (!debug_locks_silent &&				\
71 	    WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, magic = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
72 		#c, atomic_long_read(&(sem)->count),		\
73 		(unsigned long) sem->magic,			\
74 		atomic_long_read(&(sem)->owner), (long)current,	\
75 		rwsem_is_contended(sem) ? "" : "not "))		\
76 			debug_locks_off();			\
77 	} while (0)
78 #else
79 # define DEBUG_RWSEMS_WARN_ON(c, sem)
80 #endif
81 
82 /*
83  * On 64-bit architectures, the bit definitions of the count are:
84  *
85  * Bit  0    - writer locked bit
86  * Bit  1    - waiters present bit
87  * Bit  2    - lock handoff bit
88  * Bits 3-7  - reserved
89  * Bits 8-62 - 55-bit reader count
90  * Bit  63   - read fail bit
91  *
92  * On 32-bit architectures, the bit definitions of the count are:
93  *
94  * Bit  0    - writer locked bit
95  * Bit  1    - waiters present bit
96  * Bit  2    - lock handoff bit
97  * Bits 3-7  - reserved
98  * Bits 8-30 - 23-bit reader count
99  * Bit  31   - read fail bit
100  *
101  * It is not likely that the most significant bit (read fail bit) will ever
102  * be set. This guard bit is still checked anyway in the down_read() fastpath
103  * just in case we need to use up more of the reader bits for other purpose
104  * in the future.
105  *
106  * atomic_long_fetch_add() is used to obtain reader lock, whereas
107  * atomic_long_cmpxchg() will be used to obtain writer lock.
108  *
109  * There are three places where the lock handoff bit may be set or cleared.
110  * 1) rwsem_mark_wake() for readers		-- set, clear
111  * 2) rwsem_try_write_lock() for writers	-- set, clear
112  * 3) rwsem_del_waiter()			-- clear
113  *
114  * For all the above cases, wait_lock will be held. A writer must also
115  * be the first one in the wait_list to be eligible for setting the handoff
116  * bit. So concurrent setting/clearing of handoff bit is not possible.
117  */
118 #define RWSEM_WRITER_LOCKED	(1UL << 0)
119 #define RWSEM_FLAG_WAITERS	(1UL << 1)
120 #define RWSEM_FLAG_HANDOFF	(1UL << 2)
121 #define RWSEM_FLAG_READFAIL	(1UL << (BITS_PER_LONG - 1))
122 
123 #define RWSEM_READER_SHIFT	8
124 #define RWSEM_READER_BIAS	(1UL << RWSEM_READER_SHIFT)
125 #define RWSEM_READER_MASK	(~(RWSEM_READER_BIAS - 1))
126 #define RWSEM_WRITER_MASK	RWSEM_WRITER_LOCKED
127 #define RWSEM_LOCK_MASK		(RWSEM_WRITER_MASK|RWSEM_READER_MASK)
128 #define RWSEM_READ_FAILED_MASK	(RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS|\
129 				 RWSEM_FLAG_HANDOFF|RWSEM_FLAG_READFAIL)
130 
131 /*
132  * All writes to owner are protected by WRITE_ONCE() to make sure that
133  * store tearing can't happen as optimistic spinners may read and use
134  * the owner value concurrently without lock. Read from owner, however,
135  * may not need READ_ONCE() as long as the pointer value is only used
136  * for comparison and isn't being dereferenced.
137  *
138  * Both rwsem_{set,clear}_owner() functions should be in the same
139  * preempt disable section as the atomic op that changes sem->count.
140  */
rwsem_set_owner(struct rw_semaphore * sem)141 static inline void rwsem_set_owner(struct rw_semaphore *sem)
142 {
143 	lockdep_assert_preemption_disabled();
144 	atomic_long_set(&sem->owner, (long)current);
145 }
146 
rwsem_clear_owner(struct rw_semaphore * sem)147 static inline void rwsem_clear_owner(struct rw_semaphore *sem)
148 {
149 	lockdep_assert_preemption_disabled();
150 	atomic_long_set(&sem->owner, 0);
151 }
152 
153 /*
154  * Test the flags in the owner field.
155  */
rwsem_test_oflags(struct rw_semaphore * sem,long flags)156 static inline bool rwsem_test_oflags(struct rw_semaphore *sem, long flags)
157 {
158 	return atomic_long_read(&sem->owner) & flags;
159 }
160 
161 /*
162  * The task_struct pointer of the last owning reader will be left in
163  * the owner field.
164  *
165  * Note that the owner value just indicates the task has owned the rwsem
166  * previously, it may not be the real owner or one of the real owners
167  * anymore when that field is examined, so take it with a grain of salt.
168  *
169  * The reader non-spinnable bit is preserved.
170  */
__rwsem_set_reader_owned(struct rw_semaphore * sem,struct task_struct * owner)171 static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
172 					    struct task_struct *owner)
173 {
174 	unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED |
175 		(atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE);
176 
177 	atomic_long_set(&sem->owner, val);
178 }
179 
rwsem_set_reader_owned(struct rw_semaphore * sem)180 static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
181 {
182 	__rwsem_set_reader_owned(sem, current);
183 }
184 
185 #if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER)
186 /*
187  * Return just the real task structure pointer of the owner
188  */
rwsem_owner(struct rw_semaphore * sem)189 struct task_struct *rwsem_owner(struct rw_semaphore *sem)
190 {
191 	return (struct task_struct *)
192 		(atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK);
193 }
194 
195 /*
196  * Return true if the rwsem is owned by a reader.
197  */
is_rwsem_reader_owned(struct rw_semaphore * sem)198 bool is_rwsem_reader_owned(struct rw_semaphore *sem)
199 {
200 	/*
201 	 * Check the count to see if it is write-locked.
202 	 */
203 	long count = atomic_long_read(&sem->count);
204 
205 	if (count & RWSEM_WRITER_MASK)
206 		return false;
207 	return rwsem_test_oflags(sem, RWSEM_READER_OWNED);
208 }
209 
210 /*
211  * With CONFIG_DEBUG_RWSEMS or CONFIG_DETECT_HUNG_TASK_BLOCKER configured,
212  * it will make sure that the owner field of a reader-owned rwsem either
213  * points to a real reader-owner(s) or gets cleared. The only exception is
214  * when the unlock is done by up_read_non_owner().
215  */
rwsem_clear_reader_owned(struct rw_semaphore * sem)216 static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
217 {
218 	unsigned long val = atomic_long_read(&sem->owner);
219 
220 	while ((val & ~RWSEM_OWNER_FLAGS_MASK) == (unsigned long)current) {
221 		if (atomic_long_try_cmpxchg(&sem->owner, &val,
222 					    val & RWSEM_OWNER_FLAGS_MASK))
223 			return;
224 	}
225 }
226 #else
rwsem_clear_reader_owned(struct rw_semaphore * sem)227 static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
228 {
229 }
230 #endif
231 
232 /*
233  * Set the RWSEM_NONSPINNABLE bits if the RWSEM_READER_OWNED flag
234  * remains set. Otherwise, the operation will be aborted.
235  */
rwsem_set_nonspinnable(struct rw_semaphore * sem)236 static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem)
237 {
238 	unsigned long owner = atomic_long_read(&sem->owner);
239 
240 	do {
241 		if (!(owner & RWSEM_READER_OWNED))
242 			break;
243 		if (owner & RWSEM_NONSPINNABLE)
244 			break;
245 	} while (!atomic_long_try_cmpxchg(&sem->owner, &owner,
246 					  owner | RWSEM_NONSPINNABLE));
247 }
248 
rwsem_read_trylock(struct rw_semaphore * sem,long * cntp)249 static inline bool rwsem_read_trylock(struct rw_semaphore *sem, long *cntp)
250 {
251 	*cntp = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count);
252 
253 	if (WARN_ON_ONCE(*cntp < 0))
254 		rwsem_set_nonspinnable(sem);
255 
256 	if (!(*cntp & RWSEM_READ_FAILED_MASK)) {
257 		rwsem_set_reader_owned(sem);
258 		return true;
259 	}
260 
261 	return false;
262 }
263 
rwsem_write_trylock(struct rw_semaphore * sem)264 static inline bool rwsem_write_trylock(struct rw_semaphore *sem)
265 {
266 	long tmp = RWSEM_UNLOCKED_VALUE;
267 
268 	if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) {
269 		rwsem_set_owner(sem);
270 		return true;
271 	}
272 
273 	return false;
274 }
275 
276 /*
277  * Return the real task structure pointer of the owner and the embedded
278  * flags in the owner. pflags must be non-NULL.
279  */
280 static inline struct task_struct *
rwsem_owner_flags(struct rw_semaphore * sem,unsigned long * pflags)281 rwsem_owner_flags(struct rw_semaphore *sem, unsigned long *pflags)
282 {
283 	unsigned long owner = atomic_long_read(&sem->owner);
284 
285 	*pflags = owner & RWSEM_OWNER_FLAGS_MASK;
286 	return (struct task_struct *)(owner & ~RWSEM_OWNER_FLAGS_MASK);
287 }
288 
289 /*
290  * Guide to the rw_semaphore's count field.
291  *
292  * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned
293  * by a writer.
294  *
295  * The lock is owned by readers when
296  * (1) the RWSEM_WRITER_LOCKED isn't set in count,
297  * (2) some of the reader bits are set in count, and
298  * (3) the owner field has RWSEM_READ_OWNED bit set.
299  *
300  * Having some reader bits set is not enough to guarantee a readers owned
301  * lock as the readers may be in the process of backing out from the count
302  * and a writer has just released the lock. So another writer may steal
303  * the lock immediately after that.
304  */
305 
306 /*
307  * Initialize an rwsem:
308  */
__init_rwsem(struct rw_semaphore * sem,const char * name,struct lock_class_key * key)309 void __init_rwsem(struct rw_semaphore *sem, const char *name,
310 		  struct lock_class_key *key)
311 {
312 #ifdef CONFIG_DEBUG_LOCK_ALLOC
313 	/*
314 	 * Make sure we are not reinitializing a held semaphore:
315 	 */
316 	debug_check_no_locks_freed((void *)sem, sizeof(*sem));
317 	lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP);
318 #endif
319 #ifdef CONFIG_DEBUG_RWSEMS
320 	sem->magic = sem;
321 #endif
322 	atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
323 	atomic_long_set(&sem->owner, 0L);
324 	scoped_guard (raw_spinlock_init, &sem->wait_lock) {
325 		sem->first_waiter = NULL;
326 	}
327 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
328 	osq_lock_init(&sem->osq);
329 #endif
330 }
331 EXPORT_SYMBOL(__init_rwsem);
332 
333 enum rwsem_waiter_type {
334 	RWSEM_WAITING_FOR_WRITE,
335 	RWSEM_WAITING_FOR_READ
336 };
337 
338 struct rwsem_waiter {
339 	struct list_head list;
340 	struct task_struct *task;
341 	enum rwsem_waiter_type type;
342 	unsigned long timeout;
343 	bool handoff_set;
344 };
345 
346 enum rwsem_wake_type {
347 	RWSEM_WAKE_ANY,		/* Wake whatever's at head of wait list */
348 	RWSEM_WAKE_READERS,	/* Wake readers only */
349 	RWSEM_WAKE_READ_OWNED	/* Waker thread holds the read lock */
350 };
351 
352 /*
353  * The typical HZ value is either 250 or 1000. So set the minimum waiting
354  * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
355  * queue before initiating the handoff protocol.
356  */
357 #define RWSEM_WAIT_TIMEOUT	DIV_ROUND_UP(HZ, 250)
358 
359 /*
360  * Magic number to batch-wakeup waiting readers, even when writers are
361  * also present in the queue. This both limits the amount of work the
362  * waking thread must do and also prevents any potential counter overflow,
363  * however unlikely.
364  */
365 #define MAX_READERS_WAKEUP	0x100
366 
367 static inline
__rwsem_del_waiter(struct rw_semaphore * sem,struct rwsem_waiter * waiter)368 bool __rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
369 	__must_hold(&sem->wait_lock)
370 {
371 	if (list_empty(&waiter->list)) {
372 		sem->first_waiter = NULL;
373 		return false;
374 	}
375 
376 	if (sem->first_waiter == waiter) {
377 		sem->first_waiter = list_first_entry(&waiter->list,
378 						     struct rwsem_waiter, list);
379 	}
380 	list_del(&waiter->list);
381 
382 	return true;
383 }
384 
385 /*
386  * Remove a waiter from the wait_list and clear flags.
387  *
388  * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of
389  * this function. Modify with care.
390  *
391  * Return: true if wait_list isn't empty and false otherwise
392  */
393 static inline bool
rwsem_del_waiter(struct rw_semaphore * sem,struct rwsem_waiter * waiter)394 rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
395 {
396 	lockdep_assert_held(&sem->wait_lock);
397 	if (__rwsem_del_waiter(sem, waiter))
398 		return true;
399 	atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
400 	return false;
401 }
402 
403 static inline
next_waiter(const struct rw_semaphore * sem,const struct rwsem_waiter * waiter)404 struct rwsem_waiter *next_waiter(const struct rw_semaphore *sem,
405 				 const struct rwsem_waiter *waiter)
406 	__must_hold(&sem->wait_lock)
407 {
408 	struct rwsem_waiter *next = list_first_entry(&waiter->list,
409 						     struct rwsem_waiter, list);
410 	if (next == sem->first_waiter)
411 		return NULL;
412 	return next;
413 }
414 
415 /*
416  * handle the lock release when processes blocked on it that can now run
417  * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
418  *   have been set.
419  * - there must be someone on the queue
420  * - the wait_lock must be held by the caller
421  * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
422  *   to actually wakeup the blocked task(s) and drop the reference count,
423  *   preferably when the wait_lock is released
424  * - woken process blocks are discarded from the list after having task zeroed
425  * - writers are only marked woken if downgrading is false
426  *
427  * Implies rwsem_del_waiter() for all woken readers.
428  */
rwsem_mark_wake(struct rw_semaphore * sem,enum rwsem_wake_type wake_type,struct wake_q_head * wake_q)429 static void rwsem_mark_wake(struct rw_semaphore *sem,
430 			    enum rwsem_wake_type wake_type,
431 			    struct wake_q_head *wake_q)
432 {
433 	struct rwsem_waiter *waiter, *next;
434 	long oldcount, woken = 0, adjustment = 0;
435 	struct list_head wlist;
436 
437 	lockdep_assert_held(&sem->wait_lock);
438 
439 	/*
440 	 * Take a peek at the queue head waiter such that we can determine
441 	 * the wakeup(s) to perform.
442 	 */
443 	waiter = sem->first_waiter;
444 
445 	if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
446 		if (wake_type == RWSEM_WAKE_ANY) {
447 			/*
448 			 * Mark writer at the front of the queue for wakeup.
449 			 * Until the task is actually later awoken later by
450 			 * the caller, other writers are able to steal it.
451 			 * Readers, on the other hand, will block as they
452 			 * will notice the queued writer.
453 			 */
454 			wake_q_add(wake_q, waiter->task);
455 			lockevent_inc(rwsem_wake_writer);
456 		}
457 
458 		return;
459 	}
460 
461 	/*
462 	 * No reader wakeup if there are too many of them already.
463 	 */
464 	if (unlikely(atomic_long_read(&sem->count) < 0))
465 		return;
466 
467 	/*
468 	 * Writers might steal the lock before we grant it to the next reader.
469 	 * We prefer to do the first reader grant before counting readers
470 	 * so we can bail out early if a writer stole the lock.
471 	 */
472 	if (wake_type != RWSEM_WAKE_READ_OWNED) {
473 		struct task_struct *owner;
474 
475 		adjustment = RWSEM_READER_BIAS;
476 		oldcount = atomic_long_fetch_add(adjustment, &sem->count);
477 		if (unlikely(oldcount & RWSEM_WRITER_MASK)) {
478 			/*
479 			 * When we've been waiting "too" long (for writers
480 			 * to give up the lock), request a HANDOFF to
481 			 * force the issue.
482 			 */
483 			if (time_after(jiffies, waiter->timeout)) {
484 				if (!(oldcount & RWSEM_FLAG_HANDOFF)) {
485 					adjustment -= RWSEM_FLAG_HANDOFF;
486 					lockevent_inc(rwsem_rlock_handoff);
487 				}
488 				waiter->handoff_set = true;
489 			}
490 
491 			atomic_long_add(-adjustment, &sem->count);
492 			return;
493 		}
494 		/*
495 		 * Set it to reader-owned to give spinners an early
496 		 * indication that readers now have the lock.
497 		 * The reader nonspinnable bit seen at slowpath entry of
498 		 * the reader is copied over.
499 		 */
500 		owner = waiter->task;
501 		__rwsem_set_reader_owned(sem, owner);
502 	}
503 
504 	/*
505 	 * Grant up to MAX_READERS_WAKEUP read locks to all the readers in the
506 	 * queue. We know that the woken will be at least 1 as we accounted
507 	 * for above. Note we increment the 'active part' of the count by the
508 	 * number of readers before waking any processes up.
509 	 *
510 	 * This is an adaptation of the phase-fair R/W locks where at the
511 	 * reader phase (first waiter is a reader), all readers are eligible
512 	 * to acquire the lock at the same time irrespective of their order
513 	 * in the queue. The writers acquire the lock according to their
514 	 * order in the queue.
515 	 *
516 	 * We have to do wakeup in 2 passes to prevent the possibility that
517 	 * the reader count may be decremented before it is incremented. It
518 	 * is because the to-be-woken waiter may not have slept yet. So it
519 	 * may see waiter->task got cleared, finish its critical section and
520 	 * do an unlock before the reader count increment.
521 	 *
522 	 * 1) Collect the read-waiters in a separate list, count them and
523 	 *    fully increment the reader count in rwsem.
524 	 * 2) For each waiters in the new list, clear waiter->task and
525 	 *    put them into wake_q to be woken up later.
526 	 */
527 	INIT_LIST_HEAD(&wlist);
528 	do {
529 		next = next_waiter(sem, waiter);
530 		if (waiter->type == RWSEM_WAITING_FOR_WRITE)
531 			continue;
532 
533 		woken++;
534 		list_move_tail(&waiter->list, &wlist);
535 		if (sem->first_waiter == waiter)
536 			sem->first_waiter = next;
537 
538 		/*
539 		 * Limit # of readers that can be woken up per wakeup call.
540 		 */
541 		if (unlikely(woken >= MAX_READERS_WAKEUP))
542 			break;
543 	} while ((waiter = next) != NULL);
544 
545 	adjustment = woken * RWSEM_READER_BIAS - adjustment;
546 	lockevent_cond_inc(rwsem_wake_reader, woken);
547 
548 	oldcount = atomic_long_read(&sem->count);
549 	if (!sem->first_waiter) {
550 		/*
551 		 * Combined with list_move_tail() above, this implies
552 		 * rwsem_del_waiter().
553 		 */
554 		adjustment -= RWSEM_FLAG_WAITERS;
555 		if (oldcount & RWSEM_FLAG_HANDOFF)
556 			adjustment -= RWSEM_FLAG_HANDOFF;
557 	} else if (woken) {
558 		/*
559 		 * When we've woken a reader, we no longer need to force
560 		 * writers to give up the lock and we can clear HANDOFF.
561 		 */
562 		if (oldcount & RWSEM_FLAG_HANDOFF)
563 			adjustment -= RWSEM_FLAG_HANDOFF;
564 	}
565 
566 	if (adjustment)
567 		atomic_long_add(adjustment, &sem->count);
568 
569 	/* 2nd pass */
570 	list_for_each_entry_safe(waiter, next, &wlist, list) {
571 		struct task_struct *tsk;
572 
573 		tsk = waiter->task;
574 		get_task_struct(tsk);
575 
576 		/*
577 		 * Ensure calling get_task_struct() before setting the reader
578 		 * waiter to nil such that rwsem_down_read_slowpath() cannot
579 		 * race with do_exit() by always holding a reference count
580 		 * to the task to wakeup.
581 		 */
582 		smp_store_release(&waiter->task, NULL);
583 		/*
584 		 * Ensure issuing the wakeup (either by us or someone else)
585 		 * after setting the reader waiter to nil.
586 		 */
587 		wake_q_add_safe(wake_q, tsk);
588 	}
589 }
590 
591 /*
592  * Remove a waiter and try to wake up other waiters in the wait queue
593  * This function is called from the out_nolock path of both the reader and
594  * writer slowpaths with wait_lock held. It releases the wait_lock and
595  * optionally wake up waiters before it returns.
596  */
597 static inline void
rwsem_del_wake_waiter(struct rw_semaphore * sem,struct rwsem_waiter * waiter,struct wake_q_head * wake_q)598 rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
599 		      struct wake_q_head *wake_q)
600 		      __releases(&sem->wait_lock)
601 {
602 	bool first = sem->first_waiter == waiter;
603 
604 	wake_q_init(wake_q);
605 
606 	/*
607 	 * If the wait_list isn't empty and the waiter to be deleted is
608 	 * the first waiter, we wake up the remaining waiters as they may
609 	 * be eligible to acquire or spin on the lock.
610 	 */
611 	if (rwsem_del_waiter(sem, waiter) && first)
612 		rwsem_mark_wake(sem, RWSEM_WAKE_ANY, wake_q);
613 	raw_spin_unlock_irq(&sem->wait_lock);
614 	if (!wake_q_empty(wake_q))
615 		wake_up_q(wake_q);
616 }
617 
618 /*
619  * This function must be called with the sem->wait_lock held to prevent
620  * race conditions between checking the rwsem wait list and setting the
621  * sem->count accordingly.
622  *
623  * Implies rwsem_del_waiter() on success.
624  */
rwsem_try_write_lock(struct rw_semaphore * sem,struct rwsem_waiter * waiter)625 static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
626 					struct rwsem_waiter *waiter)
627 	__must_hold(&sem->wait_lock)
628 {
629 	struct rwsem_waiter *first = sem->first_waiter;
630 	long count, new;
631 
632 	lockdep_assert_held(&sem->wait_lock);
633 
634 	count = atomic_long_read(&sem->count);
635 	do {
636 		bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
637 
638 		if (has_handoff) {
639 			/*
640 			 * Honor handoff bit and yield only when the first
641 			 * waiter is the one that set it. Otherwisee, we
642 			 * still try to acquire the rwsem.
643 			 */
644 			if (first->handoff_set && (waiter != first))
645 				return false;
646 		}
647 
648 		new = count;
649 
650 		if (count & RWSEM_LOCK_MASK) {
651 			/*
652 			 * A waiter (first or not) can set the handoff bit
653 			 * if it is an RT task or wait in the wait queue
654 			 * for too long.
655 			 */
656 			if (has_handoff || (!rt_or_dl_task(waiter->task) &&
657 					    !time_after(jiffies, waiter->timeout)))
658 				return false;
659 
660 			new |= RWSEM_FLAG_HANDOFF;
661 		} else {
662 			new |= RWSEM_WRITER_LOCKED;
663 			new &= ~RWSEM_FLAG_HANDOFF;
664 
665 			if (list_empty(&first->list))
666 				new &= ~RWSEM_FLAG_WAITERS;
667 		}
668 	} while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
669 
670 	/*
671 	 * We have either acquired the lock with handoff bit cleared or set
672 	 * the handoff bit. Only the first waiter can have its handoff_set
673 	 * set here to enable optimistic spinning in slowpath loop.
674 	 */
675 	if (new & RWSEM_FLAG_HANDOFF) {
676 		first->handoff_set = true;
677 		lockevent_inc(rwsem_wlock_handoff);
678 		return false;
679 	}
680 
681 	/*
682 	 * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on
683 	 * success.
684 	 */
685 	__rwsem_del_waiter(sem, waiter);
686 
687 	rwsem_set_owner(sem);
688 	return true;
689 }
690 
691 /*
692  * The rwsem_spin_on_owner() function returns the following 4 values
693  * depending on the lock owner state.
694  *   OWNER_NULL  : owner is currently NULL
695  *   OWNER_WRITER: when owner changes and is a writer
696  *   OWNER_READER: when owner changes and the new owner may be a reader.
697  *   OWNER_NONSPINNABLE:
698  *		   when optimistic spinning has to stop because either the
699  *		   owner stops running, is unknown, or its timeslice has
700  *		   been used up.
701  */
702 enum owner_state {
703 	OWNER_NULL		= 1 << 0,
704 	OWNER_WRITER		= 1 << 1,
705 	OWNER_READER		= 1 << 2,
706 	OWNER_NONSPINNABLE	= 1 << 3,
707 };
708 
709 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
710 /*
711  * Try to acquire write lock before the writer has been put on wait queue.
712  */
rwsem_try_write_lock_unqueued(struct rw_semaphore * sem)713 static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
714 {
715 	long count = atomic_long_read(&sem->count);
716 
717 	while (!(count & (RWSEM_LOCK_MASK|RWSEM_FLAG_HANDOFF))) {
718 		if (atomic_long_try_cmpxchg_acquire(&sem->count, &count,
719 					count | RWSEM_WRITER_LOCKED)) {
720 			rwsem_set_owner(sem);
721 			lockevent_inc(rwsem_opt_lock);
722 			return true;
723 		}
724 	}
725 	return false;
726 }
727 
rwsem_can_spin_on_owner(struct rw_semaphore * sem)728 static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
729 {
730 	struct task_struct *owner;
731 	unsigned long flags;
732 	bool ret = true;
733 
734 	if (need_resched()) {
735 		lockevent_inc(rwsem_opt_fail);
736 		return false;
737 	}
738 
739 	/*
740 	 * Disable preemption is equal to the RCU read-side crital section,
741 	 * thus the task_strcut structure won't go away.
742 	 */
743 	owner = rwsem_owner_flags(sem, &flags);
744 	/*
745 	 * Don't check the read-owner as the entry may be stale.
746 	 */
747 	if ((flags & RWSEM_NONSPINNABLE) ||
748 	    (owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner)))
749 		ret = false;
750 
751 	lockevent_cond_inc(rwsem_opt_fail, !ret);
752 	return ret;
753 }
754 
755 static inline enum owner_state
rwsem_owner_state(struct task_struct * owner,unsigned long flags)756 rwsem_owner_state(struct task_struct *owner, unsigned long flags)
757 {
758 	if (flags & RWSEM_NONSPINNABLE)
759 		return OWNER_NONSPINNABLE;
760 
761 	if (flags & RWSEM_READER_OWNED)
762 		return OWNER_READER;
763 
764 	return owner ? OWNER_WRITER : OWNER_NULL;
765 }
766 
767 static noinline enum owner_state
rwsem_spin_on_owner(struct rw_semaphore * sem)768 rwsem_spin_on_owner(struct rw_semaphore *sem)
769 {
770 	struct task_struct *new, *owner;
771 	unsigned long flags, new_flags;
772 	enum owner_state state;
773 
774 	lockdep_assert_preemption_disabled();
775 
776 	owner = rwsem_owner_flags(sem, &flags);
777 	state = rwsem_owner_state(owner, flags);
778 	if (state != OWNER_WRITER)
779 		return state;
780 
781 	for (;;) {
782 		/*
783 		 * When a waiting writer set the handoff flag, it may spin
784 		 * on the owner as well. Once that writer acquires the lock,
785 		 * we can spin on it. So we don't need to quit even when the
786 		 * handoff bit is set.
787 		 */
788 		new = rwsem_owner_flags(sem, &new_flags);
789 		if ((new != owner) || (new_flags != flags)) {
790 			state = rwsem_owner_state(new, new_flags);
791 			break;
792 		}
793 
794 		/*
795 		 * Ensure we emit the owner->on_cpu, dereference _after_
796 		 * checking sem->owner still matches owner, if that fails,
797 		 * owner might point to free()d memory, if it still matches,
798 		 * our spinning context already disabled preemption which is
799 		 * equal to RCU read-side crital section ensures the memory
800 		 * stays valid.
801 		 */
802 		barrier();
803 
804 		if (need_resched() || !owner_on_cpu(owner)) {
805 			state = OWNER_NONSPINNABLE;
806 			break;
807 		}
808 
809 		cpu_relax();
810 	}
811 
812 	return state;
813 }
814 
815 /*
816  * Calculate reader-owned rwsem spinning threshold for writer
817  *
818  * The more readers own the rwsem, the longer it will take for them to
819  * wind down and free the rwsem. So the empirical formula used to
820  * determine the actual spinning time limit here is:
821  *
822  *   Spinning threshold = (10 + nr_readers/2)us
823  *
824  * The limit is capped to a maximum of 25us (30 readers). This is just
825  * a heuristic and is subjected to change in the future.
826  */
rwsem_rspin_threshold(struct rw_semaphore * sem)827 static inline u64 rwsem_rspin_threshold(struct rw_semaphore *sem)
828 {
829 	long count = atomic_long_read(&sem->count);
830 	int readers = count >> RWSEM_READER_SHIFT;
831 	u64 delta;
832 
833 	if (readers > 30)
834 		readers = 30;
835 	delta = (20 + readers) * NSEC_PER_USEC / 2;
836 
837 	return sched_clock() + delta;
838 }
839 
rwsem_optimistic_spin(struct rw_semaphore * sem)840 static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
841 {
842 	bool taken = false;
843 	int prev_owner_state = OWNER_NULL;
844 	int loop = 0;
845 	u64 rspin_threshold = 0;
846 
847 	/* sem->wait_lock should not be held when doing optimistic spinning */
848 	if (!osq_lock(&sem->osq))
849 		goto done;
850 
851 	/*
852 	 * Optimistically spin on the owner field and attempt to acquire the
853 	 * lock whenever the owner changes. Spinning will be stopped when:
854 	 *  1) the owning writer isn't running; or
855 	 *  2) readers own the lock and spinning time has exceeded limit.
856 	 */
857 	for (;;) {
858 		enum owner_state owner_state;
859 
860 		owner_state = rwsem_spin_on_owner(sem);
861 		if (owner_state == OWNER_NONSPINNABLE)
862 			break;
863 
864 		/*
865 		 * Try to acquire the lock
866 		 */
867 		taken = rwsem_try_write_lock_unqueued(sem);
868 
869 		if (taken)
870 			break;
871 
872 		/*
873 		 * Time-based reader-owned rwsem optimistic spinning
874 		 */
875 		if (owner_state == OWNER_READER) {
876 			/*
877 			 * Re-initialize rspin_threshold every time when
878 			 * the owner state changes from non-reader to reader.
879 			 * This allows a writer to steal the lock in between
880 			 * 2 reader phases and have the threshold reset at
881 			 * the beginning of the 2nd reader phase.
882 			 */
883 			if (prev_owner_state != OWNER_READER) {
884 				if (rwsem_test_oflags(sem, RWSEM_NONSPINNABLE))
885 					break;
886 				rspin_threshold = rwsem_rspin_threshold(sem);
887 				loop = 0;
888 			}
889 
890 			/*
891 			 * Check time threshold once every 16 iterations to
892 			 * avoid calling sched_clock() too frequently so
893 			 * as to reduce the average latency between the times
894 			 * when the lock becomes free and when the spinner
895 			 * is ready to do a trylock.
896 			 */
897 			else if (!(++loop & 0xf) && (sched_clock() > rspin_threshold)) {
898 				rwsem_set_nonspinnable(sem);
899 				lockevent_inc(rwsem_opt_nospin);
900 				break;
901 			}
902 		}
903 
904 		/*
905 		 * An RT task cannot do optimistic spinning if it cannot
906 		 * be sure the lock holder is running or live-lock may
907 		 * happen if the current task and the lock holder happen
908 		 * to run in the same CPU. However, aborting optimistic
909 		 * spinning while a NULL owner is detected may miss some
910 		 * opportunity where spinning can continue without causing
911 		 * problem.
912 		 *
913 		 * There are 2 possible cases where an RT task may be able
914 		 * to continue spinning.
915 		 *
916 		 * 1) The lock owner is in the process of releasing the
917 		 *    lock, sem->owner is cleared but the lock has not
918 		 *    been released yet.
919 		 * 2) The lock was free and owner cleared, but another
920 		 *    task just comes in and acquire the lock before
921 		 *    we try to get it. The new owner may be a spinnable
922 		 *    writer.
923 		 *
924 		 * To take advantage of two scenarios listed above, the RT
925 		 * task is made to retry one more time to see if it can
926 		 * acquire the lock or continue spinning on the new owning
927 		 * writer. Of course, if the time lag is long enough or the
928 		 * new owner is not a writer or spinnable, the RT task will
929 		 * quit spinning.
930 		 *
931 		 * If the owner is a writer, the need_resched() check is
932 		 * done inside rwsem_spin_on_owner(). If the owner is not
933 		 * a writer, need_resched() check needs to be done here.
934 		 */
935 		if (owner_state != OWNER_WRITER) {
936 			if (need_resched())
937 				break;
938 			if (rt_or_dl_task(current) &&
939 			   (prev_owner_state != OWNER_WRITER))
940 				break;
941 		}
942 		prev_owner_state = owner_state;
943 
944 		/*
945 		 * The cpu_relax() call is a compiler barrier which forces
946 		 * everything in this loop to be re-loaded. We don't need
947 		 * memory barriers as we'll eventually observe the right
948 		 * values at the cost of a few extra spins.
949 		 */
950 		cpu_relax();
951 	}
952 	osq_unlock(&sem->osq);
953 done:
954 	lockevent_cond_inc(rwsem_opt_fail, !taken);
955 	return taken;
956 }
957 
958 /*
959  * Clear the owner's RWSEM_NONSPINNABLE bit if it is set. This should
960  * only be called when the reader count reaches 0.
961  */
clear_nonspinnable(struct rw_semaphore * sem)962 static inline void clear_nonspinnable(struct rw_semaphore *sem)
963 {
964 	if (unlikely(rwsem_test_oflags(sem, RWSEM_NONSPINNABLE)))
965 		atomic_long_andnot(RWSEM_NONSPINNABLE, &sem->owner);
966 }
967 
968 #else
rwsem_can_spin_on_owner(struct rw_semaphore * sem)969 static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
970 {
971 	return false;
972 }
973 
rwsem_optimistic_spin(struct rw_semaphore * sem)974 static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem)
975 {
976 	return false;
977 }
978 
clear_nonspinnable(struct rw_semaphore * sem)979 static inline void clear_nonspinnable(struct rw_semaphore *sem) { }
980 
981 static inline enum owner_state
rwsem_spin_on_owner(struct rw_semaphore * sem)982 rwsem_spin_on_owner(struct rw_semaphore *sem)
983 {
984 	return OWNER_NONSPINNABLE;
985 }
986 #endif
987 
988 /*
989  * Prepare to wake up waiter(s) in the wait queue by putting them into the
990  * given wake_q if the rwsem lock owner isn't a writer. If rwsem is likely
991  * reader-owned, wake up read lock waiters in queue front or wake up any
992  * front waiter otherwise.
993 
994  * This is being called from both reader and writer slow paths.
995  */
rwsem_cond_wake_waiter(struct rw_semaphore * sem,long count,struct wake_q_head * wake_q)996 static inline void rwsem_cond_wake_waiter(struct rw_semaphore *sem, long count,
997 					  struct wake_q_head *wake_q)
998 {
999 	enum rwsem_wake_type wake_type;
1000 
1001 	if (count & RWSEM_WRITER_MASK)
1002 		return;
1003 
1004 	if (count & RWSEM_READER_MASK) {
1005 		wake_type = RWSEM_WAKE_READERS;
1006 	} else {
1007 		wake_type = RWSEM_WAKE_ANY;
1008 		clear_nonspinnable(sem);
1009 	}
1010 	rwsem_mark_wake(sem, wake_type, wake_q);
1011 }
1012 
1013 /*
1014  * Wait for the read lock to be granted
1015  */
1016 static struct rw_semaphore __sched *
rwsem_down_read_slowpath(struct rw_semaphore * sem,long count,unsigned int state)1017 rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int state)
1018 {
1019 	long adjustment = -RWSEM_READER_BIAS;
1020 	long rcnt = (count >> RWSEM_READER_SHIFT);
1021 	struct rwsem_waiter waiter, *first;
1022 	DEFINE_WAKE_Q(wake_q);
1023 
1024 	/*
1025 	 * To prevent a constant stream of readers from starving a sleeping
1026 	 * writer, don't attempt optimistic lock stealing if the lock is
1027 	 * very likely owned by readers.
1028 	 */
1029 	if ((atomic_long_read(&sem->owner) & RWSEM_READER_OWNED) &&
1030 	    (rcnt > 1) && !(count & RWSEM_WRITER_LOCKED))
1031 		goto queue;
1032 
1033 	/*
1034 	 * Reader optimistic lock stealing.
1035 	 */
1036 	if (!(count & (RWSEM_WRITER_LOCKED | RWSEM_FLAG_HANDOFF))) {
1037 		rwsem_set_reader_owned(sem);
1038 		lockevent_inc(rwsem_rlock_steal);
1039 
1040 		/*
1041 		 * Wake up other readers in the wait queue if it is
1042 		 * the first reader.
1043 		 */
1044 		if ((rcnt == 1) && (count & RWSEM_FLAG_WAITERS)) {
1045 			raw_spin_lock_irq(&sem->wait_lock);
1046 			if (sem->first_waiter)
1047 				rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED,
1048 						&wake_q);
1049 			raw_spin_unlock_irq(&sem->wait_lock);
1050 			wake_up_q(&wake_q);
1051 		}
1052 		return sem;
1053 	}
1054 
1055 queue:
1056 	waiter.task = current;
1057 	waiter.type = RWSEM_WAITING_FOR_READ;
1058 	waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
1059 	waiter.handoff_set = false;
1060 
1061 	raw_spin_lock_irq(&sem->wait_lock);
1062 	first = sem->first_waiter;
1063 	if (!first) {
1064 		/*
1065 		 * In case the wait queue is empty and the lock isn't owned
1066 		 * by a writer, this reader can exit the slowpath and return
1067 		 * immediately as its RWSEM_READER_BIAS has already been set
1068 		 * in the count.
1069 		 */
1070 		if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) {
1071 			/* Provide lock ACQUIRE */
1072 			smp_acquire__after_ctrl_dep();
1073 			raw_spin_unlock_irq(&sem->wait_lock);
1074 			rwsem_set_reader_owned(sem);
1075 			lockevent_inc(rwsem_rlock_fast);
1076 			return sem;
1077 		}
1078 		adjustment += RWSEM_FLAG_WAITERS;
1079 		INIT_LIST_HEAD(&waiter.list);
1080 		sem->first_waiter = &waiter;
1081 	} else {
1082 		list_add_tail(&waiter.list, &first->list);
1083 	}
1084 
1085 	/* we're now waiting on the lock, but no longer actively locking */
1086 	count = atomic_long_add_return(adjustment, &sem->count);
1087 
1088 	rwsem_cond_wake_waiter(sem, count, &wake_q);
1089 	raw_spin_unlock_irq(&sem->wait_lock);
1090 
1091 	if (!wake_q_empty(&wake_q))
1092 		wake_up_q(&wake_q);
1093 
1094 	trace_contention_begin(sem, LCB_F_READ);
1095 	set_current_state(state);
1096 
1097 	if (state == TASK_UNINTERRUPTIBLE)
1098 		hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_READER);
1099 
1100 	/* wait to be given the lock */
1101 	for (;;) {
1102 		if (!smp_load_acquire(&waiter.task)) {
1103 			/* Matches rwsem_mark_wake()'s smp_store_release(). */
1104 			break;
1105 		}
1106 		if (signal_pending_state(state, current)) {
1107 			raw_spin_lock_irq(&sem->wait_lock);
1108 			if (waiter.task)
1109 				goto out_nolock;
1110 			raw_spin_unlock_irq(&sem->wait_lock);
1111 			/* Ordered by sem->wait_lock against rwsem_mark_wake(). */
1112 			break;
1113 		}
1114 		schedule_preempt_disabled();
1115 		lockevent_inc(rwsem_sleep_reader);
1116 		set_current_state(state);
1117 	}
1118 
1119 	if (state == TASK_UNINTERRUPTIBLE)
1120 		hung_task_clear_blocker();
1121 
1122 	__set_current_state(TASK_RUNNING);
1123 	lockevent_inc(rwsem_rlock);
1124 	trace_contention_end(sem, 0);
1125 	return sem;
1126 
1127 out_nolock:
1128 	rwsem_del_wake_waiter(sem, &waiter, &wake_q);
1129 	__set_current_state(TASK_RUNNING);
1130 	lockevent_inc(rwsem_rlock_fail);
1131 	trace_contention_end(sem, -EINTR);
1132 	return ERR_PTR(-EINTR);
1133 }
1134 
1135 /*
1136  * Wait until we successfully acquire the write lock
1137  */
1138 static struct rw_semaphore __sched *
rwsem_down_write_slowpath(struct rw_semaphore * sem,int state)1139 rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
1140 {
1141 	struct rwsem_waiter waiter, *first;
1142 	DEFINE_WAKE_Q(wake_q);
1143 
1144 	/* do optimistic spinning and steal lock if possible */
1145 	if (rwsem_can_spin_on_owner(sem) && rwsem_optimistic_spin(sem)) {
1146 		/* rwsem_optimistic_spin() implies ACQUIRE on success */
1147 		return sem;
1148 	}
1149 
1150 	/*
1151 	 * Optimistic spinning failed, proceed to the slowpath
1152 	 * and block until we can acquire the sem.
1153 	 */
1154 	waiter.task = current;
1155 	waiter.type = RWSEM_WAITING_FOR_WRITE;
1156 	waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
1157 	waiter.handoff_set = false;
1158 
1159 	raw_spin_lock_irq(&sem->wait_lock);
1160 
1161 	first = sem->first_waiter;
1162 	if (first) {
1163 		list_add_tail(&waiter.list, &first->list);
1164 		rwsem_cond_wake_waiter(sem, atomic_long_read(&sem->count),
1165 				       &wake_q);
1166 		if (!wake_q_empty(&wake_q)) {
1167 			/*
1168 			 * We want to minimize wait_lock hold time especially
1169 			 * when a large number of readers are to be woken up.
1170 			 */
1171 			raw_spin_unlock_irq(&sem->wait_lock);
1172 			wake_up_q(&wake_q);
1173 			raw_spin_lock_irq(&sem->wait_lock);
1174 		}
1175 	} else {
1176 		INIT_LIST_HEAD(&waiter.list);
1177 		sem->first_waiter = &waiter;
1178 		atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
1179 	}
1180 
1181 	/* wait until we successfully acquire the lock */
1182 	set_current_state(state);
1183 	trace_contention_begin(sem, LCB_F_WRITE);
1184 
1185 	if (state == TASK_UNINTERRUPTIBLE)
1186 		hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_WRITER);
1187 
1188 	for (;;) {
1189 		if (rwsem_try_write_lock(sem, &waiter)) {
1190 			/* rwsem_try_write_lock() implies ACQUIRE on success */
1191 			break;
1192 		}
1193 
1194 		raw_spin_unlock_irq(&sem->wait_lock);
1195 
1196 		if (signal_pending_state(state, current))
1197 			goto out_nolock;
1198 
1199 		/*
1200 		 * After setting the handoff bit and failing to acquire
1201 		 * the lock, attempt to spin on owner to accelerate lock
1202 		 * transfer. If the previous owner is a on-cpu writer and it
1203 		 * has just released the lock, OWNER_NULL will be returned.
1204 		 * In this case, we attempt to acquire the lock again
1205 		 * without sleeping.
1206 		 */
1207 		if (waiter.handoff_set) {
1208 			enum owner_state owner_state;
1209 
1210 			owner_state = rwsem_spin_on_owner(sem);
1211 			if (owner_state == OWNER_NULL)
1212 				goto trylock_again;
1213 		}
1214 
1215 		schedule_preempt_disabled();
1216 		lockevent_inc(rwsem_sleep_writer);
1217 		set_current_state(state);
1218 trylock_again:
1219 		raw_spin_lock_irq(&sem->wait_lock);
1220 	}
1221 
1222 	if (state == TASK_UNINTERRUPTIBLE)
1223 		hung_task_clear_blocker();
1224 
1225 	__set_current_state(TASK_RUNNING);
1226 	raw_spin_unlock_irq(&sem->wait_lock);
1227 	lockevent_inc(rwsem_wlock);
1228 	trace_contention_end(sem, 0);
1229 	return sem;
1230 
1231 out_nolock:
1232 	__set_current_state(TASK_RUNNING);
1233 	raw_spin_lock_irq(&sem->wait_lock);
1234 	rwsem_del_wake_waiter(sem, &waiter, &wake_q);
1235 	lockevent_inc(rwsem_wlock_fail);
1236 	trace_contention_end(sem, -EINTR);
1237 	return ERR_PTR(-EINTR);
1238 }
1239 
1240 /*
1241  * handle waking up a waiter on the semaphore
1242  * - up_read/up_write has decremented the active part of count if we come here
1243  */
rwsem_wake(struct rw_semaphore * sem)1244 static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
1245 {
1246 	unsigned long flags;
1247 	DEFINE_WAKE_Q(wake_q);
1248 
1249 	raw_spin_lock_irqsave(&sem->wait_lock, flags);
1250 
1251 	if (sem->first_waiter)
1252 		rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
1253 
1254 	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
1255 	wake_up_q(&wake_q);
1256 
1257 	return sem;
1258 }
1259 
1260 /*
1261  * downgrade a write lock into a read lock
1262  * - caller incremented waiting part of count and discovered it still negative
1263  * - just wake up any readers at the front of the queue
1264  */
rwsem_downgrade_wake(struct rw_semaphore * sem)1265 static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
1266 {
1267 	unsigned long flags;
1268 	DEFINE_WAKE_Q(wake_q);
1269 
1270 	raw_spin_lock_irqsave(&sem->wait_lock, flags);
1271 
1272 	if (sem->first_waiter)
1273 		rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
1274 
1275 	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
1276 	wake_up_q(&wake_q);
1277 
1278 	return sem;
1279 }
1280 
1281 /*
1282  * lock for reading
1283  */
__down_read_common(struct rw_semaphore * sem,int state)1284 static __always_inline int __down_read_common(struct rw_semaphore *sem, int state)
1285 {
1286 	int ret = 0;
1287 	long count;
1288 
1289 	preempt_disable();
1290 	if (!rwsem_read_trylock(sem, &count)) {
1291 		if (IS_ERR(rwsem_down_read_slowpath(sem, count, state))) {
1292 			ret = -EINTR;
1293 			goto out;
1294 		}
1295 		DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
1296 	}
1297 out:
1298 	preempt_enable();
1299 	return ret;
1300 }
1301 
__down_read(struct rw_semaphore * sem)1302 static __always_inline void __down_read(struct rw_semaphore *sem)
1303 {
1304 	__down_read_common(sem, TASK_UNINTERRUPTIBLE);
1305 }
1306 
__down_read_interruptible(struct rw_semaphore * sem)1307 static __always_inline int __down_read_interruptible(struct rw_semaphore *sem)
1308 {
1309 	return __down_read_common(sem, TASK_INTERRUPTIBLE);
1310 }
1311 
__down_read_killable(struct rw_semaphore * sem)1312 static __always_inline int __down_read_killable(struct rw_semaphore *sem)
1313 {
1314 	return __down_read_common(sem, TASK_KILLABLE);
1315 }
1316 
__down_read_trylock(struct rw_semaphore * sem)1317 static inline int __down_read_trylock(struct rw_semaphore *sem)
1318 {
1319 	int ret = 0;
1320 	long tmp;
1321 
1322 	DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
1323 
1324 	preempt_disable();
1325 	tmp = atomic_long_read(&sem->count);
1326 	while (!(tmp & RWSEM_READ_FAILED_MASK)) {
1327 		if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
1328 						    tmp + RWSEM_READER_BIAS)) {
1329 			rwsem_set_reader_owned(sem);
1330 			ret = 1;
1331 			break;
1332 		}
1333 	}
1334 	preempt_enable();
1335 	return ret;
1336 }
1337 
1338 /*
1339  * lock for writing
1340  */
__down_write_common(struct rw_semaphore * sem,int state)1341 static __always_inline int __down_write_common(struct rw_semaphore *sem, int state)
1342 {
1343 	int ret = 0;
1344 
1345 	preempt_disable();
1346 	if (unlikely(!rwsem_write_trylock(sem))) {
1347 		if (IS_ERR(rwsem_down_write_slowpath(sem, state)))
1348 			ret = -EINTR;
1349 	}
1350 	preempt_enable();
1351 	return ret;
1352 }
1353 
__down_write(struct rw_semaphore * sem)1354 static __always_inline void __down_write(struct rw_semaphore *sem)
1355 {
1356 	__down_write_common(sem, TASK_UNINTERRUPTIBLE);
1357 }
1358 
__down_write_killable(struct rw_semaphore * sem)1359 static __always_inline int __down_write_killable(struct rw_semaphore *sem)
1360 {
1361 	return __down_write_common(sem, TASK_KILLABLE);
1362 }
1363 
__down_write_trylock(struct rw_semaphore * sem)1364 static inline int __down_write_trylock(struct rw_semaphore *sem)
1365 {
1366 	int ret;
1367 
1368 	preempt_disable();
1369 	DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
1370 	ret = rwsem_write_trylock(sem);
1371 	preempt_enable();
1372 
1373 	return ret;
1374 }
1375 
1376 /*
1377  * unlock after reading
1378  */
__up_read(struct rw_semaphore * sem)1379 static inline void __up_read(struct rw_semaphore *sem)
1380 {
1381 	long tmp;
1382 
1383 	DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
1384 	DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
1385 
1386 	preempt_disable();
1387 	rwsem_clear_reader_owned(sem);
1388 	tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
1389 	DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
1390 	if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
1391 		      RWSEM_FLAG_WAITERS)) {
1392 		clear_nonspinnable(sem);
1393 		rwsem_wake(sem);
1394 	}
1395 	preempt_enable();
1396 }
1397 
1398 /*
1399  * unlock after writing
1400  */
__up_write(struct rw_semaphore * sem)1401 static inline void __up_write(struct rw_semaphore *sem)
1402 {
1403 	long tmp;
1404 
1405 	DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
1406 	/*
1407 	 * sem->owner may differ from current if the ownership is transferred
1408 	 * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits.
1409 	 */
1410 	DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) &&
1411 			    !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem);
1412 
1413 	preempt_disable();
1414 	rwsem_clear_owner(sem);
1415 	tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
1416 	if (unlikely(tmp & RWSEM_FLAG_WAITERS))
1417 		rwsem_wake(sem);
1418 	preempt_enable();
1419 }
1420 
1421 /*
1422  * downgrade write lock to read lock
1423  */
__downgrade_write(struct rw_semaphore * sem)1424 static inline void __downgrade_write(struct rw_semaphore *sem)
1425 {
1426 	long tmp;
1427 
1428 	/*
1429 	 * When downgrading from exclusive to shared ownership,
1430 	 * anything inside the write-locked region cannot leak
1431 	 * into the read side. In contrast, anything in the
1432 	 * read-locked region is ok to be re-ordered into the
1433 	 * write side. As such, rely on RELEASE semantics.
1434 	 */
1435 	DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem);
1436 	preempt_disable();
1437 	tmp = atomic_long_fetch_add_release(
1438 		-RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count);
1439 	rwsem_set_reader_owned(sem);
1440 	if (tmp & RWSEM_FLAG_WAITERS)
1441 		rwsem_downgrade_wake(sem);
1442 	preempt_enable();
1443 }
1444 
1445 #else /* !CONFIG_PREEMPT_RT */
1446 
1447 #define RT_MUTEX_BUILD_MUTEX
1448 #include "rtmutex.c"
1449 
1450 #define rwbase_set_and_save_current_state(state)	\
1451 	set_current_state(state)
1452 
1453 #define rwbase_restore_current_state()			\
1454 	__set_current_state(TASK_RUNNING)
1455 
1456 #define rwbase_rtmutex_lock_state(rtm, state)		\
1457 	__rt_mutex_lock(rtm, state)
1458 
1459 #define rwbase_rtmutex_slowlock_locked(rtm, state, wq)	\
1460 	__rt_mutex_slowlock_locked(rtm, NULL, state, wq)
1461 
1462 #define rwbase_rtmutex_unlock(rtm)			\
1463 	__rt_mutex_unlock(rtm)
1464 
1465 #define rwbase_rtmutex_trylock(rtm)			\
1466 	__rt_mutex_trylock(rtm)
1467 
1468 #define rwbase_signal_pending_state(state, current)	\
1469 	signal_pending_state(state, current)
1470 
1471 #define rwbase_pre_schedule()				\
1472 	rt_mutex_pre_schedule()
1473 
1474 #define rwbase_schedule()				\
1475 	rt_mutex_schedule()
1476 
1477 #define rwbase_post_schedule()				\
1478 	rt_mutex_post_schedule()
1479 
1480 #include "rwbase_rt.c"
1481 
__init_rwsem(struct rw_semaphore * sem,const char * name,struct lock_class_key * key)1482 void __init_rwsem(struct rw_semaphore *sem, const char *name,
1483 		  struct lock_class_key *key)
1484 {
1485 	init_rwbase_rt(&(sem)->rwbase);
1486 
1487 #ifdef CONFIG_DEBUG_LOCK_ALLOC
1488 	debug_check_no_locks_freed((void *)sem, sizeof(*sem));
1489 	lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP);
1490 #endif
1491 }
1492 EXPORT_SYMBOL(__init_rwsem);
1493 
__down_read(struct rw_semaphore * sem)1494 static inline void __down_read(struct rw_semaphore *sem)
1495 {
1496 	rwbase_read_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE);
1497 }
1498 
__down_read_interruptible(struct rw_semaphore * sem)1499 static inline int __down_read_interruptible(struct rw_semaphore *sem)
1500 {
1501 	return rwbase_read_lock(&sem->rwbase, TASK_INTERRUPTIBLE);
1502 }
1503 
__down_read_killable(struct rw_semaphore * sem)1504 static inline int __down_read_killable(struct rw_semaphore *sem)
1505 {
1506 	return rwbase_read_lock(&sem->rwbase, TASK_KILLABLE);
1507 }
1508 
__down_read_trylock(struct rw_semaphore * sem)1509 static inline int __down_read_trylock(struct rw_semaphore *sem)
1510 {
1511 	return rwbase_read_trylock(&sem->rwbase);
1512 }
1513 
__up_read(struct rw_semaphore * sem)1514 static inline void __up_read(struct rw_semaphore *sem)
1515 {
1516 	rwbase_read_unlock(&sem->rwbase, TASK_NORMAL);
1517 }
1518 
__down_write(struct rw_semaphore * sem)1519 static inline void __sched __down_write(struct rw_semaphore *sem)
1520 {
1521 	rwbase_write_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE);
1522 }
1523 
__down_write_killable(struct rw_semaphore * sem)1524 static inline int __sched __down_write_killable(struct rw_semaphore *sem)
1525 {
1526 	return rwbase_write_lock(&sem->rwbase, TASK_KILLABLE);
1527 }
1528 
__down_write_trylock(struct rw_semaphore * sem)1529 static inline int __down_write_trylock(struct rw_semaphore *sem)
1530 {
1531 	return rwbase_write_trylock(&sem->rwbase);
1532 }
1533 
__up_write(struct rw_semaphore * sem)1534 static inline void __up_write(struct rw_semaphore *sem)
1535 {
1536 	rwbase_write_unlock(&sem->rwbase);
1537 }
1538 
__downgrade_write(struct rw_semaphore * sem)1539 static inline void __downgrade_write(struct rw_semaphore *sem)
1540 {
1541 	rwbase_write_downgrade(&sem->rwbase);
1542 }
1543 
1544 /* Debug stubs for the common API */
1545 #define DEBUG_RWSEMS_WARN_ON(c, sem)
1546 
__rwsem_set_reader_owned(struct rw_semaphore * sem,struct task_struct * owner)1547 static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
1548 					    struct task_struct *owner)
1549 {
1550 }
1551 
is_rwsem_reader_owned(struct rw_semaphore * sem)1552 static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
1553 {
1554 	int count = atomic_read(&sem->rwbase.readers);
1555 
1556 	return count < 0 && count != READER_BIAS;
1557 }
1558 
1559 #endif /* CONFIG_PREEMPT_RT */
1560 
1561 /*
1562  * lock for reading
1563  */
down_read(struct rw_semaphore * sem)1564 void __sched down_read(struct rw_semaphore *sem)
1565 	__no_context_analysis
1566 {
1567 	might_sleep();
1568 	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
1569 
1570 	LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
1571 }
1572 EXPORT_SYMBOL(down_read);
1573 
down_read_interruptible(struct rw_semaphore * sem)1574 int __sched down_read_interruptible(struct rw_semaphore *sem)
1575 	__no_context_analysis
1576 {
1577 	might_sleep();
1578 	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
1579 
1580 	if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_interruptible)) {
1581 		rwsem_release(&sem->dep_map, _RET_IP_);
1582 		return -EINTR;
1583 	}
1584 
1585 	return 0;
1586 }
1587 EXPORT_SYMBOL(down_read_interruptible);
1588 
down_read_killable(struct rw_semaphore * sem)1589 int __sched down_read_killable(struct rw_semaphore *sem)
1590 	__no_context_analysis
1591 {
1592 	might_sleep();
1593 	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
1594 
1595 	if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
1596 		rwsem_release(&sem->dep_map, _RET_IP_);
1597 		return -EINTR;
1598 	}
1599 
1600 	return 0;
1601 }
1602 EXPORT_SYMBOL(down_read_killable);
1603 
1604 /*
1605  * trylock for reading -- returns 1 if successful, 0 if contention
1606  */
down_read_trylock(struct rw_semaphore * sem)1607 int down_read_trylock(struct rw_semaphore *sem)
1608 	__no_context_analysis
1609 {
1610 	int ret = __down_read_trylock(sem);
1611 
1612 	if (ret == 1)
1613 		rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
1614 	return ret;
1615 }
1616 EXPORT_SYMBOL(down_read_trylock);
1617 
1618 /*
1619  * lock for writing
1620  */
down_write(struct rw_semaphore * sem)1621 void __sched down_write(struct rw_semaphore *sem)
1622 	__no_context_analysis
1623 {
1624 	might_sleep();
1625 	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
1626 	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
1627 }
1628 EXPORT_SYMBOL(down_write);
1629 
1630 /*
1631  * lock for writing
1632  */
down_write_killable(struct rw_semaphore * sem)1633 int __sched down_write_killable(struct rw_semaphore *sem)
1634 	__no_context_analysis
1635 {
1636 	might_sleep();
1637 	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
1638 
1639 	if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
1640 				  __down_write_killable)) {
1641 		rwsem_release(&sem->dep_map, _RET_IP_);
1642 		return -EINTR;
1643 	}
1644 
1645 	return 0;
1646 }
1647 EXPORT_SYMBOL(down_write_killable);
1648 
1649 /*
1650  * trylock for writing -- returns 1 if successful, 0 if contention
1651  */
down_write_trylock(struct rw_semaphore * sem)1652 int down_write_trylock(struct rw_semaphore *sem)
1653 	__no_context_analysis
1654 {
1655 	int ret = __down_write_trylock(sem);
1656 
1657 	if (ret == 1)
1658 		rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
1659 
1660 	return ret;
1661 }
1662 EXPORT_SYMBOL(down_write_trylock);
1663 
1664 /*
1665  * release a read lock
1666  */
up_read(struct rw_semaphore * sem)1667 void up_read(struct rw_semaphore *sem)
1668 	__no_context_analysis
1669 {
1670 	rwsem_release(&sem->dep_map, _RET_IP_);
1671 	__up_read(sem);
1672 }
1673 EXPORT_SYMBOL(up_read);
1674 
1675 /*
1676  * release a write lock
1677  */
up_write(struct rw_semaphore * sem)1678 void up_write(struct rw_semaphore *sem)
1679 	__no_context_analysis
1680 {
1681 	rwsem_release(&sem->dep_map, _RET_IP_);
1682 	__up_write(sem);
1683 }
1684 EXPORT_SYMBOL(up_write);
1685 
1686 /*
1687  * downgrade write lock to read lock
1688  */
downgrade_write(struct rw_semaphore * sem)1689 void downgrade_write(struct rw_semaphore *sem)
1690 	__no_context_analysis
1691 {
1692 	lock_downgrade(&sem->dep_map, _RET_IP_);
1693 	__downgrade_write(sem);
1694 }
1695 EXPORT_SYMBOL(downgrade_write);
1696 
1697 #ifdef CONFIG_DEBUG_LOCK_ALLOC
1698 
down_read_nested(struct rw_semaphore * sem,int subclass)1699 void down_read_nested(struct rw_semaphore *sem, int subclass)
1700 	__no_context_analysis
1701 {
1702 	might_sleep();
1703 	rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
1704 	LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
1705 }
1706 EXPORT_SYMBOL(down_read_nested);
1707 
down_read_killable_nested(struct rw_semaphore * sem,int subclass)1708 int down_read_killable_nested(struct rw_semaphore *sem, int subclass)
1709 	__no_context_analysis
1710 {
1711 	might_sleep();
1712 	rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
1713 
1714 	if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
1715 		rwsem_release(&sem->dep_map, _RET_IP_);
1716 		return -EINTR;
1717 	}
1718 
1719 	return 0;
1720 }
1721 EXPORT_SYMBOL(down_read_killable_nested);
1722 
_down_write_nest_lock(struct rw_semaphore * sem,struct lockdep_map * nest)1723 void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
1724 	__no_context_analysis
1725 {
1726 	might_sleep();
1727 	rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
1728 	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
1729 }
1730 EXPORT_SYMBOL(_down_write_nest_lock);
1731 
down_read_non_owner(struct rw_semaphore * sem)1732 void down_read_non_owner(struct rw_semaphore *sem)
1733 	__no_context_analysis
1734 {
1735 	might_sleep();
1736 	__down_read(sem);
1737 	/*
1738 	 * The owner value for a reader-owned lock is mostly for debugging
1739 	 * purpose only and is not critical to the correct functioning of
1740 	 * rwsem. So it is perfectly fine to set it in a preempt-enabled
1741 	 * context here.
1742 	 */
1743 	__rwsem_set_reader_owned(sem, NULL);
1744 }
1745 EXPORT_SYMBOL(down_read_non_owner);
1746 
down_write_nested(struct rw_semaphore * sem,int subclass)1747 void down_write_nested(struct rw_semaphore *sem, int subclass)
1748 	__no_context_analysis
1749 {
1750 	might_sleep();
1751 	rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
1752 	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
1753 }
1754 EXPORT_SYMBOL(down_write_nested);
1755 
down_write_killable_nested(struct rw_semaphore * sem,int subclass)1756 int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
1757 	__no_context_analysis
1758 {
1759 	might_sleep();
1760 	rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
1761 
1762 	if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
1763 				  __down_write_killable)) {
1764 		rwsem_release(&sem->dep_map, _RET_IP_);
1765 		return -EINTR;
1766 	}
1767 
1768 	return 0;
1769 }
1770 EXPORT_SYMBOL(down_write_killable_nested);
1771 
up_read_non_owner(struct rw_semaphore * sem)1772 void up_read_non_owner(struct rw_semaphore *sem)
1773 	__no_context_analysis
1774 {
1775 	DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
1776 	__up_read(sem);
1777 }
1778 EXPORT_SYMBOL(up_read_non_owner);
1779 
1780 #endif
1781