1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
3 #ifndef BPF_ARENA_SPIN_LOCK_H
4 #define BPF_ARENA_SPIN_LOCK_H
5 
6 #include <vmlinux.h>
7 #include <bpf/bpf_helpers.h>
8 #include "bpf_atomic.h"
9 
10 #define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label)
11 #define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1)
12 
13 #if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST)
14 
15 #define EBUSY 16
16 #define EOPNOTSUPP 95
17 #define ETIMEDOUT 110
18 
19 #ifndef __arena
20 #define __arena __attribute__((address_space(1)))
21 #endif
22 
23 extern unsigned long CONFIG_NR_CPUS __kconfig;
24 
25 /*
26  * Typically, we'd just rely on the definition in vmlinux.h for qspinlock, but
27  * PowerPC overrides the definition to define lock->val as u32 instead of
28  * atomic_t, leading to compilation errors.  Import a local definition below so
29  * that we don't depend on the vmlinux.h version.
30  */
31 
32 struct __qspinlock {
33 	union {
34 		atomic_t val;
35 		struct {
36 			u8 locked;
37 			u8 pending;
38 		};
39 		struct {
40 			u16 locked_pending;
41 			u16 tail;
42 		};
43 	};
44 };
45 
46 #define arena_spinlock_t struct __qspinlock
47 /* FIXME: Using typedef causes CO-RE relocation error */
48 /* typedef struct qspinlock arena_spinlock_t; */
49 
50 struct arena_mcs_spinlock {
51 	struct arena_mcs_spinlock __arena *next;
52 	int locked;
53 	int count;
54 };
55 
56 struct arena_qnode {
57 	struct arena_mcs_spinlock mcs;
58 };
59 
60 #define _Q_MAX_NODES		4
61 #define _Q_PENDING_LOOPS	1
62 
63 /*
64  * Bitfields in the atomic value:
65  *
66  *  0- 7: locked byte
67  *     8: pending
68  *  9-15: not used
69  * 16-17: tail index
70  * 18-31: tail cpu (+1)
71  */
72 #define _Q_MAX_CPUS		1024
73 
74 #define	_Q_SET_MASK(type)	(((1U << _Q_ ## type ## _BITS) - 1)\
75 				      << _Q_ ## type ## _OFFSET)
76 #define _Q_LOCKED_OFFSET	0
77 #define _Q_LOCKED_BITS		8
78 #define _Q_LOCKED_MASK		_Q_SET_MASK(LOCKED)
79 
80 #define _Q_PENDING_OFFSET	(_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
81 #define _Q_PENDING_BITS		8
82 #define _Q_PENDING_MASK		_Q_SET_MASK(PENDING)
83 
84 #define _Q_TAIL_IDX_OFFSET	(_Q_PENDING_OFFSET + _Q_PENDING_BITS)
85 #define _Q_TAIL_IDX_BITS	2
86 #define _Q_TAIL_IDX_MASK	_Q_SET_MASK(TAIL_IDX)
87 
88 #define _Q_TAIL_CPU_OFFSET	(_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS)
89 #define _Q_TAIL_CPU_BITS	(32 - _Q_TAIL_CPU_OFFSET)
90 #define _Q_TAIL_CPU_MASK	_Q_SET_MASK(TAIL_CPU)
91 
92 #define _Q_TAIL_OFFSET		_Q_TAIL_IDX_OFFSET
93 #define _Q_TAIL_MASK		(_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK)
94 
95 #define _Q_LOCKED_VAL		(1U << _Q_LOCKED_OFFSET)
96 #define _Q_PENDING_VAL		(1U << _Q_PENDING_OFFSET)
97 
98 #define likely(x) __builtin_expect(!!(x), 1)
99 #define unlikely(x) __builtin_expect(!!(x), 0)
100 
101 struct arena_qnode __arena qnodes[_Q_MAX_CPUS][_Q_MAX_NODES];
102 
encode_tail(int cpu,int idx)103 static inline u32 encode_tail(int cpu, int idx)
104 {
105 	u32 tail;
106 
107 	tail  = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
108 	tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
109 
110 	return tail;
111 }
112 
decode_tail(u32 tail)113 static inline struct arena_mcs_spinlock __arena *decode_tail(u32 tail)
114 {
115 	u32 cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
116 	u32 idx = (tail &  _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
117 
118 	return &qnodes[cpu][idx].mcs;
119 }
120 
121 static inline
grab_mcs_node(struct arena_mcs_spinlock __arena * base,int idx)122 struct arena_mcs_spinlock __arena *grab_mcs_node(struct arena_mcs_spinlock __arena *base, int idx)
123 {
124 	return &((struct arena_qnode __arena *)base + idx)->mcs;
125 }
126 
127 #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
128 
129 /**
130  * xchg_tail - Put in the new queue tail code word & retrieve previous one
131  * @lock : Pointer to queued spinlock structure
132  * @tail : The new queue tail code word
133  * Return: The previous queue tail code word
134  *
135  * xchg(lock, tail)
136  *
137  * p,*,* -> n,*,* ; prev = xchg(lock, node)
138  */
xchg_tail(arena_spinlock_t __arena * lock,u32 tail)139 static __always_inline u32 xchg_tail(arena_spinlock_t __arena *lock, u32 tail)
140 {
141 	u32 old, new;
142 
143 	old = atomic_read(&lock->val);
144 	do {
145 		new = (old & _Q_LOCKED_PENDING_MASK) | tail;
146 		/*
147 		 * We can use relaxed semantics since the caller ensures that
148 		 * the MCS node is properly initialized before updating the
149 		 * tail.
150 		 */
151 		/* These loops are not expected to stall, but we still need to
152 		 * prove to the verifier they will terminate eventually.
153 		 */
154 		cond_break_label(out);
155 	} while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));
156 
157 	return old;
158 out:
159 	bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
160 	return old;
161 }
162 
163 /**
164  * clear_pending - clear the pending bit.
165  * @lock: Pointer to queued spinlock structure
166  *
167  * *,1,* -> *,0,*
168  */
clear_pending(arena_spinlock_t __arena * lock)169 static __always_inline void clear_pending(arena_spinlock_t __arena *lock)
170 {
171 	WRITE_ONCE(lock->pending, 0);
172 }
173 
174 /**
175  * clear_pending_set_locked - take ownership and clear the pending bit.
176  * @lock: Pointer to queued spinlock structure
177  *
178  * *,1,0 -> *,0,1
179  *
180  * Lock stealing is not allowed if this function is used.
181  */
clear_pending_set_locked(arena_spinlock_t __arena * lock)182 static __always_inline void clear_pending_set_locked(arena_spinlock_t __arena *lock)
183 {
184 	WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
185 }
186 
187 /**
188  * set_locked - Set the lock bit and own the lock
189  * @lock: Pointer to queued spinlock structure
190  *
191  * *,*,0 -> *,0,1
192  */
set_locked(arena_spinlock_t __arena * lock)193 static __always_inline void set_locked(arena_spinlock_t __arena *lock)
194 {
195 	WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
196 }
197 
198 static __always_inline
arena_fetch_set_pending_acquire(arena_spinlock_t __arena * lock)199 u32 arena_fetch_set_pending_acquire(arena_spinlock_t __arena *lock)
200 {
201 	u32 old, new;
202 
203 	old = atomic_read(&lock->val);
204 	do {
205 		new = old | _Q_PENDING_VAL;
206 		/*
207 		 * These loops are not expected to stall, but we still need to
208 		 * prove to the verifier they will terminate eventually.
209 		 */
210 		cond_break_label(out);
211 	} while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new));
212 
213 	return old;
214 out:
215 	bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
216 	return old;
217 }
218 
219 /**
220  * arena_spin_trylock - try to acquire the queued spinlock
221  * @lock : Pointer to queued spinlock structure
222  * Return: 1 if lock acquired, 0 if failed
223  */
arena_spin_trylock(arena_spinlock_t __arena * lock)224 static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock)
225 {
226 	int val = atomic_read(&lock->val);
227 
228 	if (unlikely(val))
229 		return 0;
230 
231 	return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL));
232 }
233 
234 __noinline
arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena * lock,u32 val)235 int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val)
236 {
237 	struct arena_mcs_spinlock __arena *prev, *next, *node0, *node;
238 	int ret = -ETIMEDOUT;
239 	u32 old, tail;
240 	int idx;
241 
242 	/*
243 	 * Wait for in-progress pending->locked hand-overs with a bounded
244 	 * number of spins so that we guarantee forward progress.
245 	 *
246 	 * 0,1,0 -> 0,0,1
247 	 */
248 	if (val == _Q_PENDING_VAL) {
249 		int cnt = _Q_PENDING_LOOPS;
250 		val = atomic_cond_read_relaxed_label(&lock->val,
251 						     (VAL != _Q_PENDING_VAL) || !cnt--,
252 						     release_err);
253 	}
254 
255 	/*
256 	 * If we observe any contention; queue.
257 	 */
258 	if (val & ~_Q_LOCKED_MASK)
259 		goto queue;
260 
261 	/*
262 	 * trylock || pending
263 	 *
264 	 * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
265 	 */
266 	val = arena_fetch_set_pending_acquire(lock);
267 
268 	/*
269 	 * If we observe contention, there is a concurrent locker.
270 	 *
271 	 * Undo and queue; our setting of PENDING might have made the
272 	 * n,0,0 -> 0,0,0 transition fail and it will now be waiting
273 	 * on @next to become !NULL.
274 	 */
275 	if (unlikely(val & ~_Q_LOCKED_MASK)) {
276 
277 		/* Undo PENDING if we set it. */
278 		if (!(val & _Q_PENDING_MASK))
279 			clear_pending(lock);
280 
281 		goto queue;
282 	}
283 
284 	/*
285 	 * We're pending, wait for the owner to go away.
286 	 *
287 	 * 0,1,1 -> *,1,0
288 	 *
289 	 * this wait loop must be a load-acquire such that we match the
290 	 * store-release that clears the locked bit and create lock
291 	 * sequentiality; this is because not all
292 	 * clear_pending_set_locked() implementations imply full
293 	 * barriers.
294 	 */
295 	if (val & _Q_LOCKED_MASK)
296 		smp_cond_load_acquire_label(&lock->locked, !VAL, release_err);
297 
298 	/*
299 	 * take ownership and clear the pending bit.
300 	 *
301 	 * 0,1,0 -> 0,0,1
302 	 */
303 	clear_pending_set_locked(lock);
304 	return 0;
305 
306 	/*
307 	 * End of pending bit optimistic spinning and beginning of MCS
308 	 * queuing.
309 	 */
310 queue:
311 	node0 = &(qnodes[bpf_get_smp_processor_id()])[0].mcs;
312 	idx = node0->count++;
313 	tail = encode_tail(bpf_get_smp_processor_id(), idx);
314 
315 	/*
316 	 * 4 nodes are allocated based on the assumption that there will not be
317 	 * nested NMIs taking spinlocks. That may not be true in some
318 	 * architectures even though the chance of needing more than 4 nodes
319 	 * will still be extremely unlikely. When that happens, we simply return
320 	 * an error. Original qspinlock has a trylock fallback in this case.
321 	 */
322 	if (unlikely(idx >= _Q_MAX_NODES)) {
323 		ret = -EBUSY;
324 		goto release_node_err;
325 	}
326 
327 	node = grab_mcs_node(node0, idx);
328 
329 	/*
330 	 * Ensure that we increment the head node->count before initialising
331 	 * the actual node. If the compiler is kind enough to reorder these
332 	 * stores, then an IRQ could overwrite our assignments.
333 	 */
334 	barrier();
335 
336 	node->locked = 0;
337 	node->next = NULL;
338 
339 	/*
340 	 * We touched a (possibly) cold cacheline in the per-cpu queue node;
341 	 * attempt the trylock once more in the hope someone let go while we
342 	 * weren't watching.
343 	 */
344 	if (arena_spin_trylock(lock))
345 		goto release;
346 
347 	/*
348 	 * Ensure that the initialisation of @node is complete before we
349 	 * publish the updated tail via xchg_tail() and potentially link
350 	 * @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
351 	 */
352 	smp_wmb();
353 
354 	/*
355 	 * Publish the updated tail.
356 	 * We have already touched the queueing cacheline; don't bother with
357 	 * pending stuff.
358 	 *
359 	 * p,*,* -> n,*,*
360 	 */
361 	old = xchg_tail(lock, tail);
362 	next = NULL;
363 
364 	/*
365 	 * if there was a previous node; link it and wait until reaching the
366 	 * head of the waitqueue.
367 	 */
368 	if (old & _Q_TAIL_MASK) {
369 		prev = decode_tail(old);
370 
371 		/* Link @node into the waitqueue. */
372 		WRITE_ONCE(prev->next, node);
373 
374 		arch_mcs_spin_lock_contended_label(&node->locked, release_node_err);
375 
376 		/*
377 		 * While waiting for the MCS lock, the next pointer may have
378 		 * been set by another lock waiter. We cannot prefetch here
379 		 * due to lack of equivalent instruction in BPF ISA.
380 		 */
381 		next = READ_ONCE(node->next);
382 	}
383 
384 	/*
385 	 * we're at the head of the waitqueue, wait for the owner & pending to
386 	 * go away.
387 	 *
388 	 * *,x,y -> *,0,0
389 	 *
390 	 * this wait loop must use a load-acquire such that we match the
391 	 * store-release that clears the locked bit and create lock
392 	 * sequentiality; this is because the set_locked() function below
393 	 * does not imply a full barrier.
394 	 */
395 	val = atomic_cond_read_acquire_label(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK),
396 					     release_node_err);
397 
398 	/*
399 	 * claim the lock:
400 	 *
401 	 * n,0,0 -> 0,0,1 : lock, uncontended
402 	 * *,*,0 -> *,*,1 : lock, contended
403 	 *
404 	 * If the queue head is the only one in the queue (lock value == tail)
405 	 * and nobody is pending, clear the tail code and grab the lock.
406 	 * Otherwise, we only need to grab the lock.
407 	 */
408 
409 	/*
410 	 * In the PV case we might already have _Q_LOCKED_VAL set, because
411 	 * of lock stealing; therefore we must also allow:
412 	 *
413 	 * n,0,1 -> 0,0,1
414 	 *
415 	 * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
416 	 *       above wait condition, therefore any concurrent setting of
417 	 *       PENDING will make the uncontended transition fail.
418 	 */
419 	if ((val & _Q_TAIL_MASK) == tail) {
420 		if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
421 			goto release; /* No contention */
422 	}
423 
424 	/*
425 	 * Either somebody is queued behind us or _Q_PENDING_VAL got set
426 	 * which will then detect the remaining tail and queue behind us
427 	 * ensuring we'll see a @next.
428 	 */
429 	set_locked(lock);
430 
431 	/*
432 	 * contended path; wait for next if not observed yet, release.
433 	 */
434 	if (!next)
435 		next = smp_cond_load_relaxed_label(&node->next, (VAL), release_node_err);
436 
437 	arch_mcs_spin_unlock_contended(&next->locked);
438 
439 release:;
440 	/*
441 	 * release the node
442 	 *
443 	 * Doing a normal dec vs this_cpu_dec is fine. An upper context always
444 	 * decrements count it incremented before returning, thus we're fine.
445 	 * For contexts interrupting us, they either observe our dec or not.
446 	 * Just ensure the compiler doesn't reorder this statement, as a
447 	 * this_cpu_dec implicitly implied that.
448 	 */
449 	barrier();
450 	node0->count--;
451 	return 0;
452 release_node_err:
453 	barrier();
454 	node0->count--;
455 	goto release_err;
456 release_err:
457 	return ret;
458 }
459 
460 /**
461  * arena_spin_lock - acquire a queued spinlock
462  * @lock: Pointer to queued spinlock structure
463  *
464  * On error, returned value will be negative.
465  * On success, zero is returned.
466  *
467  * The return value _must_ be tested against zero for success,
468  * instead of checking it against negative, for passing the
469  * BPF verifier.
470  *
471  * The user should do:
472  *	if (arena_spin_lock(...) != 0) // failure
473  *		or
474  *	if (arena_spin_lock(...) == 0) // success
475  *		or
476  *	if (arena_spin_lock(...)) // failure
477  *		or
478  *	if (!arena_spin_lock(...)) // success
479  * instead of:
480  *	if (arena_spin_lock(...) < 0) // failure
481  *
482  * The return value can still be inspected later.
483  */
arena_spin_lock(arena_spinlock_t __arena * lock)484 static __always_inline int arena_spin_lock(arena_spinlock_t __arena *lock)
485 {
486 	int val = 0;
487 
488 	if (CONFIG_NR_CPUS > 1024)
489 		return -EOPNOTSUPP;
490 
491 	bpf_preempt_disable();
492 	if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
493 		return 0;
494 
495 	val = arena_spin_lock_slowpath(lock, val);
496 	/* FIXME: bpf_assert_range(-MAX_ERRNO, 0) once we have it working for all cases. */
497 	if (val)
498 		bpf_preempt_enable();
499 	return val;
500 }
501 
502 /**
503  * arena_spin_unlock - release a queued spinlock
504  * @lock : Pointer to queued spinlock structure
505  */
arena_spin_unlock(arena_spinlock_t __arena * lock)506 static __always_inline void arena_spin_unlock(arena_spinlock_t __arena *lock)
507 {
508 	/*
509 	 * unlock() needs release semantics:
510 	 */
511 	smp_store_release(&lock->locked, 0);
512 	bpf_preempt_enable();
513 }
514 
515 #define arena_spin_lock_irqsave(lock, flags)             \
516 	({                                               \
517 		int __ret;                               \
518 		bpf_local_irq_save(&(flags));            \
519 		__ret = arena_spin_lock((lock));         \
520 		if (__ret)                               \
521 			bpf_local_irq_restore(&(flags)); \
522 		(__ret);                                 \
523 	})
524 
525 #define arena_spin_unlock_irqrestore(lock, flags) \
526 	({                                        \
527 		arena_spin_unlock((lock));        \
528 		bpf_local_irq_restore(&(flags));  \
529 	})
530 
531 #endif
532 
533 #endif /* BPF_ARENA_SPIN_LOCK_H */
534