1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
3 #ifndef BPF_ARENA_SPIN_LOCK_H
4 #define BPF_ARENA_SPIN_LOCK_H
5
6 #include <vmlinux.h>
7 #include <bpf/bpf_helpers.h>
8 #include "bpf_atomic.h"
9
10 #define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label)
11 #define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1)
12
13 #if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST)
14
15 #define EBUSY 16
16 #define EOPNOTSUPP 95
17 #define ETIMEDOUT 110
18
19 #ifndef __arena
20 #define __arena __attribute__((address_space(1)))
21 #endif
22
23 extern unsigned long CONFIG_NR_CPUS __kconfig;
24
25 /*
26 * Typically, we'd just rely on the definition in vmlinux.h for qspinlock, but
27 * PowerPC overrides the definition to define lock->val as u32 instead of
28 * atomic_t, leading to compilation errors. Import a local definition below so
29 * that we don't depend on the vmlinux.h version.
30 */
31
32 struct __qspinlock {
33 union {
34 atomic_t val;
35 struct {
36 u8 locked;
37 u8 pending;
38 };
39 struct {
40 u16 locked_pending;
41 u16 tail;
42 };
43 };
44 };
45
46 #define arena_spinlock_t struct __qspinlock
47 /* FIXME: Using typedef causes CO-RE relocation error */
48 /* typedef struct qspinlock arena_spinlock_t; */
49
50 struct arena_mcs_spinlock {
51 struct arena_mcs_spinlock __arena *next;
52 int locked;
53 int count;
54 };
55
56 struct arena_qnode {
57 struct arena_mcs_spinlock mcs;
58 };
59
60 #define _Q_MAX_NODES 4
61 #define _Q_PENDING_LOOPS 1
62
63 /*
64 * Bitfields in the atomic value:
65 *
66 * 0- 7: locked byte
67 * 8: pending
68 * 9-15: not used
69 * 16-17: tail index
70 * 18-31: tail cpu (+1)
71 */
72 #define _Q_MAX_CPUS 1024
73
74 #define _Q_SET_MASK(type) (((1U << _Q_ ## type ## _BITS) - 1)\
75 << _Q_ ## type ## _OFFSET)
76 #define _Q_LOCKED_OFFSET 0
77 #define _Q_LOCKED_BITS 8
78 #define _Q_LOCKED_MASK _Q_SET_MASK(LOCKED)
79
80 #define _Q_PENDING_OFFSET (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
81 #define _Q_PENDING_BITS 8
82 #define _Q_PENDING_MASK _Q_SET_MASK(PENDING)
83
84 #define _Q_TAIL_IDX_OFFSET (_Q_PENDING_OFFSET + _Q_PENDING_BITS)
85 #define _Q_TAIL_IDX_BITS 2
86 #define _Q_TAIL_IDX_MASK _Q_SET_MASK(TAIL_IDX)
87
88 #define _Q_TAIL_CPU_OFFSET (_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS)
89 #define _Q_TAIL_CPU_BITS (32 - _Q_TAIL_CPU_OFFSET)
90 #define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU)
91
92 #define _Q_TAIL_OFFSET _Q_TAIL_IDX_OFFSET
93 #define _Q_TAIL_MASK (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK)
94
95 #define _Q_LOCKED_VAL (1U << _Q_LOCKED_OFFSET)
96 #define _Q_PENDING_VAL (1U << _Q_PENDING_OFFSET)
97
98 #define likely(x) __builtin_expect(!!(x), 1)
99 #define unlikely(x) __builtin_expect(!!(x), 0)
100
101 struct arena_qnode __arena qnodes[_Q_MAX_CPUS][_Q_MAX_NODES];
102
encode_tail(int cpu,int idx)103 static inline u32 encode_tail(int cpu, int idx)
104 {
105 u32 tail;
106
107 tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
108 tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
109
110 return tail;
111 }
112
decode_tail(u32 tail)113 static inline struct arena_mcs_spinlock __arena *decode_tail(u32 tail)
114 {
115 u32 cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
116 u32 idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
117
118 return &qnodes[cpu][idx].mcs;
119 }
120
121 static inline
grab_mcs_node(struct arena_mcs_spinlock __arena * base,int idx)122 struct arena_mcs_spinlock __arena *grab_mcs_node(struct arena_mcs_spinlock __arena *base, int idx)
123 {
124 return &((struct arena_qnode __arena *)base + idx)->mcs;
125 }
126
127 #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
128
129 /**
130 * xchg_tail - Put in the new queue tail code word & retrieve previous one
131 * @lock : Pointer to queued spinlock structure
132 * @tail : The new queue tail code word
133 * Return: The previous queue tail code word
134 *
135 * xchg(lock, tail)
136 *
137 * p,*,* -> n,*,* ; prev = xchg(lock, node)
138 */
xchg_tail(arena_spinlock_t __arena * lock,u32 tail)139 static __always_inline u32 xchg_tail(arena_spinlock_t __arena *lock, u32 tail)
140 {
141 u32 old, new;
142
143 old = atomic_read(&lock->val);
144 do {
145 new = (old & _Q_LOCKED_PENDING_MASK) | tail;
146 /*
147 * We can use relaxed semantics since the caller ensures that
148 * the MCS node is properly initialized before updating the
149 * tail.
150 */
151 /* These loops are not expected to stall, but we still need to
152 * prove to the verifier they will terminate eventually.
153 */
154 cond_break_label(out);
155 } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));
156
157 return old;
158 out:
159 bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
160 return old;
161 }
162
163 /**
164 * clear_pending - clear the pending bit.
165 * @lock: Pointer to queued spinlock structure
166 *
167 * *,1,* -> *,0,*
168 */
clear_pending(arena_spinlock_t __arena * lock)169 static __always_inline void clear_pending(arena_spinlock_t __arena *lock)
170 {
171 WRITE_ONCE(lock->pending, 0);
172 }
173
174 /**
175 * clear_pending_set_locked - take ownership and clear the pending bit.
176 * @lock: Pointer to queued spinlock structure
177 *
178 * *,1,0 -> *,0,1
179 *
180 * Lock stealing is not allowed if this function is used.
181 */
clear_pending_set_locked(arena_spinlock_t __arena * lock)182 static __always_inline void clear_pending_set_locked(arena_spinlock_t __arena *lock)
183 {
184 WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
185 }
186
187 /**
188 * set_locked - Set the lock bit and own the lock
189 * @lock: Pointer to queued spinlock structure
190 *
191 * *,*,0 -> *,0,1
192 */
set_locked(arena_spinlock_t __arena * lock)193 static __always_inline void set_locked(arena_spinlock_t __arena *lock)
194 {
195 WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
196 }
197
198 static __always_inline
arena_fetch_set_pending_acquire(arena_spinlock_t __arena * lock)199 u32 arena_fetch_set_pending_acquire(arena_spinlock_t __arena *lock)
200 {
201 u32 old, new;
202
203 old = atomic_read(&lock->val);
204 do {
205 new = old | _Q_PENDING_VAL;
206 /*
207 * These loops are not expected to stall, but we still need to
208 * prove to the verifier they will terminate eventually.
209 */
210 cond_break_label(out);
211 } while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new));
212
213 return old;
214 out:
215 bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
216 return old;
217 }
218
219 /**
220 * arena_spin_trylock - try to acquire the queued spinlock
221 * @lock : Pointer to queued spinlock structure
222 * Return: 1 if lock acquired, 0 if failed
223 */
arena_spin_trylock(arena_spinlock_t __arena * lock)224 static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock)
225 {
226 int val = atomic_read(&lock->val);
227
228 if (unlikely(val))
229 return 0;
230
231 return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL));
232 }
233
234 __noinline
arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena * lock,u32 val)235 int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val)
236 {
237 struct arena_mcs_spinlock __arena *prev, *next, *node0, *node;
238 int ret = -ETIMEDOUT;
239 u32 old, tail;
240 int idx;
241
242 /*
243 * Wait for in-progress pending->locked hand-overs with a bounded
244 * number of spins so that we guarantee forward progress.
245 *
246 * 0,1,0 -> 0,0,1
247 */
248 if (val == _Q_PENDING_VAL) {
249 int cnt = _Q_PENDING_LOOPS;
250 val = atomic_cond_read_relaxed_label(&lock->val,
251 (VAL != _Q_PENDING_VAL) || !cnt--,
252 release_err);
253 }
254
255 /*
256 * If we observe any contention; queue.
257 */
258 if (val & ~_Q_LOCKED_MASK)
259 goto queue;
260
261 /*
262 * trylock || pending
263 *
264 * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
265 */
266 val = arena_fetch_set_pending_acquire(lock);
267
268 /*
269 * If we observe contention, there is a concurrent locker.
270 *
271 * Undo and queue; our setting of PENDING might have made the
272 * n,0,0 -> 0,0,0 transition fail and it will now be waiting
273 * on @next to become !NULL.
274 */
275 if (unlikely(val & ~_Q_LOCKED_MASK)) {
276
277 /* Undo PENDING if we set it. */
278 if (!(val & _Q_PENDING_MASK))
279 clear_pending(lock);
280
281 goto queue;
282 }
283
284 /*
285 * We're pending, wait for the owner to go away.
286 *
287 * 0,1,1 -> *,1,0
288 *
289 * this wait loop must be a load-acquire such that we match the
290 * store-release that clears the locked bit and create lock
291 * sequentiality; this is because not all
292 * clear_pending_set_locked() implementations imply full
293 * barriers.
294 */
295 if (val & _Q_LOCKED_MASK)
296 smp_cond_load_acquire_label(&lock->locked, !VAL, release_err);
297
298 /*
299 * take ownership and clear the pending bit.
300 *
301 * 0,1,0 -> 0,0,1
302 */
303 clear_pending_set_locked(lock);
304 return 0;
305
306 /*
307 * End of pending bit optimistic spinning and beginning of MCS
308 * queuing.
309 */
310 queue:
311 node0 = &(qnodes[bpf_get_smp_processor_id()])[0].mcs;
312 idx = node0->count++;
313 tail = encode_tail(bpf_get_smp_processor_id(), idx);
314
315 /*
316 * 4 nodes are allocated based on the assumption that there will not be
317 * nested NMIs taking spinlocks. That may not be true in some
318 * architectures even though the chance of needing more than 4 nodes
319 * will still be extremely unlikely. When that happens, we simply return
320 * an error. Original qspinlock has a trylock fallback in this case.
321 */
322 if (unlikely(idx >= _Q_MAX_NODES)) {
323 ret = -EBUSY;
324 goto release_node_err;
325 }
326
327 node = grab_mcs_node(node0, idx);
328
329 /*
330 * Ensure that we increment the head node->count before initialising
331 * the actual node. If the compiler is kind enough to reorder these
332 * stores, then an IRQ could overwrite our assignments.
333 */
334 barrier();
335
336 node->locked = 0;
337 node->next = NULL;
338
339 /*
340 * We touched a (possibly) cold cacheline in the per-cpu queue node;
341 * attempt the trylock once more in the hope someone let go while we
342 * weren't watching.
343 */
344 if (arena_spin_trylock(lock))
345 goto release;
346
347 /*
348 * Ensure that the initialisation of @node is complete before we
349 * publish the updated tail via xchg_tail() and potentially link
350 * @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
351 */
352 smp_wmb();
353
354 /*
355 * Publish the updated tail.
356 * We have already touched the queueing cacheline; don't bother with
357 * pending stuff.
358 *
359 * p,*,* -> n,*,*
360 */
361 old = xchg_tail(lock, tail);
362 next = NULL;
363
364 /*
365 * if there was a previous node; link it and wait until reaching the
366 * head of the waitqueue.
367 */
368 if (old & _Q_TAIL_MASK) {
369 prev = decode_tail(old);
370
371 /* Link @node into the waitqueue. */
372 WRITE_ONCE(prev->next, node);
373
374 arch_mcs_spin_lock_contended_label(&node->locked, release_node_err);
375
376 /*
377 * While waiting for the MCS lock, the next pointer may have
378 * been set by another lock waiter. We cannot prefetch here
379 * due to lack of equivalent instruction in BPF ISA.
380 */
381 next = READ_ONCE(node->next);
382 }
383
384 /*
385 * we're at the head of the waitqueue, wait for the owner & pending to
386 * go away.
387 *
388 * *,x,y -> *,0,0
389 *
390 * this wait loop must use a load-acquire such that we match the
391 * store-release that clears the locked bit and create lock
392 * sequentiality; this is because the set_locked() function below
393 * does not imply a full barrier.
394 */
395 val = atomic_cond_read_acquire_label(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK),
396 release_node_err);
397
398 /*
399 * claim the lock:
400 *
401 * n,0,0 -> 0,0,1 : lock, uncontended
402 * *,*,0 -> *,*,1 : lock, contended
403 *
404 * If the queue head is the only one in the queue (lock value == tail)
405 * and nobody is pending, clear the tail code and grab the lock.
406 * Otherwise, we only need to grab the lock.
407 */
408
409 /*
410 * In the PV case we might already have _Q_LOCKED_VAL set, because
411 * of lock stealing; therefore we must also allow:
412 *
413 * n,0,1 -> 0,0,1
414 *
415 * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
416 * above wait condition, therefore any concurrent setting of
417 * PENDING will make the uncontended transition fail.
418 */
419 if ((val & _Q_TAIL_MASK) == tail) {
420 if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
421 goto release; /* No contention */
422 }
423
424 /*
425 * Either somebody is queued behind us or _Q_PENDING_VAL got set
426 * which will then detect the remaining tail and queue behind us
427 * ensuring we'll see a @next.
428 */
429 set_locked(lock);
430
431 /*
432 * contended path; wait for next if not observed yet, release.
433 */
434 if (!next)
435 next = smp_cond_load_relaxed_label(&node->next, (VAL), release_node_err);
436
437 arch_mcs_spin_unlock_contended(&next->locked);
438
439 release:;
440 /*
441 * release the node
442 *
443 * Doing a normal dec vs this_cpu_dec is fine. An upper context always
444 * decrements count it incremented before returning, thus we're fine.
445 * For contexts interrupting us, they either observe our dec or not.
446 * Just ensure the compiler doesn't reorder this statement, as a
447 * this_cpu_dec implicitly implied that.
448 */
449 barrier();
450 node0->count--;
451 return 0;
452 release_node_err:
453 barrier();
454 node0->count--;
455 goto release_err;
456 release_err:
457 return ret;
458 }
459
460 /**
461 * arena_spin_lock - acquire a queued spinlock
462 * @lock: Pointer to queued spinlock structure
463 *
464 * On error, returned value will be negative.
465 * On success, zero is returned.
466 *
467 * The return value _must_ be tested against zero for success,
468 * instead of checking it against negative, for passing the
469 * BPF verifier.
470 *
471 * The user should do:
472 * if (arena_spin_lock(...) != 0) // failure
473 * or
474 * if (arena_spin_lock(...) == 0) // success
475 * or
476 * if (arena_spin_lock(...)) // failure
477 * or
478 * if (!arena_spin_lock(...)) // success
479 * instead of:
480 * if (arena_spin_lock(...) < 0) // failure
481 *
482 * The return value can still be inspected later.
483 */
arena_spin_lock(arena_spinlock_t __arena * lock)484 static __always_inline int arena_spin_lock(arena_spinlock_t __arena *lock)
485 {
486 int val = 0;
487
488 if (CONFIG_NR_CPUS > 1024)
489 return -EOPNOTSUPP;
490
491 bpf_preempt_disable();
492 if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
493 return 0;
494
495 val = arena_spin_lock_slowpath(lock, val);
496 /* FIXME: bpf_assert_range(-MAX_ERRNO, 0) once we have it working for all cases. */
497 if (val)
498 bpf_preempt_enable();
499 return val;
500 }
501
502 /**
503 * arena_spin_unlock - release a queued spinlock
504 * @lock : Pointer to queued spinlock structure
505 */
arena_spin_unlock(arena_spinlock_t __arena * lock)506 static __always_inline void arena_spin_unlock(arena_spinlock_t __arena *lock)
507 {
508 /*
509 * unlock() needs release semantics:
510 */
511 smp_store_release(&lock->locked, 0);
512 bpf_preempt_enable();
513 }
514
515 #define arena_spin_lock_irqsave(lock, flags) \
516 ({ \
517 int __ret; \
518 bpf_local_irq_save(&(flags)); \
519 __ret = arena_spin_lock((lock)); \
520 if (__ret) \
521 bpf_local_irq_restore(&(flags)); \
522 (__ret); \
523 })
524
525 #define arena_spin_unlock_irqrestore(lock, flags) \
526 ({ \
527 arena_spin_unlock((lock)); \
528 bpf_local_irq_restore(&(flags)); \
529 })
530
531 #endif
532
533 #endif /* BPF_ARENA_SPIN_LOCK_H */
534