188d706baSKumar Kartikeya Dwivedi // SPDX-License-Identifier: GPL-2.0
288d706baSKumar Kartikeya Dwivedi /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
388d706baSKumar Kartikeya Dwivedi #ifndef BPF_ARENA_SPIN_LOCK_H
488d706baSKumar Kartikeya Dwivedi #define BPF_ARENA_SPIN_LOCK_H
588d706baSKumar Kartikeya Dwivedi
688d706baSKumar Kartikeya Dwivedi #include <vmlinux.h>
788d706baSKumar Kartikeya Dwivedi #include <bpf/bpf_helpers.h>
888d706baSKumar Kartikeya Dwivedi #include "bpf_atomic.h"
988d706baSKumar Kartikeya Dwivedi
1088d706baSKumar Kartikeya Dwivedi #define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label)
1188d706baSKumar Kartikeya Dwivedi #define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1)
1288d706baSKumar Kartikeya Dwivedi
1388d706baSKumar Kartikeya Dwivedi #if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST)
1488d706baSKumar Kartikeya Dwivedi
1588d706baSKumar Kartikeya Dwivedi #define EBUSY 16
1688d706baSKumar Kartikeya Dwivedi #define EOPNOTSUPP 95
1788d706baSKumar Kartikeya Dwivedi #define ETIMEDOUT 110
1888d706baSKumar Kartikeya Dwivedi
1988d706baSKumar Kartikeya Dwivedi #ifndef __arena
2088d706baSKumar Kartikeya Dwivedi #define __arena __attribute__((address_space(1)))
2188d706baSKumar Kartikeya Dwivedi #endif
2288d706baSKumar Kartikeya Dwivedi
2388d706baSKumar Kartikeya Dwivedi extern unsigned long CONFIG_NR_CPUS __kconfig;
2488d706baSKumar Kartikeya Dwivedi
251f375aefSKumar Kartikeya Dwivedi /*
261f375aefSKumar Kartikeya Dwivedi * Typically, we'd just rely on the definition in vmlinux.h for qspinlock, but
271f375aefSKumar Kartikeya Dwivedi * PowerPC overrides the definition to define lock->val as u32 instead of
281f375aefSKumar Kartikeya Dwivedi * atomic_t, leading to compilation errors. Import a local definition below so
291f375aefSKumar Kartikeya Dwivedi * that we don't depend on the vmlinux.h version.
301f375aefSKumar Kartikeya Dwivedi */
311f375aefSKumar Kartikeya Dwivedi
321f375aefSKumar Kartikeya Dwivedi struct __qspinlock {
331f375aefSKumar Kartikeya Dwivedi union {
341f375aefSKumar Kartikeya Dwivedi atomic_t val;
35*be552199SIlya Leoshkevich #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
361f375aefSKumar Kartikeya Dwivedi struct {
371f375aefSKumar Kartikeya Dwivedi u8 locked;
381f375aefSKumar Kartikeya Dwivedi u8 pending;
391f375aefSKumar Kartikeya Dwivedi };
401f375aefSKumar Kartikeya Dwivedi struct {
411f375aefSKumar Kartikeya Dwivedi u16 locked_pending;
421f375aefSKumar Kartikeya Dwivedi u16 tail;
431f375aefSKumar Kartikeya Dwivedi };
44*be552199SIlya Leoshkevich #else
45*be552199SIlya Leoshkevich struct {
46*be552199SIlya Leoshkevich u16 tail;
47*be552199SIlya Leoshkevich u16 locked_pending;
48*be552199SIlya Leoshkevich };
49*be552199SIlya Leoshkevich struct {
50*be552199SIlya Leoshkevich u8 reserved[2];
51*be552199SIlya Leoshkevich u8 pending;
52*be552199SIlya Leoshkevich u8 locked;
53*be552199SIlya Leoshkevich };
54*be552199SIlya Leoshkevich #endif
551f375aefSKumar Kartikeya Dwivedi };
561f375aefSKumar Kartikeya Dwivedi };
571f375aefSKumar Kartikeya Dwivedi
581f375aefSKumar Kartikeya Dwivedi #define arena_spinlock_t struct __qspinlock
5988d706baSKumar Kartikeya Dwivedi /* FIXME: Using typedef causes CO-RE relocation error */
6088d706baSKumar Kartikeya Dwivedi /* typedef struct qspinlock arena_spinlock_t; */
6188d706baSKumar Kartikeya Dwivedi
6288d706baSKumar Kartikeya Dwivedi struct arena_mcs_spinlock {
6388d706baSKumar Kartikeya Dwivedi struct arena_mcs_spinlock __arena *next;
6488d706baSKumar Kartikeya Dwivedi int locked;
6588d706baSKumar Kartikeya Dwivedi int count;
6688d706baSKumar Kartikeya Dwivedi };
6788d706baSKumar Kartikeya Dwivedi
6888d706baSKumar Kartikeya Dwivedi struct arena_qnode {
6988d706baSKumar Kartikeya Dwivedi struct arena_mcs_spinlock mcs;
7088d706baSKumar Kartikeya Dwivedi };
7188d706baSKumar Kartikeya Dwivedi
7288d706baSKumar Kartikeya Dwivedi #define _Q_MAX_NODES 4
7388d706baSKumar Kartikeya Dwivedi #define _Q_PENDING_LOOPS 1
7488d706baSKumar Kartikeya Dwivedi
7588d706baSKumar Kartikeya Dwivedi /*
7688d706baSKumar Kartikeya Dwivedi * Bitfields in the atomic value:
7788d706baSKumar Kartikeya Dwivedi *
7888d706baSKumar Kartikeya Dwivedi * 0- 7: locked byte
7988d706baSKumar Kartikeya Dwivedi * 8: pending
8088d706baSKumar Kartikeya Dwivedi * 9-15: not used
8188d706baSKumar Kartikeya Dwivedi * 16-17: tail index
8288d706baSKumar Kartikeya Dwivedi * 18-31: tail cpu (+1)
8388d706baSKumar Kartikeya Dwivedi */
8488d706baSKumar Kartikeya Dwivedi #define _Q_MAX_CPUS 1024
8588d706baSKumar Kartikeya Dwivedi
8688d706baSKumar Kartikeya Dwivedi #define _Q_SET_MASK(type) (((1U << _Q_ ## type ## _BITS) - 1)\
8788d706baSKumar Kartikeya Dwivedi << _Q_ ## type ## _OFFSET)
8888d706baSKumar Kartikeya Dwivedi #define _Q_LOCKED_OFFSET 0
8988d706baSKumar Kartikeya Dwivedi #define _Q_LOCKED_BITS 8
9088d706baSKumar Kartikeya Dwivedi #define _Q_LOCKED_MASK _Q_SET_MASK(LOCKED)
9188d706baSKumar Kartikeya Dwivedi
9288d706baSKumar Kartikeya Dwivedi #define _Q_PENDING_OFFSET (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
9388d706baSKumar Kartikeya Dwivedi #define _Q_PENDING_BITS 8
9488d706baSKumar Kartikeya Dwivedi #define _Q_PENDING_MASK _Q_SET_MASK(PENDING)
9588d706baSKumar Kartikeya Dwivedi
9688d706baSKumar Kartikeya Dwivedi #define _Q_TAIL_IDX_OFFSET (_Q_PENDING_OFFSET + _Q_PENDING_BITS)
9788d706baSKumar Kartikeya Dwivedi #define _Q_TAIL_IDX_BITS 2
9888d706baSKumar Kartikeya Dwivedi #define _Q_TAIL_IDX_MASK _Q_SET_MASK(TAIL_IDX)
9988d706baSKumar Kartikeya Dwivedi
10088d706baSKumar Kartikeya Dwivedi #define _Q_TAIL_CPU_OFFSET (_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS)
10188d706baSKumar Kartikeya Dwivedi #define _Q_TAIL_CPU_BITS (32 - _Q_TAIL_CPU_OFFSET)
10288d706baSKumar Kartikeya Dwivedi #define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU)
10388d706baSKumar Kartikeya Dwivedi
10488d706baSKumar Kartikeya Dwivedi #define _Q_TAIL_OFFSET _Q_TAIL_IDX_OFFSET
10588d706baSKumar Kartikeya Dwivedi #define _Q_TAIL_MASK (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK)
10688d706baSKumar Kartikeya Dwivedi
10788d706baSKumar Kartikeya Dwivedi #define _Q_LOCKED_VAL (1U << _Q_LOCKED_OFFSET)
10888d706baSKumar Kartikeya Dwivedi #define _Q_PENDING_VAL (1U << _Q_PENDING_OFFSET)
10988d706baSKumar Kartikeya Dwivedi
11088d706baSKumar Kartikeya Dwivedi struct arena_qnode __arena qnodes[_Q_MAX_CPUS][_Q_MAX_NODES];
11188d706baSKumar Kartikeya Dwivedi
encode_tail(int cpu,int idx)11288d706baSKumar Kartikeya Dwivedi static inline u32 encode_tail(int cpu, int idx)
11388d706baSKumar Kartikeya Dwivedi {
11488d706baSKumar Kartikeya Dwivedi u32 tail;
11588d706baSKumar Kartikeya Dwivedi
11688d706baSKumar Kartikeya Dwivedi tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
11788d706baSKumar Kartikeya Dwivedi tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
11888d706baSKumar Kartikeya Dwivedi
11988d706baSKumar Kartikeya Dwivedi return tail;
12088d706baSKumar Kartikeya Dwivedi }
12188d706baSKumar Kartikeya Dwivedi
decode_tail(u32 tail)12288d706baSKumar Kartikeya Dwivedi static inline struct arena_mcs_spinlock __arena *decode_tail(u32 tail)
12388d706baSKumar Kartikeya Dwivedi {
12488d706baSKumar Kartikeya Dwivedi u32 cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
12588d706baSKumar Kartikeya Dwivedi u32 idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
12688d706baSKumar Kartikeya Dwivedi
12788d706baSKumar Kartikeya Dwivedi return &qnodes[cpu][idx].mcs;
12888d706baSKumar Kartikeya Dwivedi }
12988d706baSKumar Kartikeya Dwivedi
13088d706baSKumar Kartikeya Dwivedi static inline
grab_mcs_node(struct arena_mcs_spinlock __arena * base,int idx)13188d706baSKumar Kartikeya Dwivedi struct arena_mcs_spinlock __arena *grab_mcs_node(struct arena_mcs_spinlock __arena *base, int idx)
13288d706baSKumar Kartikeya Dwivedi {
13388d706baSKumar Kartikeya Dwivedi return &((struct arena_qnode __arena *)base + idx)->mcs;
13488d706baSKumar Kartikeya Dwivedi }
13588d706baSKumar Kartikeya Dwivedi
13688d706baSKumar Kartikeya Dwivedi #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
13788d706baSKumar Kartikeya Dwivedi
13888d706baSKumar Kartikeya Dwivedi /**
13988d706baSKumar Kartikeya Dwivedi * xchg_tail - Put in the new queue tail code word & retrieve previous one
14088d706baSKumar Kartikeya Dwivedi * @lock : Pointer to queued spinlock structure
14188d706baSKumar Kartikeya Dwivedi * @tail : The new queue tail code word
14288d706baSKumar Kartikeya Dwivedi * Return: The previous queue tail code word
14388d706baSKumar Kartikeya Dwivedi *
14488d706baSKumar Kartikeya Dwivedi * xchg(lock, tail)
14588d706baSKumar Kartikeya Dwivedi *
14688d706baSKumar Kartikeya Dwivedi * p,*,* -> n,*,* ; prev = xchg(lock, node)
14788d706baSKumar Kartikeya Dwivedi */
xchg_tail(arena_spinlock_t __arena * lock,u32 tail)14888d706baSKumar Kartikeya Dwivedi static __always_inline u32 xchg_tail(arena_spinlock_t __arena *lock, u32 tail)
14988d706baSKumar Kartikeya Dwivedi {
15088d706baSKumar Kartikeya Dwivedi u32 old, new;
15188d706baSKumar Kartikeya Dwivedi
15288d706baSKumar Kartikeya Dwivedi old = atomic_read(&lock->val);
15388d706baSKumar Kartikeya Dwivedi do {
15488d706baSKumar Kartikeya Dwivedi new = (old & _Q_LOCKED_PENDING_MASK) | tail;
15588d706baSKumar Kartikeya Dwivedi /*
15688d706baSKumar Kartikeya Dwivedi * We can use relaxed semantics since the caller ensures that
15788d706baSKumar Kartikeya Dwivedi * the MCS node is properly initialized before updating the
15888d706baSKumar Kartikeya Dwivedi * tail.
15988d706baSKumar Kartikeya Dwivedi */
16088d706baSKumar Kartikeya Dwivedi /* These loops are not expected to stall, but we still need to
16188d706baSKumar Kartikeya Dwivedi * prove to the verifier they will terminate eventually.
16288d706baSKumar Kartikeya Dwivedi */
16388d706baSKumar Kartikeya Dwivedi cond_break_label(out);
16488d706baSKumar Kartikeya Dwivedi } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));
16588d706baSKumar Kartikeya Dwivedi
16688d706baSKumar Kartikeya Dwivedi return old;
16788d706baSKumar Kartikeya Dwivedi out:
16888d706baSKumar Kartikeya Dwivedi bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
16988d706baSKumar Kartikeya Dwivedi return old;
17088d706baSKumar Kartikeya Dwivedi }
17188d706baSKumar Kartikeya Dwivedi
17288d706baSKumar Kartikeya Dwivedi /**
17388d706baSKumar Kartikeya Dwivedi * clear_pending - clear the pending bit.
17488d706baSKumar Kartikeya Dwivedi * @lock: Pointer to queued spinlock structure
17588d706baSKumar Kartikeya Dwivedi *
17688d706baSKumar Kartikeya Dwivedi * *,1,* -> *,0,*
17788d706baSKumar Kartikeya Dwivedi */
clear_pending(arena_spinlock_t __arena * lock)17888d706baSKumar Kartikeya Dwivedi static __always_inline void clear_pending(arena_spinlock_t __arena *lock)
17988d706baSKumar Kartikeya Dwivedi {
18088d706baSKumar Kartikeya Dwivedi WRITE_ONCE(lock->pending, 0);
18188d706baSKumar Kartikeya Dwivedi }
18288d706baSKumar Kartikeya Dwivedi
18388d706baSKumar Kartikeya Dwivedi /**
18488d706baSKumar Kartikeya Dwivedi * clear_pending_set_locked - take ownership and clear the pending bit.
18588d706baSKumar Kartikeya Dwivedi * @lock: Pointer to queued spinlock structure
18688d706baSKumar Kartikeya Dwivedi *
18788d706baSKumar Kartikeya Dwivedi * *,1,0 -> *,0,1
18888d706baSKumar Kartikeya Dwivedi *
18988d706baSKumar Kartikeya Dwivedi * Lock stealing is not allowed if this function is used.
19088d706baSKumar Kartikeya Dwivedi */
clear_pending_set_locked(arena_spinlock_t __arena * lock)19188d706baSKumar Kartikeya Dwivedi static __always_inline void clear_pending_set_locked(arena_spinlock_t __arena *lock)
19288d706baSKumar Kartikeya Dwivedi {
19388d706baSKumar Kartikeya Dwivedi WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
19488d706baSKumar Kartikeya Dwivedi }
19588d706baSKumar Kartikeya Dwivedi
19688d706baSKumar Kartikeya Dwivedi /**
19788d706baSKumar Kartikeya Dwivedi * set_locked - Set the lock bit and own the lock
19888d706baSKumar Kartikeya Dwivedi * @lock: Pointer to queued spinlock structure
19988d706baSKumar Kartikeya Dwivedi *
20088d706baSKumar Kartikeya Dwivedi * *,*,0 -> *,0,1
20188d706baSKumar Kartikeya Dwivedi */
set_locked(arena_spinlock_t __arena * lock)20288d706baSKumar Kartikeya Dwivedi static __always_inline void set_locked(arena_spinlock_t __arena *lock)
20388d706baSKumar Kartikeya Dwivedi {
20488d706baSKumar Kartikeya Dwivedi WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
20588d706baSKumar Kartikeya Dwivedi }
20688d706baSKumar Kartikeya Dwivedi
20788d706baSKumar Kartikeya Dwivedi static __always_inline
arena_fetch_set_pending_acquire(arena_spinlock_t __arena * lock)20888d706baSKumar Kartikeya Dwivedi u32 arena_fetch_set_pending_acquire(arena_spinlock_t __arena *lock)
20988d706baSKumar Kartikeya Dwivedi {
21088d706baSKumar Kartikeya Dwivedi u32 old, new;
21188d706baSKumar Kartikeya Dwivedi
21288d706baSKumar Kartikeya Dwivedi old = atomic_read(&lock->val);
21388d706baSKumar Kartikeya Dwivedi do {
21488d706baSKumar Kartikeya Dwivedi new = old | _Q_PENDING_VAL;
21588d706baSKumar Kartikeya Dwivedi /*
21688d706baSKumar Kartikeya Dwivedi * These loops are not expected to stall, but we still need to
21788d706baSKumar Kartikeya Dwivedi * prove to the verifier they will terminate eventually.
21888d706baSKumar Kartikeya Dwivedi */
21988d706baSKumar Kartikeya Dwivedi cond_break_label(out);
22088d706baSKumar Kartikeya Dwivedi } while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new));
22188d706baSKumar Kartikeya Dwivedi
22288d706baSKumar Kartikeya Dwivedi return old;
22388d706baSKumar Kartikeya Dwivedi out:
22488d706baSKumar Kartikeya Dwivedi bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
22588d706baSKumar Kartikeya Dwivedi return old;
22688d706baSKumar Kartikeya Dwivedi }
22788d706baSKumar Kartikeya Dwivedi
22888d706baSKumar Kartikeya Dwivedi /**
22988d706baSKumar Kartikeya Dwivedi * arena_spin_trylock - try to acquire the queued spinlock
23088d706baSKumar Kartikeya Dwivedi * @lock : Pointer to queued spinlock structure
23188d706baSKumar Kartikeya Dwivedi * Return: 1 if lock acquired, 0 if failed
23288d706baSKumar Kartikeya Dwivedi */
arena_spin_trylock(arena_spinlock_t __arena * lock)23388d706baSKumar Kartikeya Dwivedi static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock)
23488d706baSKumar Kartikeya Dwivedi {
23588d706baSKumar Kartikeya Dwivedi int val = atomic_read(&lock->val);
23688d706baSKumar Kartikeya Dwivedi
23788d706baSKumar Kartikeya Dwivedi if (unlikely(val))
23888d706baSKumar Kartikeya Dwivedi return 0;
23988d706baSKumar Kartikeya Dwivedi
24088d706baSKumar Kartikeya Dwivedi return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL));
24188d706baSKumar Kartikeya Dwivedi }
24288d706baSKumar Kartikeya Dwivedi
24388d706baSKumar Kartikeya Dwivedi __noinline
arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena * lock,u32 val)24488d706baSKumar Kartikeya Dwivedi int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val)
24588d706baSKumar Kartikeya Dwivedi {
24688d706baSKumar Kartikeya Dwivedi struct arena_mcs_spinlock __arena *prev, *next, *node0, *node;
24788d706baSKumar Kartikeya Dwivedi int ret = -ETIMEDOUT;
24888d706baSKumar Kartikeya Dwivedi u32 old, tail;
24988d706baSKumar Kartikeya Dwivedi int idx;
25088d706baSKumar Kartikeya Dwivedi
25188d706baSKumar Kartikeya Dwivedi /*
25288d706baSKumar Kartikeya Dwivedi * Wait for in-progress pending->locked hand-overs with a bounded
25388d706baSKumar Kartikeya Dwivedi * number of spins so that we guarantee forward progress.
25488d706baSKumar Kartikeya Dwivedi *
25588d706baSKumar Kartikeya Dwivedi * 0,1,0 -> 0,0,1
25688d706baSKumar Kartikeya Dwivedi */
25788d706baSKumar Kartikeya Dwivedi if (val == _Q_PENDING_VAL) {
25888d706baSKumar Kartikeya Dwivedi int cnt = _Q_PENDING_LOOPS;
25988d706baSKumar Kartikeya Dwivedi val = atomic_cond_read_relaxed_label(&lock->val,
26088d706baSKumar Kartikeya Dwivedi (VAL != _Q_PENDING_VAL) || !cnt--,
26188d706baSKumar Kartikeya Dwivedi release_err);
26288d706baSKumar Kartikeya Dwivedi }
26388d706baSKumar Kartikeya Dwivedi
26488d706baSKumar Kartikeya Dwivedi /*
26588d706baSKumar Kartikeya Dwivedi * If we observe any contention; queue.
26688d706baSKumar Kartikeya Dwivedi */
26788d706baSKumar Kartikeya Dwivedi if (val & ~_Q_LOCKED_MASK)
26888d706baSKumar Kartikeya Dwivedi goto queue;
26988d706baSKumar Kartikeya Dwivedi
27088d706baSKumar Kartikeya Dwivedi /*
27188d706baSKumar Kartikeya Dwivedi * trylock || pending
27288d706baSKumar Kartikeya Dwivedi *
27388d706baSKumar Kartikeya Dwivedi * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
27488d706baSKumar Kartikeya Dwivedi */
27588d706baSKumar Kartikeya Dwivedi val = arena_fetch_set_pending_acquire(lock);
27688d706baSKumar Kartikeya Dwivedi
27788d706baSKumar Kartikeya Dwivedi /*
27888d706baSKumar Kartikeya Dwivedi * If we observe contention, there is a concurrent locker.
27988d706baSKumar Kartikeya Dwivedi *
28088d706baSKumar Kartikeya Dwivedi * Undo and queue; our setting of PENDING might have made the
28188d706baSKumar Kartikeya Dwivedi * n,0,0 -> 0,0,0 transition fail and it will now be waiting
28288d706baSKumar Kartikeya Dwivedi * on @next to become !NULL.
28388d706baSKumar Kartikeya Dwivedi */
28488d706baSKumar Kartikeya Dwivedi if (unlikely(val & ~_Q_LOCKED_MASK)) {
28588d706baSKumar Kartikeya Dwivedi
28688d706baSKumar Kartikeya Dwivedi /* Undo PENDING if we set it. */
28788d706baSKumar Kartikeya Dwivedi if (!(val & _Q_PENDING_MASK))
28888d706baSKumar Kartikeya Dwivedi clear_pending(lock);
28988d706baSKumar Kartikeya Dwivedi
29088d706baSKumar Kartikeya Dwivedi goto queue;
29188d706baSKumar Kartikeya Dwivedi }
29288d706baSKumar Kartikeya Dwivedi
29388d706baSKumar Kartikeya Dwivedi /*
29488d706baSKumar Kartikeya Dwivedi * We're pending, wait for the owner to go away.
29588d706baSKumar Kartikeya Dwivedi *
29688d706baSKumar Kartikeya Dwivedi * 0,1,1 -> *,1,0
29788d706baSKumar Kartikeya Dwivedi *
29888d706baSKumar Kartikeya Dwivedi * this wait loop must be a load-acquire such that we match the
29988d706baSKumar Kartikeya Dwivedi * store-release that clears the locked bit and create lock
30088d706baSKumar Kartikeya Dwivedi * sequentiality; this is because not all
30188d706baSKumar Kartikeya Dwivedi * clear_pending_set_locked() implementations imply full
30288d706baSKumar Kartikeya Dwivedi * barriers.
30388d706baSKumar Kartikeya Dwivedi */
30488d706baSKumar Kartikeya Dwivedi if (val & _Q_LOCKED_MASK)
30588d706baSKumar Kartikeya Dwivedi smp_cond_load_acquire_label(&lock->locked, !VAL, release_err);
30688d706baSKumar Kartikeya Dwivedi
30788d706baSKumar Kartikeya Dwivedi /*
30888d706baSKumar Kartikeya Dwivedi * take ownership and clear the pending bit.
30988d706baSKumar Kartikeya Dwivedi *
31088d706baSKumar Kartikeya Dwivedi * 0,1,0 -> 0,0,1
31188d706baSKumar Kartikeya Dwivedi */
31288d706baSKumar Kartikeya Dwivedi clear_pending_set_locked(lock);
31388d706baSKumar Kartikeya Dwivedi return 0;
31488d706baSKumar Kartikeya Dwivedi
31588d706baSKumar Kartikeya Dwivedi /*
31688d706baSKumar Kartikeya Dwivedi * End of pending bit optimistic spinning and beginning of MCS
31788d706baSKumar Kartikeya Dwivedi * queuing.
31888d706baSKumar Kartikeya Dwivedi */
31988d706baSKumar Kartikeya Dwivedi queue:
32088d706baSKumar Kartikeya Dwivedi node0 = &(qnodes[bpf_get_smp_processor_id()])[0].mcs;
32188d706baSKumar Kartikeya Dwivedi idx = node0->count++;
32288d706baSKumar Kartikeya Dwivedi tail = encode_tail(bpf_get_smp_processor_id(), idx);
32388d706baSKumar Kartikeya Dwivedi
32488d706baSKumar Kartikeya Dwivedi /*
32588d706baSKumar Kartikeya Dwivedi * 4 nodes are allocated based on the assumption that there will not be
32688d706baSKumar Kartikeya Dwivedi * nested NMIs taking spinlocks. That may not be true in some
32788d706baSKumar Kartikeya Dwivedi * architectures even though the chance of needing more than 4 nodes
32888d706baSKumar Kartikeya Dwivedi * will still be extremely unlikely. When that happens, we simply return
32988d706baSKumar Kartikeya Dwivedi * an error. Original qspinlock has a trylock fallback in this case.
33088d706baSKumar Kartikeya Dwivedi */
33188d706baSKumar Kartikeya Dwivedi if (unlikely(idx >= _Q_MAX_NODES)) {
33288d706baSKumar Kartikeya Dwivedi ret = -EBUSY;
33388d706baSKumar Kartikeya Dwivedi goto release_node_err;
33488d706baSKumar Kartikeya Dwivedi }
33588d706baSKumar Kartikeya Dwivedi
33688d706baSKumar Kartikeya Dwivedi node = grab_mcs_node(node0, idx);
33788d706baSKumar Kartikeya Dwivedi
33888d706baSKumar Kartikeya Dwivedi /*
33988d706baSKumar Kartikeya Dwivedi * Ensure that we increment the head node->count before initialising
34088d706baSKumar Kartikeya Dwivedi * the actual node. If the compiler is kind enough to reorder these
34188d706baSKumar Kartikeya Dwivedi * stores, then an IRQ could overwrite our assignments.
34288d706baSKumar Kartikeya Dwivedi */
34388d706baSKumar Kartikeya Dwivedi barrier();
34488d706baSKumar Kartikeya Dwivedi
34588d706baSKumar Kartikeya Dwivedi node->locked = 0;
34688d706baSKumar Kartikeya Dwivedi node->next = NULL;
34788d706baSKumar Kartikeya Dwivedi
34888d706baSKumar Kartikeya Dwivedi /*
34988d706baSKumar Kartikeya Dwivedi * We touched a (possibly) cold cacheline in the per-cpu queue node;
35088d706baSKumar Kartikeya Dwivedi * attempt the trylock once more in the hope someone let go while we
35188d706baSKumar Kartikeya Dwivedi * weren't watching.
35288d706baSKumar Kartikeya Dwivedi */
35388d706baSKumar Kartikeya Dwivedi if (arena_spin_trylock(lock))
35488d706baSKumar Kartikeya Dwivedi goto release;
35588d706baSKumar Kartikeya Dwivedi
35688d706baSKumar Kartikeya Dwivedi /*
35788d706baSKumar Kartikeya Dwivedi * Ensure that the initialisation of @node is complete before we
35888d706baSKumar Kartikeya Dwivedi * publish the updated tail via xchg_tail() and potentially link
35988d706baSKumar Kartikeya Dwivedi * @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
36088d706baSKumar Kartikeya Dwivedi */
36188d706baSKumar Kartikeya Dwivedi smp_wmb();
36288d706baSKumar Kartikeya Dwivedi
36388d706baSKumar Kartikeya Dwivedi /*
36488d706baSKumar Kartikeya Dwivedi * Publish the updated tail.
36588d706baSKumar Kartikeya Dwivedi * We have already touched the queueing cacheline; don't bother with
36688d706baSKumar Kartikeya Dwivedi * pending stuff.
36788d706baSKumar Kartikeya Dwivedi *
36888d706baSKumar Kartikeya Dwivedi * p,*,* -> n,*,*
36988d706baSKumar Kartikeya Dwivedi */
37088d706baSKumar Kartikeya Dwivedi old = xchg_tail(lock, tail);
37188d706baSKumar Kartikeya Dwivedi next = NULL;
37288d706baSKumar Kartikeya Dwivedi
37388d706baSKumar Kartikeya Dwivedi /*
37488d706baSKumar Kartikeya Dwivedi * if there was a previous node; link it and wait until reaching the
37588d706baSKumar Kartikeya Dwivedi * head of the waitqueue.
37688d706baSKumar Kartikeya Dwivedi */
37788d706baSKumar Kartikeya Dwivedi if (old & _Q_TAIL_MASK) {
37888d706baSKumar Kartikeya Dwivedi prev = decode_tail(old);
37988d706baSKumar Kartikeya Dwivedi
38088d706baSKumar Kartikeya Dwivedi /* Link @node into the waitqueue. */
38188d706baSKumar Kartikeya Dwivedi WRITE_ONCE(prev->next, node);
38288d706baSKumar Kartikeya Dwivedi
38388d706baSKumar Kartikeya Dwivedi arch_mcs_spin_lock_contended_label(&node->locked, release_node_err);
38488d706baSKumar Kartikeya Dwivedi
38588d706baSKumar Kartikeya Dwivedi /*
38688d706baSKumar Kartikeya Dwivedi * While waiting for the MCS lock, the next pointer may have
38788d706baSKumar Kartikeya Dwivedi * been set by another lock waiter. We cannot prefetch here
38888d706baSKumar Kartikeya Dwivedi * due to lack of equivalent instruction in BPF ISA.
38988d706baSKumar Kartikeya Dwivedi */
39088d706baSKumar Kartikeya Dwivedi next = READ_ONCE(node->next);
39188d706baSKumar Kartikeya Dwivedi }
39288d706baSKumar Kartikeya Dwivedi
39388d706baSKumar Kartikeya Dwivedi /*
39488d706baSKumar Kartikeya Dwivedi * we're at the head of the waitqueue, wait for the owner & pending to
39588d706baSKumar Kartikeya Dwivedi * go away.
39688d706baSKumar Kartikeya Dwivedi *
39788d706baSKumar Kartikeya Dwivedi * *,x,y -> *,0,0
39888d706baSKumar Kartikeya Dwivedi *
39988d706baSKumar Kartikeya Dwivedi * this wait loop must use a load-acquire such that we match the
40088d706baSKumar Kartikeya Dwivedi * store-release that clears the locked bit and create lock
40188d706baSKumar Kartikeya Dwivedi * sequentiality; this is because the set_locked() function below
40288d706baSKumar Kartikeya Dwivedi * does not imply a full barrier.
40388d706baSKumar Kartikeya Dwivedi */
40488d706baSKumar Kartikeya Dwivedi val = atomic_cond_read_acquire_label(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK),
40588d706baSKumar Kartikeya Dwivedi release_node_err);
40688d706baSKumar Kartikeya Dwivedi
40788d706baSKumar Kartikeya Dwivedi /*
40888d706baSKumar Kartikeya Dwivedi * claim the lock:
40988d706baSKumar Kartikeya Dwivedi *
41088d706baSKumar Kartikeya Dwivedi * n,0,0 -> 0,0,1 : lock, uncontended
41188d706baSKumar Kartikeya Dwivedi * *,*,0 -> *,*,1 : lock, contended
41288d706baSKumar Kartikeya Dwivedi *
41388d706baSKumar Kartikeya Dwivedi * If the queue head is the only one in the queue (lock value == tail)
41488d706baSKumar Kartikeya Dwivedi * and nobody is pending, clear the tail code and grab the lock.
41588d706baSKumar Kartikeya Dwivedi * Otherwise, we only need to grab the lock.
41688d706baSKumar Kartikeya Dwivedi */
41788d706baSKumar Kartikeya Dwivedi
41888d706baSKumar Kartikeya Dwivedi /*
41988d706baSKumar Kartikeya Dwivedi * In the PV case we might already have _Q_LOCKED_VAL set, because
42088d706baSKumar Kartikeya Dwivedi * of lock stealing; therefore we must also allow:
42188d706baSKumar Kartikeya Dwivedi *
42288d706baSKumar Kartikeya Dwivedi * n,0,1 -> 0,0,1
42388d706baSKumar Kartikeya Dwivedi *
42488d706baSKumar Kartikeya Dwivedi * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
42588d706baSKumar Kartikeya Dwivedi * above wait condition, therefore any concurrent setting of
42688d706baSKumar Kartikeya Dwivedi * PENDING will make the uncontended transition fail.
42788d706baSKumar Kartikeya Dwivedi */
42888d706baSKumar Kartikeya Dwivedi if ((val & _Q_TAIL_MASK) == tail) {
42988d706baSKumar Kartikeya Dwivedi if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
43088d706baSKumar Kartikeya Dwivedi goto release; /* No contention */
43188d706baSKumar Kartikeya Dwivedi }
43288d706baSKumar Kartikeya Dwivedi
43388d706baSKumar Kartikeya Dwivedi /*
43488d706baSKumar Kartikeya Dwivedi * Either somebody is queued behind us or _Q_PENDING_VAL got set
43588d706baSKumar Kartikeya Dwivedi * which will then detect the remaining tail and queue behind us
43688d706baSKumar Kartikeya Dwivedi * ensuring we'll see a @next.
43788d706baSKumar Kartikeya Dwivedi */
43888d706baSKumar Kartikeya Dwivedi set_locked(lock);
43988d706baSKumar Kartikeya Dwivedi
44088d706baSKumar Kartikeya Dwivedi /*
44188d706baSKumar Kartikeya Dwivedi * contended path; wait for next if not observed yet, release.
44288d706baSKumar Kartikeya Dwivedi */
44388d706baSKumar Kartikeya Dwivedi if (!next)
44488d706baSKumar Kartikeya Dwivedi next = smp_cond_load_relaxed_label(&node->next, (VAL), release_node_err);
44588d706baSKumar Kartikeya Dwivedi
44688d706baSKumar Kartikeya Dwivedi arch_mcs_spin_unlock_contended(&next->locked);
44788d706baSKumar Kartikeya Dwivedi
44888d706baSKumar Kartikeya Dwivedi release:;
44988d706baSKumar Kartikeya Dwivedi /*
45088d706baSKumar Kartikeya Dwivedi * release the node
45188d706baSKumar Kartikeya Dwivedi *
45288d706baSKumar Kartikeya Dwivedi * Doing a normal dec vs this_cpu_dec is fine. An upper context always
45388d706baSKumar Kartikeya Dwivedi * decrements count it incremented before returning, thus we're fine.
45488d706baSKumar Kartikeya Dwivedi * For contexts interrupting us, they either observe our dec or not.
45588d706baSKumar Kartikeya Dwivedi * Just ensure the compiler doesn't reorder this statement, as a
45688d706baSKumar Kartikeya Dwivedi * this_cpu_dec implicitly implied that.
45788d706baSKumar Kartikeya Dwivedi */
45888d706baSKumar Kartikeya Dwivedi barrier();
45988d706baSKumar Kartikeya Dwivedi node0->count--;
46088d706baSKumar Kartikeya Dwivedi return 0;
46188d706baSKumar Kartikeya Dwivedi release_node_err:
46288d706baSKumar Kartikeya Dwivedi barrier();
46388d706baSKumar Kartikeya Dwivedi node0->count--;
46488d706baSKumar Kartikeya Dwivedi goto release_err;
46588d706baSKumar Kartikeya Dwivedi release_err:
46688d706baSKumar Kartikeya Dwivedi return ret;
46788d706baSKumar Kartikeya Dwivedi }
46888d706baSKumar Kartikeya Dwivedi
46988d706baSKumar Kartikeya Dwivedi /**
47088d706baSKumar Kartikeya Dwivedi * arena_spin_lock - acquire a queued spinlock
47188d706baSKumar Kartikeya Dwivedi * @lock: Pointer to queued spinlock structure
47288d706baSKumar Kartikeya Dwivedi *
47388d706baSKumar Kartikeya Dwivedi * On error, returned value will be negative.
47488d706baSKumar Kartikeya Dwivedi * On success, zero is returned.
47588d706baSKumar Kartikeya Dwivedi *
47688d706baSKumar Kartikeya Dwivedi * The return value _must_ be tested against zero for success,
47788d706baSKumar Kartikeya Dwivedi * instead of checking it against negative, for passing the
47888d706baSKumar Kartikeya Dwivedi * BPF verifier.
47988d706baSKumar Kartikeya Dwivedi *
48088d706baSKumar Kartikeya Dwivedi * The user should do:
48188d706baSKumar Kartikeya Dwivedi * if (arena_spin_lock(...) != 0) // failure
48288d706baSKumar Kartikeya Dwivedi * or
48388d706baSKumar Kartikeya Dwivedi * if (arena_spin_lock(...) == 0) // success
48488d706baSKumar Kartikeya Dwivedi * or
48588d706baSKumar Kartikeya Dwivedi * if (arena_spin_lock(...)) // failure
48688d706baSKumar Kartikeya Dwivedi * or
48788d706baSKumar Kartikeya Dwivedi * if (!arena_spin_lock(...)) // success
48888d706baSKumar Kartikeya Dwivedi * instead of:
48988d706baSKumar Kartikeya Dwivedi * if (arena_spin_lock(...) < 0) // failure
49088d706baSKumar Kartikeya Dwivedi *
49188d706baSKumar Kartikeya Dwivedi * The return value can still be inspected later.
49288d706baSKumar Kartikeya Dwivedi */
arena_spin_lock(arena_spinlock_t __arena * lock)49388d706baSKumar Kartikeya Dwivedi static __always_inline int arena_spin_lock(arena_spinlock_t __arena *lock)
49488d706baSKumar Kartikeya Dwivedi {
49588d706baSKumar Kartikeya Dwivedi int val = 0;
49688d706baSKumar Kartikeya Dwivedi
49788d706baSKumar Kartikeya Dwivedi if (CONFIG_NR_CPUS > 1024)
49888d706baSKumar Kartikeya Dwivedi return -EOPNOTSUPP;
49988d706baSKumar Kartikeya Dwivedi
50088d706baSKumar Kartikeya Dwivedi bpf_preempt_disable();
50188d706baSKumar Kartikeya Dwivedi if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
50288d706baSKumar Kartikeya Dwivedi return 0;
50388d706baSKumar Kartikeya Dwivedi
50488d706baSKumar Kartikeya Dwivedi val = arena_spin_lock_slowpath(lock, val);
50588d706baSKumar Kartikeya Dwivedi /* FIXME: bpf_assert_range(-MAX_ERRNO, 0) once we have it working for all cases. */
50688d706baSKumar Kartikeya Dwivedi if (val)
50788d706baSKumar Kartikeya Dwivedi bpf_preempt_enable();
50888d706baSKumar Kartikeya Dwivedi return val;
50988d706baSKumar Kartikeya Dwivedi }
51088d706baSKumar Kartikeya Dwivedi
51188d706baSKumar Kartikeya Dwivedi /**
51288d706baSKumar Kartikeya Dwivedi * arena_spin_unlock - release a queued spinlock
51388d706baSKumar Kartikeya Dwivedi * @lock : Pointer to queued spinlock structure
51488d706baSKumar Kartikeya Dwivedi */
arena_spin_unlock(arena_spinlock_t __arena * lock)51588d706baSKumar Kartikeya Dwivedi static __always_inline void arena_spin_unlock(arena_spinlock_t __arena *lock)
51688d706baSKumar Kartikeya Dwivedi {
51788d706baSKumar Kartikeya Dwivedi /*
51888d706baSKumar Kartikeya Dwivedi * unlock() needs release semantics:
51988d706baSKumar Kartikeya Dwivedi */
52088d706baSKumar Kartikeya Dwivedi smp_store_release(&lock->locked, 0);
52188d706baSKumar Kartikeya Dwivedi bpf_preempt_enable();
52288d706baSKumar Kartikeya Dwivedi }
52388d706baSKumar Kartikeya Dwivedi
52488d706baSKumar Kartikeya Dwivedi #define arena_spin_lock_irqsave(lock, flags) \
52588d706baSKumar Kartikeya Dwivedi ({ \
52688d706baSKumar Kartikeya Dwivedi int __ret; \
52788d706baSKumar Kartikeya Dwivedi bpf_local_irq_save(&(flags)); \
52888d706baSKumar Kartikeya Dwivedi __ret = arena_spin_lock((lock)); \
52988d706baSKumar Kartikeya Dwivedi if (__ret) \
53088d706baSKumar Kartikeya Dwivedi bpf_local_irq_restore(&(flags)); \
53188d706baSKumar Kartikeya Dwivedi (__ret); \
53288d706baSKumar Kartikeya Dwivedi })
53388d706baSKumar Kartikeya Dwivedi
53488d706baSKumar Kartikeya Dwivedi #define arena_spin_unlock_irqrestore(lock, flags) \
53588d706baSKumar Kartikeya Dwivedi ({ \
53688d706baSKumar Kartikeya Dwivedi arena_spin_unlock((lock)); \
53788d706baSKumar Kartikeya Dwivedi bpf_local_irq_restore(&(flags)); \
53888d706baSKumar Kartikeya Dwivedi })
53988d706baSKumar Kartikeya Dwivedi
54088d706baSKumar Kartikeya Dwivedi #endif
54188d706baSKumar Kartikeya Dwivedi
54288d706baSKumar Kartikeya Dwivedi #endif /* BPF_ARENA_SPIN_LOCK_H */
543