1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2019 Facebook */
3 #include <linux/rculist.h>
4 #include <linux/list.h>
5 #include <linux/hash.h>
6 #include <linux/types.h>
7 #include <linux/spinlock.h>
8 #include <linux/bpf.h>
9 #include <linux/btf_ids.h>
10 #include <linux/bpf_local_storage.h>
11 #include <net/sock.h>
12 #include <uapi/linux/sock_diag.h>
13 #include <uapi/linux/btf.h>
14 #include <linux/rcupdate.h>
15 #include <linux/rcupdate_trace.h>
16 #include <linux/rcupdate_wait.h>
17
18 #define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
19
20 static struct bpf_local_storage_map_bucket *
select_bucket(struct bpf_local_storage_map * smap,struct bpf_local_storage * local_storage)21 select_bucket(struct bpf_local_storage_map *smap,
22 struct bpf_local_storage *local_storage)
23 {
24 return &smap->buckets[hash_ptr(local_storage, smap->bucket_log)];
25 }
26
mem_charge(struct bpf_local_storage_map * smap,void * owner,u32 size)27 static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size)
28 {
29 struct bpf_map *map = &smap->map;
30
31 if (!map->ops->map_local_storage_charge)
32 return 0;
33
34 return map->ops->map_local_storage_charge(smap, owner, size);
35 }
36
mem_uncharge(struct bpf_local_storage_map * smap,void * owner,u32 size)37 static void mem_uncharge(struct bpf_local_storage_map *smap, void *owner,
38 u32 size)
39 {
40 struct bpf_map *map = &smap->map;
41
42 if (map->ops->map_local_storage_uncharge)
43 map->ops->map_local_storage_uncharge(smap, owner, size);
44 }
45
46 static struct bpf_local_storage __rcu **
owner_storage(struct bpf_local_storage_map * smap,void * owner)47 owner_storage(struct bpf_local_storage_map *smap, void *owner)
48 {
49 struct bpf_map *map = &smap->map;
50
51 return map->ops->map_owner_storage_ptr(owner);
52 }
53
selem_linked_to_storage_lockless(const struct bpf_local_storage_elem * selem)54 static bool selem_linked_to_storage_lockless(const struct bpf_local_storage_elem *selem)
55 {
56 return !hlist_unhashed_lockless(&selem->snode);
57 }
58
selem_linked_to_storage(const struct bpf_local_storage_elem * selem)59 static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem)
60 {
61 return !hlist_unhashed(&selem->snode);
62 }
63
selem_linked_to_map(const struct bpf_local_storage_elem * selem)64 static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
65 {
66 return !hlist_unhashed(&selem->map_node);
67 }
68
69 struct bpf_local_storage_elem *
bpf_selem_alloc(struct bpf_local_storage_map * smap,void * owner,void * value,bool swap_uptrs)70 bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
71 void *value, bool swap_uptrs)
72 {
73 struct bpf_local_storage_elem *selem;
74
75 if (mem_charge(smap, owner, smap->elem_size))
76 return NULL;
77
78 selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size,
79 __GFP_ZERO, NUMA_NO_NODE);
80
81 if (selem) {
82 RCU_INIT_POINTER(SDATA(selem)->smap, smap);
83 atomic_set(&selem->state, 0);
84
85 if (value) {
86 /* No need to call check_and_init_map_value as memory is zero init */
87 copy_map_value(&smap->map, SDATA(selem)->data, value);
88 if (swap_uptrs)
89 bpf_obj_swap_uptrs(smap->map.record, SDATA(selem)->data, value);
90 }
91 return selem;
92 }
93
94 mem_uncharge(smap, owner, smap->elem_size);
95
96 return NULL;
97 }
98
bpf_local_storage_free_trace_rcu(struct rcu_head * rcu)99 static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
100 {
101 struct bpf_local_storage *local_storage;
102
103 /*
104 * RCU Tasks Trace grace period implies RCU grace period, do
105 * kfree() directly.
106 */
107 local_storage = container_of(rcu, struct bpf_local_storage, rcu);
108 kfree(local_storage);
109 }
110
bpf_local_storage_free(struct bpf_local_storage * local_storage,bool reuse_now)111 static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
112 bool reuse_now)
113 {
114 if (!local_storage)
115 return;
116
117 if (reuse_now) {
118 kfree_rcu(local_storage, rcu);
119 return;
120 }
121
122 call_rcu_tasks_trace(&local_storage->rcu,
123 bpf_local_storage_free_trace_rcu);
124 }
125
bpf_selem_free_trace_rcu(struct rcu_head * rcu)126 static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
127 {
128 struct bpf_local_storage_elem *selem;
129 struct bpf_local_storage_map *smap;
130
131 selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
132 /* The bpf_local_storage_map_free will wait for rcu_barrier */
133 smap = rcu_dereference_check(SDATA(selem)->smap, 1);
134
135 if (smap)
136 bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
137 /*
138 * RCU Tasks Trace grace period implies RCU grace period, do
139 * kfree() directly.
140 */
141 kfree(selem);
142 }
143
bpf_selem_free(struct bpf_local_storage_elem * selem,bool reuse_now)144 void bpf_selem_free(struct bpf_local_storage_elem *selem,
145 bool reuse_now)
146 {
147 struct bpf_local_storage_map *smap;
148
149 smap = rcu_dereference_check(SDATA(selem)->smap, 1);
150
151 if (reuse_now) {
152 if (smap)
153 bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
154 kfree_rcu(selem, rcu);
155 return;
156 }
157
158 call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
159 }
160
bpf_selem_free_list(struct hlist_head * list,bool reuse_now)161 static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
162 {
163 struct bpf_local_storage_elem *selem;
164 struct hlist_node *n;
165
166 /* The "_safe" iteration is needed.
167 * The loop is not removing the selem from the list
168 * but bpf_selem_free will use the selem->rcu_head
169 * which is union-ized with the selem->free_node.
170 */
171 hlist_for_each_entry_safe(selem, n, list, free_node)
172 bpf_selem_free(selem, reuse_now);
173 }
174
bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem * selem,struct bpf_local_storage_map * smap,struct bpf_local_storage * local_storage,bool free_local_storage,bool pin_owner)175 static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem *selem,
176 struct bpf_local_storage_map *smap,
177 struct bpf_local_storage *local_storage,
178 bool free_local_storage, bool pin_owner)
179 {
180 void *owner = local_storage->owner;
181 u32 uncharge = smap->elem_size;
182
183 if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) ==
184 SDATA(selem))
185 RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
186
187 if (pin_owner && !refcount_inc_not_zero(&local_storage->owner_refcnt))
188 return;
189
190 uncharge += free_local_storage ? sizeof(*local_storage) : 0;
191 mem_uncharge(smap, local_storage->owner, uncharge);
192 local_storage->mem_charge -= uncharge;
193
194 if (free_local_storage) {
195 local_storage->owner = NULL;
196
197 /* After this RCU_INIT, owner may be freed and cannot be used */
198 RCU_INIT_POINTER(*owner_storage(smap, owner), NULL);
199 }
200
201 if (pin_owner)
202 refcount_dec(&local_storage->owner_refcnt);
203 }
204
205 /* local_storage->lock must be held and selem->local_storage == local_storage.
206 * The caller must ensure selem->smap is still valid to be
207 * dereferenced for its smap->elem_size and smap->cache_idx.
208 */
bpf_selem_unlink_storage_nolock(struct bpf_local_storage * local_storage,struct bpf_local_storage_elem * selem,struct hlist_head * free_selem_list)209 static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
210 struct bpf_local_storage_elem *selem,
211 struct hlist_head *free_selem_list)
212 {
213 struct bpf_local_storage_map *smap;
214 bool free_local_storage;
215
216 smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
217
218 free_local_storage = hlist_is_singular_node(&selem->snode,
219 &local_storage->list);
220
221 bpf_selem_unlink_storage_nolock_misc(selem, smap, local_storage,
222 free_local_storage, false);
223
224 hlist_del_init_rcu(&selem->snode);
225
226 hlist_add_head(&selem->free_node, free_selem_list);
227
228 return free_local_storage;
229 }
230
bpf_selem_link_storage_nolock(struct bpf_local_storage * local_storage,struct bpf_local_storage_elem * selem)231 void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
232 struct bpf_local_storage_elem *selem)
233 {
234 struct bpf_local_storage_map *smap;
235
236 smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
237 local_storage->mem_charge += smap->elem_size;
238
239 RCU_INIT_POINTER(selem->local_storage, local_storage);
240 hlist_add_head_rcu(&selem->snode, &local_storage->list);
241 }
242
bpf_selem_unlink_map(struct bpf_local_storage_elem * selem)243 static int bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
244 {
245 struct bpf_local_storage *local_storage;
246 struct bpf_local_storage_map *smap;
247 struct bpf_local_storage_map_bucket *b;
248 unsigned long flags;
249 int err;
250
251 local_storage = rcu_dereference_check(selem->local_storage,
252 bpf_rcu_lock_held());
253 smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
254 b = select_bucket(smap, local_storage);
255 err = raw_res_spin_lock_irqsave(&b->lock, flags);
256 if (err)
257 return err;
258
259 hlist_del_init_rcu(&selem->map_node);
260 raw_res_spin_unlock_irqrestore(&b->lock, flags);
261
262 return 0;
263 }
264
bpf_selem_unlink_map_nolock(struct bpf_local_storage_elem * selem)265 static void bpf_selem_unlink_map_nolock(struct bpf_local_storage_elem *selem)
266 {
267 hlist_del_init_rcu(&selem->map_node);
268 }
269
bpf_selem_link_map(struct bpf_local_storage_map * smap,struct bpf_local_storage * local_storage,struct bpf_local_storage_elem * selem)270 int bpf_selem_link_map(struct bpf_local_storage_map *smap,
271 struct bpf_local_storage *local_storage,
272 struct bpf_local_storage_elem *selem)
273 {
274 struct bpf_local_storage_map_bucket *b;
275 unsigned long flags;
276 int err;
277
278 b = select_bucket(smap, local_storage);
279
280 err = raw_res_spin_lock_irqsave(&b->lock, flags);
281 if (err)
282 return err;
283
284 hlist_add_head_rcu(&selem->map_node, &b->list);
285 raw_res_spin_unlock_irqrestore(&b->lock, flags);
286
287 return 0;
288 }
289
bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket * b,struct bpf_local_storage_elem * selem)290 static void bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket *b,
291 struct bpf_local_storage_elem *selem)
292 {
293 hlist_add_head_rcu(&selem->map_node, &b->list);
294 }
295
296 /*
297 * Unlink an selem from map and local storage with lock held.
298 * This is the common path used by local storages to delete an selem.
299 */
bpf_selem_unlink(struct bpf_local_storage_elem * selem)300 int bpf_selem_unlink(struct bpf_local_storage_elem *selem)
301 {
302 struct bpf_local_storage *local_storage;
303 bool free_local_storage = false;
304 HLIST_HEAD(selem_free_list);
305 unsigned long flags;
306 int err;
307
308 if (in_nmi())
309 return -EOPNOTSUPP;
310
311 if (unlikely(!selem_linked_to_storage_lockless(selem)))
312 /* selem has already been unlinked from sk */
313 return 0;
314
315 local_storage = rcu_dereference_check(selem->local_storage,
316 bpf_rcu_lock_held());
317
318 err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
319 if (err)
320 return err;
321
322 if (likely(selem_linked_to_storage(selem))) {
323 /* Always unlink from map before unlinking from local_storage
324 * because selem will be freed after successfully unlinked from
325 * the local_storage.
326 */
327 err = bpf_selem_unlink_map(selem);
328 if (err)
329 goto out;
330
331 free_local_storage = bpf_selem_unlink_storage_nolock(
332 local_storage, selem, &selem_free_list);
333 }
334 out:
335 raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
336
337 bpf_selem_free_list(&selem_free_list, false);
338
339 if (free_local_storage)
340 bpf_local_storage_free(local_storage, false);
341
342 return err;
343 }
344
345 /*
346 * Unlink an selem from map and local storage with lockless fallback if callers
347 * are racing or rqspinlock returns error. It should only be called by
348 * bpf_local_storage_destroy() or bpf_local_storage_map_free().
349 */
bpf_selem_unlink_nofail(struct bpf_local_storage_elem * selem,struct bpf_local_storage_map_bucket * b)350 static void bpf_selem_unlink_nofail(struct bpf_local_storage_elem *selem,
351 struct bpf_local_storage_map_bucket *b)
352 {
353 bool in_map_free = !!b, free_storage = false;
354 struct bpf_local_storage *local_storage;
355 struct bpf_local_storage_map *smap;
356 unsigned long flags;
357 int err, unlink = 0;
358
359 local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held());
360 smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
361
362 if (smap) {
363 b = b ? : select_bucket(smap, local_storage);
364 err = raw_res_spin_lock_irqsave(&b->lock, flags);
365 if (!err) {
366 /*
367 * Call bpf_obj_free_fields() under b->lock to make sure it is done
368 * exactly once for an selem. Safe to free special fields immediately
369 * as no BPF program should be referencing the selem.
370 */
371 if (likely(selem_linked_to_map(selem))) {
372 hlist_del_init_rcu(&selem->map_node);
373 bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
374 unlink++;
375 }
376 raw_res_spin_unlock_irqrestore(&b->lock, flags);
377 }
378 /*
379 * Highly unlikely scenario: resource leak
380 *
381 * When map_free(selem1), destroy(selem1) and destroy(selem2) are racing
382 * and both selem belong to the same bucket, if destroy(selem2) acquired
383 * b->lock and block for too long, neither map_free(selem1) and
384 * destroy(selem1) will be able to free the special field associated
385 * with selem1 as raw_res_spin_lock_irqsave() returns -ETIMEDOUT.
386 */
387 WARN_ON_ONCE(err && in_map_free);
388 if (!err || in_map_free)
389 RCU_INIT_POINTER(SDATA(selem)->smap, NULL);
390 }
391
392 if (local_storage) {
393 err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
394 if (!err) {
395 if (likely(selem_linked_to_storage(selem))) {
396 free_storage = hlist_is_singular_node(&selem->snode,
397 &local_storage->list);
398 /*
399 * Okay to skip clearing owner_storage and storage->owner in
400 * destroy() since the owner is going away. No user or bpf
401 * programs should be able to reference it.
402 */
403 if (smap && in_map_free)
404 bpf_selem_unlink_storage_nolock_misc(
405 selem, smap, local_storage,
406 free_storage, true);
407 hlist_del_init_rcu(&selem->snode);
408 unlink++;
409 }
410 raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
411 }
412 /*
413 * Highly unlikely scenario: memory leak
414 *
415 * When destroy() fails to acqurire local_storage->lock and initializes
416 * selem->local_storage to NULL before any racing map_free() sees the same
417 * selem, no one will free the local storage.
418 */
419 WARN_ON_ONCE(err && !in_map_free);
420 if (!err || !in_map_free)
421 RCU_INIT_POINTER(selem->local_storage, NULL);
422 }
423
424 if (unlink != 2)
425 atomic_or(in_map_free ? SELEM_MAP_UNLINKED : SELEM_STORAGE_UNLINKED, &selem->state);
426
427 /*
428 * Normally, an selem can be unlinked under local_storage->lock and b->lock, and
429 * then freed after an RCU grace period. However, if destroy() and map_free() are
430 * racing or rqspinlock returns errors in unlikely situations (unlink != 2), free
431 * the selem only after both map_free() and destroy() see the selem.
432 */
433 if (unlink == 2 ||
434 atomic_cmpxchg(&selem->state, SELEM_UNLINKED, SELEM_TOFREE) == SELEM_UNLINKED)
435 bpf_selem_free(selem, true);
436
437 if (free_storage)
438 bpf_local_storage_free(local_storage, true);
439 }
440
__bpf_local_storage_insert_cache(struct bpf_local_storage * local_storage,struct bpf_local_storage_map * smap,struct bpf_local_storage_elem * selem)441 void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage,
442 struct bpf_local_storage_map *smap,
443 struct bpf_local_storage_elem *selem)
444 {
445 unsigned long flags;
446 int err;
447
448 /* spinlock is needed to avoid racing with the
449 * parallel delete. Otherwise, publishing an already
450 * deleted sdata to the cache will become a use-after-free
451 * problem in the next bpf_local_storage_lookup().
452 */
453 err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
454 if (err)
455 return;
456
457 if (selem_linked_to_storage(selem))
458 rcu_assign_pointer(local_storage->cache[smap->cache_idx], SDATA(selem));
459 raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
460 }
461
check_flags(const struct bpf_local_storage_data * old_sdata,u64 map_flags)462 static int check_flags(const struct bpf_local_storage_data *old_sdata,
463 u64 map_flags)
464 {
465 if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
466 /* elem already exists */
467 return -EEXIST;
468
469 if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
470 /* elem doesn't exist, cannot update it */
471 return -ENOENT;
472
473 return 0;
474 }
475
bpf_local_storage_alloc(void * owner,struct bpf_local_storage_map * smap,struct bpf_local_storage_elem * first_selem)476 int bpf_local_storage_alloc(void *owner,
477 struct bpf_local_storage_map *smap,
478 struct bpf_local_storage_elem *first_selem)
479 {
480 struct bpf_local_storage *prev_storage, *storage;
481 struct bpf_local_storage **owner_storage_ptr;
482 struct bpf_local_storage_map_bucket *b;
483 unsigned long flags;
484 int err;
485
486 err = mem_charge(smap, owner, sizeof(*storage));
487 if (err)
488 return err;
489
490 storage = bpf_map_kmalloc_nolock(&smap->map, sizeof(*storage),
491 __GFP_ZERO, NUMA_NO_NODE);
492 if (!storage) {
493 err = -ENOMEM;
494 goto uncharge;
495 }
496
497 INIT_HLIST_HEAD(&storage->list);
498 raw_res_spin_lock_init(&storage->lock);
499 storage->owner = owner;
500 storage->mem_charge = sizeof(*storage);
501 refcount_set(&storage->owner_refcnt, 1);
502
503 bpf_selem_link_storage_nolock(storage, first_selem);
504
505 b = select_bucket(smap, storage);
506 err = raw_res_spin_lock_irqsave(&b->lock, flags);
507 if (err)
508 goto uncharge;
509
510 bpf_selem_link_map_nolock(b, first_selem);
511
512 owner_storage_ptr =
513 (struct bpf_local_storage **)owner_storage(smap, owner);
514 /* Publish storage to the owner.
515 * Instead of using any lock of the kernel object (i.e. owner),
516 * cmpxchg will work with any kernel object regardless what
517 * the running context is, bh, irq...etc.
518 *
519 * From now on, the owner->storage pointer (e.g. sk->sk_bpf_storage)
520 * is protected by the storage->lock. Hence, when freeing
521 * the owner->storage, the storage->lock must be held before
522 * setting owner->storage ptr to NULL.
523 */
524 prev_storage = cmpxchg(owner_storage_ptr, NULL, storage);
525 if (unlikely(prev_storage)) {
526 bpf_selem_unlink_map_nolock(first_selem);
527 raw_res_spin_unlock_irqrestore(&b->lock, flags);
528 err = -EAGAIN;
529 goto uncharge;
530 }
531 raw_res_spin_unlock_irqrestore(&b->lock, flags);
532
533 return 0;
534
535 uncharge:
536 bpf_local_storage_free(storage, true);
537 mem_uncharge(smap, owner, sizeof(*storage));
538 return err;
539 }
540
541 /* sk cannot be going away because it is linking new elem
542 * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0).
543 * Otherwise, it will become a leak (and other memory issues
544 * during map destruction).
545 */
546 struct bpf_local_storage_data *
bpf_local_storage_update(void * owner,struct bpf_local_storage_map * smap,void * value,u64 map_flags,bool swap_uptrs)547 bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
548 void *value, u64 map_flags, bool swap_uptrs)
549 {
550 struct bpf_local_storage_data *old_sdata = NULL;
551 struct bpf_local_storage_elem *alloc_selem, *selem = NULL;
552 struct bpf_local_storage *local_storage;
553 struct bpf_local_storage_map_bucket *b;
554 HLIST_HEAD(old_selem_free_list);
555 unsigned long flags, b_flags;
556 int err;
557
558 /* BPF_EXIST and BPF_NOEXIST cannot be both set */
559 if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
560 /* BPF_F_LOCK can only be used in a value with spin_lock */
561 unlikely((map_flags & BPF_F_LOCK) &&
562 !btf_record_has_field(smap->map.record, BPF_SPIN_LOCK)))
563 return ERR_PTR(-EINVAL);
564
565 local_storage = rcu_dereference_check(*owner_storage(smap, owner),
566 bpf_rcu_lock_held());
567 if (!local_storage || hlist_empty(&local_storage->list)) {
568 /* Very first elem for the owner */
569 err = check_flags(NULL, map_flags);
570 if (err)
571 return ERR_PTR(err);
572
573 selem = bpf_selem_alloc(smap, owner, value, swap_uptrs);
574 if (!selem)
575 return ERR_PTR(-ENOMEM);
576
577 err = bpf_local_storage_alloc(owner, smap, selem);
578 if (err) {
579 bpf_selem_free(selem, true);
580 mem_uncharge(smap, owner, smap->elem_size);
581 return ERR_PTR(err);
582 }
583
584 return SDATA(selem);
585 }
586
587 if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) {
588 /* Hoping to find an old_sdata to do inline update
589 * such that it can avoid taking the local_storage->lock
590 * and changing the lists.
591 */
592 old_sdata =
593 bpf_local_storage_lookup(local_storage, smap, false);
594 err = check_flags(old_sdata, map_flags);
595 if (err)
596 return ERR_PTR(err);
597 if (old_sdata && selem_linked_to_storage_lockless(SELEM(old_sdata))) {
598 copy_map_value_locked(&smap->map, old_sdata->data,
599 value, false);
600 return old_sdata;
601 }
602 }
603
604 /* A lookup has just been done before and concluded a new selem is
605 * needed. The chance of an unnecessary alloc is unlikely.
606 */
607 alloc_selem = selem = bpf_selem_alloc(smap, owner, value, swap_uptrs);
608 if (!alloc_selem)
609 return ERR_PTR(-ENOMEM);
610
611 err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
612 if (err)
613 goto free_selem;
614
615 /* Recheck local_storage->list under local_storage->lock */
616 if (unlikely(hlist_empty(&local_storage->list))) {
617 /* A parallel del is happening and local_storage is going
618 * away. It has just been checked before, so very
619 * unlikely. Return instead of retry to keep things
620 * simple.
621 */
622 err = -EAGAIN;
623 goto unlock;
624 }
625
626 old_sdata = bpf_local_storage_lookup(local_storage, smap, false);
627 err = check_flags(old_sdata, map_flags);
628 if (err)
629 goto unlock;
630
631 if (old_sdata && (map_flags & BPF_F_LOCK)) {
632 copy_map_value_locked(&smap->map, old_sdata->data, value,
633 false);
634 selem = SELEM(old_sdata);
635 goto unlock;
636 }
637
638 b = select_bucket(smap, local_storage);
639
640 err = raw_res_spin_lock_irqsave(&b->lock, b_flags);
641 if (err)
642 goto unlock;
643
644 alloc_selem = NULL;
645 /* First, link the new selem to the map */
646 bpf_selem_link_map_nolock(b, selem);
647
648 /* Second, link (and publish) the new selem to local_storage */
649 bpf_selem_link_storage_nolock(local_storage, selem);
650
651 /* Third, remove old selem, SELEM(old_sdata) */
652 if (old_sdata) {
653 bpf_selem_unlink_map_nolock(SELEM(old_sdata));
654 bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
655 &old_selem_free_list);
656 }
657
658 raw_res_spin_unlock_irqrestore(&b->lock, b_flags);
659 unlock:
660 raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
661 free_selem:
662 bpf_selem_free_list(&old_selem_free_list, false);
663 if (alloc_selem) {
664 mem_uncharge(smap, owner, smap->elem_size);
665 bpf_selem_free(alloc_selem, true);
666 }
667 return err ? ERR_PTR(err) : SDATA(selem);
668 }
669
bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache * cache)670 static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
671 {
672 u64 min_usage = U64_MAX;
673 u16 i, res = 0;
674
675 spin_lock(&cache->idx_lock);
676
677 for (i = 0; i < BPF_LOCAL_STORAGE_CACHE_SIZE; i++) {
678 if (cache->idx_usage_counts[i] < min_usage) {
679 min_usage = cache->idx_usage_counts[i];
680 res = i;
681
682 /* Found a free cache_idx */
683 if (!min_usage)
684 break;
685 }
686 }
687 cache->idx_usage_counts[res]++;
688
689 spin_unlock(&cache->idx_lock);
690
691 return res;
692 }
693
bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache * cache,u16 idx)694 static void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
695 u16 idx)
696 {
697 spin_lock(&cache->idx_lock);
698 cache->idx_usage_counts[idx]--;
699 spin_unlock(&cache->idx_lock);
700 }
701
bpf_local_storage_map_alloc_check(union bpf_attr * attr)702 int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
703 {
704 if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK ||
705 !(attr->map_flags & BPF_F_NO_PREALLOC) ||
706 attr->max_entries ||
707 attr->key_size != sizeof(int) || !attr->value_size ||
708 /* Enforce BTF for userspace sk dumping */
709 !attr->btf_key_type_id || !attr->btf_value_type_id)
710 return -EINVAL;
711
712 if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE)
713 return -E2BIG;
714
715 return 0;
716 }
717
bpf_local_storage_map_check_btf(struct bpf_map * map,const struct btf * btf,const struct btf_type * key_type,const struct btf_type * value_type)718 int bpf_local_storage_map_check_btf(struct bpf_map *map,
719 const struct btf *btf,
720 const struct btf_type *key_type,
721 const struct btf_type *value_type)
722 {
723 if (!btf_type_is_i32(key_type))
724 return -EINVAL;
725
726 return 0;
727 }
728
729 /*
730 * Destroy local storage when the owner is going away. Caller must uncharge memory
731 * if memory charging is used.
732 */
bpf_local_storage_destroy(struct bpf_local_storage * local_storage)733 u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
734 {
735 struct bpf_local_storage_elem *selem;
736
737 /* Neither the bpf_prog nor the bpf_map's syscall
738 * could be modifying the local_storage->list now.
739 * Thus, no elem can be added to or deleted from the
740 * local_storage->list by the bpf_prog or by the bpf_map's syscall.
741 *
742 * It is racing with bpf_local_storage_map_free() alone
743 * when unlinking elem from the local_storage->list and
744 * the map's bucket->list.
745 */
746 hlist_for_each_entry_rcu(selem, &local_storage->list, snode)
747 bpf_selem_unlink_nofail(selem, NULL);
748
749 if (!refcount_dec_and_test(&local_storage->owner_refcnt)) {
750 while (refcount_read(&local_storage->owner_refcnt))
751 cpu_relax();
752 /*
753 * Paired with refcount_dec() in bpf_selem_unlink_nofail()
754 * to make sure destroy() sees the correct local_storage->mem_charge.
755 */
756 smp_mb();
757 }
758
759 return local_storage->mem_charge;
760 }
761
bpf_local_storage_map_mem_usage(const struct bpf_map * map)762 u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
763 {
764 struct bpf_local_storage_map *smap = (struct bpf_local_storage_map *)map;
765 u64 usage = sizeof(*smap);
766
767 /* The dynamically callocated selems are not counted currently. */
768 usage += sizeof(*smap->buckets) * (1ULL << smap->bucket_log);
769 return usage;
770 }
771
772 struct bpf_map *
bpf_local_storage_map_alloc(union bpf_attr * attr,struct bpf_local_storage_cache * cache)773 bpf_local_storage_map_alloc(union bpf_attr *attr,
774 struct bpf_local_storage_cache *cache)
775 {
776 struct bpf_local_storage_map *smap;
777 unsigned int i;
778 u32 nbuckets;
779 int err;
780
781 smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE);
782 if (!smap)
783 return ERR_PTR(-ENOMEM);
784 bpf_map_init_from_attr(&smap->map, attr);
785
786 nbuckets = roundup_pow_of_two(num_possible_cpus());
787 /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
788 nbuckets = max_t(u32, 2, nbuckets);
789 smap->bucket_log = ilog2(nbuckets);
790
791 smap->buckets = bpf_map_kvcalloc(&smap->map, nbuckets,
792 sizeof(*smap->buckets), GFP_USER | __GFP_NOWARN);
793 if (!smap->buckets) {
794 err = -ENOMEM;
795 goto free_smap;
796 }
797
798 for (i = 0; i < nbuckets; i++) {
799 INIT_HLIST_HEAD(&smap->buckets[i].list);
800 raw_res_spin_lock_init(&smap->buckets[i].lock);
801 }
802
803 smap->elem_size = offsetof(struct bpf_local_storage_elem,
804 sdata.data[attr->value_size]);
805
806 smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
807 return &smap->map;
808
809 free_smap:
810 kvfree(smap->buckets);
811 bpf_map_area_free(smap);
812 return ERR_PTR(err);
813 }
814
bpf_local_storage_map_free(struct bpf_map * map,struct bpf_local_storage_cache * cache)815 void bpf_local_storage_map_free(struct bpf_map *map,
816 struct bpf_local_storage_cache *cache)
817 {
818 struct bpf_local_storage_map_bucket *b;
819 struct bpf_local_storage_elem *selem;
820 struct bpf_local_storage_map *smap;
821 unsigned int i;
822
823 smap = (struct bpf_local_storage_map *)map;
824 bpf_local_storage_cache_idx_free(cache, smap->cache_idx);
825
826 /* Note that this map might be concurrently cloned from
827 * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
828 * RCU read section to finish before proceeding. New RCU
829 * read sections should be prevented via bpf_map_inc_not_zero.
830 */
831 synchronize_rcu();
832
833 /* bpf prog and the userspace can no longer access this map
834 * now. No new selem (of this map) can be added
835 * to the owner->storage or to the map bucket's list.
836 *
837 * The elem of this map can be cleaned up here
838 * or when the storage is freed e.g.
839 * by bpf_sk_storage_free() during __sk_destruct().
840 */
841 for (i = 0; i < (1U << smap->bucket_log); i++) {
842 b = &smap->buckets[i];
843
844 rcu_read_lock();
845 /* No one is adding to b->list now */
846 restart:
847 hlist_for_each_entry_rcu(selem, &b->list, map_node) {
848 bpf_selem_unlink_nofail(selem, b);
849
850 if (need_resched()) {
851 cond_resched_rcu();
852 goto restart;
853 }
854 }
855 rcu_read_unlock();
856 }
857
858 /* While freeing the storage we may still need to access the map.
859 *
860 * e.g. when bpf_sk_storage_free() has unlinked selem from the map
861 * which then made the above while((selem = ...)) loop
862 * exit immediately.
863 *
864 * However, while freeing the storage one still needs to access the
865 * smap->elem_size to do the uncharging in
866 * bpf_selem_unlink_storage_nolock().
867 *
868 * Hence, wait another rcu grace period for the storage to be freed.
869 */
870 synchronize_rcu();
871
872 /* smap remains in use regardless of kmalloc_nolock, so wait unconditionally. */
873 rcu_barrier_tasks_trace();
874 rcu_barrier();
875 kvfree(smap->buckets);
876 bpf_map_area_free(smap);
877 }
878