xref: /linux/kernel/bpf/bpf_local_storage.c (revision 32e940f2bd3b16551f23ea44be47f6f5d1746d64)
1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2019 Facebook  */
3 #include <linux/rculist.h>
4 #include <linux/list.h>
5 #include <linux/hash.h>
6 #include <linux/types.h>
7 #include <linux/spinlock.h>
8 #include <linux/bpf.h>
9 #include <linux/btf_ids.h>
10 #include <linux/bpf_local_storage.h>
11 #include <net/sock.h>
12 #include <uapi/linux/sock_diag.h>
13 #include <uapi/linux/btf.h>
14 #include <linux/rcupdate.h>
15 #include <linux/rcupdate_trace.h>
16 #include <linux/rcupdate_wait.h>
17 
18 #define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
19 
20 static struct bpf_local_storage_map_bucket *
select_bucket(struct bpf_local_storage_map * smap,struct bpf_local_storage * local_storage)21 select_bucket(struct bpf_local_storage_map *smap,
22 	      struct bpf_local_storage *local_storage)
23 {
24 	return &smap->buckets[hash_ptr(local_storage, smap->bucket_log)];
25 }
26 
mem_charge(struct bpf_local_storage_map * smap,void * owner,u32 size)27 static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size)
28 {
29 	struct bpf_map *map = &smap->map;
30 
31 	if (!map->ops->map_local_storage_charge)
32 		return 0;
33 
34 	return map->ops->map_local_storage_charge(smap, owner, size);
35 }
36 
mem_uncharge(struct bpf_local_storage_map * smap,void * owner,u32 size)37 static void mem_uncharge(struct bpf_local_storage_map *smap, void *owner,
38 			 u32 size)
39 {
40 	struct bpf_map *map = &smap->map;
41 
42 	if (map->ops->map_local_storage_uncharge)
43 		map->ops->map_local_storage_uncharge(smap, owner, size);
44 }
45 
46 static struct bpf_local_storage __rcu **
owner_storage(struct bpf_local_storage_map * smap,void * owner)47 owner_storage(struct bpf_local_storage_map *smap, void *owner)
48 {
49 	struct bpf_map *map = &smap->map;
50 
51 	return map->ops->map_owner_storage_ptr(owner);
52 }
53 
selem_linked_to_storage_lockless(const struct bpf_local_storage_elem * selem)54 static bool selem_linked_to_storage_lockless(const struct bpf_local_storage_elem *selem)
55 {
56 	return !hlist_unhashed_lockless(&selem->snode);
57 }
58 
selem_linked_to_storage(const struct bpf_local_storage_elem * selem)59 static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem)
60 {
61 	return !hlist_unhashed(&selem->snode);
62 }
63 
selem_linked_to_map(const struct bpf_local_storage_elem * selem)64 static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
65 {
66 	return !hlist_unhashed(&selem->map_node);
67 }
68 
69 struct bpf_local_storage_elem *
bpf_selem_alloc(struct bpf_local_storage_map * smap,void * owner,void * value,bool swap_uptrs)70 bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
71 		void *value, bool swap_uptrs)
72 {
73 	struct bpf_local_storage_elem *selem;
74 
75 	if (mem_charge(smap, owner, smap->elem_size))
76 		return NULL;
77 
78 	selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size,
79 				       __GFP_ZERO, NUMA_NO_NODE);
80 
81 	if (selem) {
82 		RCU_INIT_POINTER(SDATA(selem)->smap, smap);
83 		atomic_set(&selem->state, 0);
84 
85 		if (value) {
86 			/* No need to call check_and_init_map_value as memory is zero init */
87 			copy_map_value(&smap->map, SDATA(selem)->data, value);
88 			if (swap_uptrs)
89 				bpf_obj_swap_uptrs(smap->map.record, SDATA(selem)->data, value);
90 		}
91 		return selem;
92 	}
93 
94 	mem_uncharge(smap, owner, smap->elem_size);
95 
96 	return NULL;
97 }
98 
bpf_local_storage_free_trace_rcu(struct rcu_head * rcu)99 static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
100 {
101 	struct bpf_local_storage *local_storage;
102 
103 	/*
104 	 * RCU Tasks Trace grace period implies RCU grace period, do
105 	 * kfree() directly.
106 	 */
107 	local_storage = container_of(rcu, struct bpf_local_storage, rcu);
108 	kfree(local_storage);
109 }
110 
bpf_local_storage_free(struct bpf_local_storage * local_storage,bool reuse_now)111 static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
112 				   bool reuse_now)
113 {
114 	if (!local_storage)
115 		return;
116 
117 	if (reuse_now) {
118 		kfree_rcu(local_storage, rcu);
119 		return;
120 	}
121 
122 	call_rcu_tasks_trace(&local_storage->rcu,
123 			     bpf_local_storage_free_trace_rcu);
124 }
125 
bpf_selem_free_trace_rcu(struct rcu_head * rcu)126 static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
127 {
128 	struct bpf_local_storage_elem *selem;
129 	struct bpf_local_storage_map *smap;
130 
131 	selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
132 	/* The bpf_local_storage_map_free will wait for rcu_barrier */
133 	smap = rcu_dereference_check(SDATA(selem)->smap, 1);
134 
135 	if (smap)
136 		bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
137 	/*
138 	 * RCU Tasks Trace grace period implies RCU grace period, do
139 	 * kfree() directly.
140 	 */
141 	kfree(selem);
142 }
143 
bpf_selem_free(struct bpf_local_storage_elem * selem,bool reuse_now)144 void bpf_selem_free(struct bpf_local_storage_elem *selem,
145 		    bool reuse_now)
146 {
147 	struct bpf_local_storage_map *smap;
148 
149 	smap = rcu_dereference_check(SDATA(selem)->smap, 1);
150 
151 	if (reuse_now) {
152 		if (smap)
153 			bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
154 		kfree_rcu(selem, rcu);
155 		return;
156 	}
157 
158 	call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
159 }
160 
bpf_selem_free_list(struct hlist_head * list,bool reuse_now)161 static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
162 {
163 	struct bpf_local_storage_elem *selem;
164 	struct hlist_node *n;
165 
166 	/* The "_safe" iteration is needed.
167 	 * The loop is not removing the selem from the list
168 	 * but bpf_selem_free will use the selem->rcu_head
169 	 * which is union-ized with the selem->free_node.
170 	 */
171 	hlist_for_each_entry_safe(selem, n, list, free_node)
172 		bpf_selem_free(selem, reuse_now);
173 }
174 
bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem * selem,struct bpf_local_storage_map * smap,struct bpf_local_storage * local_storage,bool free_local_storage,bool pin_owner)175 static void bpf_selem_unlink_storage_nolock_misc(struct bpf_local_storage_elem *selem,
176 						 struct bpf_local_storage_map *smap,
177 						 struct bpf_local_storage *local_storage,
178 						 bool free_local_storage, bool pin_owner)
179 {
180 	void *owner = local_storage->owner;
181 	u32 uncharge = smap->elem_size;
182 
183 	if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) ==
184 	    SDATA(selem))
185 		RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
186 
187 	if (pin_owner && !refcount_inc_not_zero(&local_storage->owner_refcnt))
188 		return;
189 
190 	uncharge += free_local_storage ? sizeof(*local_storage) : 0;
191 	mem_uncharge(smap, local_storage->owner, uncharge);
192 	local_storage->mem_charge -= uncharge;
193 
194 	if (free_local_storage) {
195 		local_storage->owner = NULL;
196 
197 		/* After this RCU_INIT, owner may be freed and cannot be used */
198 		RCU_INIT_POINTER(*owner_storage(smap, owner), NULL);
199 	}
200 
201 	if (pin_owner)
202 		refcount_dec(&local_storage->owner_refcnt);
203 }
204 
205 /* local_storage->lock must be held and selem->local_storage == local_storage.
206  * The caller must ensure selem->smap is still valid to be
207  * dereferenced for its smap->elem_size and smap->cache_idx.
208  */
bpf_selem_unlink_storage_nolock(struct bpf_local_storage * local_storage,struct bpf_local_storage_elem * selem,struct hlist_head * free_selem_list)209 static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
210 					    struct bpf_local_storage_elem *selem,
211 					    struct hlist_head *free_selem_list)
212 {
213 	struct bpf_local_storage_map *smap;
214 	bool free_local_storage;
215 
216 	smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
217 
218 	free_local_storage = hlist_is_singular_node(&selem->snode,
219 						    &local_storage->list);
220 
221 	bpf_selem_unlink_storage_nolock_misc(selem, smap, local_storage,
222 					     free_local_storage, false);
223 
224 	hlist_del_init_rcu(&selem->snode);
225 
226 	hlist_add_head(&selem->free_node, free_selem_list);
227 
228 	return free_local_storage;
229 }
230 
bpf_selem_link_storage_nolock(struct bpf_local_storage * local_storage,struct bpf_local_storage_elem * selem)231 void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
232 				   struct bpf_local_storage_elem *selem)
233 {
234 	struct bpf_local_storage_map *smap;
235 
236 	smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
237 	local_storage->mem_charge += smap->elem_size;
238 
239 	RCU_INIT_POINTER(selem->local_storage, local_storage);
240 	hlist_add_head_rcu(&selem->snode, &local_storage->list);
241 }
242 
bpf_selem_unlink_map(struct bpf_local_storage_elem * selem)243 static int bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
244 {
245 	struct bpf_local_storage *local_storage;
246 	struct bpf_local_storage_map *smap;
247 	struct bpf_local_storage_map_bucket *b;
248 	unsigned long flags;
249 	int err;
250 
251 	local_storage = rcu_dereference_check(selem->local_storage,
252 					      bpf_rcu_lock_held());
253 	smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
254 	b = select_bucket(smap, local_storage);
255 	err = raw_res_spin_lock_irqsave(&b->lock, flags);
256 	if (err)
257 		return err;
258 
259 	hlist_del_init_rcu(&selem->map_node);
260 	raw_res_spin_unlock_irqrestore(&b->lock, flags);
261 
262 	return 0;
263 }
264 
bpf_selem_unlink_map_nolock(struct bpf_local_storage_elem * selem)265 static void bpf_selem_unlink_map_nolock(struct bpf_local_storage_elem *selem)
266 {
267 	hlist_del_init_rcu(&selem->map_node);
268 }
269 
bpf_selem_link_map(struct bpf_local_storage_map * smap,struct bpf_local_storage * local_storage,struct bpf_local_storage_elem * selem)270 int bpf_selem_link_map(struct bpf_local_storage_map *smap,
271 		       struct bpf_local_storage *local_storage,
272 		       struct bpf_local_storage_elem *selem)
273 {
274 	struct bpf_local_storage_map_bucket *b;
275 	unsigned long flags;
276 	int err;
277 
278 	b = select_bucket(smap, local_storage);
279 
280 	err = raw_res_spin_lock_irqsave(&b->lock, flags);
281 	if (err)
282 		return err;
283 
284 	hlist_add_head_rcu(&selem->map_node, &b->list);
285 	raw_res_spin_unlock_irqrestore(&b->lock, flags);
286 
287 	return 0;
288 }
289 
bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket * b,struct bpf_local_storage_elem * selem)290 static void bpf_selem_link_map_nolock(struct bpf_local_storage_map_bucket *b,
291 				      struct bpf_local_storage_elem *selem)
292 {
293 	hlist_add_head_rcu(&selem->map_node, &b->list);
294 }
295 
296 /*
297  * Unlink an selem from map and local storage with lock held.
298  * This is the common path used by local storages to delete an selem.
299  */
bpf_selem_unlink(struct bpf_local_storage_elem * selem)300 int bpf_selem_unlink(struct bpf_local_storage_elem *selem)
301 {
302 	struct bpf_local_storage *local_storage;
303 	bool free_local_storage = false;
304 	HLIST_HEAD(selem_free_list);
305 	unsigned long flags;
306 	int err;
307 
308 	if (in_nmi())
309 		return -EOPNOTSUPP;
310 
311 	if (unlikely(!selem_linked_to_storage_lockless(selem)))
312 		/* selem has already been unlinked from sk */
313 		return 0;
314 
315 	local_storage = rcu_dereference_check(selem->local_storage,
316 					      bpf_rcu_lock_held());
317 
318 	err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
319 	if (err)
320 		return err;
321 
322 	if (likely(selem_linked_to_storage(selem))) {
323 		/* Always unlink from map before unlinking from local_storage
324 		 * because selem will be freed after successfully unlinked from
325 		 * the local_storage.
326 		 */
327 		err = bpf_selem_unlink_map(selem);
328 		if (err)
329 			goto out;
330 
331 		free_local_storage = bpf_selem_unlink_storage_nolock(
332 			local_storage, selem, &selem_free_list);
333 	}
334 out:
335 	raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
336 
337 	bpf_selem_free_list(&selem_free_list, false);
338 
339 	if (free_local_storage)
340 		bpf_local_storage_free(local_storage, false);
341 
342 	return err;
343 }
344 
345 /*
346  * Unlink an selem from map and local storage with lockless fallback if callers
347  * are racing or rqspinlock returns error. It should only be called by
348  * bpf_local_storage_destroy() or bpf_local_storage_map_free().
349  */
bpf_selem_unlink_nofail(struct bpf_local_storage_elem * selem,struct bpf_local_storage_map_bucket * b)350 static void bpf_selem_unlink_nofail(struct bpf_local_storage_elem *selem,
351 				    struct bpf_local_storage_map_bucket *b)
352 {
353 	bool in_map_free = !!b, free_storage = false;
354 	struct bpf_local_storage *local_storage;
355 	struct bpf_local_storage_map *smap;
356 	unsigned long flags;
357 	int err, unlink = 0;
358 
359 	local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held());
360 	smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
361 
362 	if (smap) {
363 		b = b ? : select_bucket(smap, local_storage);
364 		err = raw_res_spin_lock_irqsave(&b->lock, flags);
365 		if (!err) {
366 			/*
367 			 * Call bpf_obj_free_fields() under b->lock to make sure it is done
368 			 * exactly once for an selem. Safe to free special fields immediately
369 			 * as no BPF program should be referencing the selem.
370 			 */
371 			if (likely(selem_linked_to_map(selem))) {
372 				hlist_del_init_rcu(&selem->map_node);
373 				bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
374 				unlink++;
375 			}
376 			raw_res_spin_unlock_irqrestore(&b->lock, flags);
377 		}
378 		/*
379 		 * Highly unlikely scenario: resource leak
380 		 *
381 		 * When map_free(selem1), destroy(selem1) and destroy(selem2) are racing
382 		 * and both selem belong to the same bucket, if destroy(selem2) acquired
383 		 * b->lock and block for too long, neither map_free(selem1) and
384 		 * destroy(selem1) will be able to free the special field associated
385 		 * with selem1 as raw_res_spin_lock_irqsave() returns -ETIMEDOUT.
386 		 */
387 		WARN_ON_ONCE(err && in_map_free);
388 		if (!err || in_map_free)
389 			RCU_INIT_POINTER(SDATA(selem)->smap, NULL);
390 	}
391 
392 	if (local_storage) {
393 		err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
394 		if (!err) {
395 			if (likely(selem_linked_to_storage(selem))) {
396 				free_storage = hlist_is_singular_node(&selem->snode,
397 								      &local_storage->list);
398 				 /*
399 				  * Okay to skip clearing owner_storage and storage->owner in
400 				  * destroy() since the owner is going away. No user or bpf
401 				  * programs should be able to reference it.
402 				  */
403 				if (smap && in_map_free)
404 					bpf_selem_unlink_storage_nolock_misc(
405 						selem, smap, local_storage,
406 						free_storage, true);
407 				hlist_del_init_rcu(&selem->snode);
408 				unlink++;
409 			}
410 			raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
411 		}
412 		/*
413 		 * Highly unlikely scenario: memory leak
414 		 *
415 		 * When destroy() fails to acqurire local_storage->lock and initializes
416 		 * selem->local_storage to NULL before any racing map_free() sees the same
417 		 * selem, no one will free the local storage.
418 		 */
419 		WARN_ON_ONCE(err && !in_map_free);
420 		if (!err || !in_map_free)
421 			RCU_INIT_POINTER(selem->local_storage, NULL);
422 	}
423 
424 	if (unlink != 2)
425 		atomic_or(in_map_free ? SELEM_MAP_UNLINKED : SELEM_STORAGE_UNLINKED, &selem->state);
426 
427 	/*
428 	 * Normally, an selem can be unlinked under local_storage->lock and b->lock, and
429 	 * then freed after an RCU grace period. However, if destroy() and map_free() are
430 	 * racing or rqspinlock returns errors in unlikely situations (unlink != 2), free
431 	 * the selem only after both map_free() and destroy() see the selem.
432 	 */
433 	if (unlink == 2 ||
434 	    atomic_cmpxchg(&selem->state, SELEM_UNLINKED, SELEM_TOFREE) == SELEM_UNLINKED)
435 		bpf_selem_free(selem, true);
436 
437 	if (free_storage)
438 		bpf_local_storage_free(local_storage, true);
439 }
440 
__bpf_local_storage_insert_cache(struct bpf_local_storage * local_storage,struct bpf_local_storage_map * smap,struct bpf_local_storage_elem * selem)441 void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage,
442 				      struct bpf_local_storage_map *smap,
443 				      struct bpf_local_storage_elem *selem)
444 {
445 	unsigned long flags;
446 	int err;
447 
448 	/* spinlock is needed to avoid racing with the
449 	 * parallel delete.  Otherwise, publishing an already
450 	 * deleted sdata to the cache will become a use-after-free
451 	 * problem in the next bpf_local_storage_lookup().
452 	 */
453 	err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
454 	if (err)
455 		return;
456 
457 	if (selem_linked_to_storage(selem))
458 		rcu_assign_pointer(local_storage->cache[smap->cache_idx], SDATA(selem));
459 	raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
460 }
461 
check_flags(const struct bpf_local_storage_data * old_sdata,u64 map_flags)462 static int check_flags(const struct bpf_local_storage_data *old_sdata,
463 		       u64 map_flags)
464 {
465 	if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
466 		/* elem already exists */
467 		return -EEXIST;
468 
469 	if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
470 		/* elem doesn't exist, cannot update it */
471 		return -ENOENT;
472 
473 	return 0;
474 }
475 
bpf_local_storage_alloc(void * owner,struct bpf_local_storage_map * smap,struct bpf_local_storage_elem * first_selem)476 int bpf_local_storage_alloc(void *owner,
477 			    struct bpf_local_storage_map *smap,
478 			    struct bpf_local_storage_elem *first_selem)
479 {
480 	struct bpf_local_storage *prev_storage, *storage;
481 	struct bpf_local_storage **owner_storage_ptr;
482 	struct bpf_local_storage_map_bucket *b;
483 	unsigned long flags;
484 	int err;
485 
486 	err = mem_charge(smap, owner, sizeof(*storage));
487 	if (err)
488 		return err;
489 
490 	storage = bpf_map_kmalloc_nolock(&smap->map, sizeof(*storage),
491 					 __GFP_ZERO, NUMA_NO_NODE);
492 	if (!storage) {
493 		err = -ENOMEM;
494 		goto uncharge;
495 	}
496 
497 	INIT_HLIST_HEAD(&storage->list);
498 	raw_res_spin_lock_init(&storage->lock);
499 	storage->owner = owner;
500 	storage->mem_charge = sizeof(*storage);
501 	refcount_set(&storage->owner_refcnt, 1);
502 
503 	bpf_selem_link_storage_nolock(storage, first_selem);
504 
505 	b = select_bucket(smap, storage);
506 	err = raw_res_spin_lock_irqsave(&b->lock, flags);
507 	if (err)
508 		goto uncharge;
509 
510 	bpf_selem_link_map_nolock(b, first_selem);
511 
512 	owner_storage_ptr =
513 		(struct bpf_local_storage **)owner_storage(smap, owner);
514 	/* Publish storage to the owner.
515 	 * Instead of using any lock of the kernel object (i.e. owner),
516 	 * cmpxchg will work with any kernel object regardless what
517 	 * the running context is, bh, irq...etc.
518 	 *
519 	 * From now on, the owner->storage pointer (e.g. sk->sk_bpf_storage)
520 	 * is protected by the storage->lock.  Hence, when freeing
521 	 * the owner->storage, the storage->lock must be held before
522 	 * setting owner->storage ptr to NULL.
523 	 */
524 	prev_storage = cmpxchg(owner_storage_ptr, NULL, storage);
525 	if (unlikely(prev_storage)) {
526 		bpf_selem_unlink_map_nolock(first_selem);
527 		raw_res_spin_unlock_irqrestore(&b->lock, flags);
528 		err = -EAGAIN;
529 		goto uncharge;
530 	}
531 	raw_res_spin_unlock_irqrestore(&b->lock, flags);
532 
533 	return 0;
534 
535 uncharge:
536 	bpf_local_storage_free(storage, true);
537 	mem_uncharge(smap, owner, sizeof(*storage));
538 	return err;
539 }
540 
541 /* sk cannot be going away because it is linking new elem
542  * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0).
543  * Otherwise, it will become a leak (and other memory issues
544  * during map destruction).
545  */
546 struct bpf_local_storage_data *
bpf_local_storage_update(void * owner,struct bpf_local_storage_map * smap,void * value,u64 map_flags,bool swap_uptrs)547 bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
548 			 void *value, u64 map_flags, bool swap_uptrs)
549 {
550 	struct bpf_local_storage_data *old_sdata = NULL;
551 	struct bpf_local_storage_elem *alloc_selem, *selem = NULL;
552 	struct bpf_local_storage *local_storage;
553 	struct bpf_local_storage_map_bucket *b;
554 	HLIST_HEAD(old_selem_free_list);
555 	unsigned long flags, b_flags;
556 	int err;
557 
558 	/* BPF_EXIST and BPF_NOEXIST cannot be both set */
559 	if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
560 	    /* BPF_F_LOCK can only be used in a value with spin_lock */
561 	    unlikely((map_flags & BPF_F_LOCK) &&
562 		     !btf_record_has_field(smap->map.record, BPF_SPIN_LOCK)))
563 		return ERR_PTR(-EINVAL);
564 
565 	local_storage = rcu_dereference_check(*owner_storage(smap, owner),
566 					      bpf_rcu_lock_held());
567 	if (!local_storage || hlist_empty(&local_storage->list)) {
568 		/* Very first elem for the owner */
569 		err = check_flags(NULL, map_flags);
570 		if (err)
571 			return ERR_PTR(err);
572 
573 		selem = bpf_selem_alloc(smap, owner, value, swap_uptrs);
574 		if (!selem)
575 			return ERR_PTR(-ENOMEM);
576 
577 		err = bpf_local_storage_alloc(owner, smap, selem);
578 		if (err) {
579 			bpf_selem_free(selem, true);
580 			mem_uncharge(smap, owner, smap->elem_size);
581 			return ERR_PTR(err);
582 		}
583 
584 		return SDATA(selem);
585 	}
586 
587 	if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) {
588 		/* Hoping to find an old_sdata to do inline update
589 		 * such that it can avoid taking the local_storage->lock
590 		 * and changing the lists.
591 		 */
592 		old_sdata =
593 			bpf_local_storage_lookup(local_storage, smap, false);
594 		err = check_flags(old_sdata, map_flags);
595 		if (err)
596 			return ERR_PTR(err);
597 		if (old_sdata && selem_linked_to_storage_lockless(SELEM(old_sdata))) {
598 			copy_map_value_locked(&smap->map, old_sdata->data,
599 					      value, false);
600 			return old_sdata;
601 		}
602 	}
603 
604 	/* A lookup has just been done before and concluded a new selem is
605 	 * needed. The chance of an unnecessary alloc is unlikely.
606 	 */
607 	alloc_selem = selem = bpf_selem_alloc(smap, owner, value, swap_uptrs);
608 	if (!alloc_selem)
609 		return ERR_PTR(-ENOMEM);
610 
611 	err = raw_res_spin_lock_irqsave(&local_storage->lock, flags);
612 	if (err)
613 		goto free_selem;
614 
615 	/* Recheck local_storage->list under local_storage->lock */
616 	if (unlikely(hlist_empty(&local_storage->list))) {
617 		/* A parallel del is happening and local_storage is going
618 		 * away.  It has just been checked before, so very
619 		 * unlikely.  Return instead of retry to keep things
620 		 * simple.
621 		 */
622 		err = -EAGAIN;
623 		goto unlock;
624 	}
625 
626 	old_sdata = bpf_local_storage_lookup(local_storage, smap, false);
627 	err = check_flags(old_sdata, map_flags);
628 	if (err)
629 		goto unlock;
630 
631 	if (old_sdata && (map_flags & BPF_F_LOCK)) {
632 		copy_map_value_locked(&smap->map, old_sdata->data, value,
633 				      false);
634 		selem = SELEM(old_sdata);
635 		goto unlock;
636 	}
637 
638 	b = select_bucket(smap, local_storage);
639 
640 	err = raw_res_spin_lock_irqsave(&b->lock, b_flags);
641 	if (err)
642 		goto unlock;
643 
644 	alloc_selem = NULL;
645 	/* First, link the new selem to the map */
646 	bpf_selem_link_map_nolock(b, selem);
647 
648 	/* Second, link (and publish) the new selem to local_storage */
649 	bpf_selem_link_storage_nolock(local_storage, selem);
650 
651 	/* Third, remove old selem, SELEM(old_sdata) */
652 	if (old_sdata) {
653 		bpf_selem_unlink_map_nolock(SELEM(old_sdata));
654 		bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
655 						&old_selem_free_list);
656 	}
657 
658 	raw_res_spin_unlock_irqrestore(&b->lock, b_flags);
659 unlock:
660 	raw_res_spin_unlock_irqrestore(&local_storage->lock, flags);
661 free_selem:
662 	bpf_selem_free_list(&old_selem_free_list, false);
663 	if (alloc_selem) {
664 		mem_uncharge(smap, owner, smap->elem_size);
665 		bpf_selem_free(alloc_selem, true);
666 	}
667 	return err ? ERR_PTR(err) : SDATA(selem);
668 }
669 
bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache * cache)670 static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
671 {
672 	u64 min_usage = U64_MAX;
673 	u16 i, res = 0;
674 
675 	spin_lock(&cache->idx_lock);
676 
677 	for (i = 0; i < BPF_LOCAL_STORAGE_CACHE_SIZE; i++) {
678 		if (cache->idx_usage_counts[i] < min_usage) {
679 			min_usage = cache->idx_usage_counts[i];
680 			res = i;
681 
682 			/* Found a free cache_idx */
683 			if (!min_usage)
684 				break;
685 		}
686 	}
687 	cache->idx_usage_counts[res]++;
688 
689 	spin_unlock(&cache->idx_lock);
690 
691 	return res;
692 }
693 
bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache * cache,u16 idx)694 static void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
695 					     u16 idx)
696 {
697 	spin_lock(&cache->idx_lock);
698 	cache->idx_usage_counts[idx]--;
699 	spin_unlock(&cache->idx_lock);
700 }
701 
bpf_local_storage_map_alloc_check(union bpf_attr * attr)702 int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
703 {
704 	if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK ||
705 	    !(attr->map_flags & BPF_F_NO_PREALLOC) ||
706 	    attr->max_entries ||
707 	    attr->key_size != sizeof(int) || !attr->value_size ||
708 	    /* Enforce BTF for userspace sk dumping */
709 	    !attr->btf_key_type_id || !attr->btf_value_type_id)
710 		return -EINVAL;
711 
712 	if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE)
713 		return -E2BIG;
714 
715 	return 0;
716 }
717 
bpf_local_storage_map_check_btf(struct bpf_map * map,const struct btf * btf,const struct btf_type * key_type,const struct btf_type * value_type)718 int bpf_local_storage_map_check_btf(struct bpf_map *map,
719 				    const struct btf *btf,
720 				    const struct btf_type *key_type,
721 				    const struct btf_type *value_type)
722 {
723 	if (!btf_type_is_i32(key_type))
724 		return -EINVAL;
725 
726 	return 0;
727 }
728 
729 /*
730  * Destroy local storage when the owner is going away. Caller must uncharge memory
731  * if memory charging is used.
732  */
bpf_local_storage_destroy(struct bpf_local_storage * local_storage)733 u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
734 {
735 	struct bpf_local_storage_elem *selem;
736 
737 	/* Neither the bpf_prog nor the bpf_map's syscall
738 	 * could be modifying the local_storage->list now.
739 	 * Thus, no elem can be added to or deleted from the
740 	 * local_storage->list by the bpf_prog or by the bpf_map's syscall.
741 	 *
742 	 * It is racing with bpf_local_storage_map_free() alone
743 	 * when unlinking elem from the local_storage->list and
744 	 * the map's bucket->list.
745 	 */
746 	hlist_for_each_entry_rcu(selem, &local_storage->list, snode)
747 		bpf_selem_unlink_nofail(selem, NULL);
748 
749 	if (!refcount_dec_and_test(&local_storage->owner_refcnt)) {
750 		while (refcount_read(&local_storage->owner_refcnt))
751 			cpu_relax();
752 		/*
753 		 * Paired with refcount_dec() in bpf_selem_unlink_nofail()
754 		 * to make sure destroy() sees the correct local_storage->mem_charge.
755 		 */
756 		smp_mb();
757 	}
758 
759 	return local_storage->mem_charge;
760 }
761 
bpf_local_storage_map_mem_usage(const struct bpf_map * map)762 u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
763 {
764 	struct bpf_local_storage_map *smap = (struct bpf_local_storage_map *)map;
765 	u64 usage = sizeof(*smap);
766 
767 	/* The dynamically callocated selems are not counted currently. */
768 	usage += sizeof(*smap->buckets) * (1ULL << smap->bucket_log);
769 	return usage;
770 }
771 
772 struct bpf_map *
bpf_local_storage_map_alloc(union bpf_attr * attr,struct bpf_local_storage_cache * cache)773 bpf_local_storage_map_alloc(union bpf_attr *attr,
774 			    struct bpf_local_storage_cache *cache)
775 {
776 	struct bpf_local_storage_map *smap;
777 	unsigned int i;
778 	u32 nbuckets;
779 	int err;
780 
781 	smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE);
782 	if (!smap)
783 		return ERR_PTR(-ENOMEM);
784 	bpf_map_init_from_attr(&smap->map, attr);
785 
786 	nbuckets = roundup_pow_of_two(num_possible_cpus());
787 	/* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
788 	nbuckets = max_t(u32, 2, nbuckets);
789 	smap->bucket_log = ilog2(nbuckets);
790 
791 	smap->buckets = bpf_map_kvcalloc(&smap->map, nbuckets,
792 					 sizeof(*smap->buckets), GFP_USER | __GFP_NOWARN);
793 	if (!smap->buckets) {
794 		err = -ENOMEM;
795 		goto free_smap;
796 	}
797 
798 	for (i = 0; i < nbuckets; i++) {
799 		INIT_HLIST_HEAD(&smap->buckets[i].list);
800 		raw_res_spin_lock_init(&smap->buckets[i].lock);
801 	}
802 
803 	smap->elem_size = offsetof(struct bpf_local_storage_elem,
804 				   sdata.data[attr->value_size]);
805 
806 	smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
807 	return &smap->map;
808 
809 free_smap:
810 	kvfree(smap->buckets);
811 	bpf_map_area_free(smap);
812 	return ERR_PTR(err);
813 }
814 
bpf_local_storage_map_free(struct bpf_map * map,struct bpf_local_storage_cache * cache)815 void bpf_local_storage_map_free(struct bpf_map *map,
816 				struct bpf_local_storage_cache *cache)
817 {
818 	struct bpf_local_storage_map_bucket *b;
819 	struct bpf_local_storage_elem *selem;
820 	struct bpf_local_storage_map *smap;
821 	unsigned int i;
822 
823 	smap = (struct bpf_local_storage_map *)map;
824 	bpf_local_storage_cache_idx_free(cache, smap->cache_idx);
825 
826 	/* Note that this map might be concurrently cloned from
827 	 * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
828 	 * RCU read section to finish before proceeding. New RCU
829 	 * read sections should be prevented via bpf_map_inc_not_zero.
830 	 */
831 	synchronize_rcu();
832 
833 	/* bpf prog and the userspace can no longer access this map
834 	 * now.  No new selem (of this map) can be added
835 	 * to the owner->storage or to the map bucket's list.
836 	 *
837 	 * The elem of this map can be cleaned up here
838 	 * or when the storage is freed e.g.
839 	 * by bpf_sk_storage_free() during __sk_destruct().
840 	 */
841 	for (i = 0; i < (1U << smap->bucket_log); i++) {
842 		b = &smap->buckets[i];
843 
844 		rcu_read_lock();
845 		/* No one is adding to b->list now */
846 restart:
847 		hlist_for_each_entry_rcu(selem, &b->list, map_node) {
848 			bpf_selem_unlink_nofail(selem, b);
849 
850 			if (need_resched()) {
851 				cond_resched_rcu();
852 				goto restart;
853 			}
854 		}
855 		rcu_read_unlock();
856 	}
857 
858 	/* While freeing the storage we may still need to access the map.
859 	 *
860 	 * e.g. when bpf_sk_storage_free() has unlinked selem from the map
861 	 * which then made the above while((selem = ...)) loop
862 	 * exit immediately.
863 	 *
864 	 * However, while freeing the storage one still needs to access the
865 	 * smap->elem_size to do the uncharging in
866 	 * bpf_selem_unlink_storage_nolock().
867 	 *
868 	 * Hence, wait another rcu grace period for the storage to be freed.
869 	 */
870 	synchronize_rcu();
871 
872 	/* smap remains in use regardless of kmalloc_nolock, so wait unconditionally. */
873 	rcu_barrier_tasks_trace();
874 	rcu_barrier();
875 	kvfree(smap->buckets);
876 	bpf_map_area_free(smap);
877 }
878