1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
3 * Copyright (c) 2016 Facebook
4 */
5 #include <linux/bpf.h>
6 #include <linux/btf.h>
7 #include <linux/jhash.h>
8 #include <linux/filter.h>
9 #include <linux/rculist_nulls.h>
10 #include <linux/rcupdate_wait.h>
11 #include <linux/random.h>
12 #include <uapi/linux/btf.h>
13 #include <linux/rcupdate_trace.h>
14 #include <linux/btf_ids.h>
15 #include "percpu_freelist.h"
16 #include "bpf_lru_list.h"
17 #include "map_in_map.h"
18 #include <linux/bpf_mem_alloc.h>
19 #include <asm/rqspinlock.h>
20
21 #define HTAB_CREATE_FLAG_MASK \
22 (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
23 BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)
24
25 #define BATCH_OPS(_name) \
26 .map_lookup_batch = \
27 _name##_map_lookup_batch, \
28 .map_lookup_and_delete_batch = \
29 _name##_map_lookup_and_delete_batch, \
30 .map_update_batch = \
31 generic_map_update_batch, \
32 .map_delete_batch = \
33 generic_map_delete_batch
34
35 /*
36 * The bucket lock has two protection scopes:
37 *
38 * 1) Serializing concurrent operations from BPF programs on different
39 * CPUs
40 *
41 * 2) Serializing concurrent operations from BPF programs and sys_bpf()
42 *
43 * BPF programs can execute in any context including perf, kprobes and
44 * tracing. As there are almost no limits where perf, kprobes and tracing
45 * can be invoked from the lock operations need to be protected against
46 * deadlocks. Deadlocks can be caused by recursion and by an invocation in
47 * the lock held section when functions which acquire this lock are invoked
48 * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU
49 * variable bpf_prog_active, which prevents BPF programs attached to perf
50 * events, kprobes and tracing to be invoked before the prior invocation
51 * from one of these contexts completed. sys_bpf() uses the same mechanism
52 * by pinning the task to the current CPU and incrementing the recursion
53 * protection across the map operation.
54 *
55 * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain
56 * operations like memory allocations (even with GFP_ATOMIC) from atomic
57 * contexts. This is required because even with GFP_ATOMIC the memory
58 * allocator calls into code paths which acquire locks with long held lock
59 * sections. To ensure the deterministic behaviour these locks are regular
60 * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only
61 * true atomic contexts on an RT kernel are the low level hardware
62 * handling, scheduling, low level interrupt handling, NMIs etc. None of
63 * these contexts should ever do memory allocations.
64 *
65 * As regular device interrupt handlers and soft interrupts are forced into
66 * thread context, the existing code which does
67 * spin_lock*(); alloc(GFP_ATOMIC); spin_unlock*();
68 * just works.
69 *
70 * In theory the BPF locks could be converted to regular spinlocks as well,
71 * but the bucket locks and percpu_freelist locks can be taken from
72 * arbitrary contexts (perf, kprobes, tracepoints) which are required to be
73 * atomic contexts even on RT. Before the introduction of bpf_mem_alloc,
74 * it is only safe to use raw spinlock for preallocated hash map on a RT kernel,
75 * because there is no memory allocation within the lock held sections. However
76 * after hash map was fully converted to use bpf_mem_alloc, there will be
77 * non-synchronous memory allocation for non-preallocated hash map, so it is
78 * safe to always use raw spinlock for bucket lock.
79 */
80 struct bucket {
81 struct hlist_nulls_head head;
82 rqspinlock_t raw_lock;
83 };
84
85 #define HASHTAB_MAP_LOCK_COUNT 8
86 #define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1)
87
88 struct bpf_htab {
89 struct bpf_map map;
90 struct bpf_mem_alloc ma;
91 struct bpf_mem_alloc pcpu_ma;
92 struct bucket *buckets;
93 void *elems;
94 union {
95 struct pcpu_freelist freelist;
96 struct bpf_lru lru;
97 };
98 struct htab_elem *__percpu *extra_elems;
99 /* number of elements in non-preallocated hashtable are kept
100 * in either pcount or count
101 */
102 struct percpu_counter pcount;
103 atomic_t count;
104 bool use_percpu_counter;
105 u32 n_buckets; /* number of hash buckets */
106 u32 elem_size; /* size of each element in bytes */
107 u32 hashrnd;
108 };
109
110 /* each htab element is struct htab_elem + key + value */
111 struct htab_elem {
112 union {
113 struct hlist_nulls_node hash_node;
114 struct {
115 void *padding;
116 union {
117 struct pcpu_freelist_node fnode;
118 struct htab_elem *batch_flink;
119 };
120 };
121 };
122 union {
123 /* pointer to per-cpu pointer */
124 void *ptr_to_pptr;
125 struct bpf_lru_node lru_node;
126 };
127 u32 hash;
128 char key[] __aligned(8);
129 };
130
htab_is_prealloc(const struct bpf_htab * htab)131 static inline bool htab_is_prealloc(const struct bpf_htab *htab)
132 {
133 return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
134 }
135
htab_init_buckets(struct bpf_htab * htab)136 static void htab_init_buckets(struct bpf_htab *htab)
137 {
138 unsigned int i;
139
140 for (i = 0; i < htab->n_buckets; i++) {
141 INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
142 raw_res_spin_lock_init(&htab->buckets[i].raw_lock);
143 cond_resched();
144 }
145 }
146
htab_lock_bucket(struct bucket * b,unsigned long * pflags)147 static inline int htab_lock_bucket(struct bucket *b, unsigned long *pflags)
148 {
149 unsigned long flags;
150 int ret;
151
152 ret = raw_res_spin_lock_irqsave(&b->raw_lock, flags);
153 if (ret)
154 return ret;
155 *pflags = flags;
156 return 0;
157 }
158
htab_unlock_bucket(struct bucket * b,unsigned long flags)159 static inline void htab_unlock_bucket(struct bucket *b, unsigned long flags)
160 {
161 raw_res_spin_unlock_irqrestore(&b->raw_lock, flags);
162 }
163
164 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
165
htab_is_lru(const struct bpf_htab * htab)166 static bool htab_is_lru(const struct bpf_htab *htab)
167 {
168 return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH ||
169 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
170 }
171
htab_is_percpu(const struct bpf_htab * htab)172 static bool htab_is_percpu(const struct bpf_htab *htab)
173 {
174 return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH ||
175 htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
176 }
177
is_fd_htab(const struct bpf_htab * htab)178 static inline bool is_fd_htab(const struct bpf_htab *htab)
179 {
180 return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS;
181 }
182
htab_elem_value(struct htab_elem * l,u32 key_size)183 static inline void *htab_elem_value(struct htab_elem *l, u32 key_size)
184 {
185 return l->key + round_up(key_size, 8);
186 }
187
htab_elem_set_ptr(struct htab_elem * l,u32 key_size,void __percpu * pptr)188 static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
189 void __percpu *pptr)
190 {
191 *(void __percpu **)htab_elem_value(l, key_size) = pptr;
192 }
193
htab_elem_get_ptr(struct htab_elem * l,u32 key_size)194 static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
195 {
196 return *(void __percpu **)htab_elem_value(l, key_size);
197 }
198
fd_htab_map_get_ptr(const struct bpf_map * map,struct htab_elem * l)199 static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l)
200 {
201 return *(void **)htab_elem_value(l, map->key_size);
202 }
203
get_htab_elem(struct bpf_htab * htab,int i)204 static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
205 {
206 return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size);
207 }
208
209 /* Both percpu and fd htab support in-place update, so no need for
210 * extra elem. LRU itself can remove the least used element, so
211 * there is no need for an extra elem during map_update.
212 */
htab_has_extra_elems(struct bpf_htab * htab)213 static bool htab_has_extra_elems(struct bpf_htab *htab)
214 {
215 return !htab_is_percpu(htab) && !htab_is_lru(htab) && !is_fd_htab(htab);
216 }
217
htab_free_internal_structs(struct bpf_htab * htab,struct htab_elem * elem)218 static void htab_free_internal_structs(struct bpf_htab *htab, struct htab_elem *elem)
219 {
220 if (btf_record_has_field(htab->map.record, BPF_TIMER))
221 bpf_obj_free_timer(htab->map.record,
222 htab_elem_value(elem, htab->map.key_size));
223 if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
224 bpf_obj_free_workqueue(htab->map.record,
225 htab_elem_value(elem, htab->map.key_size));
226 if (btf_record_has_field(htab->map.record, BPF_TASK_WORK))
227 bpf_obj_free_task_work(htab->map.record,
228 htab_elem_value(elem, htab->map.key_size));
229 }
230
htab_free_prealloced_internal_structs(struct bpf_htab * htab)231 static void htab_free_prealloced_internal_structs(struct bpf_htab *htab)
232 {
233 u32 num_entries = htab->map.max_entries;
234 int i;
235
236 if (htab_has_extra_elems(htab))
237 num_entries += num_possible_cpus();
238
239 for (i = 0; i < num_entries; i++) {
240 struct htab_elem *elem;
241
242 elem = get_htab_elem(htab, i);
243 htab_free_internal_structs(htab, elem);
244 cond_resched();
245 }
246 }
247
htab_free_prealloced_fields(struct bpf_htab * htab)248 static void htab_free_prealloced_fields(struct bpf_htab *htab)
249 {
250 u32 num_entries = htab->map.max_entries;
251 int i;
252
253 if (IS_ERR_OR_NULL(htab->map.record))
254 return;
255 if (htab_has_extra_elems(htab))
256 num_entries += num_possible_cpus();
257 for (i = 0; i < num_entries; i++) {
258 struct htab_elem *elem;
259
260 elem = get_htab_elem(htab, i);
261 if (htab_is_percpu(htab)) {
262 void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
263 int cpu;
264
265 for_each_possible_cpu(cpu) {
266 bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
267 cond_resched();
268 }
269 } else {
270 bpf_obj_free_fields(htab->map.record,
271 htab_elem_value(elem, htab->map.key_size));
272 cond_resched();
273 }
274 cond_resched();
275 }
276 }
277
htab_free_elems(struct bpf_htab * htab)278 static void htab_free_elems(struct bpf_htab *htab)
279 {
280 int i;
281
282 if (!htab_is_percpu(htab))
283 goto free_elems;
284
285 for (i = 0; i < htab->map.max_entries; i++) {
286 void __percpu *pptr;
287
288 pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
289 htab->map.key_size);
290 free_percpu(pptr);
291 cond_resched();
292 }
293 free_elems:
294 bpf_map_area_free(htab->elems);
295 }
296
297 /* The LRU list has a lock (lru_lock). Each htab bucket has a lock
298 * (bucket_lock). If both locks need to be acquired together, the lock
299 * order is always lru_lock -> bucket_lock and this only happens in
300 * bpf_lru_list.c logic. For example, certain code path of
301 * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(),
302 * will acquire lru_lock first followed by acquiring bucket_lock.
303 *
304 * In hashtab.c, to avoid deadlock, lock acquisition of
305 * bucket_lock followed by lru_lock is not allowed. In such cases,
306 * bucket_lock needs to be released first before acquiring lru_lock.
307 */
prealloc_lru_pop(struct bpf_htab * htab,void * key,u32 hash)308 static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
309 u32 hash)
310 {
311 struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash);
312 struct htab_elem *l;
313
314 if (node) {
315 bpf_map_inc_elem_count(&htab->map);
316 l = container_of(node, struct htab_elem, lru_node);
317 memcpy(l->key, key, htab->map.key_size);
318 return l;
319 }
320
321 return NULL;
322 }
323
prealloc_init(struct bpf_htab * htab)324 static int prealloc_init(struct bpf_htab *htab)
325 {
326 u32 num_entries = htab->map.max_entries;
327 int err = -ENOMEM, i;
328
329 if (htab_has_extra_elems(htab))
330 num_entries += num_possible_cpus();
331
332 htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries,
333 htab->map.numa_node);
334 if (!htab->elems)
335 return -ENOMEM;
336
337 if (!htab_is_percpu(htab))
338 goto skip_percpu_elems;
339
340 for (i = 0; i < num_entries; i++) {
341 u32 size = round_up(htab->map.value_size, 8);
342 void __percpu *pptr;
343
344 pptr = bpf_map_alloc_percpu(&htab->map, size, 8,
345 GFP_USER | __GFP_NOWARN);
346 if (!pptr)
347 goto free_elems;
348 htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
349 pptr);
350 cond_resched();
351 }
352
353 skip_percpu_elems:
354 if (htab_is_lru(htab))
355 err = bpf_lru_init(&htab->lru,
356 htab->map.map_flags & BPF_F_NO_COMMON_LRU,
357 offsetof(struct htab_elem, hash) -
358 offsetof(struct htab_elem, lru_node),
359 htab_lru_map_delete_node,
360 htab);
361 else
362 err = pcpu_freelist_init(&htab->freelist);
363
364 if (err)
365 goto free_elems;
366
367 if (htab_is_lru(htab))
368 bpf_lru_populate(&htab->lru, htab->elems,
369 offsetof(struct htab_elem, lru_node),
370 htab->elem_size, num_entries);
371 else
372 pcpu_freelist_populate(&htab->freelist,
373 htab->elems + offsetof(struct htab_elem, fnode),
374 htab->elem_size, num_entries);
375
376 return 0;
377
378 free_elems:
379 htab_free_elems(htab);
380 return err;
381 }
382
prealloc_destroy(struct bpf_htab * htab)383 static void prealloc_destroy(struct bpf_htab *htab)
384 {
385 htab_free_elems(htab);
386
387 if (htab_is_lru(htab))
388 bpf_lru_destroy(&htab->lru);
389 else
390 pcpu_freelist_destroy(&htab->freelist);
391 }
392
alloc_extra_elems(struct bpf_htab * htab)393 static int alloc_extra_elems(struct bpf_htab *htab)
394 {
395 struct htab_elem *__percpu *pptr, *l_new;
396 struct pcpu_freelist_node *l;
397 int cpu;
398
399 pptr = bpf_map_alloc_percpu(&htab->map, sizeof(struct htab_elem *), 8,
400 GFP_USER | __GFP_NOWARN);
401 if (!pptr)
402 return -ENOMEM;
403
404 for_each_possible_cpu(cpu) {
405 l = pcpu_freelist_pop(&htab->freelist);
406 /* pop will succeed, since prealloc_init()
407 * preallocated extra num_possible_cpus elements
408 */
409 l_new = container_of(l, struct htab_elem, fnode);
410 *per_cpu_ptr(pptr, cpu) = l_new;
411 }
412 htab->extra_elems = pptr;
413 return 0;
414 }
415
416 /* Called from syscall */
htab_map_alloc_check(union bpf_attr * attr)417 static int htab_map_alloc_check(union bpf_attr *attr)
418 {
419 bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
420 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
421 bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
422 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
423 /* percpu_lru means each cpu has its own LRU list.
424 * it is different from BPF_MAP_TYPE_PERCPU_HASH where
425 * the map's value itself is percpu. percpu_lru has
426 * nothing to do with the map's value.
427 */
428 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
429 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
430 bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);
431 int numa_node = bpf_map_attr_numa_node(attr);
432
433 BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
434 offsetof(struct htab_elem, hash_node.pprev));
435
436 if (zero_seed && !capable(CAP_SYS_ADMIN))
437 /* Guard against local DoS, and discourage production use. */
438 return -EPERM;
439
440 if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK ||
441 !bpf_map_flags_access_ok(attr->map_flags))
442 return -EINVAL;
443
444 if (!lru && percpu_lru)
445 return -EINVAL;
446
447 if (lru && !prealloc)
448 return -ENOTSUPP;
449
450 if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru))
451 return -EINVAL;
452
453 /* check sanity of attributes.
454 * value_size == 0 may be allowed in the future to use map as a set
455 */
456 if (attr->max_entries == 0 || attr->key_size == 0 ||
457 attr->value_size == 0)
458 return -EINVAL;
459
460 if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE -
461 sizeof(struct htab_elem))
462 /* if key_size + value_size is bigger, the user space won't be
463 * able to access the elements via bpf syscall. This check
464 * also makes sure that the elem_size doesn't overflow and it's
465 * kmalloc-able later in htab_map_update_elem()
466 */
467 return -E2BIG;
468 /* percpu map value size is bound by PCPU_MIN_UNIT_SIZE */
469 if (percpu && round_up(attr->value_size, 8) > PCPU_MIN_UNIT_SIZE)
470 return -E2BIG;
471
472 return 0;
473 }
474
htab_map_alloc(union bpf_attr * attr)475 static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
476 {
477 bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
478 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
479 /* percpu_lru means each cpu has its own LRU list.
480 * it is different from BPF_MAP_TYPE_PERCPU_HASH where
481 * the map's value itself is percpu. percpu_lru has
482 * nothing to do with the map's value.
483 */
484 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
485 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
486 struct bpf_htab *htab;
487 int err;
488
489 htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE);
490 if (!htab)
491 return ERR_PTR(-ENOMEM);
492
493 bpf_map_init_from_attr(&htab->map, attr);
494
495 if (percpu_lru) {
496 /* ensure each CPU's lru list has >=1 elements.
497 * since we are at it, make each lru list has the same
498 * number of elements.
499 */
500 htab->map.max_entries = roundup(attr->max_entries,
501 num_possible_cpus());
502 if (htab->map.max_entries < attr->max_entries)
503 htab->map.max_entries = rounddown(attr->max_entries,
504 num_possible_cpus());
505 }
506
507 /* hash table size must be power of 2; roundup_pow_of_two() can overflow
508 * into UB on 32-bit arches, so check that first
509 */
510 err = -E2BIG;
511 if (htab->map.max_entries > 1UL << 31)
512 goto free_htab;
513
514 htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
515
516 htab->elem_size = sizeof(struct htab_elem) +
517 round_up(htab->map.key_size, 8);
518 if (percpu)
519 htab->elem_size += sizeof(void *);
520 else
521 htab->elem_size += round_up(htab->map.value_size, 8);
522
523 /* check for u32 overflow */
524 if (htab->n_buckets > U32_MAX / sizeof(struct bucket))
525 goto free_htab;
526
527 err = bpf_map_init_elem_count(&htab->map);
528 if (err)
529 goto free_htab;
530
531 err = -ENOMEM;
532 htab->buckets = bpf_map_area_alloc(htab->n_buckets *
533 sizeof(struct bucket),
534 htab->map.numa_node);
535 if (!htab->buckets)
536 goto free_elem_count;
537
538 if (htab->map.map_flags & BPF_F_ZERO_SEED)
539 htab->hashrnd = 0;
540 else
541 htab->hashrnd = get_random_u32();
542
543 htab_init_buckets(htab);
544
545 /* compute_batch_value() computes batch value as num_online_cpus() * 2
546 * and __percpu_counter_compare() needs
547 * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus()
548 * for percpu_counter to be faster than atomic_t. In practice the average bpf
549 * hash map size is 10k, which means that a system with 64 cpus will fill
550 * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore
551 * define our own batch count as 32 then 10k hash map can be filled up to 80%:
552 * 10k - 8k > 32 _batch_ * 64 _cpus_
553 * and __percpu_counter_compare() will still be fast. At that point hash map
554 * collisions will dominate its performance anyway. Assume that hash map filled
555 * to 50+% isn't going to be O(1) and use the following formula to choose
556 * between percpu_counter and atomic_t.
557 */
558 #define PERCPU_COUNTER_BATCH 32
559 if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH)
560 htab->use_percpu_counter = true;
561
562 if (htab->use_percpu_counter) {
563 err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL);
564 if (err)
565 goto free_map_locked;
566 }
567
568 if (prealloc) {
569 err = prealloc_init(htab);
570 if (err)
571 goto free_map_locked;
572
573 if (htab_has_extra_elems(htab)) {
574 err = alloc_extra_elems(htab);
575 if (err)
576 goto free_prealloc;
577 }
578 } else {
579 err = bpf_mem_alloc_init(&htab->ma, htab->elem_size, false);
580 if (err)
581 goto free_map_locked;
582 if (percpu) {
583 err = bpf_mem_alloc_init(&htab->pcpu_ma,
584 round_up(htab->map.value_size, 8), true);
585 if (err)
586 goto free_map_locked;
587 }
588 }
589
590 return &htab->map;
591
592 free_prealloc:
593 prealloc_destroy(htab);
594 free_map_locked:
595 if (htab->use_percpu_counter)
596 percpu_counter_destroy(&htab->pcount);
597 bpf_map_area_free(htab->buckets);
598 bpf_mem_alloc_destroy(&htab->pcpu_ma);
599 bpf_mem_alloc_destroy(&htab->ma);
600 free_elem_count:
601 bpf_map_free_elem_count(&htab->map);
602 free_htab:
603 bpf_map_area_free(htab);
604 return ERR_PTR(err);
605 }
606
htab_map_hash(const void * key,u32 key_len,u32 hashrnd)607 static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
608 {
609 if (likely(key_len % 4 == 0))
610 return jhash2(key, key_len / 4, hashrnd);
611 return jhash(key, key_len, hashrnd);
612 }
613
__select_bucket(struct bpf_htab * htab,u32 hash)614 static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
615 {
616 return &htab->buckets[hash & (htab->n_buckets - 1)];
617 }
618
select_bucket(struct bpf_htab * htab,u32 hash)619 static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash)
620 {
621 return &__select_bucket(htab, hash)->head;
622 }
623
624 /* this lookup function can only be called with bucket lock taken */
lookup_elem_raw(struct hlist_nulls_head * head,u32 hash,void * key,u32 key_size)625 static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash,
626 void *key, u32 key_size)
627 {
628 struct hlist_nulls_node *n;
629 struct htab_elem *l;
630
631 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
632 if (l->hash == hash && !memcmp(&l->key, key, key_size))
633 return l;
634
635 return NULL;
636 }
637
638 /* can be called without bucket lock. it will repeat the loop in
639 * the unlikely event when elements moved from one bucket into another
640 * while link list is being walked
641 */
lookup_nulls_elem_raw(struct hlist_nulls_head * head,u32 hash,void * key,u32 key_size,u32 n_buckets)642 static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head,
643 u32 hash, void *key,
644 u32 key_size, u32 n_buckets)
645 {
646 struct hlist_nulls_node *n;
647 struct htab_elem *l;
648
649 again:
650 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
651 if (l->hash == hash && !memcmp(&l->key, key, key_size))
652 return l;
653
654 if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1))))
655 goto again;
656
657 return NULL;
658 }
659
660 /* Called from syscall or from eBPF program directly, so
661 * arguments have to match bpf_map_lookup_elem() exactly.
662 * The return value is adjusted by BPF instructions
663 * in htab_map_gen_lookup().
664 */
__htab_map_lookup_elem(struct bpf_map * map,void * key)665 static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
666 {
667 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
668 struct hlist_nulls_head *head;
669 struct htab_elem *l;
670 u32 hash, key_size;
671
672 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
673 !rcu_read_lock_bh_held());
674
675 key_size = map->key_size;
676
677 hash = htab_map_hash(key, key_size, htab->hashrnd);
678
679 head = select_bucket(htab, hash);
680
681 l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
682
683 return l;
684 }
685
htab_map_lookup_elem(struct bpf_map * map,void * key)686 static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
687 {
688 struct htab_elem *l = __htab_map_lookup_elem(map, key);
689
690 if (l)
691 return htab_elem_value(l, map->key_size);
692
693 return NULL;
694 }
695
696 /* inline bpf_map_lookup_elem() call.
697 * Instead of:
698 * bpf_prog
699 * bpf_map_lookup_elem
700 * map->ops->map_lookup_elem
701 * htab_map_lookup_elem
702 * __htab_map_lookup_elem
703 * do:
704 * bpf_prog
705 * __htab_map_lookup_elem
706 */
htab_map_gen_lookup(struct bpf_map * map,struct bpf_insn * insn_buf)707 static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
708 {
709 struct bpf_insn *insn = insn_buf;
710 const int ret = BPF_REG_0;
711
712 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
713 (void *(*)(struct bpf_map *map, void *key))NULL));
714 *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
715 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
716 *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
717 offsetof(struct htab_elem, key) +
718 round_up(map->key_size, 8));
719 return insn - insn_buf;
720 }
721
__htab_lru_map_lookup_elem(struct bpf_map * map,void * key,const bool mark)722 static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map,
723 void *key, const bool mark)
724 {
725 struct htab_elem *l = __htab_map_lookup_elem(map, key);
726
727 if (l) {
728 if (mark)
729 bpf_lru_node_set_ref(&l->lru_node);
730 return htab_elem_value(l, map->key_size);
731 }
732
733 return NULL;
734 }
735
htab_lru_map_lookup_elem(struct bpf_map * map,void * key)736 static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
737 {
738 return __htab_lru_map_lookup_elem(map, key, true);
739 }
740
htab_lru_map_lookup_elem_sys(struct bpf_map * map,void * key)741 static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key)
742 {
743 return __htab_lru_map_lookup_elem(map, key, false);
744 }
745
htab_lru_map_gen_lookup(struct bpf_map * map,struct bpf_insn * insn_buf)746 static int htab_lru_map_gen_lookup(struct bpf_map *map,
747 struct bpf_insn *insn_buf)
748 {
749 struct bpf_insn *insn = insn_buf;
750 const int ret = BPF_REG_0;
751 const int ref_reg = BPF_REG_1;
752
753 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
754 (void *(*)(struct bpf_map *map, void *key))NULL));
755 *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
756 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
757 *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
758 offsetof(struct htab_elem, lru_node) +
759 offsetof(struct bpf_lru_node, ref));
760 *insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1);
761 *insn++ = BPF_ST_MEM(BPF_B, ret,
762 offsetof(struct htab_elem, lru_node) +
763 offsetof(struct bpf_lru_node, ref),
764 1);
765 *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
766 offsetof(struct htab_elem, key) +
767 round_up(map->key_size, 8));
768 return insn - insn_buf;
769 }
770
check_and_free_fields(struct bpf_htab * htab,struct htab_elem * elem)771 static void check_and_free_fields(struct bpf_htab *htab,
772 struct htab_elem *elem)
773 {
774 if (IS_ERR_OR_NULL(htab->map.record))
775 return;
776
777 if (htab_is_percpu(htab)) {
778 void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
779 int cpu;
780
781 for_each_possible_cpu(cpu)
782 bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
783 } else {
784 void *map_value = htab_elem_value(elem, htab->map.key_size);
785
786 bpf_obj_free_fields(htab->map.record, map_value);
787 }
788 }
789
790 /* It is called from the bpf_lru_list when the LRU needs to delete
791 * older elements from the htab.
792 */
htab_lru_map_delete_node(void * arg,struct bpf_lru_node * node)793 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
794 {
795 struct bpf_htab *htab = arg;
796 struct htab_elem *l = NULL, *tgt_l;
797 struct hlist_nulls_head *head;
798 struct hlist_nulls_node *n;
799 unsigned long flags;
800 struct bucket *b;
801 int ret;
802
803 tgt_l = container_of(node, struct htab_elem, lru_node);
804 b = __select_bucket(htab, tgt_l->hash);
805 head = &b->head;
806
807 ret = htab_lock_bucket(b, &flags);
808 if (ret)
809 return false;
810
811 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
812 if (l == tgt_l) {
813 hlist_nulls_del_rcu(&l->hash_node);
814 bpf_map_dec_elem_count(&htab->map);
815 break;
816 }
817
818 htab_unlock_bucket(b, flags);
819
820 if (l == tgt_l)
821 check_and_free_fields(htab, l);
822 return l == tgt_l;
823 }
824
825 /* Called from syscall */
htab_map_get_next_key(struct bpf_map * map,void * key,void * next_key)826 static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
827 {
828 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
829 struct hlist_nulls_head *head;
830 struct htab_elem *l, *next_l;
831 u32 hash, key_size;
832 int i = 0;
833
834 WARN_ON_ONCE(!rcu_read_lock_held());
835
836 key_size = map->key_size;
837
838 if (!key)
839 goto find_first_elem;
840
841 hash = htab_map_hash(key, key_size, htab->hashrnd);
842
843 head = select_bucket(htab, hash);
844
845 /* lookup the key */
846 l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
847
848 if (!l)
849 goto find_first_elem;
850
851 /* key was found, get next key in the same bucket */
852 next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)),
853 struct htab_elem, hash_node);
854
855 if (next_l) {
856 /* if next elem in this hash list is non-zero, just return it */
857 memcpy(next_key, next_l->key, key_size);
858 return 0;
859 }
860
861 /* no more elements in this hash list, go to the next bucket */
862 i = hash & (htab->n_buckets - 1);
863 i++;
864
865 find_first_elem:
866 /* iterate over buckets */
867 for (; i < htab->n_buckets; i++) {
868 head = select_bucket(htab, i);
869
870 /* pick first element in the bucket */
871 next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)),
872 struct htab_elem, hash_node);
873 if (next_l) {
874 /* if it's not empty, just return it */
875 memcpy(next_key, next_l->key, key_size);
876 return 0;
877 }
878 }
879
880 /* iterated over all buckets and all elements */
881 return -ENOENT;
882 }
883
htab_elem_free(struct bpf_htab * htab,struct htab_elem * l)884 static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
885 {
886 check_and_free_fields(htab, l);
887
888 if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
889 bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
890 bpf_mem_cache_free(&htab->ma, l);
891 }
892
htab_put_fd_value(struct bpf_htab * htab,struct htab_elem * l)893 static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
894 {
895 struct bpf_map *map = &htab->map;
896 void *ptr;
897
898 if (map->ops->map_fd_put_ptr) {
899 ptr = fd_htab_map_get_ptr(map, l);
900 map->ops->map_fd_put_ptr(map, ptr, true);
901 }
902 }
903
is_map_full(struct bpf_htab * htab)904 static bool is_map_full(struct bpf_htab *htab)
905 {
906 if (htab->use_percpu_counter)
907 return __percpu_counter_compare(&htab->pcount, htab->map.max_entries,
908 PERCPU_COUNTER_BATCH) >= 0;
909 return atomic_read(&htab->count) >= htab->map.max_entries;
910 }
911
inc_elem_count(struct bpf_htab * htab)912 static void inc_elem_count(struct bpf_htab *htab)
913 {
914 bpf_map_inc_elem_count(&htab->map);
915
916 if (htab->use_percpu_counter)
917 percpu_counter_add_batch(&htab->pcount, 1, PERCPU_COUNTER_BATCH);
918 else
919 atomic_inc(&htab->count);
920 }
921
dec_elem_count(struct bpf_htab * htab)922 static void dec_elem_count(struct bpf_htab *htab)
923 {
924 bpf_map_dec_elem_count(&htab->map);
925
926 if (htab->use_percpu_counter)
927 percpu_counter_add_batch(&htab->pcount, -1, PERCPU_COUNTER_BATCH);
928 else
929 atomic_dec(&htab->count);
930 }
931
932
free_htab_elem(struct bpf_htab * htab,struct htab_elem * l)933 static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
934 {
935 htab_put_fd_value(htab, l);
936
937 if (htab_is_prealloc(htab)) {
938 bpf_map_dec_elem_count(&htab->map);
939 check_and_free_fields(htab, l);
940 pcpu_freelist_push(&htab->freelist, &l->fnode);
941 } else {
942 dec_elem_count(htab);
943 htab_elem_free(htab, l);
944 }
945 }
946
pcpu_copy_value(struct bpf_htab * htab,void __percpu * pptr,void * value,bool onallcpus)947 static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
948 void *value, bool onallcpus)
949 {
950 if (!onallcpus) {
951 /* copy true value_size bytes */
952 copy_map_value(&htab->map, this_cpu_ptr(pptr), value);
953 } else {
954 u32 size = round_up(htab->map.value_size, 8);
955 int off = 0, cpu;
956
957 for_each_possible_cpu(cpu) {
958 copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value + off);
959 off += size;
960 }
961 }
962 }
963
pcpu_init_value(struct bpf_htab * htab,void __percpu * pptr,void * value,bool onallcpus)964 static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
965 void *value, bool onallcpus)
966 {
967 /* When not setting the initial value on all cpus, zero-fill element
968 * values for other cpus. Otherwise, bpf program has no way to ensure
969 * known initial values for cpus other than current one
970 * (onallcpus=false always when coming from bpf prog).
971 */
972 if (!onallcpus) {
973 int current_cpu = raw_smp_processor_id();
974 int cpu;
975
976 for_each_possible_cpu(cpu) {
977 if (cpu == current_cpu)
978 copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value);
979 else /* Since elem is preallocated, we cannot touch special fields */
980 zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu));
981 }
982 } else {
983 pcpu_copy_value(htab, pptr, value, onallcpus);
984 }
985 }
986
fd_htab_map_needs_adjust(const struct bpf_htab * htab)987 static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
988 {
989 return is_fd_htab(htab) && BITS_PER_LONG == 64;
990 }
991
alloc_htab_elem(struct bpf_htab * htab,void * key,void * value,u32 key_size,u32 hash,bool percpu,bool onallcpus,struct htab_elem * old_elem)992 static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
993 void *value, u32 key_size, u32 hash,
994 bool percpu, bool onallcpus,
995 struct htab_elem *old_elem)
996 {
997 u32 size = htab->map.value_size;
998 bool prealloc = htab_is_prealloc(htab);
999 struct htab_elem *l_new, **pl_new;
1000 void __percpu *pptr;
1001
1002 if (prealloc) {
1003 if (old_elem) {
1004 /* if we're updating the existing element,
1005 * use per-cpu extra elems to avoid freelist_pop/push
1006 */
1007 pl_new = this_cpu_ptr(htab->extra_elems);
1008 l_new = *pl_new;
1009 *pl_new = old_elem;
1010 } else {
1011 struct pcpu_freelist_node *l;
1012
1013 l = __pcpu_freelist_pop(&htab->freelist);
1014 if (!l)
1015 return ERR_PTR(-E2BIG);
1016 l_new = container_of(l, struct htab_elem, fnode);
1017 bpf_map_inc_elem_count(&htab->map);
1018 }
1019 } else {
1020 if (is_map_full(htab))
1021 if (!old_elem)
1022 /* when map is full and update() is replacing
1023 * old element, it's ok to allocate, since
1024 * old element will be freed immediately.
1025 * Otherwise return an error
1026 */
1027 return ERR_PTR(-E2BIG);
1028 inc_elem_count(htab);
1029 l_new = bpf_mem_cache_alloc(&htab->ma);
1030 if (!l_new) {
1031 l_new = ERR_PTR(-ENOMEM);
1032 goto dec_count;
1033 }
1034 }
1035
1036 memcpy(l_new->key, key, key_size);
1037 if (percpu) {
1038 if (prealloc) {
1039 pptr = htab_elem_get_ptr(l_new, key_size);
1040 } else {
1041 /* alloc_percpu zero-fills */
1042 void *ptr = bpf_mem_cache_alloc(&htab->pcpu_ma);
1043
1044 if (!ptr) {
1045 bpf_mem_cache_free(&htab->ma, l_new);
1046 l_new = ERR_PTR(-ENOMEM);
1047 goto dec_count;
1048 }
1049 l_new->ptr_to_pptr = ptr;
1050 pptr = *(void __percpu **)ptr;
1051 }
1052
1053 pcpu_init_value(htab, pptr, value, onallcpus);
1054
1055 if (!prealloc)
1056 htab_elem_set_ptr(l_new, key_size, pptr);
1057 } else if (fd_htab_map_needs_adjust(htab)) {
1058 size = round_up(size, 8);
1059 memcpy(htab_elem_value(l_new, key_size), value, size);
1060 } else {
1061 copy_map_value(&htab->map, htab_elem_value(l_new, key_size), value);
1062 }
1063
1064 l_new->hash = hash;
1065 return l_new;
1066 dec_count:
1067 dec_elem_count(htab);
1068 return l_new;
1069 }
1070
check_flags(struct bpf_htab * htab,struct htab_elem * l_old,u64 map_flags)1071 static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
1072 u64 map_flags)
1073 {
1074 if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
1075 /* elem already exists */
1076 return -EEXIST;
1077
1078 if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
1079 /* elem doesn't exist, cannot update it */
1080 return -ENOENT;
1081
1082 return 0;
1083 }
1084
1085 /* Called from syscall or from eBPF program */
htab_map_update_elem(struct bpf_map * map,void * key,void * value,u64 map_flags)1086 static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
1087 u64 map_flags)
1088 {
1089 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1090 struct htab_elem *l_new, *l_old;
1091 struct hlist_nulls_head *head;
1092 unsigned long flags;
1093 struct bucket *b;
1094 u32 key_size, hash;
1095 int ret;
1096
1097 if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
1098 /* unknown flags */
1099 return -EINVAL;
1100
1101 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1102 !rcu_read_lock_bh_held());
1103
1104 key_size = map->key_size;
1105
1106 hash = htab_map_hash(key, key_size, htab->hashrnd);
1107
1108 b = __select_bucket(htab, hash);
1109 head = &b->head;
1110
1111 if (unlikely(map_flags & BPF_F_LOCK)) {
1112 if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK)))
1113 return -EINVAL;
1114 /* find an element without taking the bucket lock */
1115 l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
1116 htab->n_buckets);
1117 ret = check_flags(htab, l_old, map_flags);
1118 if (ret)
1119 return ret;
1120 if (l_old) {
1121 /* grab the element lock and update value in place */
1122 copy_map_value_locked(map,
1123 htab_elem_value(l_old, key_size),
1124 value, false);
1125 return 0;
1126 }
1127 /* fall through, grab the bucket lock and lookup again.
1128 * 99.9% chance that the element won't be found,
1129 * but second lookup under lock has to be done.
1130 */
1131 }
1132
1133 ret = htab_lock_bucket(b, &flags);
1134 if (ret)
1135 return ret;
1136
1137 l_old = lookup_elem_raw(head, hash, key, key_size);
1138
1139 ret = check_flags(htab, l_old, map_flags);
1140 if (ret)
1141 goto err;
1142
1143 if (unlikely(l_old && (map_flags & BPF_F_LOCK))) {
1144 /* first lookup without the bucket lock didn't find the element,
1145 * but second lookup with the bucket lock found it.
1146 * This case is highly unlikely, but has to be dealt with:
1147 * grab the element lock in addition to the bucket lock
1148 * and update element in place
1149 */
1150 copy_map_value_locked(map,
1151 htab_elem_value(l_old, key_size),
1152 value, false);
1153 ret = 0;
1154 goto err;
1155 }
1156
1157 l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
1158 l_old);
1159 if (IS_ERR(l_new)) {
1160 /* all pre-allocated elements are in use or memory exhausted */
1161 ret = PTR_ERR(l_new);
1162 goto err;
1163 }
1164
1165 /* add new element to the head of the list, so that
1166 * concurrent search will find it before old elem
1167 */
1168 hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1169 if (l_old) {
1170 hlist_nulls_del_rcu(&l_old->hash_node);
1171
1172 /* l_old has already been stashed in htab->extra_elems, free
1173 * its special fields before it is available for reuse.
1174 */
1175 if (htab_is_prealloc(htab))
1176 check_and_free_fields(htab, l_old);
1177 }
1178 htab_unlock_bucket(b, flags);
1179 if (l_old && !htab_is_prealloc(htab))
1180 free_htab_elem(htab, l_old);
1181 return 0;
1182 err:
1183 htab_unlock_bucket(b, flags);
1184 return ret;
1185 }
1186
htab_lru_push_free(struct bpf_htab * htab,struct htab_elem * elem)1187 static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
1188 {
1189 check_and_free_fields(htab, elem);
1190 bpf_map_dec_elem_count(&htab->map);
1191 bpf_lru_push_free(&htab->lru, &elem->lru_node);
1192 }
1193
htab_lru_map_update_elem(struct bpf_map * map,void * key,void * value,u64 map_flags)1194 static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
1195 u64 map_flags)
1196 {
1197 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1198 struct htab_elem *l_new, *l_old = NULL;
1199 struct hlist_nulls_head *head;
1200 unsigned long flags;
1201 struct bucket *b;
1202 u32 key_size, hash;
1203 int ret;
1204
1205 if (unlikely(map_flags > BPF_EXIST))
1206 /* unknown flags */
1207 return -EINVAL;
1208
1209 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1210 !rcu_read_lock_bh_held());
1211
1212 key_size = map->key_size;
1213
1214 hash = htab_map_hash(key, key_size, htab->hashrnd);
1215
1216 b = __select_bucket(htab, hash);
1217 head = &b->head;
1218
1219 /* For LRU, we need to alloc before taking bucket's
1220 * spinlock because getting free nodes from LRU may need
1221 * to remove older elements from htab and this removal
1222 * operation will need a bucket lock.
1223 */
1224 l_new = prealloc_lru_pop(htab, key, hash);
1225 if (!l_new)
1226 return -ENOMEM;
1227 copy_map_value(&htab->map, htab_elem_value(l_new, map->key_size), value);
1228
1229 ret = htab_lock_bucket(b, &flags);
1230 if (ret)
1231 goto err_lock_bucket;
1232
1233 l_old = lookup_elem_raw(head, hash, key, key_size);
1234
1235 ret = check_flags(htab, l_old, map_flags);
1236 if (ret)
1237 goto err;
1238
1239 /* add new element to the head of the list, so that
1240 * concurrent search will find it before old elem
1241 */
1242 hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1243 if (l_old) {
1244 bpf_lru_node_set_ref(&l_new->lru_node);
1245 hlist_nulls_del_rcu(&l_old->hash_node);
1246 }
1247 ret = 0;
1248
1249 err:
1250 htab_unlock_bucket(b, flags);
1251
1252 err_lock_bucket:
1253 if (ret)
1254 htab_lru_push_free(htab, l_new);
1255 else if (l_old)
1256 htab_lru_push_free(htab, l_old);
1257
1258 return ret;
1259 }
1260
htab_map_update_elem_in_place(struct bpf_map * map,void * key,void * value,u64 map_flags,bool percpu,bool onallcpus)1261 static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
1262 void *value, u64 map_flags,
1263 bool percpu, bool onallcpus)
1264 {
1265 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1266 struct htab_elem *l_new, *l_old;
1267 struct hlist_nulls_head *head;
1268 void *old_map_ptr = NULL;
1269 unsigned long flags;
1270 struct bucket *b;
1271 u32 key_size, hash;
1272 int ret;
1273
1274 if (unlikely(map_flags > BPF_EXIST))
1275 /* unknown flags */
1276 return -EINVAL;
1277
1278 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1279 !rcu_read_lock_bh_held());
1280
1281 key_size = map->key_size;
1282
1283 hash = htab_map_hash(key, key_size, htab->hashrnd);
1284
1285 b = __select_bucket(htab, hash);
1286 head = &b->head;
1287
1288 ret = htab_lock_bucket(b, &flags);
1289 if (ret)
1290 return ret;
1291
1292 l_old = lookup_elem_raw(head, hash, key, key_size);
1293
1294 ret = check_flags(htab, l_old, map_flags);
1295 if (ret)
1296 goto err;
1297
1298 if (l_old) {
1299 /* Update value in-place */
1300 if (percpu) {
1301 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
1302 value, onallcpus);
1303 } else {
1304 void **inner_map_pptr = htab_elem_value(l_old, key_size);
1305
1306 old_map_ptr = *inner_map_pptr;
1307 WRITE_ONCE(*inner_map_pptr, *(void **)value);
1308 }
1309 } else {
1310 l_new = alloc_htab_elem(htab, key, value, key_size,
1311 hash, percpu, onallcpus, NULL);
1312 if (IS_ERR(l_new)) {
1313 ret = PTR_ERR(l_new);
1314 goto err;
1315 }
1316 hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1317 }
1318 err:
1319 htab_unlock_bucket(b, flags);
1320 if (old_map_ptr)
1321 map->ops->map_fd_put_ptr(map, old_map_ptr, true);
1322 return ret;
1323 }
1324
__htab_lru_percpu_map_update_elem(struct bpf_map * map,void * key,void * value,u64 map_flags,bool onallcpus)1325 static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
1326 void *value, u64 map_flags,
1327 bool onallcpus)
1328 {
1329 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1330 struct htab_elem *l_new = NULL, *l_old;
1331 struct hlist_nulls_head *head;
1332 unsigned long flags;
1333 struct bucket *b;
1334 u32 key_size, hash;
1335 int ret;
1336
1337 if (unlikely(map_flags > BPF_EXIST))
1338 /* unknown flags */
1339 return -EINVAL;
1340
1341 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1342 !rcu_read_lock_bh_held());
1343
1344 key_size = map->key_size;
1345
1346 hash = htab_map_hash(key, key_size, htab->hashrnd);
1347
1348 b = __select_bucket(htab, hash);
1349 head = &b->head;
1350
1351 /* For LRU, we need to alloc before taking bucket's
1352 * spinlock because LRU's elem alloc may need
1353 * to remove older elem from htab and this removal
1354 * operation will need a bucket lock.
1355 */
1356 if (map_flags != BPF_EXIST) {
1357 l_new = prealloc_lru_pop(htab, key, hash);
1358 if (!l_new)
1359 return -ENOMEM;
1360 }
1361
1362 ret = htab_lock_bucket(b, &flags);
1363 if (ret)
1364 goto err_lock_bucket;
1365
1366 l_old = lookup_elem_raw(head, hash, key, key_size);
1367
1368 ret = check_flags(htab, l_old, map_flags);
1369 if (ret)
1370 goto err;
1371
1372 if (l_old) {
1373 bpf_lru_node_set_ref(&l_old->lru_node);
1374
1375 /* per-cpu hash map can update value in-place */
1376 pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
1377 value, onallcpus);
1378 } else {
1379 pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size),
1380 value, onallcpus);
1381 hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1382 l_new = NULL;
1383 }
1384 ret = 0;
1385 err:
1386 htab_unlock_bucket(b, flags);
1387 err_lock_bucket:
1388 if (l_new) {
1389 bpf_map_dec_elem_count(&htab->map);
1390 bpf_lru_push_free(&htab->lru, &l_new->lru_node);
1391 }
1392 return ret;
1393 }
1394
htab_percpu_map_update_elem(struct bpf_map * map,void * key,void * value,u64 map_flags)1395 static long htab_percpu_map_update_elem(struct bpf_map *map, void *key,
1396 void *value, u64 map_flags)
1397 {
1398 return htab_map_update_elem_in_place(map, key, value, map_flags, true, false);
1399 }
1400
htab_lru_percpu_map_update_elem(struct bpf_map * map,void * key,void * value,u64 map_flags)1401 static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
1402 void *value, u64 map_flags)
1403 {
1404 return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
1405 false);
1406 }
1407
1408 /* Called from syscall or from eBPF program */
htab_map_delete_elem(struct bpf_map * map,void * key)1409 static long htab_map_delete_elem(struct bpf_map *map, void *key)
1410 {
1411 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1412 struct hlist_nulls_head *head;
1413 struct bucket *b;
1414 struct htab_elem *l;
1415 unsigned long flags;
1416 u32 hash, key_size;
1417 int ret;
1418
1419 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1420 !rcu_read_lock_bh_held());
1421
1422 key_size = map->key_size;
1423
1424 hash = htab_map_hash(key, key_size, htab->hashrnd);
1425 b = __select_bucket(htab, hash);
1426 head = &b->head;
1427
1428 ret = htab_lock_bucket(b, &flags);
1429 if (ret)
1430 return ret;
1431
1432 l = lookup_elem_raw(head, hash, key, key_size);
1433 if (l)
1434 hlist_nulls_del_rcu(&l->hash_node);
1435 else
1436 ret = -ENOENT;
1437
1438 htab_unlock_bucket(b, flags);
1439
1440 if (l)
1441 free_htab_elem(htab, l);
1442 return ret;
1443 }
1444
htab_lru_map_delete_elem(struct bpf_map * map,void * key)1445 static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
1446 {
1447 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1448 struct hlist_nulls_head *head;
1449 struct bucket *b;
1450 struct htab_elem *l;
1451 unsigned long flags;
1452 u32 hash, key_size;
1453 int ret;
1454
1455 WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1456 !rcu_read_lock_bh_held());
1457
1458 key_size = map->key_size;
1459
1460 hash = htab_map_hash(key, key_size, htab->hashrnd);
1461 b = __select_bucket(htab, hash);
1462 head = &b->head;
1463
1464 ret = htab_lock_bucket(b, &flags);
1465 if (ret)
1466 return ret;
1467
1468 l = lookup_elem_raw(head, hash, key, key_size);
1469
1470 if (l)
1471 hlist_nulls_del_rcu(&l->hash_node);
1472 else
1473 ret = -ENOENT;
1474
1475 htab_unlock_bucket(b, flags);
1476 if (l)
1477 htab_lru_push_free(htab, l);
1478 return ret;
1479 }
1480
delete_all_elements(struct bpf_htab * htab)1481 static void delete_all_elements(struct bpf_htab *htab)
1482 {
1483 int i;
1484
1485 /* It's called from a worker thread and migration has been disabled,
1486 * therefore, it is OK to invoke bpf_mem_cache_free() directly.
1487 */
1488 for (i = 0; i < htab->n_buckets; i++) {
1489 struct hlist_nulls_head *head = select_bucket(htab, i);
1490 struct hlist_nulls_node *n;
1491 struct htab_elem *l;
1492
1493 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1494 hlist_nulls_del_rcu(&l->hash_node);
1495 htab_elem_free(htab, l);
1496 }
1497 cond_resched();
1498 }
1499 }
1500
htab_free_malloced_internal_structs(struct bpf_htab * htab)1501 static void htab_free_malloced_internal_structs(struct bpf_htab *htab)
1502 {
1503 int i;
1504
1505 rcu_read_lock();
1506 for (i = 0; i < htab->n_buckets; i++) {
1507 struct hlist_nulls_head *head = select_bucket(htab, i);
1508 struct hlist_nulls_node *n;
1509 struct htab_elem *l;
1510
1511 hlist_nulls_for_each_entry(l, n, head, hash_node) {
1512 /* We only free timer on uref dropping to zero */
1513 htab_free_internal_structs(htab, l);
1514 }
1515 cond_resched_rcu();
1516 }
1517 rcu_read_unlock();
1518 }
1519
htab_map_free_internal_structs(struct bpf_map * map)1520 static void htab_map_free_internal_structs(struct bpf_map *map)
1521 {
1522 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1523
1524 /* We only free timer and workqueue on uref dropping to zero */
1525 if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) {
1526 if (!htab_is_prealloc(htab))
1527 htab_free_malloced_internal_structs(htab);
1528 else
1529 htab_free_prealloced_internal_structs(htab);
1530 }
1531 }
1532
1533 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
htab_map_free(struct bpf_map * map)1534 static void htab_map_free(struct bpf_map *map)
1535 {
1536 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1537
1538 /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.
1539 * bpf_free_used_maps() is called after bpf prog is no longer executing.
1540 * There is no need to synchronize_rcu() here to protect map elements.
1541 */
1542
1543 /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it
1544 * underneath and is responsible for waiting for callbacks to finish
1545 * during bpf_mem_alloc_destroy().
1546 */
1547 if (!htab_is_prealloc(htab)) {
1548 delete_all_elements(htab);
1549 } else {
1550 htab_free_prealloced_fields(htab);
1551 prealloc_destroy(htab);
1552 }
1553
1554 bpf_map_free_elem_count(map);
1555 free_percpu(htab->extra_elems);
1556 bpf_map_area_free(htab->buckets);
1557 bpf_mem_alloc_destroy(&htab->pcpu_ma);
1558 bpf_mem_alloc_destroy(&htab->ma);
1559 if (htab->use_percpu_counter)
1560 percpu_counter_destroy(&htab->pcount);
1561 bpf_map_area_free(htab);
1562 }
1563
htab_map_seq_show_elem(struct bpf_map * map,void * key,struct seq_file * m)1564 static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
1565 struct seq_file *m)
1566 {
1567 void *value;
1568
1569 rcu_read_lock();
1570
1571 value = htab_map_lookup_elem(map, key);
1572 if (!value) {
1573 rcu_read_unlock();
1574 return;
1575 }
1576
1577 btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
1578 seq_puts(m, ": ");
1579 btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
1580 seq_putc(m, '\n');
1581
1582 rcu_read_unlock();
1583 }
1584
__htab_map_lookup_and_delete_elem(struct bpf_map * map,void * key,void * value,bool is_lru_map,bool is_percpu,u64 flags)1585 static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
1586 void *value, bool is_lru_map,
1587 bool is_percpu, u64 flags)
1588 {
1589 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1590 struct hlist_nulls_head *head;
1591 unsigned long bflags;
1592 struct htab_elem *l;
1593 u32 hash, key_size;
1594 struct bucket *b;
1595 int ret;
1596
1597 key_size = map->key_size;
1598
1599 hash = htab_map_hash(key, key_size, htab->hashrnd);
1600 b = __select_bucket(htab, hash);
1601 head = &b->head;
1602
1603 ret = htab_lock_bucket(b, &bflags);
1604 if (ret)
1605 return ret;
1606
1607 l = lookup_elem_raw(head, hash, key, key_size);
1608 if (!l) {
1609 ret = -ENOENT;
1610 goto out_unlock;
1611 }
1612
1613 if (is_percpu) {
1614 u32 roundup_value_size = round_up(map->value_size, 8);
1615 void __percpu *pptr;
1616 int off = 0, cpu;
1617
1618 pptr = htab_elem_get_ptr(l, key_size);
1619 for_each_possible_cpu(cpu) {
1620 copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu));
1621 check_and_init_map_value(&htab->map, value + off);
1622 off += roundup_value_size;
1623 }
1624 } else {
1625 void *src = htab_elem_value(l, map->key_size);
1626
1627 if (flags & BPF_F_LOCK)
1628 copy_map_value_locked(map, value, src, true);
1629 else
1630 copy_map_value(map, value, src);
1631 /* Zeroing special fields in the temp buffer */
1632 check_and_init_map_value(map, value);
1633 }
1634 hlist_nulls_del_rcu(&l->hash_node);
1635
1636 out_unlock:
1637 htab_unlock_bucket(b, bflags);
1638
1639 if (l) {
1640 if (is_lru_map)
1641 htab_lru_push_free(htab, l);
1642 else
1643 free_htab_elem(htab, l);
1644 }
1645
1646 return ret;
1647 }
1648
htab_map_lookup_and_delete_elem(struct bpf_map * map,void * key,void * value,u64 flags)1649 static int htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
1650 void *value, u64 flags)
1651 {
1652 return __htab_map_lookup_and_delete_elem(map, key, value, false, false,
1653 flags);
1654 }
1655
htab_percpu_map_lookup_and_delete_elem(struct bpf_map * map,void * key,void * value,u64 flags)1656 static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
1657 void *key, void *value,
1658 u64 flags)
1659 {
1660 return __htab_map_lookup_and_delete_elem(map, key, value, false, true,
1661 flags);
1662 }
1663
htab_lru_map_lookup_and_delete_elem(struct bpf_map * map,void * key,void * value,u64 flags)1664 static int htab_lru_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
1665 void *value, u64 flags)
1666 {
1667 return __htab_map_lookup_and_delete_elem(map, key, value, true, false,
1668 flags);
1669 }
1670
htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map * map,void * key,void * value,u64 flags)1671 static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
1672 void *key, void *value,
1673 u64 flags)
1674 {
1675 return __htab_map_lookup_and_delete_elem(map, key, value, true, true,
1676 flags);
1677 }
1678
1679 static int
__htab_map_lookup_and_delete_batch(struct bpf_map * map,const union bpf_attr * attr,union bpf_attr __user * uattr,bool do_delete,bool is_lru_map,bool is_percpu)1680 __htab_map_lookup_and_delete_batch(struct bpf_map *map,
1681 const union bpf_attr *attr,
1682 union bpf_attr __user *uattr,
1683 bool do_delete, bool is_lru_map,
1684 bool is_percpu)
1685 {
1686 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1687 void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
1688 void __user *uvalues = u64_to_user_ptr(attr->batch.values);
1689 void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
1690 void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
1691 u32 batch, max_count, size, bucket_size, map_id;
1692 u32 bucket_cnt, total, key_size, value_size;
1693 struct htab_elem *node_to_free = NULL;
1694 u64 elem_map_flags, map_flags;
1695 struct hlist_nulls_head *head;
1696 struct hlist_nulls_node *n;
1697 unsigned long flags = 0;
1698 bool locked = false;
1699 struct htab_elem *l;
1700 struct bucket *b;
1701 int ret = 0;
1702
1703 elem_map_flags = attr->batch.elem_flags;
1704 if ((elem_map_flags & ~BPF_F_LOCK) ||
1705 ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
1706 return -EINVAL;
1707
1708 map_flags = attr->batch.flags;
1709 if (map_flags)
1710 return -EINVAL;
1711
1712 max_count = attr->batch.count;
1713 if (!max_count)
1714 return 0;
1715
1716 if (put_user(0, &uattr->batch.count))
1717 return -EFAULT;
1718
1719 batch = 0;
1720 if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch)))
1721 return -EFAULT;
1722
1723 if (batch >= htab->n_buckets)
1724 return -ENOENT;
1725
1726 key_size = htab->map.key_size;
1727 value_size = htab->map.value_size;
1728 size = round_up(value_size, 8);
1729 if (is_percpu)
1730 value_size = size * num_possible_cpus();
1731 total = 0;
1732 /* while experimenting with hash tables with sizes ranging from 10 to
1733 * 1000, it was observed that a bucket can have up to 5 entries.
1734 */
1735 bucket_size = 5;
1736
1737 alloc:
1738 /* We cannot do copy_from_user or copy_to_user inside
1739 * the rcu_read_lock. Allocate enough space here.
1740 */
1741 keys = kvmalloc_array(key_size, bucket_size, GFP_USER | __GFP_NOWARN);
1742 values = kvmalloc_array(value_size, bucket_size, GFP_USER | __GFP_NOWARN);
1743 if (!keys || !values) {
1744 ret = -ENOMEM;
1745 goto after_loop;
1746 }
1747
1748 again:
1749 bpf_disable_instrumentation();
1750 rcu_read_lock();
1751 again_nocopy:
1752 dst_key = keys;
1753 dst_val = values;
1754 b = &htab->buckets[batch];
1755 head = &b->head;
1756 /* do not grab the lock unless need it (bucket_cnt > 0). */
1757 if (locked) {
1758 ret = htab_lock_bucket(b, &flags);
1759 if (ret) {
1760 rcu_read_unlock();
1761 bpf_enable_instrumentation();
1762 goto after_loop;
1763 }
1764 }
1765
1766 bucket_cnt = 0;
1767 hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
1768 bucket_cnt++;
1769
1770 if (bucket_cnt && !locked) {
1771 locked = true;
1772 goto again_nocopy;
1773 }
1774
1775 if (bucket_cnt > (max_count - total)) {
1776 if (total == 0)
1777 ret = -ENOSPC;
1778 /* Note that since bucket_cnt > 0 here, it is implicit
1779 * that the locked was grabbed, so release it.
1780 */
1781 htab_unlock_bucket(b, flags);
1782 rcu_read_unlock();
1783 bpf_enable_instrumentation();
1784 goto after_loop;
1785 }
1786
1787 if (bucket_cnt > bucket_size) {
1788 bucket_size = bucket_cnt;
1789 /* Note that since bucket_cnt > 0 here, it is implicit
1790 * that the locked was grabbed, so release it.
1791 */
1792 htab_unlock_bucket(b, flags);
1793 rcu_read_unlock();
1794 bpf_enable_instrumentation();
1795 kvfree(keys);
1796 kvfree(values);
1797 goto alloc;
1798 }
1799
1800 /* Next block is only safe to run if you have grabbed the lock */
1801 if (!locked)
1802 goto next_batch;
1803
1804 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1805 memcpy(dst_key, l->key, key_size);
1806
1807 if (is_percpu) {
1808 int off = 0, cpu;
1809 void __percpu *pptr;
1810
1811 pptr = htab_elem_get_ptr(l, map->key_size);
1812 for_each_possible_cpu(cpu) {
1813 copy_map_value_long(&htab->map, dst_val + off, per_cpu_ptr(pptr, cpu));
1814 check_and_init_map_value(&htab->map, dst_val + off);
1815 off += size;
1816 }
1817 } else {
1818 value = htab_elem_value(l, key_size);
1819 if (is_fd_htab(htab)) {
1820 struct bpf_map **inner_map = value;
1821
1822 /* Actual value is the id of the inner map */
1823 map_id = map->ops->map_fd_sys_lookup_elem(*inner_map);
1824 value = &map_id;
1825 }
1826
1827 if (elem_map_flags & BPF_F_LOCK)
1828 copy_map_value_locked(map, dst_val, value,
1829 true);
1830 else
1831 copy_map_value(map, dst_val, value);
1832 /* Zeroing special fields in the temp buffer */
1833 check_and_init_map_value(map, dst_val);
1834 }
1835 if (do_delete) {
1836 hlist_nulls_del_rcu(&l->hash_node);
1837
1838 /* bpf_lru_push_free() will acquire lru_lock, which
1839 * may cause deadlock. See comments in function
1840 * prealloc_lru_pop(). Let us do bpf_lru_push_free()
1841 * after releasing the bucket lock.
1842 *
1843 * For htab of maps, htab_put_fd_value() in
1844 * free_htab_elem() may acquire a spinlock with bucket
1845 * lock being held and it violates the lock rule, so
1846 * invoke free_htab_elem() after unlock as well.
1847 */
1848 l->batch_flink = node_to_free;
1849 node_to_free = l;
1850 }
1851 dst_key += key_size;
1852 dst_val += value_size;
1853 }
1854
1855 htab_unlock_bucket(b, flags);
1856 locked = false;
1857
1858 while (node_to_free) {
1859 l = node_to_free;
1860 node_to_free = node_to_free->batch_flink;
1861 if (is_lru_map)
1862 htab_lru_push_free(htab, l);
1863 else
1864 free_htab_elem(htab, l);
1865 }
1866
1867 next_batch:
1868 /* If we are not copying data, we can go to next bucket and avoid
1869 * unlocking the rcu.
1870 */
1871 if (!bucket_cnt && (batch + 1 < htab->n_buckets)) {
1872 batch++;
1873 goto again_nocopy;
1874 }
1875
1876 rcu_read_unlock();
1877 bpf_enable_instrumentation();
1878 if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys,
1879 key_size * bucket_cnt) ||
1880 copy_to_user(uvalues + total * value_size, values,
1881 value_size * bucket_cnt))) {
1882 ret = -EFAULT;
1883 goto after_loop;
1884 }
1885
1886 total += bucket_cnt;
1887 batch++;
1888 if (batch >= htab->n_buckets) {
1889 ret = -ENOENT;
1890 goto after_loop;
1891 }
1892 goto again;
1893
1894 after_loop:
1895 if (ret == -EFAULT)
1896 goto out;
1897
1898 /* copy # of entries and next batch */
1899 ubatch = u64_to_user_ptr(attr->batch.out_batch);
1900 if (copy_to_user(ubatch, &batch, sizeof(batch)) ||
1901 put_user(total, &uattr->batch.count))
1902 ret = -EFAULT;
1903
1904 out:
1905 kvfree(keys);
1906 kvfree(values);
1907 return ret;
1908 }
1909
1910 static int
htab_percpu_map_lookup_batch(struct bpf_map * map,const union bpf_attr * attr,union bpf_attr __user * uattr)1911 htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1912 union bpf_attr __user *uattr)
1913 {
1914 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1915 false, true);
1916 }
1917
1918 static int
htab_percpu_map_lookup_and_delete_batch(struct bpf_map * map,const union bpf_attr * attr,union bpf_attr __user * uattr)1919 htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
1920 const union bpf_attr *attr,
1921 union bpf_attr __user *uattr)
1922 {
1923 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1924 false, true);
1925 }
1926
1927 static int
htab_map_lookup_batch(struct bpf_map * map,const union bpf_attr * attr,union bpf_attr __user * uattr)1928 htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1929 union bpf_attr __user *uattr)
1930 {
1931 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1932 false, false);
1933 }
1934
1935 static int
htab_map_lookup_and_delete_batch(struct bpf_map * map,const union bpf_attr * attr,union bpf_attr __user * uattr)1936 htab_map_lookup_and_delete_batch(struct bpf_map *map,
1937 const union bpf_attr *attr,
1938 union bpf_attr __user *uattr)
1939 {
1940 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1941 false, false);
1942 }
1943
1944 static int
htab_lru_percpu_map_lookup_batch(struct bpf_map * map,const union bpf_attr * attr,union bpf_attr __user * uattr)1945 htab_lru_percpu_map_lookup_batch(struct bpf_map *map,
1946 const union bpf_attr *attr,
1947 union bpf_attr __user *uattr)
1948 {
1949 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1950 true, true);
1951 }
1952
1953 static int
htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map * map,const union bpf_attr * attr,union bpf_attr __user * uattr)1954 htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
1955 const union bpf_attr *attr,
1956 union bpf_attr __user *uattr)
1957 {
1958 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1959 true, true);
1960 }
1961
1962 static int
htab_lru_map_lookup_batch(struct bpf_map * map,const union bpf_attr * attr,union bpf_attr __user * uattr)1963 htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1964 union bpf_attr __user *uattr)
1965 {
1966 return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1967 true, false);
1968 }
1969
1970 static int
htab_lru_map_lookup_and_delete_batch(struct bpf_map * map,const union bpf_attr * attr,union bpf_attr __user * uattr)1971 htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
1972 const union bpf_attr *attr,
1973 union bpf_attr __user *uattr)
1974 {
1975 return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1976 true, false);
1977 }
1978
1979 struct bpf_iter_seq_hash_map_info {
1980 struct bpf_map *map;
1981 struct bpf_htab *htab;
1982 void *percpu_value_buf; // non-zero means percpu hash
1983 u32 bucket_id;
1984 u32 skip_elems;
1985 };
1986
1987 static struct htab_elem *
bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info * info,struct htab_elem * prev_elem)1988 bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info,
1989 struct htab_elem *prev_elem)
1990 {
1991 const struct bpf_htab *htab = info->htab;
1992 u32 skip_elems = info->skip_elems;
1993 u32 bucket_id = info->bucket_id;
1994 struct hlist_nulls_head *head;
1995 struct hlist_nulls_node *n;
1996 struct htab_elem *elem;
1997 struct bucket *b;
1998 u32 i, count;
1999
2000 if (bucket_id >= htab->n_buckets)
2001 return NULL;
2002
2003 /* try to find next elem in the same bucket */
2004 if (prev_elem) {
2005 /* no update/deletion on this bucket, prev_elem should be still valid
2006 * and we won't skip elements.
2007 */
2008 n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node));
2009 elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node);
2010 if (elem)
2011 return elem;
2012
2013 /* not found, unlock and go to the next bucket */
2014 b = &htab->buckets[bucket_id++];
2015 rcu_read_unlock();
2016 skip_elems = 0;
2017 }
2018
2019 for (i = bucket_id; i < htab->n_buckets; i++) {
2020 b = &htab->buckets[i];
2021 rcu_read_lock();
2022
2023 count = 0;
2024 head = &b->head;
2025 hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
2026 if (count >= skip_elems) {
2027 info->bucket_id = i;
2028 info->skip_elems = count;
2029 return elem;
2030 }
2031 count++;
2032 }
2033
2034 rcu_read_unlock();
2035 skip_elems = 0;
2036 }
2037
2038 info->bucket_id = i;
2039 info->skip_elems = 0;
2040 return NULL;
2041 }
2042
bpf_hash_map_seq_start(struct seq_file * seq,loff_t * pos)2043 static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos)
2044 {
2045 struct bpf_iter_seq_hash_map_info *info = seq->private;
2046 struct htab_elem *elem;
2047
2048 elem = bpf_hash_map_seq_find_next(info, NULL);
2049 if (!elem)
2050 return NULL;
2051
2052 if (*pos == 0)
2053 ++*pos;
2054 return elem;
2055 }
2056
bpf_hash_map_seq_next(struct seq_file * seq,void * v,loff_t * pos)2057 static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2058 {
2059 struct bpf_iter_seq_hash_map_info *info = seq->private;
2060
2061 ++*pos;
2062 ++info->skip_elems;
2063 return bpf_hash_map_seq_find_next(info, v);
2064 }
2065
__bpf_hash_map_seq_show(struct seq_file * seq,struct htab_elem * elem)2066 static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem)
2067 {
2068 struct bpf_iter_seq_hash_map_info *info = seq->private;
2069 struct bpf_iter__bpf_map_elem ctx = {};
2070 struct bpf_map *map = info->map;
2071 struct bpf_iter_meta meta;
2072 int ret = 0, off = 0, cpu;
2073 u32 roundup_value_size;
2074 struct bpf_prog *prog;
2075 void __percpu *pptr;
2076
2077 meta.seq = seq;
2078 prog = bpf_iter_get_info(&meta, elem == NULL);
2079 if (prog) {
2080 ctx.meta = &meta;
2081 ctx.map = info->map;
2082 if (elem) {
2083 ctx.key = elem->key;
2084 if (!info->percpu_value_buf) {
2085 ctx.value = htab_elem_value(elem, map->key_size);
2086 } else {
2087 roundup_value_size = round_up(map->value_size, 8);
2088 pptr = htab_elem_get_ptr(elem, map->key_size);
2089 for_each_possible_cpu(cpu) {
2090 copy_map_value_long(map, info->percpu_value_buf + off,
2091 per_cpu_ptr(pptr, cpu));
2092 check_and_init_map_value(map, info->percpu_value_buf + off);
2093 off += roundup_value_size;
2094 }
2095 ctx.value = info->percpu_value_buf;
2096 }
2097 }
2098 ret = bpf_iter_run_prog(prog, &ctx);
2099 }
2100
2101 return ret;
2102 }
2103
bpf_hash_map_seq_show(struct seq_file * seq,void * v)2104 static int bpf_hash_map_seq_show(struct seq_file *seq, void *v)
2105 {
2106 return __bpf_hash_map_seq_show(seq, v);
2107 }
2108
bpf_hash_map_seq_stop(struct seq_file * seq,void * v)2109 static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v)
2110 {
2111 if (!v)
2112 (void)__bpf_hash_map_seq_show(seq, NULL);
2113 else
2114 rcu_read_unlock();
2115 }
2116
bpf_iter_init_hash_map(void * priv_data,struct bpf_iter_aux_info * aux)2117 static int bpf_iter_init_hash_map(void *priv_data,
2118 struct bpf_iter_aux_info *aux)
2119 {
2120 struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
2121 struct bpf_map *map = aux->map;
2122 void *value_buf;
2123 u32 buf_size;
2124
2125 if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
2126 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
2127 buf_size = round_up(map->value_size, 8) * num_possible_cpus();
2128 value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);
2129 if (!value_buf)
2130 return -ENOMEM;
2131
2132 seq_info->percpu_value_buf = value_buf;
2133 }
2134
2135 bpf_map_inc_with_uref(map);
2136 seq_info->map = map;
2137 seq_info->htab = container_of(map, struct bpf_htab, map);
2138 return 0;
2139 }
2140
bpf_iter_fini_hash_map(void * priv_data)2141 static void bpf_iter_fini_hash_map(void *priv_data)
2142 {
2143 struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
2144
2145 bpf_map_put_with_uref(seq_info->map);
2146 kfree(seq_info->percpu_value_buf);
2147 }
2148
2149 static const struct seq_operations bpf_hash_map_seq_ops = {
2150 .start = bpf_hash_map_seq_start,
2151 .next = bpf_hash_map_seq_next,
2152 .stop = bpf_hash_map_seq_stop,
2153 .show = bpf_hash_map_seq_show,
2154 };
2155
2156 static const struct bpf_iter_seq_info iter_seq_info = {
2157 .seq_ops = &bpf_hash_map_seq_ops,
2158 .init_seq_private = bpf_iter_init_hash_map,
2159 .fini_seq_private = bpf_iter_fini_hash_map,
2160 .seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info),
2161 };
2162
bpf_for_each_hash_elem(struct bpf_map * map,bpf_callback_t callback_fn,void * callback_ctx,u64 flags)2163 static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
2164 void *callback_ctx, u64 flags)
2165 {
2166 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2167 struct hlist_nulls_head *head;
2168 struct hlist_nulls_node *n;
2169 struct htab_elem *elem;
2170 int i, num_elems = 0;
2171 void __percpu *pptr;
2172 struct bucket *b;
2173 void *key, *val;
2174 bool is_percpu;
2175 u64 ret = 0;
2176
2177 cant_migrate();
2178
2179 if (flags != 0)
2180 return -EINVAL;
2181
2182 is_percpu = htab_is_percpu(htab);
2183
2184 /* migration has been disabled, so percpu value prepared here will be
2185 * the same as the one seen by the bpf program with
2186 * bpf_map_lookup_elem().
2187 */
2188 for (i = 0; i < htab->n_buckets; i++) {
2189 b = &htab->buckets[i];
2190 rcu_read_lock();
2191 head = &b->head;
2192 hlist_nulls_for_each_entry_safe(elem, n, head, hash_node) {
2193 key = elem->key;
2194 if (is_percpu) {
2195 /* current cpu value for percpu map */
2196 pptr = htab_elem_get_ptr(elem, map->key_size);
2197 val = this_cpu_ptr(pptr);
2198 } else {
2199 val = htab_elem_value(elem, map->key_size);
2200 }
2201 num_elems++;
2202 ret = callback_fn((u64)(long)map, (u64)(long)key,
2203 (u64)(long)val, (u64)(long)callback_ctx, 0);
2204 /* return value: 0 - continue, 1 - stop and return */
2205 if (ret) {
2206 rcu_read_unlock();
2207 goto out;
2208 }
2209 }
2210 rcu_read_unlock();
2211 }
2212 out:
2213 return num_elems;
2214 }
2215
htab_map_mem_usage(const struct bpf_map * map)2216 static u64 htab_map_mem_usage(const struct bpf_map *map)
2217 {
2218 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2219 u32 value_size = round_up(htab->map.value_size, 8);
2220 bool prealloc = htab_is_prealloc(htab);
2221 bool percpu = htab_is_percpu(htab);
2222 bool lru = htab_is_lru(htab);
2223 u64 num_entries;
2224 u64 usage = sizeof(struct bpf_htab);
2225
2226 usage += sizeof(struct bucket) * htab->n_buckets;
2227 usage += sizeof(int) * num_possible_cpus() * HASHTAB_MAP_LOCK_COUNT;
2228 if (prealloc) {
2229 num_entries = map->max_entries;
2230 if (htab_has_extra_elems(htab))
2231 num_entries += num_possible_cpus();
2232
2233 usage += htab->elem_size * num_entries;
2234
2235 if (percpu)
2236 usage += value_size * num_possible_cpus() * num_entries;
2237 else if (!lru)
2238 usage += sizeof(struct htab_elem *) * num_possible_cpus();
2239 } else {
2240 #define LLIST_NODE_SZ sizeof(struct llist_node)
2241
2242 num_entries = htab->use_percpu_counter ?
2243 percpu_counter_sum(&htab->pcount) :
2244 atomic_read(&htab->count);
2245 usage += (htab->elem_size + LLIST_NODE_SZ) * num_entries;
2246 if (percpu) {
2247 usage += (LLIST_NODE_SZ + sizeof(void *)) * num_entries;
2248 usage += value_size * num_possible_cpus() * num_entries;
2249 }
2250 }
2251 return usage;
2252 }
2253
2254 BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab)
2255 const struct bpf_map_ops htab_map_ops = {
2256 .map_meta_equal = bpf_map_meta_equal,
2257 .map_alloc_check = htab_map_alloc_check,
2258 .map_alloc = htab_map_alloc,
2259 .map_free = htab_map_free,
2260 .map_get_next_key = htab_map_get_next_key,
2261 .map_release_uref = htab_map_free_internal_structs,
2262 .map_lookup_elem = htab_map_lookup_elem,
2263 .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
2264 .map_update_elem = htab_map_update_elem,
2265 .map_delete_elem = htab_map_delete_elem,
2266 .map_gen_lookup = htab_map_gen_lookup,
2267 .map_seq_show_elem = htab_map_seq_show_elem,
2268 .map_set_for_each_callback_args = map_set_for_each_callback_args,
2269 .map_for_each_callback = bpf_for_each_hash_elem,
2270 .map_mem_usage = htab_map_mem_usage,
2271 BATCH_OPS(htab),
2272 .map_btf_id = &htab_map_btf_ids[0],
2273 .iter_seq_info = &iter_seq_info,
2274 };
2275
2276 const struct bpf_map_ops htab_lru_map_ops = {
2277 .map_meta_equal = bpf_map_meta_equal,
2278 .map_alloc_check = htab_map_alloc_check,
2279 .map_alloc = htab_map_alloc,
2280 .map_free = htab_map_free,
2281 .map_get_next_key = htab_map_get_next_key,
2282 .map_release_uref = htab_map_free_internal_structs,
2283 .map_lookup_elem = htab_lru_map_lookup_elem,
2284 .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
2285 .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
2286 .map_update_elem = htab_lru_map_update_elem,
2287 .map_delete_elem = htab_lru_map_delete_elem,
2288 .map_gen_lookup = htab_lru_map_gen_lookup,
2289 .map_seq_show_elem = htab_map_seq_show_elem,
2290 .map_set_for_each_callback_args = map_set_for_each_callback_args,
2291 .map_for_each_callback = bpf_for_each_hash_elem,
2292 .map_mem_usage = htab_map_mem_usage,
2293 BATCH_OPS(htab_lru),
2294 .map_btf_id = &htab_map_btf_ids[0],
2295 .iter_seq_info = &iter_seq_info,
2296 };
2297
2298 /* Called from eBPF program */
htab_percpu_map_lookup_elem(struct bpf_map * map,void * key)2299 static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
2300 {
2301 struct htab_elem *l = __htab_map_lookup_elem(map, key);
2302
2303 if (l)
2304 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
2305 else
2306 return NULL;
2307 }
2308
2309 /* inline bpf_map_lookup_elem() call for per-CPU hashmap */
htab_percpu_map_gen_lookup(struct bpf_map * map,struct bpf_insn * insn_buf)2310 static int htab_percpu_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
2311 {
2312 struct bpf_insn *insn = insn_buf;
2313
2314 if (!bpf_jit_supports_percpu_insn())
2315 return -EOPNOTSUPP;
2316
2317 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
2318 (void *(*)(struct bpf_map *map, void *key))NULL));
2319 *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
2320 *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3);
2321 *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_0,
2322 offsetof(struct htab_elem, key) + roundup(map->key_size, 8));
2323 *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0);
2324 *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
2325
2326 return insn - insn_buf;
2327 }
2328
htab_percpu_map_lookup_percpu_elem(struct bpf_map * map,void * key,u32 cpu)2329 static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
2330 {
2331 struct htab_elem *l;
2332
2333 if (cpu >= nr_cpu_ids)
2334 return NULL;
2335
2336 l = __htab_map_lookup_elem(map, key);
2337 if (l)
2338 return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
2339 else
2340 return NULL;
2341 }
2342
htab_lru_percpu_map_lookup_elem(struct bpf_map * map,void * key)2343 static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
2344 {
2345 struct htab_elem *l = __htab_map_lookup_elem(map, key);
2346
2347 if (l) {
2348 bpf_lru_node_set_ref(&l->lru_node);
2349 return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
2350 }
2351
2352 return NULL;
2353 }
2354
htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map * map,void * key,u32 cpu)2355 static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
2356 {
2357 struct htab_elem *l;
2358
2359 if (cpu >= nr_cpu_ids)
2360 return NULL;
2361
2362 l = __htab_map_lookup_elem(map, key);
2363 if (l) {
2364 bpf_lru_node_set_ref(&l->lru_node);
2365 return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
2366 }
2367
2368 return NULL;
2369 }
2370
bpf_percpu_hash_copy(struct bpf_map * map,void * key,void * value)2371 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
2372 {
2373 struct htab_elem *l;
2374 void __percpu *pptr;
2375 int ret = -ENOENT;
2376 int cpu, off = 0;
2377 u32 size;
2378
2379 /* per_cpu areas are zero-filled and bpf programs can only
2380 * access 'value_size' of them, so copying rounded areas
2381 * will not leak any kernel data
2382 */
2383 size = round_up(map->value_size, 8);
2384 rcu_read_lock();
2385 l = __htab_map_lookup_elem(map, key);
2386 if (!l)
2387 goto out;
2388 /* We do not mark LRU map element here in order to not mess up
2389 * eviction heuristics when user space does a map walk.
2390 */
2391 pptr = htab_elem_get_ptr(l, map->key_size);
2392 for_each_possible_cpu(cpu) {
2393 copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
2394 check_and_init_map_value(map, value + off);
2395 off += size;
2396 }
2397 ret = 0;
2398 out:
2399 rcu_read_unlock();
2400 return ret;
2401 }
2402
bpf_percpu_hash_update(struct bpf_map * map,void * key,void * value,u64 map_flags)2403 int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
2404 u64 map_flags)
2405 {
2406 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2407 int ret;
2408
2409 rcu_read_lock();
2410 if (htab_is_lru(htab))
2411 ret = __htab_lru_percpu_map_update_elem(map, key, value,
2412 map_flags, true);
2413 else
2414 ret = htab_map_update_elem_in_place(map, key, value, map_flags,
2415 true, true);
2416 rcu_read_unlock();
2417
2418 return ret;
2419 }
2420
htab_percpu_map_seq_show_elem(struct bpf_map * map,void * key,struct seq_file * m)2421 static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key,
2422 struct seq_file *m)
2423 {
2424 struct htab_elem *l;
2425 void __percpu *pptr;
2426 int cpu;
2427
2428 rcu_read_lock();
2429
2430 l = __htab_map_lookup_elem(map, key);
2431 if (!l) {
2432 rcu_read_unlock();
2433 return;
2434 }
2435
2436 btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
2437 seq_puts(m, ": {\n");
2438 pptr = htab_elem_get_ptr(l, map->key_size);
2439 for_each_possible_cpu(cpu) {
2440 seq_printf(m, "\tcpu%d: ", cpu);
2441 btf_type_seq_show(map->btf, map->btf_value_type_id,
2442 per_cpu_ptr(pptr, cpu), m);
2443 seq_putc(m, '\n');
2444 }
2445 seq_puts(m, "}\n");
2446
2447 rcu_read_unlock();
2448 }
2449
2450 const struct bpf_map_ops htab_percpu_map_ops = {
2451 .map_meta_equal = bpf_map_meta_equal,
2452 .map_alloc_check = htab_map_alloc_check,
2453 .map_alloc = htab_map_alloc,
2454 .map_free = htab_map_free,
2455 .map_get_next_key = htab_map_get_next_key,
2456 .map_lookup_elem = htab_percpu_map_lookup_elem,
2457 .map_gen_lookup = htab_percpu_map_gen_lookup,
2458 .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
2459 .map_update_elem = htab_percpu_map_update_elem,
2460 .map_delete_elem = htab_map_delete_elem,
2461 .map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem,
2462 .map_seq_show_elem = htab_percpu_map_seq_show_elem,
2463 .map_set_for_each_callback_args = map_set_for_each_callback_args,
2464 .map_for_each_callback = bpf_for_each_hash_elem,
2465 .map_mem_usage = htab_map_mem_usage,
2466 BATCH_OPS(htab_percpu),
2467 .map_btf_id = &htab_map_btf_ids[0],
2468 .iter_seq_info = &iter_seq_info,
2469 };
2470
2471 const struct bpf_map_ops htab_lru_percpu_map_ops = {
2472 .map_meta_equal = bpf_map_meta_equal,
2473 .map_alloc_check = htab_map_alloc_check,
2474 .map_alloc = htab_map_alloc,
2475 .map_free = htab_map_free,
2476 .map_get_next_key = htab_map_get_next_key,
2477 .map_lookup_elem = htab_lru_percpu_map_lookup_elem,
2478 .map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem,
2479 .map_update_elem = htab_lru_percpu_map_update_elem,
2480 .map_delete_elem = htab_lru_map_delete_elem,
2481 .map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem,
2482 .map_seq_show_elem = htab_percpu_map_seq_show_elem,
2483 .map_set_for_each_callback_args = map_set_for_each_callback_args,
2484 .map_for_each_callback = bpf_for_each_hash_elem,
2485 .map_mem_usage = htab_map_mem_usage,
2486 BATCH_OPS(htab_lru_percpu),
2487 .map_btf_id = &htab_map_btf_ids[0],
2488 .iter_seq_info = &iter_seq_info,
2489 };
2490
fd_htab_map_alloc_check(union bpf_attr * attr)2491 static int fd_htab_map_alloc_check(union bpf_attr *attr)
2492 {
2493 if (attr->value_size != sizeof(u32))
2494 return -EINVAL;
2495 return htab_map_alloc_check(attr);
2496 }
2497
fd_htab_map_free(struct bpf_map * map)2498 static void fd_htab_map_free(struct bpf_map *map)
2499 {
2500 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2501 struct hlist_nulls_node *n;
2502 struct hlist_nulls_head *head;
2503 struct htab_elem *l;
2504 int i;
2505
2506 for (i = 0; i < htab->n_buckets; i++) {
2507 head = select_bucket(htab, i);
2508
2509 hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
2510 void *ptr = fd_htab_map_get_ptr(map, l);
2511
2512 map->ops->map_fd_put_ptr(map, ptr, false);
2513 }
2514 }
2515
2516 htab_map_free(map);
2517 }
2518
2519 /* only called from syscall */
bpf_fd_htab_map_lookup_elem(struct bpf_map * map,void * key,u32 * value)2520 int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
2521 {
2522 void **ptr;
2523 int ret = 0;
2524
2525 if (!map->ops->map_fd_sys_lookup_elem)
2526 return -ENOTSUPP;
2527
2528 rcu_read_lock();
2529 ptr = htab_map_lookup_elem(map, key);
2530 if (ptr)
2531 *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr));
2532 else
2533 ret = -ENOENT;
2534 rcu_read_unlock();
2535
2536 return ret;
2537 }
2538
2539 /* Only called from syscall */
bpf_fd_htab_map_update_elem(struct bpf_map * map,struct file * map_file,void * key,void * value,u64 map_flags)2540 int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
2541 void *key, void *value, u64 map_flags)
2542 {
2543 void *ptr;
2544 int ret;
2545
2546 ptr = map->ops->map_fd_get_ptr(map, map_file, *(int *)value);
2547 if (IS_ERR(ptr))
2548 return PTR_ERR(ptr);
2549
2550 /* The htab bucket lock is always held during update operations in fd
2551 * htab map, and the following rcu_read_lock() is only used to avoid
2552 * the WARN_ON_ONCE in htab_map_update_elem_in_place().
2553 */
2554 rcu_read_lock();
2555 ret = htab_map_update_elem_in_place(map, key, &ptr, map_flags, false, false);
2556 rcu_read_unlock();
2557 if (ret)
2558 map->ops->map_fd_put_ptr(map, ptr, false);
2559
2560 return ret;
2561 }
2562
htab_of_map_alloc(union bpf_attr * attr)2563 static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr)
2564 {
2565 struct bpf_map *map, *inner_map_meta;
2566
2567 inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
2568 if (IS_ERR(inner_map_meta))
2569 return inner_map_meta;
2570
2571 map = htab_map_alloc(attr);
2572 if (IS_ERR(map)) {
2573 bpf_map_meta_free(inner_map_meta);
2574 return map;
2575 }
2576
2577 map->inner_map_meta = inner_map_meta;
2578
2579 return map;
2580 }
2581
htab_of_map_lookup_elem(struct bpf_map * map,void * key)2582 static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key)
2583 {
2584 struct bpf_map **inner_map = htab_map_lookup_elem(map, key);
2585
2586 if (!inner_map)
2587 return NULL;
2588
2589 return READ_ONCE(*inner_map);
2590 }
2591
htab_of_map_gen_lookup(struct bpf_map * map,struct bpf_insn * insn_buf)2592 static int htab_of_map_gen_lookup(struct bpf_map *map,
2593 struct bpf_insn *insn_buf)
2594 {
2595 struct bpf_insn *insn = insn_buf;
2596 const int ret = BPF_REG_0;
2597
2598 BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
2599 (void *(*)(struct bpf_map *map, void *key))NULL));
2600 *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
2601 *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
2602 *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
2603 offsetof(struct htab_elem, key) +
2604 round_up(map->key_size, 8));
2605 *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0);
2606
2607 return insn - insn_buf;
2608 }
2609
htab_of_map_free(struct bpf_map * map)2610 static void htab_of_map_free(struct bpf_map *map)
2611 {
2612 bpf_map_meta_free(map->inner_map_meta);
2613 fd_htab_map_free(map);
2614 }
2615
2616 const struct bpf_map_ops htab_of_maps_map_ops = {
2617 .map_alloc_check = fd_htab_map_alloc_check,
2618 .map_alloc = htab_of_map_alloc,
2619 .map_free = htab_of_map_free,
2620 .map_get_next_key = htab_map_get_next_key,
2621 .map_lookup_elem = htab_of_map_lookup_elem,
2622 .map_delete_elem = htab_map_delete_elem,
2623 .map_fd_get_ptr = bpf_map_fd_get_ptr,
2624 .map_fd_put_ptr = bpf_map_fd_put_ptr,
2625 .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
2626 .map_gen_lookup = htab_of_map_gen_lookup,
2627 .map_check_btf = map_check_no_btf,
2628 .map_mem_usage = htab_map_mem_usage,
2629 BATCH_OPS(htab),
2630 .map_btf_id = &htab_map_btf_ids[0],
2631 };
2632