1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Linux INET6 implementation
4 * Forwarding Information Database
5 *
6 * Authors:
7 * Pedro Roque <roque@di.fc.ul.pt>
8 *
9 * Changes:
10 * Yuji SEKIYA @USAGI: Support default route on router node;
11 * remove ip6_null_entry from the top of
12 * routing table.
13 * Ville Nuorvala: Fixed routing subtrees.
14 */
15
16 #define pr_fmt(fmt) "IPv6: " fmt
17
18 #include <linux/bpf.h>
19 #include <linux/errno.h>
20 #include <linux/types.h>
21 #include <linux/net.h>
22 #include <linux/route.h>
23 #include <linux/netdevice.h>
24 #include <linux/in6.h>
25 #include <linux/init.h>
26 #include <linux/list.h>
27 #include <linux/slab.h>
28
29 #include <net/ip.h>
30 #include <net/ipv6.h>
31 #include <net/ndisc.h>
32 #include <net/addrconf.h>
33 #include <net/lwtunnel.h>
34 #include <net/fib_notifier.h>
35
36 #include <net/ip_fib.h>
37 #include <net/ip6_fib.h>
38 #include <net/ip6_route.h>
39
40 static struct kmem_cache *fib6_node_kmem __read_mostly;
41
42 struct fib6_cleaner {
43 struct fib6_walker w;
44 struct net *net;
45 int (*func)(struct fib6_info *, void *arg);
46 int sernum;
47 void *arg;
48 bool skip_notify;
49 };
50
51 #ifdef CONFIG_IPV6_SUBTREES
52 #define FWS_INIT FWS_S
53 #else
54 #define FWS_INIT FWS_L
55 #endif
56
57 static struct fib6_info *fib6_find_prefix(struct net *net,
58 struct fib6_table *table,
59 struct fib6_node *fn);
60 static struct fib6_node *fib6_repair_tree(struct net *net,
61 struct fib6_table *table,
62 struct fib6_node *fn);
63 static int fib6_walk(struct net *net, struct fib6_walker *w);
64 static int fib6_walk_continue(struct fib6_walker *w);
65
66 /*
67 * A routing update causes an increase of the serial number on the
68 * affected subtree. This allows for cached routes to be asynchronously
69 * tested when modifications are made to the destination cache as a
70 * result of redirects, path MTU changes, etc.
71 */
72
73 static void fib6_gc_timer_cb(struct timer_list *t);
74
75 #define FOR_WALKERS(net, w) \
76 list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh)
77
fib6_walker_link(struct net * net,struct fib6_walker * w)78 static void fib6_walker_link(struct net *net, struct fib6_walker *w)
79 {
80 write_lock_bh(&net->ipv6.fib6_walker_lock);
81 list_add(&w->lh, &net->ipv6.fib6_walkers);
82 write_unlock_bh(&net->ipv6.fib6_walker_lock);
83 }
84
fib6_walker_unlink(struct net * net,struct fib6_walker * w)85 static void fib6_walker_unlink(struct net *net, struct fib6_walker *w)
86 {
87 write_lock_bh(&net->ipv6.fib6_walker_lock);
88 list_del(&w->lh);
89 write_unlock_bh(&net->ipv6.fib6_walker_lock);
90 }
91
fib6_new_sernum(struct net * net)92 static int fib6_new_sernum(struct net *net)
93 {
94 int new, old = atomic_read(&net->ipv6.fib6_sernum);
95
96 do {
97 new = old < INT_MAX ? old + 1 : 1;
98 } while (!atomic_try_cmpxchg(&net->ipv6.fib6_sernum, &old, new));
99
100 return new;
101 }
102
103 enum {
104 FIB6_NO_SERNUM_CHANGE = 0,
105 };
106
fib6_update_sernum(struct net * net,struct fib6_info * f6i)107 void fib6_update_sernum(struct net *net, struct fib6_info *f6i)
108 {
109 struct fib6_node *fn;
110
111 fn = rcu_dereference_protected(f6i->fib6_node,
112 lockdep_is_held(&f6i->fib6_table->tb6_lock));
113 if (fn)
114 WRITE_ONCE(fn->fn_sernum, fib6_new_sernum(net));
115 }
116
117 /*
118 * Auxiliary address test functions for the radix tree.
119 *
120 * These assume a 32bit processor (although it will work on
121 * 64bit processors)
122 */
123
124 /*
125 * test bit
126 */
127 #if defined(__LITTLE_ENDIAN)
128 # define BITOP_BE32_SWIZZLE (0x1F & ~7)
129 #else
130 # define BITOP_BE32_SWIZZLE 0
131 #endif
132
addr_bit_set(const void * token,int fn_bit)133 static __be32 addr_bit_set(const void *token, int fn_bit)
134 {
135 const __be32 *addr = token;
136 /*
137 * Here,
138 * 1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)
139 * is optimized version of
140 * htonl(1 << ((~fn_bit)&0x1F))
141 * See include/asm-generic/bitops/le.h.
142 */
143 return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) &
144 addr[fn_bit >> 5];
145 }
146
fib6_info_alloc(gfp_t gfp_flags,bool with_fib6_nh)147 struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh)
148 {
149 struct fib6_info *f6i;
150 size_t sz = sizeof(*f6i);
151
152 if (with_fib6_nh)
153 sz += sizeof(struct fib6_nh);
154
155 f6i = kzalloc(sz, gfp_flags);
156 if (!f6i)
157 return NULL;
158
159 /* fib6_siblings is a union with nh_list, so this initializes both */
160 INIT_LIST_HEAD(&f6i->fib6_siblings);
161 refcount_set(&f6i->fib6_ref, 1);
162
163 INIT_HLIST_NODE(&f6i->gc_link);
164
165 return f6i;
166 }
167
fib6_info_destroy_rcu(struct rcu_head * head)168 void fib6_info_destroy_rcu(struct rcu_head *head)
169 {
170 struct fib6_info *f6i = container_of(head, struct fib6_info, rcu);
171
172 WARN_ON(f6i->fib6_node);
173
174 if (f6i->nh)
175 nexthop_put(f6i->nh);
176 else
177 fib6_nh_release(f6i->fib6_nh);
178
179 ip_fib_metrics_put(f6i->fib6_metrics);
180 kfree(f6i);
181 }
182 EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu);
183
node_alloc(struct net * net)184 static struct fib6_node *node_alloc(struct net *net)
185 {
186 struct fib6_node *fn;
187
188 fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC);
189 if (fn)
190 net->ipv6.rt6_stats->fib_nodes++;
191
192 return fn;
193 }
194
node_free_immediate(struct net * net,struct fib6_node * fn)195 static void node_free_immediate(struct net *net, struct fib6_node *fn)
196 {
197 kmem_cache_free(fib6_node_kmem, fn);
198 net->ipv6.rt6_stats->fib_nodes--;
199 }
200
node_free(struct net * net,struct fib6_node * fn)201 static void node_free(struct net *net, struct fib6_node *fn)
202 {
203 kfree_rcu(fn, rcu);
204 net->ipv6.rt6_stats->fib_nodes--;
205 }
206
fib6_free_table(struct fib6_table * table)207 static void fib6_free_table(struct fib6_table *table)
208 {
209 inetpeer_invalidate_tree(&table->tb6_peers);
210 kfree(table);
211 }
212
fib6_link_table(struct net * net,struct fib6_table * tb)213 static void fib6_link_table(struct net *net, struct fib6_table *tb)
214 {
215 unsigned int h;
216
217 /*
218 * Initialize table lock at a single place to give lockdep a key,
219 * tables aren't visible prior to being linked to the list.
220 */
221 spin_lock_init(&tb->tb6_lock);
222 h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1);
223
224 /*
225 * No protection necessary, this is the only list mutatation
226 * operation, tables never disappear once they exist.
227 */
228 hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]);
229 }
230
231 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
232
fib6_alloc_table(struct net * net,u32 id)233 static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
234 {
235 struct fib6_table *table;
236
237 table = kzalloc_obj(*table, GFP_ATOMIC);
238 if (table) {
239 table->tb6_id = id;
240 rcu_assign_pointer(table->tb6_root.leaf,
241 net->ipv6.fib6_null_entry);
242 table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
243 inet_peer_base_init(&table->tb6_peers);
244 INIT_HLIST_HEAD(&table->tb6_gc_hlist);
245 }
246
247 return table;
248 }
249
fib6_new_table(struct net * net,u32 id)250 struct fib6_table *fib6_new_table(struct net *net, u32 id)
251 {
252 struct fib6_table *tb, *new_tb;
253
254 if (id == 0)
255 id = RT6_TABLE_MAIN;
256
257 tb = fib6_get_table(net, id);
258 if (tb)
259 return tb;
260
261 new_tb = fib6_alloc_table(net, id);
262 if (!new_tb)
263 return NULL;
264
265 spin_lock_bh(&net->ipv6.fib_table_hash_lock);
266
267 tb = fib6_get_table(net, id);
268 if (unlikely(tb)) {
269 spin_unlock_bh(&net->ipv6.fib_table_hash_lock);
270 kfree(new_tb);
271 return tb;
272 }
273
274 fib6_link_table(net, new_tb);
275
276 spin_unlock_bh(&net->ipv6.fib_table_hash_lock);
277
278 return new_tb;
279 }
280 EXPORT_SYMBOL_GPL(fib6_new_table);
281
fib6_get_table(struct net * net,u32 id)282 struct fib6_table *fib6_get_table(struct net *net, u32 id)
283 {
284 struct hlist_head *head;
285 struct fib6_table *tb;
286
287 if (!id)
288 id = RT6_TABLE_MAIN;
289
290 head = &net->ipv6.fib_table_hash[id & (FIB6_TABLE_HASHSZ - 1)];
291
292 /* See comment in fib6_link_table(). RCU is not required,
293 * but rcu_dereference_raw() is used to avoid data-race.
294 */
295 hlist_for_each_entry_rcu(tb, head, tb6_hlist, true)
296 if (tb->tb6_id == id)
297 return tb;
298
299 return NULL;
300 }
301 EXPORT_SYMBOL_GPL(fib6_get_table);
302
fib6_tables_init(struct net * net)303 static void __net_init fib6_tables_init(struct net *net)
304 {
305 fib6_link_table(net, net->ipv6.fib6_main_tbl);
306 fib6_link_table(net, net->ipv6.fib6_local_tbl);
307 }
308 #else
309
fib6_new_table(struct net * net,u32 id)310 struct fib6_table *fib6_new_table(struct net *net, u32 id)
311 {
312 return fib6_get_table(net, id);
313 }
314
fib6_get_table(struct net * net,u32 id)315 struct fib6_table *fib6_get_table(struct net *net, u32 id)
316 {
317 return net->ipv6.fib6_main_tbl;
318 }
319
fib6_rule_lookup(struct net * net,struct flowi6 * fl6,const struct sk_buff * skb,int flags,pol_lookup_t lookup)320 struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
321 const struct sk_buff *skb,
322 int flags, pol_lookup_t lookup)
323 {
324 struct rt6_info *rt;
325
326 rt = pol_lookup_func(lookup,
327 net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
328 if (rt->dst.error == -EAGAIN) {
329 ip6_rt_put_flags(rt, flags);
330 rt = net->ipv6.ip6_null_entry;
331 if (!(flags & RT6_LOOKUP_F_DST_NOREF))
332 dst_hold(&rt->dst);
333 }
334
335 return &rt->dst;
336 }
337
338 /* called with rcu lock held; no reference taken on fib6_info */
fib6_lookup(struct net * net,int oif,struct flowi6 * fl6,struct fib6_result * res,int flags)339 int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
340 struct fib6_result *res, int flags)
341 {
342 return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6,
343 res, flags);
344 }
345
fib6_tables_init(struct net * net)346 static void __net_init fib6_tables_init(struct net *net)
347 {
348 fib6_link_table(net, net->ipv6.fib6_main_tbl);
349 }
350
351 #endif
352
fib6_tables_seq_read(const struct net * net)353 unsigned int fib6_tables_seq_read(const struct net *net)
354 {
355 unsigned int h, fib_seq = 0;
356
357 rcu_read_lock();
358 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
359 const struct hlist_head *head = &net->ipv6.fib_table_hash[h];
360 const struct fib6_table *tb;
361
362 hlist_for_each_entry_rcu(tb, head, tb6_hlist)
363 fib_seq += READ_ONCE(tb->fib_seq);
364 }
365 rcu_read_unlock();
366
367 return fib_seq;
368 }
369
call_fib6_entry_notifier(struct notifier_block * nb,enum fib_event_type event_type,struct fib6_info * rt,struct netlink_ext_ack * extack)370 static int call_fib6_entry_notifier(struct notifier_block *nb,
371 enum fib_event_type event_type,
372 struct fib6_info *rt,
373 struct netlink_ext_ack *extack)
374 {
375 struct fib6_entry_notifier_info info = {
376 .info.extack = extack,
377 .rt = rt,
378 };
379
380 return call_fib6_notifier(nb, event_type, &info.info);
381 }
382
call_fib6_multipath_entry_notifier(struct notifier_block * nb,enum fib_event_type event_type,struct fib6_info * rt,unsigned int nsiblings,struct netlink_ext_ack * extack)383 static int call_fib6_multipath_entry_notifier(struct notifier_block *nb,
384 enum fib_event_type event_type,
385 struct fib6_info *rt,
386 unsigned int nsiblings,
387 struct netlink_ext_ack *extack)
388 {
389 struct fib6_entry_notifier_info info = {
390 .info.extack = extack,
391 .rt = rt,
392 .nsiblings = nsiblings,
393 };
394
395 return call_fib6_notifier(nb, event_type, &info.info);
396 }
397
call_fib6_entry_notifiers(struct net * net,enum fib_event_type event_type,struct fib6_info * rt,struct netlink_ext_ack * extack)398 int call_fib6_entry_notifiers(struct net *net,
399 enum fib_event_type event_type,
400 struct fib6_info *rt,
401 struct netlink_ext_ack *extack)
402 {
403 struct fib6_entry_notifier_info info = {
404 .info.extack = extack,
405 .rt = rt,
406 };
407
408 WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1);
409 return call_fib6_notifiers(net, event_type, &info.info);
410 }
411
call_fib6_multipath_entry_notifiers(struct net * net,enum fib_event_type event_type,struct fib6_info * rt,unsigned int nsiblings,struct netlink_ext_ack * extack)412 int call_fib6_multipath_entry_notifiers(struct net *net,
413 enum fib_event_type event_type,
414 struct fib6_info *rt,
415 unsigned int nsiblings,
416 struct netlink_ext_ack *extack)
417 {
418 struct fib6_entry_notifier_info info = {
419 .info.extack = extack,
420 .rt = rt,
421 .nsiblings = nsiblings,
422 };
423
424 WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1);
425 return call_fib6_notifiers(net, event_type, &info.info);
426 }
427
call_fib6_entry_notifiers_replace(struct net * net,struct fib6_info * rt)428 int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt)
429 {
430 struct fib6_entry_notifier_info info = {
431 .rt = rt,
432 .nsiblings = rt->fib6_nsiblings,
433 };
434
435 WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1);
436 return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info);
437 }
438
439 struct fib6_dump_arg {
440 struct net *net;
441 struct notifier_block *nb;
442 struct netlink_ext_ack *extack;
443 };
444
fib6_rt_dump(struct fib6_info * rt,struct fib6_dump_arg * arg)445 static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg)
446 {
447 enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE;
448 unsigned int nsiblings;
449 int err;
450
451 if (!rt || rt == arg->net->ipv6.fib6_null_entry)
452 return 0;
453
454 nsiblings = READ_ONCE(rt->fib6_nsiblings);
455 if (nsiblings)
456 err = call_fib6_multipath_entry_notifier(arg->nb, fib_event,
457 rt,
458 nsiblings,
459 arg->extack);
460 else
461 err = call_fib6_entry_notifier(arg->nb, fib_event, rt,
462 arg->extack);
463
464 return err;
465 }
466
fib6_node_dump(struct fib6_walker * w)467 static int fib6_node_dump(struct fib6_walker *w)
468 {
469 int err;
470
471 err = fib6_rt_dump(w->leaf, w->args);
472 w->leaf = NULL;
473 return err;
474 }
475
fib6_table_dump(struct net * net,struct fib6_table * tb,struct fib6_walker * w)476 static int fib6_table_dump(struct net *net, struct fib6_table *tb,
477 struct fib6_walker *w)
478 {
479 int err;
480
481 w->root = &tb->tb6_root;
482 spin_lock_bh(&tb->tb6_lock);
483 err = fib6_walk(net, w);
484 spin_unlock_bh(&tb->tb6_lock);
485 return err;
486 }
487
488 /* Called with rcu_read_lock() */
fib6_tables_dump(struct net * net,struct notifier_block * nb,struct netlink_ext_ack * extack)489 int fib6_tables_dump(struct net *net, struct notifier_block *nb,
490 struct netlink_ext_ack *extack)
491 {
492 struct fib6_dump_arg arg;
493 struct fib6_walker *w;
494 unsigned int h;
495 int err = 0;
496
497 w = kzalloc_obj(*w, GFP_ATOMIC);
498 if (!w)
499 return -ENOMEM;
500
501 w->func = fib6_node_dump;
502 arg.net = net;
503 arg.nb = nb;
504 arg.extack = extack;
505 w->args = &arg;
506
507 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
508 struct hlist_head *head = &net->ipv6.fib_table_hash[h];
509 struct fib6_table *tb;
510
511 hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
512 err = fib6_table_dump(net, tb, w);
513 if (err)
514 goto out;
515 }
516 }
517
518 out:
519 kfree(w);
520
521 /* The tree traversal function should never return a positive value. */
522 return err > 0 ? -EINVAL : err;
523 }
524
fib6_dump_node(struct fib6_walker * w)525 static int fib6_dump_node(struct fib6_walker *w)
526 {
527 int res;
528 struct fib6_info *rt;
529
530 for_each_fib6_walker_rt(w) {
531 res = rt6_dump_route(rt, w->args, w->skip_in_node);
532 if (res >= 0) {
533 /* Frame is full, suspend walking */
534 w->leaf = rt;
535
536 /* We'll restart from this node, so if some routes were
537 * already dumped, skip them next time.
538 */
539 w->skip_in_node += res;
540
541 return 1;
542 }
543 w->skip_in_node = 0;
544
545 /* Multipath routes are dumped in one route with the
546 * RTA_MULTIPATH attribute. Jump 'rt' to point to the
547 * last sibling of this route (no need to dump the
548 * sibling routes again)
549 */
550 if (rt->fib6_nsiblings)
551 rt = list_last_entry(&rt->fib6_siblings,
552 struct fib6_info,
553 fib6_siblings);
554 }
555 w->leaf = NULL;
556 return 0;
557 }
558
fib6_dump_end(struct netlink_callback * cb)559 static void fib6_dump_end(struct netlink_callback *cb)
560 {
561 struct net *net = sock_net(cb->skb->sk);
562 struct fib6_walker *w = (void *)cb->args[2];
563
564 if (w) {
565 if (cb->args[4]) {
566 cb->args[4] = 0;
567 fib6_walker_unlink(net, w);
568 }
569 cb->args[2] = 0;
570 kfree(w);
571 }
572 cb->done = (void *)cb->args[3];
573 cb->args[1] = 3;
574 }
575
fib6_dump_done(struct netlink_callback * cb)576 static int fib6_dump_done(struct netlink_callback *cb)
577 {
578 fib6_dump_end(cb);
579 return cb->done ? cb->done(cb) : 0;
580 }
581
fib6_dump_table(struct fib6_table * table,struct sk_buff * skb,struct netlink_callback * cb)582 static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
583 struct netlink_callback *cb)
584 {
585 struct net *net = sock_net(skb->sk);
586 struct fib6_walker *w;
587 int res;
588
589 w = (void *)cb->args[2];
590 w->root = &table->tb6_root;
591
592 if (cb->args[4] == 0) {
593 w->count = 0;
594 w->skip = 0;
595 w->skip_in_node = 0;
596
597 spin_lock_bh(&table->tb6_lock);
598 res = fib6_walk(net, w);
599 spin_unlock_bh(&table->tb6_lock);
600 if (res > 0) {
601 cb->args[4] = 1;
602 cb->args[5] = READ_ONCE(w->root->fn_sernum);
603 }
604 } else {
605 int sernum = READ_ONCE(w->root->fn_sernum);
606 if (cb->args[5] != sernum) {
607 /* Begin at the root if the tree changed */
608 cb->args[5] = sernum;
609 w->state = FWS_INIT;
610 w->node = w->root;
611 w->skip = w->count;
612 w->skip_in_node = 0;
613 } else
614 w->skip = 0;
615
616 spin_lock_bh(&table->tb6_lock);
617 res = fib6_walk_continue(w);
618 spin_unlock_bh(&table->tb6_lock);
619 if (res <= 0) {
620 fib6_walker_unlink(net, w);
621 cb->args[4] = 0;
622 }
623 }
624
625 return res;
626 }
627
inet6_dump_fib(struct sk_buff * skb,struct netlink_callback * cb)628 static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
629 {
630 struct rt6_rtnl_dump_arg arg = {
631 .filter.dump_exceptions = true,
632 .filter.dump_routes = true,
633 .filter.rtnl_held = false,
634 };
635 const struct nlmsghdr *nlh = cb->nlh;
636 struct net *net = sock_net(skb->sk);
637 unsigned int e = 0, s_e;
638 struct hlist_head *head;
639 struct fib6_walker *w;
640 struct fib6_table *tb;
641 unsigned int h, s_h;
642 int err = 0;
643
644 rcu_read_lock();
645 if (cb->strict_check) {
646 err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb);
647 if (err < 0)
648 goto unlock;
649 } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
650 struct rtmsg *rtm = nlmsg_data(nlh);
651
652 if (rtm->rtm_flags & RTM_F_PREFIX)
653 arg.filter.flags = RTM_F_PREFIX;
654 }
655
656 w = (void *)cb->args[2];
657 if (!w) {
658 /* New dump:
659 *
660 * 1. allocate and initialize walker.
661 */
662 w = kzalloc_obj(*w, GFP_ATOMIC);
663 if (!w) {
664 err = -ENOMEM;
665 goto unlock;
666 }
667 w->func = fib6_dump_node;
668 cb->args[2] = (long)w;
669
670 /* 2. hook callback destructor.
671 */
672 cb->args[3] = (long)cb->done;
673 cb->done = fib6_dump_done;
674
675 }
676
677 arg.skb = skb;
678 arg.cb = cb;
679 arg.net = net;
680 w->args = &arg;
681
682 if (arg.filter.table_id) {
683 tb = fib6_get_table(net, arg.filter.table_id);
684 if (!tb) {
685 if (rtnl_msg_family(cb->nlh) != PF_INET6)
686 goto unlock;
687
688 NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist");
689 err = -ENOENT;
690 goto unlock;
691 }
692
693 if (!cb->args[0]) {
694 err = fib6_dump_table(tb, skb, cb);
695 if (!err)
696 cb->args[0] = 1;
697 }
698 goto unlock;
699 }
700
701 s_h = cb->args[0];
702 s_e = cb->args[1];
703
704 for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) {
705 e = 0;
706 head = &net->ipv6.fib_table_hash[h];
707 hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
708 if (e < s_e)
709 goto next;
710 err = fib6_dump_table(tb, skb, cb);
711 if (err != 0)
712 goto out;
713 next:
714 e++;
715 }
716 }
717 out:
718 cb->args[1] = e;
719 cb->args[0] = h;
720
721 unlock:
722 rcu_read_unlock();
723 if (err <= 0)
724 fib6_dump_end(cb);
725 return err;
726 }
727
fib6_metric_set(struct fib6_info * f6i,int metric,u32 val)728 void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val)
729 {
730 struct dst_metrics *m;
731
732 if (!f6i)
733 return;
734
735 if (READ_ONCE(f6i->fib6_metrics) == &dst_default_metrics) {
736 struct dst_metrics *dflt = (struct dst_metrics *)&dst_default_metrics;
737 struct dst_metrics *p = kzalloc_obj(*p, GFP_ATOMIC);
738
739 if (!p)
740 return;
741
742 p->metrics[metric - 1] = val;
743 refcount_set(&p->refcnt, 1);
744 if (cmpxchg(&f6i->fib6_metrics, dflt, p) != dflt)
745 kfree(p);
746 else
747 return;
748 }
749
750 m = READ_ONCE(f6i->fib6_metrics);
751 WRITE_ONCE(m->metrics[metric - 1], val);
752 }
753
754 /*
755 * Routing Table
756 *
757 * return the appropriate node for a routing tree "add" operation
758 * by either creating and inserting or by returning an existing
759 * node.
760 */
761
fib6_add_1(struct net * net,struct fib6_table * table,struct fib6_node * root,struct in6_addr * addr,int plen,int offset,int allow_create,int replace_required,struct netlink_ext_ack * extack)762 static struct fib6_node *fib6_add_1(struct net *net,
763 struct fib6_table *table,
764 struct fib6_node *root,
765 struct in6_addr *addr, int plen,
766 int offset, int allow_create,
767 int replace_required,
768 struct netlink_ext_ack *extack)
769 {
770 struct fib6_node *fn, *in, *ln;
771 struct fib6_node *pn = NULL;
772 struct rt6key *key;
773 int bit;
774 __be32 dir = 0;
775
776 /* insert node in tree */
777
778 fn = root;
779
780 do {
781 struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
782 lockdep_is_held(&table->tb6_lock));
783 key = (struct rt6key *)((u8 *)leaf + offset);
784
785 /*
786 * Prefix match
787 */
788 if (plen < fn->fn_bit ||
789 !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) {
790 if (!allow_create) {
791 if (replace_required) {
792 NL_SET_ERR_MSG(extack,
793 "Can not replace route - no match found");
794 pr_warn("Can't replace route, no match found\n");
795 return ERR_PTR(-ENOENT);
796 }
797 pr_warn("NLM_F_CREATE should be set when creating new route\n");
798 }
799 goto insert_above;
800 }
801
802 /*
803 * Exact match ?
804 */
805
806 if (plen == fn->fn_bit) {
807 /* clean up an intermediate node */
808 if (!(fn->fn_flags & RTN_RTINFO)) {
809 RCU_INIT_POINTER(fn->leaf, NULL);
810 fib6_info_release(leaf);
811 /* remove null_entry in the root node */
812 } else if (fn->fn_flags & RTN_TL_ROOT &&
813 rcu_access_pointer(fn->leaf) ==
814 net->ipv6.fib6_null_entry) {
815 RCU_INIT_POINTER(fn->leaf, NULL);
816 }
817
818 return fn;
819 }
820
821 /*
822 * We have more bits to go
823 */
824
825 /* Try to walk down on tree. */
826 dir = addr_bit_set(addr, fn->fn_bit);
827 pn = fn;
828 fn = dir ?
829 rcu_dereference_protected(fn->right,
830 lockdep_is_held(&table->tb6_lock)) :
831 rcu_dereference_protected(fn->left,
832 lockdep_is_held(&table->tb6_lock));
833 } while (fn);
834
835 if (!allow_create) {
836 /* We should not create new node because
837 * NLM_F_REPLACE was specified without NLM_F_CREATE
838 * I assume it is safe to require NLM_F_CREATE when
839 * REPLACE flag is used! Later we may want to remove the
840 * check for replace_required, because according
841 * to netlink specification, NLM_F_CREATE
842 * MUST be specified if new route is created.
843 * That would keep IPv6 consistent with IPv4
844 */
845 if (replace_required) {
846 NL_SET_ERR_MSG(extack,
847 "Can not replace route - no match found");
848 pr_warn("Can't replace route, no match found\n");
849 return ERR_PTR(-ENOENT);
850 }
851 pr_warn("NLM_F_CREATE should be set when creating new route\n");
852 }
853 /*
854 * We walked to the bottom of tree.
855 * Create new leaf node without children.
856 */
857
858 ln = node_alloc(net);
859
860 if (!ln)
861 return ERR_PTR(-ENOMEM);
862 ln->fn_bit = plen;
863 RCU_INIT_POINTER(ln->parent, pn);
864
865 if (dir)
866 rcu_assign_pointer(pn->right, ln);
867 else
868 rcu_assign_pointer(pn->left, ln);
869
870 return ln;
871
872
873 insert_above:
874 /*
875 * split since we don't have a common prefix anymore or
876 * we have a less significant route.
877 * we've to insert an intermediate node on the list
878 * this new node will point to the one we need to create
879 * and the current
880 */
881
882 pn = rcu_dereference_protected(fn->parent,
883 lockdep_is_held(&table->tb6_lock));
884
885 /* find 1st bit in difference between the 2 addrs.
886
887 See comment in __ipv6_addr_diff: bit may be an invalid value,
888 but if it is >= plen, the value is ignored in any case.
889 */
890
891 bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr));
892
893 /*
894 * (intermediate)[in]
895 * / \
896 * (new leaf node)[ln] (old node)[fn]
897 */
898 if (plen > bit) {
899 in = node_alloc(net);
900 ln = node_alloc(net);
901
902 if (!in || !ln) {
903 if (in)
904 node_free_immediate(net, in);
905 if (ln)
906 node_free_immediate(net, ln);
907 return ERR_PTR(-ENOMEM);
908 }
909
910 /*
911 * new intermediate node.
912 * RTN_RTINFO will
913 * be off since that an address that chooses one of
914 * the branches would not match less specific routes
915 * in the other branch
916 */
917
918 in->fn_bit = bit;
919
920 RCU_INIT_POINTER(in->parent, pn);
921 in->leaf = fn->leaf;
922 fib6_info_hold(rcu_dereference_protected(in->leaf,
923 lockdep_is_held(&table->tb6_lock)));
924
925 /* update parent pointer */
926 if (dir)
927 rcu_assign_pointer(pn->right, in);
928 else
929 rcu_assign_pointer(pn->left, in);
930
931 ln->fn_bit = plen;
932
933 RCU_INIT_POINTER(ln->parent, in);
934 rcu_assign_pointer(fn->parent, in);
935
936 if (addr_bit_set(addr, bit)) {
937 rcu_assign_pointer(in->right, ln);
938 rcu_assign_pointer(in->left, fn);
939 } else {
940 rcu_assign_pointer(in->left, ln);
941 rcu_assign_pointer(in->right, fn);
942 }
943 } else { /* plen <= bit */
944
945 /*
946 * (new leaf node)[ln]
947 * / \
948 * (old node)[fn] NULL
949 */
950
951 ln = node_alloc(net);
952
953 if (!ln)
954 return ERR_PTR(-ENOMEM);
955
956 ln->fn_bit = plen;
957
958 RCU_INIT_POINTER(ln->parent, pn);
959
960 if (addr_bit_set(&key->addr, plen))
961 RCU_INIT_POINTER(ln->right, fn);
962 else
963 RCU_INIT_POINTER(ln->left, fn);
964
965 rcu_assign_pointer(fn->parent, ln);
966
967 if (dir)
968 rcu_assign_pointer(pn->right, ln);
969 else
970 rcu_assign_pointer(pn->left, ln);
971 }
972 return ln;
973 }
974
__fib6_drop_pcpu_from(struct fib6_nh * fib6_nh,const struct fib6_info * match)975 static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh,
976 const struct fib6_info *match)
977 {
978 int cpu;
979
980 if (!fib6_nh->rt6i_pcpu)
981 return;
982
983 rcu_read_lock();
984 /* release the reference to this fib entry from
985 * all of its cached pcpu routes
986 */
987 for_each_possible_cpu(cpu) {
988 struct rt6_info **ppcpu_rt;
989 struct rt6_info *pcpu_rt;
990
991 ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
992
993 /* Paired with xchg() in rt6_get_pcpu_route() */
994 pcpu_rt = READ_ONCE(*ppcpu_rt);
995
996 /* only dropping the 'from' reference if the cached route
997 * is using 'match'. The cached pcpu_rt->from only changes
998 * from a fib6_info to NULL (ip6_dst_destroy); it can never
999 * change from one fib6_info reference to another
1000 */
1001 if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) {
1002 struct fib6_info *from;
1003
1004 from = unrcu_pointer(xchg(&pcpu_rt->from, NULL));
1005 fib6_info_release(from);
1006 }
1007 }
1008 rcu_read_unlock();
1009 }
1010
fib6_nh_drop_pcpu_from(struct fib6_nh * nh,void * _arg)1011 static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg)
1012 {
1013 struct fib6_info *arg = _arg;
1014
1015 __fib6_drop_pcpu_from(nh, arg);
1016 return 0;
1017 }
1018
fib6_drop_pcpu_from(struct fib6_info * f6i)1019 static void fib6_drop_pcpu_from(struct fib6_info *f6i)
1020 {
1021 /* Make sure rt6_make_pcpu_route() wont add other percpu routes
1022 * while we are cleaning them here.
1023 */
1024 f6i->fib6_destroying = 1;
1025 mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */
1026
1027 if (f6i->nh) {
1028 rcu_read_lock();
1029 nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, f6i);
1030 rcu_read_unlock();
1031 } else {
1032 struct fib6_nh *fib6_nh;
1033
1034 fib6_nh = f6i->fib6_nh;
1035 __fib6_drop_pcpu_from(fib6_nh, f6i);
1036 }
1037 }
1038
fib6_purge_rt(struct fib6_info * rt,struct fib6_node * fn,struct net * net)1039 static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
1040 struct net *net)
1041 {
1042 struct fib6_table *table = rt->fib6_table;
1043
1044 /* Flush all cached dst in exception table */
1045 rt6_flush_exceptions(rt);
1046 fib6_drop_pcpu_from(rt);
1047
1048 if (rt->nh) {
1049 spin_lock(&rt->nh->lock);
1050
1051 if (!list_empty(&rt->nh_list))
1052 list_del_init(&rt->nh_list);
1053
1054 spin_unlock(&rt->nh->lock);
1055 }
1056
1057 if (refcount_read(&rt->fib6_ref) != 1) {
1058 /* This route is used as dummy address holder in some split
1059 * nodes. It is not leaked, but it still holds other resources,
1060 * which must be released in time. So, scan ascendant nodes
1061 * and replace dummy references to this route with references
1062 * to still alive ones.
1063 */
1064 while (fn) {
1065 struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
1066 lockdep_is_held(&table->tb6_lock));
1067 struct fib6_info *new_leaf;
1068 if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
1069 new_leaf = fib6_find_prefix(net, table, fn);
1070 fib6_info_hold(new_leaf);
1071
1072 rcu_assign_pointer(fn->leaf, new_leaf);
1073 fib6_info_release(rt);
1074 }
1075 fn = rcu_dereference_protected(fn->parent,
1076 lockdep_is_held(&table->tb6_lock));
1077 }
1078 }
1079
1080 fib6_clean_expires(rt);
1081 fib6_remove_gc_list(rt);
1082 }
1083
1084 /*
1085 * Insert routing information in a node.
1086 */
1087
fib6_add_rt2node(struct fib6_node * fn,struct fib6_info * rt,struct nl_info * info,struct netlink_ext_ack * extack,struct list_head * purge_list)1088 static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
1089 struct nl_info *info, struct netlink_ext_ack *extack,
1090 struct list_head *purge_list)
1091 {
1092 struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
1093 lockdep_is_held(&rt->fib6_table->tb6_lock));
1094 struct fib6_info *iter = NULL;
1095 struct fib6_info __rcu **ins;
1096 struct fib6_info __rcu **fallback_ins = NULL;
1097 int replace = (info->nlh &&
1098 (info->nlh->nlmsg_flags & NLM_F_REPLACE));
1099 int add = (!info->nlh ||
1100 (info->nlh->nlmsg_flags & NLM_F_CREATE));
1101 int found = 0;
1102 bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
1103 bool notify_sibling_rt = false;
1104 u16 nlflags = NLM_F_EXCL;
1105 int err;
1106
1107 if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
1108 nlflags |= NLM_F_APPEND;
1109
1110 ins = &fn->leaf;
1111
1112 for (iter = leaf; iter;
1113 iter = rcu_dereference_protected(iter->fib6_next,
1114 lockdep_is_held(&rt->fib6_table->tb6_lock))) {
1115 /*
1116 * Search for duplicates
1117 */
1118
1119 if (iter->fib6_metric == rt->fib6_metric) {
1120 /*
1121 * Same priority level
1122 */
1123 if (info->nlh &&
1124 (info->nlh->nlmsg_flags & NLM_F_EXCL))
1125 return -EEXIST;
1126
1127 nlflags &= ~NLM_F_EXCL;
1128 if (replace) {
1129 if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
1130 found++;
1131 break;
1132 }
1133 fallback_ins = fallback_ins ?: ins;
1134 goto next_iter;
1135 }
1136
1137 if (rt6_duplicate_nexthop(iter, rt)) {
1138 if (rt->fib6_nsiblings)
1139 WRITE_ONCE(rt->fib6_nsiblings, 0);
1140 if (!(iter->fib6_flags & RTF_EXPIRES))
1141 return -EEXIST;
1142 if (!(rt->fib6_flags & RTF_EXPIRES)) {
1143 fib6_clean_expires(iter);
1144 fib6_may_remove_gc_list(info->nl_net, iter);
1145 } else {
1146 fib6_set_expires(iter, rt->expires);
1147 fib6_add_gc_list(iter);
1148 }
1149 if (!(rt->fib6_flags & (RTF_ADDRCONF | RTF_PREFIX_RT)) &&
1150 (iter->nh || !iter->fib6_nh->fib_nh_gw_family)) {
1151 iter->fib6_flags &= ~RTF_ADDRCONF;
1152 iter->fib6_flags &= ~RTF_PREFIX_RT;
1153 }
1154
1155 if (rt->fib6_pmtu)
1156 fib6_metric_set(iter, RTAX_MTU,
1157 rt->fib6_pmtu);
1158 return -EEXIST;
1159 }
1160 /* If we have the same destination and the same metric,
1161 * but not the same gateway, then the route we try to
1162 * add is sibling to this route, increment our counter
1163 * of siblings, and later we will add our route to the
1164 * list.
1165 * Only static routes (which don't have flag
1166 * RTF_EXPIRES) are used for ECMPv6.
1167 *
1168 * To avoid long list, we only had siblings if the
1169 * route have a gateway.
1170 */
1171 if (rt_can_ecmp &&
1172 rt6_qualify_for_ecmp(iter))
1173 WRITE_ONCE(rt->fib6_nsiblings,
1174 rt->fib6_nsiblings + 1);
1175 }
1176
1177 if (iter->fib6_metric > rt->fib6_metric)
1178 break;
1179
1180 next_iter:
1181 ins = &iter->fib6_next;
1182 }
1183
1184 if (fallback_ins && !found) {
1185 /* No matching route with same ecmp-able-ness found, replace
1186 * first matching route
1187 */
1188 ins = fallback_ins;
1189 iter = rcu_dereference_protected(*ins,
1190 lockdep_is_held(&rt->fib6_table->tb6_lock));
1191 found++;
1192 }
1193
1194 /* Reset round-robin state, if necessary */
1195 if (ins == &fn->leaf)
1196 fn->rr_ptr = NULL;
1197
1198 /* Link this route to others same route. */
1199 if (rt->fib6_nsiblings) {
1200 unsigned int fib6_nsiblings;
1201 struct fib6_info *sibling, *temp_sibling;
1202
1203 /* Find the first route that have the same metric */
1204 sibling = leaf;
1205 notify_sibling_rt = true;
1206 while (sibling) {
1207 if (sibling->fib6_metric == rt->fib6_metric &&
1208 rt6_qualify_for_ecmp(sibling)) {
1209 list_add_tail_rcu(&rt->fib6_siblings,
1210 &sibling->fib6_siblings);
1211 break;
1212 }
1213 sibling = rcu_dereference_protected(sibling->fib6_next,
1214 lockdep_is_held(&rt->fib6_table->tb6_lock));
1215 notify_sibling_rt = false;
1216 }
1217 /* For each sibling in the list, increment the counter of
1218 * siblings. BUG() if counters does not match, list of siblings
1219 * is broken!
1220 */
1221 fib6_nsiblings = 0;
1222 list_for_each_entry_safe(sibling, temp_sibling,
1223 &rt->fib6_siblings, fib6_siblings) {
1224 WRITE_ONCE(sibling->fib6_nsiblings,
1225 sibling->fib6_nsiblings + 1);
1226 BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings);
1227 fib6_nsiblings++;
1228 }
1229 BUG_ON(fib6_nsiblings != rt->fib6_nsiblings);
1230 rcu_read_lock();
1231 rt6_multipath_rebalance(temp_sibling);
1232 rcu_read_unlock();
1233 }
1234
1235 /*
1236 * insert node
1237 */
1238 if (!replace) {
1239 if (!add)
1240 pr_warn("NLM_F_CREATE should be set when creating new route\n");
1241
1242 add:
1243 nlflags |= NLM_F_CREATE;
1244
1245 /* The route should only be notified if it is the first
1246 * route in the node or if it is added as a sibling
1247 * route to the first route in the node.
1248 */
1249 if (!info->skip_notify_kernel &&
1250 (notify_sibling_rt || ins == &fn->leaf)) {
1251 enum fib_event_type fib_event;
1252
1253 if (notify_sibling_rt)
1254 fib_event = FIB_EVENT_ENTRY_APPEND;
1255 else
1256 fib_event = FIB_EVENT_ENTRY_REPLACE;
1257 err = call_fib6_entry_notifiers(info->nl_net,
1258 fib_event, rt,
1259 extack);
1260 if (err) {
1261 struct fib6_info *sibling, *next_sibling;
1262
1263 /* If the route has siblings, then it first
1264 * needs to be unlinked from them.
1265 */
1266 if (!rt->fib6_nsiblings)
1267 return err;
1268
1269 list_for_each_entry_safe(sibling, next_sibling,
1270 &rt->fib6_siblings,
1271 fib6_siblings)
1272 WRITE_ONCE(sibling->fib6_nsiblings,
1273 sibling->fib6_nsiblings - 1);
1274 WRITE_ONCE(rt->fib6_nsiblings, 0);
1275 list_del_rcu(&rt->fib6_siblings);
1276 rcu_read_lock();
1277 rt6_multipath_rebalance(next_sibling);
1278 rcu_read_unlock();
1279 return err;
1280 }
1281 }
1282
1283 rcu_assign_pointer(rt->fib6_next, iter);
1284 fib6_info_hold(rt);
1285 rcu_assign_pointer(rt->fib6_node, fn);
1286 rcu_assign_pointer(*ins, rt);
1287 if (!info->skip_notify)
1288 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
1289 info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
1290
1291 if (!(fn->fn_flags & RTN_RTINFO)) {
1292 info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
1293 fn->fn_flags |= RTN_RTINFO;
1294 }
1295
1296 } else {
1297 int nsiblings;
1298
1299 if (!found) {
1300 if (add)
1301 goto add;
1302 pr_warn("NLM_F_REPLACE set, but no existing node found!\n");
1303 return -ENOENT;
1304 }
1305
1306 if (!info->skip_notify_kernel && ins == &fn->leaf) {
1307 err = call_fib6_entry_notifiers(info->nl_net,
1308 FIB_EVENT_ENTRY_REPLACE,
1309 rt, extack);
1310 if (err)
1311 return err;
1312 }
1313
1314 fib6_info_hold(rt);
1315 rcu_assign_pointer(rt->fib6_node, fn);
1316 rt->fib6_next = iter->fib6_next;
1317 rcu_assign_pointer(*ins, rt);
1318 if (!info->skip_notify)
1319 inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
1320 if (!(fn->fn_flags & RTN_RTINFO)) {
1321 info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
1322 fn->fn_flags |= RTN_RTINFO;
1323 }
1324 nsiblings = iter->fib6_nsiblings;
1325 iter->fib6_node = NULL;
1326 list_add(&iter->purge_link, purge_list);
1327 if (rcu_access_pointer(fn->rr_ptr) == iter)
1328 fn->rr_ptr = NULL;
1329
1330 if (nsiblings) {
1331 /* Replacing an ECMP route, remove all siblings */
1332 ins = &rt->fib6_next;
1333 iter = rcu_dereference_protected(*ins,
1334 lockdep_is_held(&rt->fib6_table->tb6_lock));
1335 while (iter) {
1336 if (iter->fib6_metric > rt->fib6_metric)
1337 break;
1338 if (rt6_qualify_for_ecmp(iter)) {
1339 *ins = iter->fib6_next;
1340 iter->fib6_node = NULL;
1341 list_add(&iter->purge_link, purge_list);
1342 if (rcu_access_pointer(fn->rr_ptr) == iter)
1343 fn->rr_ptr = NULL;
1344 nsiblings--;
1345 info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
1346 } else {
1347 ins = &iter->fib6_next;
1348 }
1349 iter = rcu_dereference_protected(*ins,
1350 lockdep_is_held(&rt->fib6_table->tb6_lock));
1351 }
1352 WARN_ON(nsiblings != 0);
1353 }
1354 }
1355
1356 return 0;
1357 }
1358
fib6_add_rt2node_nh(struct fib6_node * fn,struct fib6_info * rt,struct nl_info * info,struct netlink_ext_ack * extack,struct list_head * purge_list)1359 static int fib6_add_rt2node_nh(struct fib6_node *fn, struct fib6_info *rt,
1360 struct nl_info *info, struct netlink_ext_ack *extack,
1361 struct list_head *purge_list)
1362 {
1363 int err;
1364
1365 spin_lock(&rt->nh->lock);
1366
1367 if (rt->nh->dead) {
1368 NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
1369 err = -EINVAL;
1370 } else {
1371 err = fib6_add_rt2node(fn, rt, info, extack, purge_list);
1372 if (!err)
1373 list_add(&rt->nh_list, &rt->nh->f6i_list);
1374 }
1375
1376 spin_unlock(&rt->nh->lock);
1377
1378 return err;
1379 }
1380
fib6_start_gc(struct net * net,struct fib6_info * rt)1381 static void fib6_start_gc(struct net *net, struct fib6_info *rt)
1382 {
1383 if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
1384 (rt->fib6_flags & RTF_EXPIRES))
1385 mod_timer(&net->ipv6.ip6_fib_timer,
1386 jiffies + READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_interval));
1387 }
1388
fib6_force_start_gc(struct net * net)1389 void fib6_force_start_gc(struct net *net)
1390 {
1391 if (!timer_pending(&net->ipv6.ip6_fib_timer))
1392 mod_timer(&net->ipv6.ip6_fib_timer,
1393 jiffies + READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_interval));
1394 }
1395
__fib6_update_sernum_upto_root(struct fib6_info * rt,int sernum)1396 static void __fib6_update_sernum_upto_root(struct fib6_info *rt,
1397 int sernum)
1398 {
1399 struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
1400 lockdep_is_held(&rt->fib6_table->tb6_lock));
1401
1402 /* paired with smp_rmb() in fib6_get_cookie_safe() */
1403 smp_wmb();
1404 while (fn) {
1405 WRITE_ONCE(fn->fn_sernum, sernum);
1406 fn = rcu_dereference_protected(fn->parent,
1407 lockdep_is_held(&rt->fib6_table->tb6_lock));
1408 }
1409 }
1410
fib6_update_sernum_upto_root(struct net * net,struct fib6_info * rt)1411 void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt)
1412 {
1413 __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net));
1414 }
1415
1416 /* allow ipv4 to update sernum via ipv6_stub */
fib6_update_sernum_stub(struct net * net,struct fib6_info * f6i)1417 void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i)
1418 {
1419 spin_lock_bh(&f6i->fib6_table->tb6_lock);
1420 fib6_update_sernum_upto_root(net, f6i);
1421 spin_unlock_bh(&f6i->fib6_table->tb6_lock);
1422 }
1423
1424 /*
1425 * Add routing information to the routing tree.
1426 * <destination addr>/<source addr>
1427 * with source addr info in sub-trees
1428 * Need to own table->tb6_lock
1429 */
1430
fib6_add(struct fib6_node * root,struct fib6_info * rt,struct nl_info * info,struct netlink_ext_ack * extack)1431 int fib6_add(struct fib6_node *root, struct fib6_info *rt,
1432 struct nl_info *info, struct netlink_ext_ack *extack)
1433 {
1434 struct fib6_table *table = rt->fib6_table;
1435 LIST_HEAD(purge_list);
1436 struct fib6_node *fn;
1437 #ifdef CONFIG_IPV6_SUBTREES
1438 struct fib6_node *pn = NULL;
1439 #endif
1440 int err = -ENOMEM;
1441 int allow_create = 1;
1442 int replace_required = 0;
1443
1444 if (info->nlh) {
1445 if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
1446 allow_create = 0;
1447 if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
1448 replace_required = 1;
1449 }
1450 if (!allow_create && !replace_required)
1451 pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");
1452
1453 fn = fib6_add_1(info->nl_net, table, root,
1454 &rt->fib6_dst.addr, rt->fib6_dst.plen,
1455 offsetof(struct fib6_info, fib6_dst), allow_create,
1456 replace_required, extack);
1457 if (IS_ERR(fn)) {
1458 err = PTR_ERR(fn);
1459 fn = NULL;
1460 goto out;
1461 }
1462
1463 #ifdef CONFIG_IPV6_SUBTREES
1464 pn = fn;
1465
1466 if (rt->fib6_src.plen) {
1467 struct fib6_node *sn;
1468
1469 if (!rcu_access_pointer(fn->subtree)) {
1470 struct fib6_node *sfn;
1471
1472 /*
1473 * Create subtree.
1474 *
1475 * fn[main tree]
1476 * |
1477 * sfn[subtree root]
1478 * \
1479 * sn[new leaf node]
1480 */
1481
1482 /* Create subtree root node */
1483 sfn = node_alloc(info->nl_net);
1484 if (!sfn)
1485 goto failure;
1486
1487 fib6_info_hold(info->nl_net->ipv6.fib6_null_entry);
1488 rcu_assign_pointer(sfn->leaf,
1489 info->nl_net->ipv6.fib6_null_entry);
1490 sfn->fn_flags = RTN_ROOT;
1491
1492 /* Now add the first leaf node to new subtree */
1493
1494 sn = fib6_add_1(info->nl_net, table, sfn,
1495 &rt->fib6_src.addr, rt->fib6_src.plen,
1496 offsetof(struct fib6_info, fib6_src),
1497 allow_create, replace_required, extack);
1498
1499 if (IS_ERR(sn)) {
1500 /* If it is failed, discard just allocated
1501 root, and then (in failure) stale node
1502 in main tree.
1503 */
1504 node_free_immediate(info->nl_net, sfn);
1505 err = PTR_ERR(sn);
1506 goto failure;
1507 }
1508
1509 /* Now link new subtree to main tree */
1510 rcu_assign_pointer(sfn->parent, fn);
1511 rcu_assign_pointer(fn->subtree, sfn);
1512 } else {
1513 sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn),
1514 &rt->fib6_src.addr, rt->fib6_src.plen,
1515 offsetof(struct fib6_info, fib6_src),
1516 allow_create, replace_required, extack);
1517
1518 if (IS_ERR(sn)) {
1519 err = PTR_ERR(sn);
1520 goto failure;
1521 }
1522 }
1523
1524 if (!rcu_access_pointer(fn->leaf)) {
1525 if (fn->fn_flags & RTN_TL_ROOT) {
1526 /* put back null_entry for root node */
1527 rcu_assign_pointer(fn->leaf,
1528 info->nl_net->ipv6.fib6_null_entry);
1529 } else {
1530 fib6_info_hold(rt);
1531 rcu_assign_pointer(fn->leaf, rt);
1532 }
1533 }
1534 fn = sn;
1535 }
1536 #endif
1537
1538 if (rt->nh)
1539 err = fib6_add_rt2node_nh(fn, rt, info, extack, &purge_list);
1540 else
1541 err = fib6_add_rt2node(fn, rt, info, extack, &purge_list);
1542 if (!err) {
1543 struct fib6_info *iter, *next;
1544
1545 list_for_each_entry_safe(iter, next, &purge_list, purge_link) {
1546 list_del(&iter->purge_link);
1547 fib6_purge_rt(iter, fn, info->nl_net);
1548 fib6_info_release(iter);
1549 }
1550
1551 __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net));
1552
1553 if (rt->fib6_flags & RTF_EXPIRES)
1554 fib6_add_gc_list(rt);
1555
1556 fib6_start_gc(info->nl_net, rt);
1557 }
1558
1559 out:
1560 if (err) {
1561 #ifdef CONFIG_IPV6_SUBTREES
1562 /*
1563 * If fib6_add_1 has cleared the old leaf pointer in the
1564 * super-tree leaf node we have to find a new one for it.
1565 */
1566 if (pn != fn) {
1567 struct fib6_info *pn_leaf =
1568 rcu_dereference_protected(pn->leaf,
1569 lockdep_is_held(&table->tb6_lock));
1570 if (pn_leaf == rt) {
1571 pn_leaf = NULL;
1572 RCU_INIT_POINTER(pn->leaf, NULL);
1573 fib6_info_release(rt);
1574 }
1575 if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) {
1576 pn_leaf = fib6_find_prefix(info->nl_net, table,
1577 pn);
1578 if (!pn_leaf)
1579 pn_leaf =
1580 info->nl_net->ipv6.fib6_null_entry;
1581 fib6_info_hold(pn_leaf);
1582 rcu_assign_pointer(pn->leaf, pn_leaf);
1583 }
1584 }
1585 #endif
1586 goto failure;
1587 } else if (fib6_requires_src(rt)) {
1588 fib6_routes_require_src_inc(info->nl_net);
1589 }
1590 return err;
1591
1592 failure:
1593 /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if:
1594 * 1. fn is an intermediate node and we failed to add the new
1595 * route to it in both subtree creation failure and fib6_add_rt2node()
1596 * failure case.
1597 * 2. fn is the root node in the table and we fail to add the first
1598 * default route to it.
1599 */
1600 if (fn &&
1601 (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) ||
1602 (fn->fn_flags & RTN_TL_ROOT &&
1603 !rcu_access_pointer(fn->leaf))))
1604 fib6_repair_tree(info->nl_net, table, fn);
1605 return err;
1606 }
1607
1608 /*
1609 * Routing tree lookup
1610 *
1611 */
1612
1613 struct lookup_args {
1614 int offset; /* key offset on fib6_info */
1615 const struct in6_addr *addr; /* search key */
1616 };
1617
fib6_node_lookup_1(struct fib6_node * root,struct lookup_args * args)1618 static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root,
1619 struct lookup_args *args)
1620 {
1621 struct fib6_node *fn;
1622 __be32 dir;
1623
1624 if (unlikely(args->offset == 0))
1625 return NULL;
1626
1627 /*
1628 * Descend on a tree
1629 */
1630
1631 fn = root;
1632
1633 for (;;) {
1634 struct fib6_node *next;
1635
1636 dir = addr_bit_set(args->addr, fn->fn_bit);
1637
1638 next = dir ? rcu_dereference(fn->right) :
1639 rcu_dereference(fn->left);
1640
1641 if (next) {
1642 fn = next;
1643 continue;
1644 }
1645 break;
1646 }
1647
1648 while (fn) {
1649 struct fib6_node *subtree = FIB6_SUBTREE(fn);
1650
1651 if (subtree || fn->fn_flags & RTN_RTINFO) {
1652 struct fib6_info *leaf = rcu_dereference(fn->leaf);
1653 struct rt6key *key;
1654
1655 if (!leaf)
1656 goto backtrack;
1657
1658 key = (struct rt6key *) ((u8 *)leaf + args->offset);
1659
1660 if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) {
1661 #ifdef CONFIG_IPV6_SUBTREES
1662 if (subtree) {
1663 struct fib6_node *sfn;
1664 sfn = fib6_node_lookup_1(subtree,
1665 args + 1);
1666 if (!sfn)
1667 goto backtrack;
1668 fn = sfn;
1669 }
1670 #endif
1671 if (fn->fn_flags & RTN_RTINFO)
1672 return fn;
1673 }
1674 }
1675 backtrack:
1676 if (fn->fn_flags & RTN_ROOT)
1677 break;
1678
1679 fn = rcu_dereference(fn->parent);
1680 }
1681
1682 return NULL;
1683 }
1684
1685 /* called with rcu_read_lock() held
1686 */
fib6_node_lookup(struct fib6_node * root,const struct in6_addr * daddr,const struct in6_addr * saddr)1687 struct fib6_node *fib6_node_lookup(struct fib6_node *root,
1688 const struct in6_addr *daddr,
1689 const struct in6_addr *saddr)
1690 {
1691 struct fib6_node *fn;
1692 struct lookup_args args[] = {
1693 {
1694 .offset = offsetof(struct fib6_info, fib6_dst),
1695 .addr = daddr,
1696 },
1697 #ifdef CONFIG_IPV6_SUBTREES
1698 {
1699 .offset = offsetof(struct fib6_info, fib6_src),
1700 .addr = saddr,
1701 },
1702 #endif
1703 {
1704 .offset = 0, /* sentinel */
1705 }
1706 };
1707
1708 fn = fib6_node_lookup_1(root, daddr ? args : args + 1);
1709 if (!fn || fn->fn_flags & RTN_TL_ROOT)
1710 fn = root;
1711
1712 return fn;
1713 }
1714
1715 /*
1716 * Get node with specified destination prefix (and source prefix,
1717 * if subtrees are used)
1718 * exact_match == true means we try to find fn with exact match of
1719 * the passed in prefix addr
1720 * exact_match == false means we try to find fn with longest prefix
1721 * match of the passed in prefix addr. This is useful for finding fn
1722 * for cached route as it will be stored in the exception table under
1723 * the node with longest prefix length.
1724 */
1725
1726
fib6_locate_1(struct fib6_node * root,const struct in6_addr * addr,int plen,int offset,bool exact_match)1727 static struct fib6_node *fib6_locate_1(struct fib6_node *root,
1728 const struct in6_addr *addr,
1729 int plen, int offset,
1730 bool exact_match)
1731 {
1732 struct fib6_node *fn, *prev = NULL;
1733
1734 for (fn = root; fn ; ) {
1735 struct fib6_info *leaf = rcu_dereference(fn->leaf);
1736 struct rt6key *key;
1737
1738 /* This node is being deleted */
1739 if (!leaf) {
1740 if (plen <= fn->fn_bit)
1741 goto out;
1742 else
1743 goto next;
1744 }
1745
1746 key = (struct rt6key *)((u8 *)leaf + offset);
1747
1748 /*
1749 * Prefix match
1750 */
1751 if (plen < fn->fn_bit ||
1752 !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
1753 goto out;
1754
1755 if (plen == fn->fn_bit)
1756 return fn;
1757
1758 if (fn->fn_flags & RTN_RTINFO)
1759 prev = fn;
1760
1761 next:
1762 /*
1763 * We have more bits to go
1764 */
1765 if (addr_bit_set(addr, fn->fn_bit))
1766 fn = rcu_dereference(fn->right);
1767 else
1768 fn = rcu_dereference(fn->left);
1769 }
1770 out:
1771 if (exact_match)
1772 return NULL;
1773 else
1774 return prev;
1775 }
1776
fib6_locate(struct fib6_node * root,const struct in6_addr * daddr,int dst_len,const struct in6_addr * saddr,int src_len,bool exact_match)1777 struct fib6_node *fib6_locate(struct fib6_node *root,
1778 const struct in6_addr *daddr, int dst_len,
1779 const struct in6_addr *saddr, int src_len,
1780 bool exact_match)
1781 {
1782 struct fib6_node *fn;
1783
1784 fn = fib6_locate_1(root, daddr, dst_len,
1785 offsetof(struct fib6_info, fib6_dst),
1786 exact_match);
1787
1788 #ifdef CONFIG_IPV6_SUBTREES
1789 if (src_len) {
1790 WARN_ON(saddr == NULL);
1791 if (fn) {
1792 struct fib6_node *subtree = FIB6_SUBTREE(fn);
1793
1794 if (subtree) {
1795 fn = fib6_locate_1(subtree, saddr, src_len,
1796 offsetof(struct fib6_info, fib6_src),
1797 exact_match);
1798 }
1799 }
1800 }
1801 #endif
1802
1803 if (fn && fn->fn_flags & RTN_RTINFO)
1804 return fn;
1805
1806 return NULL;
1807 }
1808
1809
1810 /*
1811 * Deletion
1812 *
1813 */
1814
fib6_find_prefix(struct net * net,struct fib6_table * table,struct fib6_node * fn)1815 static struct fib6_info *fib6_find_prefix(struct net *net,
1816 struct fib6_table *table,
1817 struct fib6_node *fn)
1818 {
1819 struct fib6_node *child_left, *child_right;
1820
1821 if (fn->fn_flags & RTN_ROOT)
1822 return net->ipv6.fib6_null_entry;
1823
1824 while (fn) {
1825 child_left = rcu_dereference_protected(fn->left,
1826 lockdep_is_held(&table->tb6_lock));
1827 child_right = rcu_dereference_protected(fn->right,
1828 lockdep_is_held(&table->tb6_lock));
1829 if (child_left)
1830 return rcu_dereference_protected(child_left->leaf,
1831 lockdep_is_held(&table->tb6_lock));
1832 if (child_right)
1833 return rcu_dereference_protected(child_right->leaf,
1834 lockdep_is_held(&table->tb6_lock));
1835
1836 fn = FIB6_SUBTREE(fn);
1837 }
1838 return NULL;
1839 }
1840
1841 /*
1842 * Called to trim the tree of intermediate nodes when possible. "fn"
1843 * is the node we want to try and remove.
1844 * Need to own table->tb6_lock
1845 */
1846
fib6_repair_tree(struct net * net,struct fib6_table * table,struct fib6_node * fn)1847 static struct fib6_node *fib6_repair_tree(struct net *net,
1848 struct fib6_table *table,
1849 struct fib6_node *fn)
1850 {
1851 int children;
1852 int nstate;
1853 struct fib6_node *child;
1854 struct fib6_walker *w;
1855 int iter = 0;
1856
1857 /* Set fn->leaf to null_entry for root node. */
1858 if (fn->fn_flags & RTN_TL_ROOT) {
1859 rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry);
1860 return fn;
1861 }
1862
1863 for (;;) {
1864 struct fib6_node *fn_r = rcu_dereference_protected(fn->right,
1865 lockdep_is_held(&table->tb6_lock));
1866 struct fib6_node *fn_l = rcu_dereference_protected(fn->left,
1867 lockdep_is_held(&table->tb6_lock));
1868 struct fib6_node *pn = rcu_dereference_protected(fn->parent,
1869 lockdep_is_held(&table->tb6_lock));
1870 struct fib6_node *pn_r = rcu_dereference_protected(pn->right,
1871 lockdep_is_held(&table->tb6_lock));
1872 struct fib6_node *pn_l = rcu_dereference_protected(pn->left,
1873 lockdep_is_held(&table->tb6_lock));
1874 struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf,
1875 lockdep_is_held(&table->tb6_lock));
1876 struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
1877 lockdep_is_held(&table->tb6_lock));
1878 struct fib6_info *new_fn_leaf;
1879
1880 pr_debug("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
1881 iter++;
1882
1883 WARN_ON(fn->fn_flags & RTN_RTINFO);
1884 WARN_ON(fn->fn_flags & RTN_TL_ROOT);
1885 WARN_ON(fn_leaf);
1886
1887 children = 0;
1888 child = NULL;
1889 if (fn_r) {
1890 child = fn_r;
1891 children |= 1;
1892 }
1893 if (fn_l) {
1894 child = fn_l;
1895 children |= 2;
1896 }
1897
1898 if (children == 3 || FIB6_SUBTREE(fn)
1899 #ifdef CONFIG_IPV6_SUBTREES
1900 /* Subtree root (i.e. fn) may have one child */
1901 || (children && fn->fn_flags & RTN_ROOT)
1902 #endif
1903 ) {
1904 new_fn_leaf = fib6_find_prefix(net, table, fn);
1905 #if RT6_DEBUG >= 2
1906 if (!new_fn_leaf) {
1907 WARN_ON(!new_fn_leaf);
1908 new_fn_leaf = net->ipv6.fib6_null_entry;
1909 }
1910 #endif
1911 fib6_info_hold(new_fn_leaf);
1912 rcu_assign_pointer(fn->leaf, new_fn_leaf);
1913 return pn;
1914 }
1915
1916 #ifdef CONFIG_IPV6_SUBTREES
1917 if (FIB6_SUBTREE(pn) == fn) {
1918 WARN_ON(!(fn->fn_flags & RTN_ROOT));
1919 RCU_INIT_POINTER(pn->subtree, NULL);
1920 nstate = FWS_L;
1921 } else {
1922 WARN_ON(fn->fn_flags & RTN_ROOT);
1923 #endif
1924 if (pn_r == fn)
1925 rcu_assign_pointer(pn->right, child);
1926 else if (pn_l == fn)
1927 rcu_assign_pointer(pn->left, child);
1928 #if RT6_DEBUG >= 2
1929 else
1930 WARN_ON(1);
1931 #endif
1932 if (child)
1933 rcu_assign_pointer(child->parent, pn);
1934 nstate = FWS_R;
1935 #ifdef CONFIG_IPV6_SUBTREES
1936 }
1937 #endif
1938
1939 read_lock(&net->ipv6.fib6_walker_lock);
1940 FOR_WALKERS(net, w) {
1941 if (!child) {
1942 if (w->node == fn) {
1943 pr_debug("W %p adjusted by delnode 1, s=%d/%d\n",
1944 w, w->state, nstate);
1945 w->node = pn;
1946 w->state = nstate;
1947 }
1948 } else {
1949 if (w->node == fn) {
1950 w->node = child;
1951 if (children&2) {
1952 pr_debug("W %p adjusted by delnode 2, s=%d\n",
1953 w, w->state);
1954 w->state = w->state >= FWS_R ? FWS_U : FWS_INIT;
1955 } else {
1956 pr_debug("W %p adjusted by delnode 2, s=%d\n",
1957 w, w->state);
1958 w->state = w->state >= FWS_C ? FWS_U : FWS_INIT;
1959 }
1960 }
1961 }
1962 }
1963 read_unlock(&net->ipv6.fib6_walker_lock);
1964
1965 node_free(net, fn);
1966 if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn))
1967 return pn;
1968
1969 RCU_INIT_POINTER(pn->leaf, NULL);
1970 fib6_info_release(pn_leaf);
1971 fn = pn;
1972 }
1973 }
1974
fib6_del_route(struct fib6_table * table,struct fib6_node * fn,struct fib6_info __rcu ** rtp,struct nl_info * info)1975 static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
1976 struct fib6_info __rcu **rtp, struct nl_info *info)
1977 {
1978 struct fib6_info *leaf, *replace_rt = NULL;
1979 struct fib6_walker *w;
1980 struct fib6_info *rt = rcu_dereference_protected(*rtp,
1981 lockdep_is_held(&table->tb6_lock));
1982 struct net *net = info->nl_net;
1983 bool notify_del = false;
1984
1985 /* If the deleted route is the first in the node and it is not part of
1986 * a multipath route, then we need to replace it with the next route
1987 * in the node, if exists.
1988 */
1989 leaf = rcu_dereference_protected(fn->leaf,
1990 lockdep_is_held(&table->tb6_lock));
1991 if (leaf == rt && !rt->fib6_nsiblings) {
1992 if (rcu_access_pointer(rt->fib6_next))
1993 replace_rt = rcu_dereference_protected(rt->fib6_next,
1994 lockdep_is_held(&table->tb6_lock));
1995 else
1996 notify_del = true;
1997 }
1998
1999 /* Unlink it */
2000 *rtp = rt->fib6_next;
2001 rt->fib6_node = NULL;
2002 net->ipv6.rt6_stats->fib_rt_entries--;
2003 net->ipv6.rt6_stats->fib_discarded_routes++;
2004
2005 /* Reset round-robin state, if necessary */
2006 if (rcu_access_pointer(fn->rr_ptr) == rt)
2007 fn->rr_ptr = NULL;
2008
2009 /* Remove this entry from other siblings */
2010 if (rt->fib6_nsiblings) {
2011 struct fib6_info *sibling, *next_sibling;
2012
2013 /* The route is deleted from a multipath route. If this
2014 * multipath route is the first route in the node, then we need
2015 * to emit a delete notification. Otherwise, we need to skip
2016 * the notification.
2017 */
2018 if (rt->fib6_metric == leaf->fib6_metric &&
2019 rt6_qualify_for_ecmp(leaf))
2020 notify_del = true;
2021 list_for_each_entry_safe(sibling, next_sibling,
2022 &rt->fib6_siblings, fib6_siblings)
2023 WRITE_ONCE(sibling->fib6_nsiblings,
2024 sibling->fib6_nsiblings - 1);
2025 WRITE_ONCE(rt->fib6_nsiblings, 0);
2026 list_del_rcu(&rt->fib6_siblings);
2027 rt6_multipath_rebalance(next_sibling);
2028 }
2029
2030 /* Adjust walkers */
2031 read_lock(&net->ipv6.fib6_walker_lock);
2032 FOR_WALKERS(net, w) {
2033 if (w->state == FWS_C && w->leaf == rt) {
2034 pr_debug("walker %p adjusted by delroute\n", w);
2035 w->leaf = rcu_dereference_protected(rt->fib6_next,
2036 lockdep_is_held(&table->tb6_lock));
2037 if (!w->leaf)
2038 w->state = FWS_U;
2039 }
2040 }
2041 read_unlock(&net->ipv6.fib6_walker_lock);
2042
2043 /* If it was last route, call fib6_repair_tree() to:
2044 * 1. For root node, put back null_entry as how the table was created.
2045 * 2. For other nodes, expunge its radix tree node.
2046 */
2047 if (!rcu_access_pointer(fn->leaf)) {
2048 if (!(fn->fn_flags & RTN_TL_ROOT)) {
2049 fn->fn_flags &= ~RTN_RTINFO;
2050 net->ipv6.rt6_stats->fib_route_nodes--;
2051 }
2052 fn = fib6_repair_tree(net, table, fn);
2053 }
2054
2055 fib6_purge_rt(rt, fn, net);
2056
2057 if (!info->skip_notify_kernel) {
2058 if (notify_del)
2059 call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL,
2060 rt, NULL);
2061 else if (replace_rt)
2062 call_fib6_entry_notifiers_replace(net, replace_rt);
2063 }
2064 if (!info->skip_notify)
2065 inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
2066
2067 fib6_info_release(rt);
2068 }
2069
2070 /* Need to own table->tb6_lock */
fib6_del(struct fib6_info * rt,struct nl_info * info)2071 int fib6_del(struct fib6_info *rt, struct nl_info *info)
2072 {
2073 struct net *net = info->nl_net;
2074 struct fib6_info __rcu **rtp;
2075 struct fib6_info __rcu **rtp_next;
2076 struct fib6_table *table;
2077 struct fib6_node *fn;
2078
2079 if (rt == net->ipv6.fib6_null_entry)
2080 return -ENOENT;
2081
2082 table = rt->fib6_table;
2083 fn = rcu_dereference_protected(rt->fib6_node,
2084 lockdep_is_held(&table->tb6_lock));
2085 if (!fn)
2086 return -ENOENT;
2087
2088 WARN_ON(!(fn->fn_flags & RTN_RTINFO));
2089
2090 /*
2091 * Walk the leaf entries looking for ourself
2092 */
2093
2094 for (rtp = &fn->leaf; *rtp; rtp = rtp_next) {
2095 struct fib6_info *cur = rcu_dereference_protected(*rtp,
2096 lockdep_is_held(&table->tb6_lock));
2097 if (rt == cur) {
2098 if (fib6_requires_src(cur))
2099 fib6_routes_require_src_dec(info->nl_net);
2100 fib6_del_route(table, fn, rtp, info);
2101 return 0;
2102 }
2103 rtp_next = &cur->fib6_next;
2104 }
2105 return -ENOENT;
2106 }
2107
2108 /*
2109 * Tree traversal function.
2110 *
2111 * Certainly, it is not interrupt safe.
2112 * However, it is internally reenterable wrt itself and fib6_add/fib6_del.
2113 * It means, that we can modify tree during walking
2114 * and use this function for garbage collection, clone pruning,
2115 * cleaning tree when a device goes down etc. etc.
2116 *
2117 * It guarantees that every node will be traversed,
2118 * and that it will be traversed only once.
2119 *
2120 * Callback function w->func may return:
2121 * 0 -> continue walking.
2122 * positive value -> walking is suspended (used by tree dumps,
2123 * and probably by gc, if it will be split to several slices)
2124 * negative value -> terminate walking.
2125 *
2126 * The function itself returns:
2127 * 0 -> walk is complete.
2128 * >0 -> walk is incomplete (i.e. suspended)
2129 * <0 -> walk is terminated by an error.
2130 *
2131 * This function is called with tb6_lock held.
2132 */
2133
fib6_walk_continue(struct fib6_walker * w)2134 static int fib6_walk_continue(struct fib6_walker *w)
2135 {
2136 struct fib6_node *fn, *pn, *left, *right;
2137
2138 /* w->root should always be table->tb6_root */
2139 WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT));
2140
2141 for (;;) {
2142 fn = w->node;
2143 if (!fn)
2144 return 0;
2145
2146 switch (w->state) {
2147 #ifdef CONFIG_IPV6_SUBTREES
2148 case FWS_S:
2149 if (FIB6_SUBTREE(fn)) {
2150 w->node = FIB6_SUBTREE(fn);
2151 continue;
2152 }
2153 w->state = FWS_L;
2154 fallthrough;
2155 #endif
2156 case FWS_L:
2157 left = rcu_dereference_protected(fn->left, 1);
2158 if (left) {
2159 w->node = left;
2160 w->state = FWS_INIT;
2161 continue;
2162 }
2163 w->state = FWS_R;
2164 fallthrough;
2165 case FWS_R:
2166 right = rcu_dereference_protected(fn->right, 1);
2167 if (right) {
2168 w->node = right;
2169 w->state = FWS_INIT;
2170 continue;
2171 }
2172 w->state = FWS_C;
2173 w->leaf = rcu_dereference_protected(fn->leaf, 1);
2174 fallthrough;
2175 case FWS_C:
2176 if (w->leaf && fn->fn_flags & RTN_RTINFO) {
2177 int err;
2178
2179 if (w->skip) {
2180 w->skip--;
2181 goto skip;
2182 }
2183
2184 err = w->func(w);
2185 if (err)
2186 return err;
2187
2188 w->count++;
2189 continue;
2190 }
2191 skip:
2192 w->state = FWS_U;
2193 fallthrough;
2194 case FWS_U:
2195 if (fn == w->root)
2196 return 0;
2197 pn = rcu_dereference_protected(fn->parent, 1);
2198 left = rcu_dereference_protected(pn->left, 1);
2199 right = rcu_dereference_protected(pn->right, 1);
2200 w->node = pn;
2201 #ifdef CONFIG_IPV6_SUBTREES
2202 if (FIB6_SUBTREE(pn) == fn) {
2203 WARN_ON(!(fn->fn_flags & RTN_ROOT));
2204 w->state = FWS_L;
2205 continue;
2206 }
2207 #endif
2208 if (left == fn) {
2209 w->state = FWS_R;
2210 continue;
2211 }
2212 if (right == fn) {
2213 w->state = FWS_C;
2214 w->leaf = rcu_dereference_protected(w->node->leaf, 1);
2215 continue;
2216 }
2217 #if RT6_DEBUG >= 2
2218 WARN_ON(1);
2219 #endif
2220 }
2221 }
2222 }
2223
fib6_walk(struct net * net,struct fib6_walker * w)2224 static int fib6_walk(struct net *net, struct fib6_walker *w)
2225 {
2226 int res;
2227
2228 w->state = FWS_INIT;
2229 w->node = w->root;
2230
2231 fib6_walker_link(net, w);
2232 res = fib6_walk_continue(w);
2233 if (res <= 0)
2234 fib6_walker_unlink(net, w);
2235 return res;
2236 }
2237
fib6_clean_node(struct fib6_walker * w)2238 static int fib6_clean_node(struct fib6_walker *w)
2239 {
2240 int res;
2241 struct fib6_info *rt;
2242 struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
2243 struct nl_info info = {
2244 .nl_net = c->net,
2245 .skip_notify = c->skip_notify,
2246 };
2247
2248 if (c->sernum != FIB6_NO_SERNUM_CHANGE &&
2249 READ_ONCE(w->node->fn_sernum) != c->sernum)
2250 WRITE_ONCE(w->node->fn_sernum, c->sernum);
2251
2252 if (!c->func) {
2253 WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE);
2254 w->leaf = NULL;
2255 return 0;
2256 }
2257
2258 for_each_fib6_walker_rt(w) {
2259 res = c->func(rt, c->arg);
2260 if (res == -1) {
2261 w->leaf = rt;
2262 res = fib6_del(rt, &info);
2263 if (res) {
2264 #if RT6_DEBUG >= 2
2265 pr_debug("%s: del failed: rt=%p@%p err=%d\n",
2266 __func__, rt,
2267 rcu_access_pointer(rt->fib6_node),
2268 res);
2269 #endif
2270 continue;
2271 }
2272 return 0;
2273 } else if (res == -2) {
2274 if (WARN_ON(!rt->fib6_nsiblings))
2275 continue;
2276 rt = list_last_entry(&rt->fib6_siblings,
2277 struct fib6_info, fib6_siblings);
2278 continue;
2279 }
2280 WARN_ON(res != 0);
2281 }
2282 w->leaf = rt;
2283 return 0;
2284 }
2285
2286 /*
2287 * Convenient frontend to tree walker.
2288 *
2289 * func is called on each route.
2290 * It may return -2 -> skip multipath route.
2291 * -1 -> delete this route.
2292 * 0 -> continue walking
2293 */
2294
fib6_clean_tree(struct net * net,struct fib6_node * root,int (* func)(struct fib6_info *,void * arg),int sernum,void * arg,bool skip_notify)2295 static void fib6_clean_tree(struct net *net, struct fib6_node *root,
2296 int (*func)(struct fib6_info *, void *arg),
2297 int sernum, void *arg, bool skip_notify)
2298 {
2299 struct fib6_cleaner c;
2300
2301 c.w.root = root;
2302 c.w.func = fib6_clean_node;
2303 c.w.count = 0;
2304 c.w.skip = 0;
2305 c.w.skip_in_node = 0;
2306 c.func = func;
2307 c.sernum = sernum;
2308 c.arg = arg;
2309 c.net = net;
2310 c.skip_notify = skip_notify;
2311
2312 fib6_walk(net, &c.w);
2313 }
2314
__fib6_clean_all(struct net * net,int (* func)(struct fib6_info *,void *),int sernum,void * arg,bool skip_notify)2315 static void __fib6_clean_all(struct net *net,
2316 int (*func)(struct fib6_info *, void *),
2317 int sernum, void *arg, bool skip_notify)
2318 {
2319 struct fib6_table *table;
2320 struct hlist_head *head;
2321 unsigned int h;
2322
2323 rcu_read_lock();
2324 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2325 head = &net->ipv6.fib_table_hash[h];
2326 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2327 spin_lock_bh(&table->tb6_lock);
2328 fib6_clean_tree(net, &table->tb6_root,
2329 func, sernum, arg, skip_notify);
2330 spin_unlock_bh(&table->tb6_lock);
2331 }
2332 }
2333 rcu_read_unlock();
2334 }
2335
fib6_clean_all(struct net * net,int (* func)(struct fib6_info *,void *),void * arg)2336 void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *),
2337 void *arg)
2338 {
2339 __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false);
2340 }
2341
fib6_clean_all_skip_notify(struct net * net,int (* func)(struct fib6_info *,void *),void * arg)2342 void fib6_clean_all_skip_notify(struct net *net,
2343 int (*func)(struct fib6_info *, void *),
2344 void *arg)
2345 {
2346 __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true);
2347 }
2348
fib6_flush_trees(struct net * net)2349 static void fib6_flush_trees(struct net *net)
2350 {
2351 int new_sernum = fib6_new_sernum(net);
2352
2353 __fib6_clean_all(net, NULL, new_sernum, NULL, false);
2354 }
2355
2356 /*
2357 * Garbage collection
2358 */
fib6_age_exceptions(struct fib6_info * rt,struct fib6_gc_args * gc_args,unsigned long now)2359 void fib6_age_exceptions(struct fib6_info *rt, struct fib6_gc_args *gc_args,
2360 unsigned long now)
2361 {
2362 bool may_expire = rt->fib6_flags & RTF_EXPIRES && rt->expires;
2363 int old_more = gc_args->more;
2364
2365 rt6_age_exceptions(rt, gc_args, now);
2366
2367 if (!may_expire && old_more == gc_args->more)
2368 fib6_remove_gc_list(rt);
2369 }
2370
fib6_age(struct fib6_info * rt,struct fib6_gc_args * gc_args)2371 static int fib6_age(struct fib6_info *rt, struct fib6_gc_args *gc_args)
2372 {
2373 unsigned long now = jiffies;
2374
2375 /*
2376 * check addrconf expiration here.
2377 * Routes are expired even if they are in use.
2378 */
2379
2380 if (rt->fib6_flags & RTF_EXPIRES && rt->expires) {
2381 if (time_after(now, rt->expires)) {
2382 pr_debug("expiring %p\n", rt);
2383 return -1;
2384 }
2385 gc_args->more++;
2386 }
2387
2388 /* Also age clones in the exception table.
2389 * Note, that clones are aged out
2390 * only if they are not in use now.
2391 */
2392 fib6_age_exceptions(rt, gc_args, now);
2393
2394 return 0;
2395 }
2396
fib6_gc_table(struct net * net,struct fib6_table * tb6,struct fib6_gc_args * gc_args)2397 static void fib6_gc_table(struct net *net,
2398 struct fib6_table *tb6,
2399 struct fib6_gc_args *gc_args)
2400 {
2401 struct fib6_info *rt;
2402 struct hlist_node *n;
2403 struct nl_info info = {
2404 .nl_net = net,
2405 .skip_notify = false,
2406 };
2407
2408 hlist_for_each_entry_safe(rt, n, &tb6->tb6_gc_hlist, gc_link)
2409 if (fib6_age(rt, gc_args) == -1)
2410 fib6_del(rt, &info);
2411 }
2412
fib6_gc_all(struct net * net,struct fib6_gc_args * gc_args)2413 static void fib6_gc_all(struct net *net, struct fib6_gc_args *gc_args)
2414 {
2415 struct fib6_table *table;
2416 struct hlist_head *head;
2417 unsigned int h;
2418
2419 rcu_read_lock();
2420 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2421 head = &net->ipv6.fib_table_hash[h];
2422 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2423 spin_lock_bh(&table->tb6_lock);
2424
2425 fib6_gc_table(net, table, gc_args);
2426
2427 spin_unlock_bh(&table->tb6_lock);
2428 }
2429 }
2430 rcu_read_unlock();
2431 }
2432
fib6_run_gc(unsigned long expires,struct net * net,bool force)2433 void fib6_run_gc(unsigned long expires, struct net *net, bool force)
2434 {
2435 struct fib6_gc_args gc_args;
2436 int ip6_rt_gc_interval;
2437 unsigned long now;
2438
2439 if (force) {
2440 spin_lock_bh(&net->ipv6.fib6_gc_lock);
2441 } else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) {
2442 mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ);
2443 return;
2444 }
2445 ip6_rt_gc_interval = READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_interval);
2446 gc_args.timeout = expires ? (int)expires : ip6_rt_gc_interval;
2447 gc_args.more = 0;
2448
2449 fib6_gc_all(net, &gc_args);
2450 now = jiffies;
2451 net->ipv6.ip6_rt_last_gc = now;
2452
2453 if (gc_args.more)
2454 mod_timer(&net->ipv6.ip6_fib_timer,
2455 round_jiffies(now + ip6_rt_gc_interval));
2456 else
2457 timer_delete(&net->ipv6.ip6_fib_timer);
2458 spin_unlock_bh(&net->ipv6.fib6_gc_lock);
2459 }
2460
fib6_gc_timer_cb(struct timer_list * t)2461 static void fib6_gc_timer_cb(struct timer_list *t)
2462 {
2463 struct net *arg = timer_container_of(arg, t, ipv6.ip6_fib_timer);
2464
2465 fib6_run_gc(0, arg, true);
2466 }
2467
fib6_net_init(struct net * net)2468 static int __net_init fib6_net_init(struct net *net)
2469 {
2470 size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
2471 int err;
2472
2473 err = fib6_notifier_init(net);
2474 if (err)
2475 return err;
2476
2477 /* Default to 3-tuple */
2478 net->ipv6.sysctl.multipath_hash_fields =
2479 FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK;
2480
2481 spin_lock_init(&net->ipv6.fib6_gc_lock);
2482 rwlock_init(&net->ipv6.fib6_walker_lock);
2483 INIT_LIST_HEAD(&net->ipv6.fib6_walkers);
2484 timer_setup(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, 0);
2485
2486 net->ipv6.rt6_stats = kzalloc_obj(*net->ipv6.rt6_stats);
2487 if (!net->ipv6.rt6_stats)
2488 goto out_notifier;
2489
2490 /* Avoid false sharing : Use at least a full cache line */
2491 size = max_t(size_t, size, L1_CACHE_BYTES);
2492
2493 net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL);
2494 if (!net->ipv6.fib_table_hash)
2495 goto out_rt6_stats;
2496
2497 spin_lock_init(&net->ipv6.fib_table_hash_lock);
2498
2499 net->ipv6.fib6_main_tbl = kzalloc_obj(*net->ipv6.fib6_main_tbl);
2500 if (!net->ipv6.fib6_main_tbl)
2501 goto out_fib_table_hash;
2502
2503 net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
2504 rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf,
2505 net->ipv6.fib6_null_entry);
2506 net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
2507 RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
2508 inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers);
2509 INIT_HLIST_HEAD(&net->ipv6.fib6_main_tbl->tb6_gc_hlist);
2510
2511 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2512 net->ipv6.fib6_local_tbl = kzalloc_obj(*net->ipv6.fib6_local_tbl);
2513 if (!net->ipv6.fib6_local_tbl)
2514 goto out_fib6_main_tbl;
2515 net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
2516 rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf,
2517 net->ipv6.fib6_null_entry);
2518 net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
2519 RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
2520 inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers);
2521 INIT_HLIST_HEAD(&net->ipv6.fib6_local_tbl->tb6_gc_hlist);
2522 #endif
2523 fib6_tables_init(net);
2524
2525 return 0;
2526
2527 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2528 out_fib6_main_tbl:
2529 kfree(net->ipv6.fib6_main_tbl);
2530 #endif
2531 out_fib_table_hash:
2532 kfree(net->ipv6.fib_table_hash);
2533 out_rt6_stats:
2534 kfree(net->ipv6.rt6_stats);
2535 out_notifier:
2536 fib6_notifier_exit(net);
2537 return -ENOMEM;
2538 }
2539
fib6_net_exit(struct net * net)2540 static void fib6_net_exit(struct net *net)
2541 {
2542 unsigned int i;
2543
2544 timer_delete_sync(&net->ipv6.ip6_fib_timer);
2545
2546 for (i = 0; i < FIB6_TABLE_HASHSZ; i++) {
2547 struct hlist_head *head = &net->ipv6.fib_table_hash[i];
2548 struct hlist_node *tmp;
2549 struct fib6_table *tb;
2550
2551 hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) {
2552 hlist_del(&tb->tb6_hlist);
2553 fib6_free_table(tb);
2554 }
2555 }
2556
2557 kfree(net->ipv6.fib_table_hash);
2558 kfree(net->ipv6.rt6_stats);
2559 fib6_notifier_exit(net);
2560 }
2561
2562 static struct pernet_operations fib6_net_ops = {
2563 .init = fib6_net_init,
2564 .exit = fib6_net_exit,
2565 };
2566
2567 static const struct rtnl_msg_handler fib6_rtnl_msg_handlers[] __initconst_or_module = {
2568 {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE,
2569 .dumpit = inet6_dump_fib,
2570 .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE},
2571 };
2572
fib6_init(void)2573 int __init fib6_init(void)
2574 {
2575 int ret = -ENOMEM;
2576
2577 fib6_node_kmem = KMEM_CACHE(fib6_node,
2578 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT);
2579 if (!fib6_node_kmem)
2580 goto out;
2581
2582 ret = register_pernet_subsys(&fib6_net_ops);
2583 if (ret)
2584 goto out_kmem_cache_create;
2585
2586 ret = rtnl_register_many(fib6_rtnl_msg_handlers);
2587 if (ret)
2588 goto out_unregister_subsys;
2589
2590 __fib6_flush_trees = fib6_flush_trees;
2591 out:
2592 return ret;
2593
2594 out_unregister_subsys:
2595 unregister_pernet_subsys(&fib6_net_ops);
2596 out_kmem_cache_create:
2597 kmem_cache_destroy(fib6_node_kmem);
2598 goto out;
2599 }
2600
fib6_gc_cleanup(void)2601 void fib6_gc_cleanup(void)
2602 {
2603 unregister_pernet_subsys(&fib6_net_ops);
2604 kmem_cache_destroy(fib6_node_kmem);
2605 }
2606
2607 #ifdef CONFIG_PROC_FS
ipv6_route_native_seq_show(struct seq_file * seq,void * v)2608 static int ipv6_route_native_seq_show(struct seq_file *seq, void *v)
2609 {
2610 struct fib6_info *rt = v;
2611 struct ipv6_route_iter *iter = seq->private;
2612 struct fib6_nh *fib6_nh = rt->fib6_nh;
2613 unsigned int flags = rt->fib6_flags;
2614 const struct net_device *dev;
2615
2616 if (rt->nh)
2617 fib6_nh = nexthop_fib6_nh(rt->nh);
2618
2619 seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
2620
2621 #ifdef CONFIG_IPV6_SUBTREES
2622 seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen);
2623 #else
2624 seq_puts(seq, "00000000000000000000000000000000 00 ");
2625 #endif
2626 if (fib6_nh->fib_nh_gw_family) {
2627 flags |= RTF_GATEWAY;
2628 seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6);
2629 } else {
2630 seq_puts(seq, "00000000000000000000000000000000");
2631 }
2632
2633 dev = fib6_nh->fib_nh_dev;
2634 seq_printf(seq, " %08x %08x %08x %08x %8s\n",
2635 rt->fib6_metric, refcount_read(&rt->fib6_ref), 0,
2636 flags, dev ? dev->name : "");
2637 iter->w.leaf = NULL;
2638 return 0;
2639 }
2640
ipv6_route_yield(struct fib6_walker * w)2641 static int ipv6_route_yield(struct fib6_walker *w)
2642 {
2643 struct ipv6_route_iter *iter = w->args;
2644
2645 if (!iter->skip)
2646 return 1;
2647
2648 do {
2649 iter->w.leaf = rcu_dereference_protected(
2650 iter->w.leaf->fib6_next,
2651 lockdep_is_held(&iter->tbl->tb6_lock));
2652 iter->skip--;
2653 if (!iter->skip && iter->w.leaf)
2654 return 1;
2655 } while (iter->w.leaf);
2656
2657 return 0;
2658 }
2659
ipv6_route_seq_setup_walk(struct ipv6_route_iter * iter,struct net * net)2660 static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter,
2661 struct net *net)
2662 {
2663 memset(&iter->w, 0, sizeof(iter->w));
2664 iter->w.func = ipv6_route_yield;
2665 iter->w.root = &iter->tbl->tb6_root;
2666 iter->w.state = FWS_INIT;
2667 iter->w.node = iter->w.root;
2668 iter->w.args = iter;
2669 iter->sernum = READ_ONCE(iter->w.root->fn_sernum);
2670 INIT_LIST_HEAD(&iter->w.lh);
2671 fib6_walker_link(net, &iter->w);
2672 }
2673
ipv6_route_seq_next_table(struct fib6_table * tbl,struct net * net)2674 static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl,
2675 struct net *net)
2676 {
2677 unsigned int h;
2678 struct hlist_node *node;
2679
2680 if (tbl) {
2681 h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1;
2682 node = rcu_dereference(hlist_next_rcu(&tbl->tb6_hlist));
2683 } else {
2684 h = 0;
2685 node = NULL;
2686 }
2687
2688 while (!node && h < FIB6_TABLE_HASHSZ) {
2689 node = rcu_dereference(
2690 hlist_first_rcu(&net->ipv6.fib_table_hash[h++]));
2691 }
2692 return hlist_entry_safe(node, struct fib6_table, tb6_hlist);
2693 }
2694
ipv6_route_check_sernum(struct ipv6_route_iter * iter)2695 static void ipv6_route_check_sernum(struct ipv6_route_iter *iter)
2696 {
2697 int sernum = READ_ONCE(iter->w.root->fn_sernum);
2698
2699 if (iter->sernum != sernum) {
2700 iter->sernum = sernum;
2701 iter->w.state = FWS_INIT;
2702 iter->w.node = iter->w.root;
2703 WARN_ON(iter->w.skip);
2704 iter->w.skip = iter->w.count;
2705 }
2706 }
2707
ipv6_route_seq_next(struct seq_file * seq,void * v,loff_t * pos)2708 static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2709 {
2710 int r;
2711 struct fib6_info *n;
2712 struct net *net = seq_file_net(seq);
2713 struct ipv6_route_iter *iter = seq->private;
2714
2715 ++(*pos);
2716 if (!v)
2717 goto iter_table;
2718
2719 n = rcu_dereference(((struct fib6_info *)v)->fib6_next);
2720 if (n)
2721 return n;
2722
2723 iter_table:
2724 ipv6_route_check_sernum(iter);
2725 spin_lock_bh(&iter->tbl->tb6_lock);
2726 r = fib6_walk_continue(&iter->w);
2727 spin_unlock_bh(&iter->tbl->tb6_lock);
2728 if (r > 0) {
2729 return iter->w.leaf;
2730 } else if (r < 0) {
2731 fib6_walker_unlink(net, &iter->w);
2732 return NULL;
2733 }
2734 fib6_walker_unlink(net, &iter->w);
2735
2736 iter->tbl = ipv6_route_seq_next_table(iter->tbl, net);
2737 if (!iter->tbl)
2738 return NULL;
2739
2740 ipv6_route_seq_setup_walk(iter, net);
2741 goto iter_table;
2742 }
2743
ipv6_route_seq_start(struct seq_file * seq,loff_t * pos)2744 static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos)
2745 __acquires(RCU)
2746 {
2747 struct net *net = seq_file_net(seq);
2748 struct ipv6_route_iter *iter = seq->private;
2749
2750 rcu_read_lock();
2751 iter->tbl = ipv6_route_seq_next_table(NULL, net);
2752 iter->skip = *pos;
2753
2754 if (iter->tbl) {
2755 loff_t p = 0;
2756
2757 ipv6_route_seq_setup_walk(iter, net);
2758 return ipv6_route_seq_next(seq, NULL, &p);
2759 } else {
2760 return NULL;
2761 }
2762 }
2763
ipv6_route_iter_active(struct ipv6_route_iter * iter)2764 static bool ipv6_route_iter_active(struct ipv6_route_iter *iter)
2765 {
2766 struct fib6_walker *w = &iter->w;
2767 return w->node && !(w->state == FWS_U && w->node == w->root);
2768 }
2769
ipv6_route_native_seq_stop(struct seq_file * seq,void * v)2770 static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v)
2771 __releases(RCU)
2772 {
2773 struct net *net = seq_file_net(seq);
2774 struct ipv6_route_iter *iter = seq->private;
2775
2776 if (ipv6_route_iter_active(iter))
2777 fib6_walker_unlink(net, &iter->w);
2778
2779 rcu_read_unlock();
2780 }
2781
2782 #if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL)
ipv6_route_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,void * v)2783 static int ipv6_route_prog_seq_show(struct bpf_prog *prog,
2784 struct bpf_iter_meta *meta,
2785 void *v)
2786 {
2787 struct bpf_iter__ipv6_route ctx;
2788
2789 ctx.meta = meta;
2790 ctx.rt = v;
2791 return bpf_iter_run_prog(prog, &ctx);
2792 }
2793
ipv6_route_seq_show(struct seq_file * seq,void * v)2794 static int ipv6_route_seq_show(struct seq_file *seq, void *v)
2795 {
2796 struct ipv6_route_iter *iter = seq->private;
2797 struct bpf_iter_meta meta;
2798 struct bpf_prog *prog;
2799 int ret;
2800
2801 meta.seq = seq;
2802 prog = bpf_iter_get_info(&meta, false);
2803 if (!prog)
2804 return ipv6_route_native_seq_show(seq, v);
2805
2806 ret = ipv6_route_prog_seq_show(prog, &meta, v);
2807 iter->w.leaf = NULL;
2808
2809 return ret;
2810 }
2811
ipv6_route_seq_stop(struct seq_file * seq,void * v)2812 static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
2813 {
2814 struct bpf_iter_meta meta;
2815 struct bpf_prog *prog;
2816
2817 if (!v) {
2818 meta.seq = seq;
2819 prog = bpf_iter_get_info(&meta, true);
2820 if (prog)
2821 (void)ipv6_route_prog_seq_show(prog, &meta, v);
2822 }
2823
2824 ipv6_route_native_seq_stop(seq, v);
2825 }
2826 #else
ipv6_route_seq_show(struct seq_file * seq,void * v)2827 static int ipv6_route_seq_show(struct seq_file *seq, void *v)
2828 {
2829 return ipv6_route_native_seq_show(seq, v);
2830 }
2831
ipv6_route_seq_stop(struct seq_file * seq,void * v)2832 static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
2833 {
2834 ipv6_route_native_seq_stop(seq, v);
2835 }
2836 #endif
2837
2838 const struct seq_operations ipv6_route_seq_ops = {
2839 .start = ipv6_route_seq_start,
2840 .next = ipv6_route_seq_next,
2841 .stop = ipv6_route_seq_stop,
2842 .show = ipv6_route_seq_show
2843 };
2844 #endif /* CONFIG_PROC_FS */
2845