1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		ROUTE - implementation of the IP router.
7  *
8  * Authors:	Ross Biro
9  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *		Alan Cox	:	Verify area fixes.
16  *		Alan Cox	:	cli() protects routing changes
17  *		Rui Oliveira	:	ICMP routing table updates
18  *		(rco@di.uminho.pt)	Routing table insertion and update
19  *		Linus Torvalds	:	Rewrote bits to be sensible
20  *		Alan Cox	:	Added BSD route gw semantics
21  *		Alan Cox	:	Super /proc >4K
22  *		Alan Cox	:	MTU in route table
23  *		Alan Cox	: 	MSS actually. Also added the window
24  *					clamper.
25  *		Sam Lantinga	:	Fixed route matching in rt_del()
26  *		Alan Cox	:	Routing cache support.
27  *		Alan Cox	:	Removed compatibility cruft.
28  *		Alan Cox	:	RTF_REJECT support.
29  *		Alan Cox	:	TCP irtt support.
30  *		Jonathan Naylor	:	Added Metric support.
31  *	Miquel van Smoorenburg	:	BSD API fixes.
32  *	Miquel van Smoorenburg	:	Metrics.
33  *		Alan Cox	:	Use __u32 properly
34  *		Alan Cox	:	Aligned routing errors more closely with BSD
35  *					our system is still very different.
36  *		Alan Cox	:	Faster /proc handling
37  *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
38  *					routing caches and better behaviour.
39  *
40  *		Olaf Erb	:	irtt wasn't being copied right.
41  *		Bjorn Ekwall	:	Kerneld route support.
42  *		Alan Cox	:	Multicast fixed (I hope)
43  * 		Pavel Krauz	:	Limited broadcast fixed
44  *		Mike McLagan	:	Routing by source
45  *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
46  *					route.c and rewritten from scratch.
47  *		Andi Kleen	:	Load-limit warning messages.
48  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
49  *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
50  *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
51  *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
52  *		Marc Boucher	:	routing by fwmark
53  *	Robert Olsson		:	Added rt_cache statistics
54  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
55  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
56  * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
57  * 	Ilia Sotnikov		:	Removed TOS from hash calculations
58  *
59  *		This program is free software; you can redistribute it and/or
60  *		modify it under the terms of the GNU General Public License
61  *		as published by the Free Software Foundation; either version
62  *		2 of the License, or (at your option) any later version.
63  */
64 
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <linux/prefetch.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112 #include <net/secure_seq.h>
113 
114 #define RT_FL_TOS(oldflp4) \
115 	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
116 
117 #define IP_MAX_MTU	0xFFF0
118 
119 #define RT_GC_TIMEOUT (300*HZ)
120 
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
123 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
124 static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
125 static int ip_rt_redirect_number __read_mostly	= 9;
126 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
127 static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost __read_mostly	= HZ;
129 static int ip_rt_error_burst __read_mostly	= 5 * HZ;
130 static int ip_rt_gc_elasticity __read_mostly	= 8;
131 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
132 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
133 static int ip_rt_min_advmss __read_mostly	= 256;
134 static int rt_chain_length_max __read_mostly	= 20;
135 
136 static struct delayed_work expires_work;
137 static unsigned long expires_ljiffies;
138 
139 /*
140  *	Interface to generic destination cache.
141  */
142 
143 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
144 static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
145 static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
146 static void		 ipv4_dst_destroy(struct dst_entry *dst);
147 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
148 static void		 ipv4_link_failure(struct sk_buff *skb);
149 static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
150 static int rt_garbage_collect(struct dst_ops *ops);
151 
ipv4_dst_ifdown(struct dst_entry * dst,struct net_device * dev,int how)152 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
153 			    int how)
154 {
155 }
156 
ipv4_cow_metrics(struct dst_entry * dst,unsigned long old)157 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
158 {
159 	struct rtable *rt = (struct rtable *) dst;
160 	struct inet_peer *peer;
161 	u32 *p = NULL;
162 
163 	if (!rt->peer)
164 		rt_bind_peer(rt, rt->rt_dst, 1);
165 
166 	peer = rt->peer;
167 	if (peer) {
168 		u32 *old_p = __DST_METRICS_PTR(old);
169 		unsigned long prev, new;
170 
171 		p = peer->metrics;
172 		if (inet_metrics_new(peer))
173 			memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
174 
175 		new = (unsigned long) p;
176 		prev = cmpxchg(&dst->_metrics, old, new);
177 
178 		if (prev != old) {
179 			p = __DST_METRICS_PTR(prev);
180 			if (prev & DST_METRICS_READ_ONLY)
181 				p = NULL;
182 		} else {
183 			if (rt->fi) {
184 				fib_info_put(rt->fi);
185 				rt->fi = NULL;
186 			}
187 		}
188 	}
189 	return p;
190 }
191 
192 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
193 
194 static struct dst_ops ipv4_dst_ops = {
195 	.family =		AF_INET,
196 	.protocol =		cpu_to_be16(ETH_P_IP),
197 	.gc =			rt_garbage_collect,
198 	.check =		ipv4_dst_check,
199 	.default_advmss =	ipv4_default_advmss,
200 	.mtu =			ipv4_mtu,
201 	.cow_metrics =		ipv4_cow_metrics,
202 	.destroy =		ipv4_dst_destroy,
203 	.ifdown =		ipv4_dst_ifdown,
204 	.negative_advice =	ipv4_negative_advice,
205 	.link_failure =		ipv4_link_failure,
206 	.update_pmtu =		ip_rt_update_pmtu,
207 	.local_out =		__ip_local_out,
208 	.neigh_lookup =		ipv4_neigh_lookup,
209 };
210 
211 #define ECN_OR_COST(class)	TC_PRIO_##class
212 
213 const __u8 ip_tos2prio[16] = {
214 	TC_PRIO_BESTEFFORT,
215 	ECN_OR_COST(BESTEFFORT),
216 	TC_PRIO_BESTEFFORT,
217 	ECN_OR_COST(BESTEFFORT),
218 	TC_PRIO_BULK,
219 	ECN_OR_COST(BULK),
220 	TC_PRIO_BULK,
221 	ECN_OR_COST(BULK),
222 	TC_PRIO_INTERACTIVE,
223 	ECN_OR_COST(INTERACTIVE),
224 	TC_PRIO_INTERACTIVE,
225 	ECN_OR_COST(INTERACTIVE),
226 	TC_PRIO_INTERACTIVE_BULK,
227 	ECN_OR_COST(INTERACTIVE_BULK),
228 	TC_PRIO_INTERACTIVE_BULK,
229 	ECN_OR_COST(INTERACTIVE_BULK)
230 };
231 
232 
233 /*
234  * Route cache.
235  */
236 
237 /* The locking scheme is rather straight forward:
238  *
239  * 1) Read-Copy Update protects the buckets of the central route hash.
240  * 2) Only writers remove entries, and they hold the lock
241  *    as they look at rtable reference counts.
242  * 3) Only readers acquire references to rtable entries,
243  *    they do so with atomic increments and with the
244  *    lock held.
245  */
246 
247 struct rt_hash_bucket {
248 	struct rtable __rcu	*chain;
249 };
250 
251 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
252 	defined(CONFIG_PROVE_LOCKING)
253 /*
254  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
255  * The size of this table is a power of two and depends on the number of CPUS.
256  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
257  */
258 #ifdef CONFIG_LOCKDEP
259 # define RT_HASH_LOCK_SZ	256
260 #else
261 # if NR_CPUS >= 32
262 #  define RT_HASH_LOCK_SZ	4096
263 # elif NR_CPUS >= 16
264 #  define RT_HASH_LOCK_SZ	2048
265 # elif NR_CPUS >= 8
266 #  define RT_HASH_LOCK_SZ	1024
267 # elif NR_CPUS >= 4
268 #  define RT_HASH_LOCK_SZ	512
269 # else
270 #  define RT_HASH_LOCK_SZ	256
271 # endif
272 #endif
273 
274 static spinlock_t	*rt_hash_locks;
275 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
276 
rt_hash_lock_init(void)277 static __init void rt_hash_lock_init(void)
278 {
279 	int i;
280 
281 	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
282 			GFP_KERNEL);
283 	if (!rt_hash_locks)
284 		panic("IP: failed to allocate rt_hash_locks\n");
285 
286 	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
287 		spin_lock_init(&rt_hash_locks[i]);
288 }
289 #else
290 # define rt_hash_lock_addr(slot) NULL
291 
rt_hash_lock_init(void)292 static inline void rt_hash_lock_init(void)
293 {
294 }
295 #endif
296 
297 static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
298 static unsigned			rt_hash_mask __read_mostly;
299 static unsigned int		rt_hash_log  __read_mostly;
300 
301 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
302 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
303 
rt_hash(__be32 daddr,__be32 saddr,int idx,int genid)304 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
305 				   int genid)
306 {
307 	return jhash_3words((__force u32)daddr, (__force u32)saddr,
308 			    idx, genid)
309 		& rt_hash_mask;
310 }
311 
rt_genid(struct net * net)312 static inline int rt_genid(struct net *net)
313 {
314 	return atomic_read(&net->ipv4.rt_genid);
315 }
316 
317 #ifdef CONFIG_PROC_FS
318 struct rt_cache_iter_state {
319 	struct seq_net_private p;
320 	int bucket;
321 	int genid;
322 };
323 
rt_cache_get_first(struct seq_file * seq)324 static struct rtable *rt_cache_get_first(struct seq_file *seq)
325 {
326 	struct rt_cache_iter_state *st = seq->private;
327 	struct rtable *r = NULL;
328 
329 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
330 		if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
331 			continue;
332 		rcu_read_lock_bh();
333 		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
334 		while (r) {
335 			if (dev_net(r->dst.dev) == seq_file_net(seq) &&
336 			    r->rt_genid == st->genid)
337 				return r;
338 			r = rcu_dereference_bh(r->dst.rt_next);
339 		}
340 		rcu_read_unlock_bh();
341 	}
342 	return r;
343 }
344 
__rt_cache_get_next(struct seq_file * seq,struct rtable * r)345 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
346 					  struct rtable *r)
347 {
348 	struct rt_cache_iter_state *st = seq->private;
349 
350 	r = rcu_dereference_bh(r->dst.rt_next);
351 	while (!r) {
352 		rcu_read_unlock_bh();
353 		do {
354 			if (--st->bucket < 0)
355 				return NULL;
356 		} while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
357 		rcu_read_lock_bh();
358 		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
359 	}
360 	return r;
361 }
362 
rt_cache_get_next(struct seq_file * seq,struct rtable * r)363 static struct rtable *rt_cache_get_next(struct seq_file *seq,
364 					struct rtable *r)
365 {
366 	struct rt_cache_iter_state *st = seq->private;
367 	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
368 		if (dev_net(r->dst.dev) != seq_file_net(seq))
369 			continue;
370 		if (r->rt_genid == st->genid)
371 			break;
372 	}
373 	return r;
374 }
375 
rt_cache_get_idx(struct seq_file * seq,loff_t pos)376 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
377 {
378 	struct rtable *r = rt_cache_get_first(seq);
379 
380 	if (r)
381 		while (pos && (r = rt_cache_get_next(seq, r)))
382 			--pos;
383 	return pos ? NULL : r;
384 }
385 
rt_cache_seq_start(struct seq_file * seq,loff_t * pos)386 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
387 {
388 	struct rt_cache_iter_state *st = seq->private;
389 	if (*pos)
390 		return rt_cache_get_idx(seq, *pos - 1);
391 	st->genid = rt_genid(seq_file_net(seq));
392 	return SEQ_START_TOKEN;
393 }
394 
rt_cache_seq_next(struct seq_file * seq,void * v,loff_t * pos)395 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
396 {
397 	struct rtable *r;
398 
399 	if (v == SEQ_START_TOKEN)
400 		r = rt_cache_get_first(seq);
401 	else
402 		r = rt_cache_get_next(seq, v);
403 	++*pos;
404 	return r;
405 }
406 
rt_cache_seq_stop(struct seq_file * seq,void * v)407 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
408 {
409 	if (v && v != SEQ_START_TOKEN)
410 		rcu_read_unlock_bh();
411 }
412 
rt_cache_seq_show(struct seq_file * seq,void * v)413 static int rt_cache_seq_show(struct seq_file *seq, void *v)
414 {
415 	if (v == SEQ_START_TOKEN)
416 		seq_printf(seq, "%-127s\n",
417 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
418 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
419 			   "HHUptod\tSpecDst");
420 	else {
421 		struct rtable *r = v;
422 		struct neighbour *n;
423 		int len, HHUptod;
424 
425 		rcu_read_lock();
426 		n = dst_get_neighbour_noref(&r->dst);
427 		HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
428 		rcu_read_unlock();
429 
430 		seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
431 			      "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
432 			r->dst.dev ? r->dst.dev->name : "*",
433 			(__force u32)r->rt_dst,
434 			(__force u32)r->rt_gateway,
435 			r->rt_flags, atomic_read(&r->dst.__refcnt),
436 			r->dst.__use, 0, (__force u32)r->rt_src,
437 			dst_metric_advmss(&r->dst) + 40,
438 			dst_metric(&r->dst, RTAX_WINDOW),
439 			(int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
440 			      dst_metric(&r->dst, RTAX_RTTVAR)),
441 			r->rt_key_tos,
442 			-1,
443 			HHUptod,
444 			r->rt_spec_dst, &len);
445 
446 		seq_printf(seq, "%*s\n", 127 - len, "");
447 	}
448 	return 0;
449 }
450 
451 static const struct seq_operations rt_cache_seq_ops = {
452 	.start  = rt_cache_seq_start,
453 	.next   = rt_cache_seq_next,
454 	.stop   = rt_cache_seq_stop,
455 	.show   = rt_cache_seq_show,
456 };
457 
rt_cache_seq_open(struct inode * inode,struct file * file)458 static int rt_cache_seq_open(struct inode *inode, struct file *file)
459 {
460 	return seq_open_net(inode, file, &rt_cache_seq_ops,
461 			sizeof(struct rt_cache_iter_state));
462 }
463 
464 static const struct file_operations rt_cache_seq_fops = {
465 	.owner	 = THIS_MODULE,
466 	.open	 = rt_cache_seq_open,
467 	.read	 = seq_read,
468 	.llseek	 = seq_lseek,
469 	.release = seq_release_net,
470 };
471 
472 
rt_cpu_seq_start(struct seq_file * seq,loff_t * pos)473 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
474 {
475 	int cpu;
476 
477 	if (*pos == 0)
478 		return SEQ_START_TOKEN;
479 
480 	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
481 		if (!cpu_possible(cpu))
482 			continue;
483 		*pos = cpu+1;
484 		return &per_cpu(rt_cache_stat, cpu);
485 	}
486 	return NULL;
487 }
488 
rt_cpu_seq_next(struct seq_file * seq,void * v,loff_t * pos)489 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
490 {
491 	int cpu;
492 
493 	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
494 		if (!cpu_possible(cpu))
495 			continue;
496 		*pos = cpu+1;
497 		return &per_cpu(rt_cache_stat, cpu);
498 	}
499 	return NULL;
500 
501 }
502 
rt_cpu_seq_stop(struct seq_file * seq,void * v)503 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
504 {
505 
506 }
507 
rt_cpu_seq_show(struct seq_file * seq,void * v)508 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
509 {
510 	struct rt_cache_stat *st = v;
511 
512 	if (v == SEQ_START_TOKEN) {
513 		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
514 		return 0;
515 	}
516 
517 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
518 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
519 		   dst_entries_get_slow(&ipv4_dst_ops),
520 		   st->in_hit,
521 		   st->in_slow_tot,
522 		   st->in_slow_mc,
523 		   st->in_no_route,
524 		   st->in_brd,
525 		   st->in_martian_dst,
526 		   st->in_martian_src,
527 
528 		   st->out_hit,
529 		   st->out_slow_tot,
530 		   st->out_slow_mc,
531 
532 		   st->gc_total,
533 		   st->gc_ignored,
534 		   st->gc_goal_miss,
535 		   st->gc_dst_overflow,
536 		   st->in_hlist_search,
537 		   st->out_hlist_search
538 		);
539 	return 0;
540 }
541 
542 static const struct seq_operations rt_cpu_seq_ops = {
543 	.start  = rt_cpu_seq_start,
544 	.next   = rt_cpu_seq_next,
545 	.stop   = rt_cpu_seq_stop,
546 	.show   = rt_cpu_seq_show,
547 };
548 
549 
rt_cpu_seq_open(struct inode * inode,struct file * file)550 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
551 {
552 	return seq_open(file, &rt_cpu_seq_ops);
553 }
554 
555 static const struct file_operations rt_cpu_seq_fops = {
556 	.owner	 = THIS_MODULE,
557 	.open	 = rt_cpu_seq_open,
558 	.read	 = seq_read,
559 	.llseek	 = seq_lseek,
560 	.release = seq_release,
561 };
562 
563 #ifdef CONFIG_IP_ROUTE_CLASSID
rt_acct_proc_show(struct seq_file * m,void * v)564 static int rt_acct_proc_show(struct seq_file *m, void *v)
565 {
566 	struct ip_rt_acct *dst, *src;
567 	unsigned int i, j;
568 
569 	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
570 	if (!dst)
571 		return -ENOMEM;
572 
573 	for_each_possible_cpu(i) {
574 		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
575 		for (j = 0; j < 256; j++) {
576 			dst[j].o_bytes   += src[j].o_bytes;
577 			dst[j].o_packets += src[j].o_packets;
578 			dst[j].i_bytes   += src[j].i_bytes;
579 			dst[j].i_packets += src[j].i_packets;
580 		}
581 	}
582 
583 	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
584 	kfree(dst);
585 	return 0;
586 }
587 
rt_acct_proc_open(struct inode * inode,struct file * file)588 static int rt_acct_proc_open(struct inode *inode, struct file *file)
589 {
590 	return single_open(file, rt_acct_proc_show, NULL);
591 }
592 
593 static const struct file_operations rt_acct_proc_fops = {
594 	.owner		= THIS_MODULE,
595 	.open		= rt_acct_proc_open,
596 	.read		= seq_read,
597 	.llseek		= seq_lseek,
598 	.release	= single_release,
599 };
600 #endif
601 
ip_rt_do_proc_init(struct net * net)602 static int __net_init ip_rt_do_proc_init(struct net *net)
603 {
604 	struct proc_dir_entry *pde;
605 
606 	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
607 			&rt_cache_seq_fops);
608 	if (!pde)
609 		goto err1;
610 
611 	pde = proc_create("rt_cache", S_IRUGO,
612 			  net->proc_net_stat, &rt_cpu_seq_fops);
613 	if (!pde)
614 		goto err2;
615 
616 #ifdef CONFIG_IP_ROUTE_CLASSID
617 	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
618 	if (!pde)
619 		goto err3;
620 #endif
621 	return 0;
622 
623 #ifdef CONFIG_IP_ROUTE_CLASSID
624 err3:
625 	remove_proc_entry("rt_cache", net->proc_net_stat);
626 #endif
627 err2:
628 	remove_proc_entry("rt_cache", net->proc_net);
629 err1:
630 	return -ENOMEM;
631 }
632 
ip_rt_do_proc_exit(struct net * net)633 static void __net_exit ip_rt_do_proc_exit(struct net *net)
634 {
635 	remove_proc_entry("rt_cache", net->proc_net_stat);
636 	remove_proc_entry("rt_cache", net->proc_net);
637 #ifdef CONFIG_IP_ROUTE_CLASSID
638 	remove_proc_entry("rt_acct", net->proc_net);
639 #endif
640 }
641 
642 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
643 	.init = ip_rt_do_proc_init,
644 	.exit = ip_rt_do_proc_exit,
645 };
646 
ip_rt_proc_init(void)647 static int __init ip_rt_proc_init(void)
648 {
649 	return register_pernet_subsys(&ip_rt_proc_ops);
650 }
651 
652 #else
ip_rt_proc_init(void)653 static inline int ip_rt_proc_init(void)
654 {
655 	return 0;
656 }
657 #endif /* CONFIG_PROC_FS */
658 
rt_free(struct rtable * rt)659 static inline void rt_free(struct rtable *rt)
660 {
661 	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
662 }
663 
rt_drop(struct rtable * rt)664 static inline void rt_drop(struct rtable *rt)
665 {
666 	ip_rt_put(rt);
667 	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
668 }
669 
rt_fast_clean(struct rtable * rth)670 static inline int rt_fast_clean(struct rtable *rth)
671 {
672 	/* Kill broadcast/multicast entries very aggresively, if they
673 	   collide in hash table with more useful entries */
674 	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
675 		rt_is_input_route(rth) && rth->dst.rt_next;
676 }
677 
rt_valuable(struct rtable * rth)678 static inline int rt_valuable(struct rtable *rth)
679 {
680 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
681 		(rth->peer && rth->peer->pmtu_expires);
682 }
683 
rt_may_expire(struct rtable * rth,unsigned long tmo1,unsigned long tmo2)684 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
685 {
686 	unsigned long age;
687 	int ret = 0;
688 
689 	if (atomic_read(&rth->dst.__refcnt))
690 		goto out;
691 
692 	age = jiffies - rth->dst.lastuse;
693 	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
694 	    (age <= tmo2 && rt_valuable(rth)))
695 		goto out;
696 	ret = 1;
697 out:	return ret;
698 }
699 
700 /* Bits of score are:
701  * 31: very valuable
702  * 30: not quite useless
703  * 29..0: usage counter
704  */
rt_score(struct rtable * rt)705 static inline u32 rt_score(struct rtable *rt)
706 {
707 	u32 score = jiffies - rt->dst.lastuse;
708 
709 	score = ~score & ~(3<<30);
710 
711 	if (rt_valuable(rt))
712 		score |= (1<<31);
713 
714 	if (rt_is_output_route(rt) ||
715 	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
716 		score |= (1<<30);
717 
718 	return score;
719 }
720 
rt_caching(const struct net * net)721 static inline bool rt_caching(const struct net *net)
722 {
723 	return net->ipv4.current_rt_cache_rebuild_count <=
724 		net->ipv4.sysctl_rt_cache_rebuild_count;
725 }
726 
compare_hash_inputs(const struct rtable * rt1,const struct rtable * rt2)727 static inline bool compare_hash_inputs(const struct rtable *rt1,
728 				       const struct rtable *rt2)
729 {
730 	return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
731 		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
732 		(rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
733 }
734 
compare_keys(struct rtable * rt1,struct rtable * rt2)735 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
736 {
737 	return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
738 		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
739 		(rt1->rt_mark ^ rt2->rt_mark) |
740 		(rt1->rt_key_tos ^ rt2->rt_key_tos) |
741 		(rt1->rt_route_iif ^ rt2->rt_route_iif) |
742 		(rt1->rt_oif ^ rt2->rt_oif)) == 0;
743 }
744 
compare_netns(struct rtable * rt1,struct rtable * rt2)745 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
746 {
747 	return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
748 }
749 
rt_is_expired(struct rtable * rth)750 static inline int rt_is_expired(struct rtable *rth)
751 {
752 	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
753 }
754 
755 /*
756  * Perform a full scan of hash table and free all entries.
757  * Can be called by a softirq or a process.
758  * In the later case, we want to be reschedule if necessary
759  */
rt_do_flush(struct net * net,int process_context)760 static void rt_do_flush(struct net *net, int process_context)
761 {
762 	unsigned int i;
763 	struct rtable *rth, *next;
764 
765 	for (i = 0; i <= rt_hash_mask; i++) {
766 		struct rtable __rcu **pprev;
767 		struct rtable *list;
768 
769 		if (process_context && need_resched())
770 			cond_resched();
771 		rth = rcu_access_pointer(rt_hash_table[i].chain);
772 		if (!rth)
773 			continue;
774 
775 		spin_lock_bh(rt_hash_lock_addr(i));
776 
777 		list = NULL;
778 		pprev = &rt_hash_table[i].chain;
779 		rth = rcu_dereference_protected(*pprev,
780 			lockdep_is_held(rt_hash_lock_addr(i)));
781 
782 		while (rth) {
783 			next = rcu_dereference_protected(rth->dst.rt_next,
784 				lockdep_is_held(rt_hash_lock_addr(i)));
785 
786 			if (!net ||
787 			    net_eq(dev_net(rth->dst.dev), net)) {
788 				rcu_assign_pointer(*pprev, next);
789 				rcu_assign_pointer(rth->dst.rt_next, list);
790 				list = rth;
791 			} else {
792 				pprev = &rth->dst.rt_next;
793 			}
794 			rth = next;
795 		}
796 
797 		spin_unlock_bh(rt_hash_lock_addr(i));
798 
799 		for (; list; list = next) {
800 			next = rcu_dereference_protected(list->dst.rt_next, 1);
801 			rt_free(list);
802 		}
803 	}
804 }
805 
806 /*
807  * While freeing expired entries, we compute average chain length
808  * and standard deviation, using fixed-point arithmetic.
809  * This to have an estimation of rt_chain_length_max
810  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
811  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
812  */
813 
814 #define FRACT_BITS 3
815 #define ONE (1UL << FRACT_BITS)
816 
817 /*
818  * Given a hash chain and an item in this hash chain,
819  * find if a previous entry has the same hash_inputs
820  * (but differs on tos, mark or oif)
821  * Returns 0 if an alias is found.
822  * Returns ONE if rth has no alias before itself.
823  */
has_noalias(const struct rtable * head,const struct rtable * rth)824 static int has_noalias(const struct rtable *head, const struct rtable *rth)
825 {
826 	const struct rtable *aux = head;
827 
828 	while (aux != rth) {
829 		if (compare_hash_inputs(aux, rth))
830 			return 0;
831 		aux = rcu_dereference_protected(aux->dst.rt_next, 1);
832 	}
833 	return ONE;
834 }
835 
rt_check_expire(void)836 static void rt_check_expire(void)
837 {
838 	static unsigned int rover;
839 	unsigned int i = rover, goal;
840 	struct rtable *rth;
841 	struct rtable __rcu **rthp;
842 	unsigned long samples = 0;
843 	unsigned long sum = 0, sum2 = 0;
844 	unsigned long delta;
845 	u64 mult;
846 
847 	delta = jiffies - expires_ljiffies;
848 	expires_ljiffies = jiffies;
849 	mult = ((u64)delta) << rt_hash_log;
850 	if (ip_rt_gc_timeout > 1)
851 		do_div(mult, ip_rt_gc_timeout);
852 	goal = (unsigned int)mult;
853 	if (goal > rt_hash_mask)
854 		goal = rt_hash_mask + 1;
855 	for (; goal > 0; goal--) {
856 		unsigned long tmo = ip_rt_gc_timeout;
857 		unsigned long length;
858 
859 		i = (i + 1) & rt_hash_mask;
860 		rthp = &rt_hash_table[i].chain;
861 
862 		if (need_resched())
863 			cond_resched();
864 
865 		samples++;
866 
867 		if (rcu_dereference_raw(*rthp) == NULL)
868 			continue;
869 		length = 0;
870 		spin_lock_bh(rt_hash_lock_addr(i));
871 		while ((rth = rcu_dereference_protected(*rthp,
872 					lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
873 			prefetch(rth->dst.rt_next);
874 			if (rt_is_expired(rth)) {
875 				*rthp = rth->dst.rt_next;
876 				rt_free(rth);
877 				continue;
878 			}
879 			if (rth->dst.expires) {
880 				/* Entry is expired even if it is in use */
881 				if (time_before_eq(jiffies, rth->dst.expires)) {
882 nofree:
883 					tmo >>= 1;
884 					rthp = &rth->dst.rt_next;
885 					/*
886 					 * We only count entries on
887 					 * a chain with equal hash inputs once
888 					 * so that entries for different QOS
889 					 * levels, and other non-hash input
890 					 * attributes don't unfairly skew
891 					 * the length computation
892 					 */
893 					length += has_noalias(rt_hash_table[i].chain, rth);
894 					continue;
895 				}
896 			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
897 				goto nofree;
898 
899 			/* Cleanup aged off entries. */
900 			*rthp = rth->dst.rt_next;
901 			rt_free(rth);
902 		}
903 		spin_unlock_bh(rt_hash_lock_addr(i));
904 		sum += length;
905 		sum2 += length*length;
906 	}
907 	if (samples) {
908 		unsigned long avg = sum / samples;
909 		unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
910 		rt_chain_length_max = max_t(unsigned long,
911 					ip_rt_gc_elasticity,
912 					(avg + 4*sd) >> FRACT_BITS);
913 	}
914 	rover = i;
915 }
916 
917 /*
918  * rt_worker_func() is run in process context.
919  * we call rt_check_expire() to scan part of the hash table
920  */
rt_worker_func(struct work_struct * work)921 static void rt_worker_func(struct work_struct *work)
922 {
923 	rt_check_expire();
924 	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
925 }
926 
927 /*
928  * Perturbation of rt_genid by a small quantity [1..256]
929  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
930  * many times (2^24) without giving recent rt_genid.
931  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
932  */
rt_cache_invalidate(struct net * net)933 static void rt_cache_invalidate(struct net *net)
934 {
935 	unsigned char shuffle;
936 
937 	get_random_bytes(&shuffle, sizeof(shuffle));
938 	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
939 	inetpeer_invalidate_tree(AF_INET);
940 }
941 
942 /*
943  * delay < 0  : invalidate cache (fast : entries will be deleted later)
944  * delay >= 0 : invalidate & flush cache (can be long)
945  */
rt_cache_flush(struct net * net,int delay)946 void rt_cache_flush(struct net *net, int delay)
947 {
948 	rt_cache_invalidate(net);
949 	if (delay >= 0)
950 		rt_do_flush(net, !in_softirq());
951 }
952 
953 /* Flush previous cache invalidated entries from the cache */
rt_cache_flush_batch(struct net * net)954 void rt_cache_flush_batch(struct net *net)
955 {
956 	rt_do_flush(net, !in_softirq());
957 }
958 
rt_emergency_hash_rebuild(struct net * net)959 static void rt_emergency_hash_rebuild(struct net *net)
960 {
961 	if (net_ratelimit())
962 		printk(KERN_WARNING "Route hash chain too long!\n");
963 	rt_cache_invalidate(net);
964 }
965 
966 /*
967    Short description of GC goals.
968 
969    We want to build algorithm, which will keep routing cache
970    at some equilibrium point, when number of aged off entries
971    is kept approximately equal to newly generated ones.
972 
973    Current expiration strength is variable "expire".
974    We try to adjust it dynamically, so that if networking
975    is idle expires is large enough to keep enough of warm entries,
976    and when load increases it reduces to limit cache size.
977  */
978 
rt_garbage_collect(struct dst_ops * ops)979 static int rt_garbage_collect(struct dst_ops *ops)
980 {
981 	static unsigned long expire = RT_GC_TIMEOUT;
982 	static unsigned long last_gc;
983 	static int rover;
984 	static int equilibrium;
985 	struct rtable *rth;
986 	struct rtable __rcu **rthp;
987 	unsigned long now = jiffies;
988 	int goal;
989 	int entries = dst_entries_get_fast(&ipv4_dst_ops);
990 
991 	/*
992 	 * Garbage collection is pretty expensive,
993 	 * do not make it too frequently.
994 	 */
995 
996 	RT_CACHE_STAT_INC(gc_total);
997 
998 	if (now - last_gc < ip_rt_gc_min_interval &&
999 	    entries < ip_rt_max_size) {
1000 		RT_CACHE_STAT_INC(gc_ignored);
1001 		goto out;
1002 	}
1003 
1004 	entries = dst_entries_get_slow(&ipv4_dst_ops);
1005 	/* Calculate number of entries, which we want to expire now. */
1006 	goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1007 	if (goal <= 0) {
1008 		if (equilibrium < ipv4_dst_ops.gc_thresh)
1009 			equilibrium = ipv4_dst_ops.gc_thresh;
1010 		goal = entries - equilibrium;
1011 		if (goal > 0) {
1012 			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1013 			goal = entries - equilibrium;
1014 		}
1015 	} else {
1016 		/* We are in dangerous area. Try to reduce cache really
1017 		 * aggressively.
1018 		 */
1019 		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1020 		equilibrium = entries - goal;
1021 	}
1022 
1023 	if (now - last_gc >= ip_rt_gc_min_interval)
1024 		last_gc = now;
1025 
1026 	if (goal <= 0) {
1027 		equilibrium += goal;
1028 		goto work_done;
1029 	}
1030 
1031 	do {
1032 		int i, k;
1033 
1034 		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1035 			unsigned long tmo = expire;
1036 
1037 			k = (k + 1) & rt_hash_mask;
1038 			rthp = &rt_hash_table[k].chain;
1039 			spin_lock_bh(rt_hash_lock_addr(k));
1040 			while ((rth = rcu_dereference_protected(*rthp,
1041 					lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1042 				if (!rt_is_expired(rth) &&
1043 					!rt_may_expire(rth, tmo, expire)) {
1044 					tmo >>= 1;
1045 					rthp = &rth->dst.rt_next;
1046 					continue;
1047 				}
1048 				*rthp = rth->dst.rt_next;
1049 				rt_free(rth);
1050 				goal--;
1051 			}
1052 			spin_unlock_bh(rt_hash_lock_addr(k));
1053 			if (goal <= 0)
1054 				break;
1055 		}
1056 		rover = k;
1057 
1058 		if (goal <= 0)
1059 			goto work_done;
1060 
1061 		/* Goal is not achieved. We stop process if:
1062 
1063 		   - if expire reduced to zero. Otherwise, expire is halfed.
1064 		   - if table is not full.
1065 		   - if we are called from interrupt.
1066 		   - jiffies check is just fallback/debug loop breaker.
1067 		     We will not spin here for long time in any case.
1068 		 */
1069 
1070 		RT_CACHE_STAT_INC(gc_goal_miss);
1071 
1072 		if (expire == 0)
1073 			break;
1074 
1075 		expire >>= 1;
1076 
1077 		if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1078 			goto out;
1079 	} while (!in_softirq() && time_before_eq(jiffies, now));
1080 
1081 	if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1082 		goto out;
1083 	if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1084 		goto out;
1085 	if (net_ratelimit())
1086 		printk(KERN_WARNING "dst cache overflow\n");
1087 	RT_CACHE_STAT_INC(gc_dst_overflow);
1088 	return 1;
1089 
1090 work_done:
1091 	expire += ip_rt_gc_min_interval;
1092 	if (expire > ip_rt_gc_timeout ||
1093 	    dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1094 	    dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1095 		expire = ip_rt_gc_timeout;
1096 out:	return 0;
1097 }
1098 
1099 /*
1100  * Returns number of entries in a hash chain that have different hash_inputs
1101  */
slow_chain_length(const struct rtable * head)1102 static int slow_chain_length(const struct rtable *head)
1103 {
1104 	int length = 0;
1105 	const struct rtable *rth = head;
1106 
1107 	while (rth) {
1108 		length += has_noalias(head, rth);
1109 		rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1110 	}
1111 	return length >> FRACT_BITS;
1112 }
1113 
ipv4_neigh_lookup(const struct dst_entry * dst,const void * daddr)1114 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1115 {
1116 	static const __be32 inaddr_any = 0;
1117 	struct net_device *dev = dst->dev;
1118 	const __be32 *pkey = daddr;
1119 	struct neighbour *n;
1120 
1121 	if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1122 		pkey = &inaddr_any;
1123 
1124 	n = __ipv4_neigh_lookup(&arp_tbl, dev, *(__force u32 *)pkey);
1125 	if (n)
1126 		return n;
1127 	return neigh_create(&arp_tbl, pkey, dev);
1128 }
1129 
rt_bind_neighbour(struct rtable * rt)1130 static int rt_bind_neighbour(struct rtable *rt)
1131 {
1132 	struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1133 	if (IS_ERR(n))
1134 		return PTR_ERR(n);
1135 	dst_set_neighbour(&rt->dst, n);
1136 
1137 	return 0;
1138 }
1139 
rt_intern_hash(unsigned hash,struct rtable * rt,struct sk_buff * skb,int ifindex)1140 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1141 				     struct sk_buff *skb, int ifindex)
1142 {
1143 	struct rtable	*rth, *cand;
1144 	struct rtable __rcu **rthp, **candp;
1145 	unsigned long	now;
1146 	u32 		min_score;
1147 	int		chain_length;
1148 	int attempts = !in_softirq();
1149 
1150 restart:
1151 	chain_length = 0;
1152 	min_score = ~(u32)0;
1153 	cand = NULL;
1154 	candp = NULL;
1155 	now = jiffies;
1156 
1157 	if (!rt_caching(dev_net(rt->dst.dev))) {
1158 		/*
1159 		 * If we're not caching, just tell the caller we
1160 		 * were successful and don't touch the route.  The
1161 		 * caller hold the sole reference to the cache entry, and
1162 		 * it will be released when the caller is done with it.
1163 		 * If we drop it here, the callers have no way to resolve routes
1164 		 * when we're not caching.  Instead, just point *rp at rt, so
1165 		 * the caller gets a single use out of the route
1166 		 * Note that we do rt_free on this new route entry, so that
1167 		 * once its refcount hits zero, we are still able to reap it
1168 		 * (Thanks Alexey)
1169 		 * Note: To avoid expensive rcu stuff for this uncached dst,
1170 		 * we set DST_NOCACHE so that dst_release() can free dst without
1171 		 * waiting a grace period.
1172 		 */
1173 
1174 		rt->dst.flags |= DST_NOCACHE;
1175 		if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1176 			int err = rt_bind_neighbour(rt);
1177 			if (err) {
1178 				if (net_ratelimit())
1179 					printk(KERN_WARNING
1180 					    "Neighbour table failure & not caching routes.\n");
1181 				ip_rt_put(rt);
1182 				return ERR_PTR(err);
1183 			}
1184 		}
1185 
1186 		goto skip_hashing;
1187 	}
1188 
1189 	rthp = &rt_hash_table[hash].chain;
1190 
1191 	spin_lock_bh(rt_hash_lock_addr(hash));
1192 	while ((rth = rcu_dereference_protected(*rthp,
1193 			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1194 		if (rt_is_expired(rth)) {
1195 			*rthp = rth->dst.rt_next;
1196 			rt_free(rth);
1197 			continue;
1198 		}
1199 		if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1200 			/* Put it first */
1201 			*rthp = rth->dst.rt_next;
1202 			/*
1203 			 * Since lookup is lockfree, the deletion
1204 			 * must be visible to another weakly ordered CPU before
1205 			 * the insertion at the start of the hash chain.
1206 			 */
1207 			rcu_assign_pointer(rth->dst.rt_next,
1208 					   rt_hash_table[hash].chain);
1209 			/*
1210 			 * Since lookup is lockfree, the update writes
1211 			 * must be ordered for consistency on SMP.
1212 			 */
1213 			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1214 
1215 			dst_use(&rth->dst, now);
1216 			spin_unlock_bh(rt_hash_lock_addr(hash));
1217 
1218 			rt_drop(rt);
1219 			if (skb)
1220 				skb_dst_set(skb, &rth->dst);
1221 			return rth;
1222 		}
1223 
1224 		if (!atomic_read(&rth->dst.__refcnt)) {
1225 			u32 score = rt_score(rth);
1226 
1227 			if (score <= min_score) {
1228 				cand = rth;
1229 				candp = rthp;
1230 				min_score = score;
1231 			}
1232 		}
1233 
1234 		chain_length++;
1235 
1236 		rthp = &rth->dst.rt_next;
1237 	}
1238 
1239 	if (cand) {
1240 		/* ip_rt_gc_elasticity used to be average length of chain
1241 		 * length, when exceeded gc becomes really aggressive.
1242 		 *
1243 		 * The second limit is less certain. At the moment it allows
1244 		 * only 2 entries per bucket. We will see.
1245 		 */
1246 		if (chain_length > ip_rt_gc_elasticity) {
1247 			*candp = cand->dst.rt_next;
1248 			rt_free(cand);
1249 		}
1250 	} else {
1251 		if (chain_length > rt_chain_length_max &&
1252 		    slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1253 			struct net *net = dev_net(rt->dst.dev);
1254 			int num = ++net->ipv4.current_rt_cache_rebuild_count;
1255 			if (!rt_caching(net)) {
1256 				printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1257 					rt->dst.dev->name, num);
1258 			}
1259 			rt_emergency_hash_rebuild(net);
1260 			spin_unlock_bh(rt_hash_lock_addr(hash));
1261 
1262 			hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1263 					ifindex, rt_genid(net));
1264 			goto restart;
1265 		}
1266 	}
1267 
1268 	/* Try to bind route to arp only if it is output
1269 	   route or unicast forwarding path.
1270 	 */
1271 	if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1272 		int err = rt_bind_neighbour(rt);
1273 		if (err) {
1274 			spin_unlock_bh(rt_hash_lock_addr(hash));
1275 
1276 			if (err != -ENOBUFS) {
1277 				rt_drop(rt);
1278 				return ERR_PTR(err);
1279 			}
1280 
1281 			/* Neighbour tables are full and nothing
1282 			   can be released. Try to shrink route cache,
1283 			   it is most likely it holds some neighbour records.
1284 			 */
1285 			if (attempts-- > 0) {
1286 				int saved_elasticity = ip_rt_gc_elasticity;
1287 				int saved_int = ip_rt_gc_min_interval;
1288 				ip_rt_gc_elasticity	= 1;
1289 				ip_rt_gc_min_interval	= 0;
1290 				rt_garbage_collect(&ipv4_dst_ops);
1291 				ip_rt_gc_min_interval	= saved_int;
1292 				ip_rt_gc_elasticity	= saved_elasticity;
1293 				goto restart;
1294 			}
1295 
1296 			if (net_ratelimit())
1297 				printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1298 			rt_drop(rt);
1299 			return ERR_PTR(-ENOBUFS);
1300 		}
1301 	}
1302 
1303 	rt->dst.rt_next = rt_hash_table[hash].chain;
1304 
1305 	/*
1306 	 * Since lookup is lockfree, we must make sure
1307 	 * previous writes to rt are committed to memory
1308 	 * before making rt visible to other CPUS.
1309 	 */
1310 	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1311 
1312 	spin_unlock_bh(rt_hash_lock_addr(hash));
1313 
1314 skip_hashing:
1315 	if (skb)
1316 		skb_dst_set(skb, &rt->dst);
1317 	return rt;
1318 }
1319 
1320 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1321 
rt_peer_genid(void)1322 static u32 rt_peer_genid(void)
1323 {
1324 	return atomic_read(&__rt_peer_genid);
1325 }
1326 
rt_bind_peer(struct rtable * rt,__be32 daddr,int create)1327 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1328 {
1329 	struct inet_peer *peer;
1330 
1331 	peer = inet_getpeer_v4(daddr, create);
1332 
1333 	if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1334 		inet_putpeer(peer);
1335 	else
1336 		rt->rt_peer_genid = rt_peer_genid();
1337 }
1338 
1339 /*
1340  * Peer allocation may fail only in serious out-of-memory conditions.  However
1341  * we still can generate some output.
1342  * Random ID selection looks a bit dangerous because we have no chances to
1343  * select ID being unique in a reasonable period of time.
1344  * But broken packet identifier may be better than no packet at all.
1345  */
ip_select_fb_ident(struct iphdr * iph)1346 static void ip_select_fb_ident(struct iphdr *iph)
1347 {
1348 	static DEFINE_SPINLOCK(ip_fb_id_lock);
1349 	static u32 ip_fallback_id;
1350 	u32 salt;
1351 
1352 	spin_lock_bh(&ip_fb_id_lock);
1353 	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1354 	iph->id = htons(salt & 0xFFFF);
1355 	ip_fallback_id = salt;
1356 	spin_unlock_bh(&ip_fb_id_lock);
1357 }
1358 
__ip_select_ident(struct iphdr * iph,struct dst_entry * dst,int more)1359 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1360 {
1361 	struct rtable *rt = (struct rtable *) dst;
1362 
1363 	if (rt && !(rt->dst.flags & DST_NOPEER)) {
1364 		if (rt->peer == NULL)
1365 			rt_bind_peer(rt, rt->rt_dst, 1);
1366 
1367 		/* If peer is attached to destination, it is never detached,
1368 		   so that we need not to grab a lock to dereference it.
1369 		 */
1370 		if (rt->peer) {
1371 			iph->id = htons(inet_getid(rt->peer, more));
1372 			return;
1373 		}
1374 	} else if (!rt)
1375 		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1376 		       __builtin_return_address(0));
1377 
1378 	ip_select_fb_ident(iph);
1379 }
1380 EXPORT_SYMBOL(__ip_select_ident);
1381 
rt_del(unsigned hash,struct rtable * rt)1382 static void rt_del(unsigned hash, struct rtable *rt)
1383 {
1384 	struct rtable __rcu **rthp;
1385 	struct rtable *aux;
1386 
1387 	rthp = &rt_hash_table[hash].chain;
1388 	spin_lock_bh(rt_hash_lock_addr(hash));
1389 	ip_rt_put(rt);
1390 	while ((aux = rcu_dereference_protected(*rthp,
1391 			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1392 		if (aux == rt || rt_is_expired(aux)) {
1393 			*rthp = aux->dst.rt_next;
1394 			rt_free(aux);
1395 			continue;
1396 		}
1397 		rthp = &aux->dst.rt_next;
1398 	}
1399 	spin_unlock_bh(rt_hash_lock_addr(hash));
1400 }
1401 
check_peer_redir(struct dst_entry * dst,struct inet_peer * peer)1402 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1403 {
1404 	struct rtable *rt = (struct rtable *) dst;
1405 	__be32 orig_gw = rt->rt_gateway;
1406 	struct neighbour *n, *old_n;
1407 
1408 	dst_confirm(&rt->dst);
1409 
1410 	rt->rt_gateway = peer->redirect_learned.a4;
1411 
1412 	n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1413 	if (IS_ERR(n)) {
1414 		rt->rt_gateway = orig_gw;
1415 		return;
1416 	}
1417 	old_n = xchg(&rt->dst._neighbour, n);
1418 	if (old_n)
1419 		neigh_release(old_n);
1420 	if (!(n->nud_state & NUD_VALID)) {
1421 		neigh_event_send(n, NULL);
1422 	} else {
1423 		rt->rt_flags |= RTCF_REDIRECTED;
1424 		call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1425 	}
1426 }
1427 
1428 /* called in rcu_read_lock() section */
ip_rt_redirect(__be32 old_gw,__be32 daddr,__be32 new_gw,__be32 saddr,struct net_device * dev)1429 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1430 		    __be32 saddr, struct net_device *dev)
1431 {
1432 	int s, i;
1433 	struct in_device *in_dev = __in_dev_get_rcu(dev);
1434 	__be32 skeys[2] = { saddr, 0 };
1435 	int    ikeys[2] = { dev->ifindex, 0 };
1436 	struct inet_peer *peer;
1437 	struct net *net;
1438 
1439 	if (!in_dev)
1440 		return;
1441 
1442 	net = dev_net(dev);
1443 	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1444 	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1445 	    ipv4_is_zeronet(new_gw))
1446 		goto reject_redirect;
1447 
1448 	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1449 		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1450 			goto reject_redirect;
1451 		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1452 			goto reject_redirect;
1453 	} else {
1454 		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1455 			goto reject_redirect;
1456 	}
1457 
1458 	for (s = 0; s < 2; s++) {
1459 		for (i = 0; i < 2; i++) {
1460 			unsigned int hash;
1461 			struct rtable __rcu **rthp;
1462 			struct rtable *rt;
1463 
1464 			hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1465 
1466 			rthp = &rt_hash_table[hash].chain;
1467 
1468 			while ((rt = rcu_dereference(*rthp)) != NULL) {
1469 				rthp = &rt->dst.rt_next;
1470 
1471 				if (rt->rt_key_dst != daddr ||
1472 				    rt->rt_key_src != skeys[s] ||
1473 				    rt->rt_oif != ikeys[i] ||
1474 				    rt_is_input_route(rt) ||
1475 				    rt_is_expired(rt) ||
1476 				    !net_eq(dev_net(rt->dst.dev), net) ||
1477 				    rt->dst.error ||
1478 				    rt->dst.dev != dev ||
1479 				    rt->rt_gateway != old_gw)
1480 					continue;
1481 
1482 				if (!rt->peer)
1483 					rt_bind_peer(rt, rt->rt_dst, 1);
1484 
1485 				peer = rt->peer;
1486 				if (peer) {
1487 					if (peer->redirect_learned.a4 != new_gw) {
1488 						peer->redirect_learned.a4 = new_gw;
1489 						atomic_inc(&__rt_peer_genid);
1490 					}
1491 					check_peer_redir(&rt->dst, peer);
1492 				}
1493 			}
1494 		}
1495 	}
1496 	return;
1497 
1498 reject_redirect:
1499 #ifdef CONFIG_IP_ROUTE_VERBOSE
1500 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1501 		printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1502 			"  Advised path = %pI4 -> %pI4\n",
1503 		       &old_gw, dev->name, &new_gw,
1504 		       &saddr, &daddr);
1505 #endif
1506 	;
1507 }
1508 
peer_pmtu_expired(struct inet_peer * peer)1509 static bool peer_pmtu_expired(struct inet_peer *peer)
1510 {
1511 	unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1512 
1513 	return orig &&
1514 	       time_after_eq(jiffies, orig) &&
1515 	       cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1516 }
1517 
peer_pmtu_cleaned(struct inet_peer * peer)1518 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1519 {
1520 	unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1521 
1522 	return orig &&
1523 	       cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1524 }
1525 
ipv4_negative_advice(struct dst_entry * dst)1526 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1527 {
1528 	struct rtable *rt = (struct rtable *)dst;
1529 	struct dst_entry *ret = dst;
1530 
1531 	if (rt) {
1532 		if (dst->obsolete > 0) {
1533 			ip_rt_put(rt);
1534 			ret = NULL;
1535 		} else if (rt->rt_flags & RTCF_REDIRECTED) {
1536 			unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1537 						rt->rt_oif,
1538 						rt_genid(dev_net(dst->dev)));
1539 			rt_del(hash, rt);
1540 			ret = NULL;
1541 		} else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1542 			dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1543 		}
1544 	}
1545 	return ret;
1546 }
1547 
1548 /*
1549  * Algorithm:
1550  *	1. The first ip_rt_redirect_number redirects are sent
1551  *	   with exponential backoff, then we stop sending them at all,
1552  *	   assuming that the host ignores our redirects.
1553  *	2. If we did not see packets requiring redirects
1554  *	   during ip_rt_redirect_silence, we assume that the host
1555  *	   forgot redirected route and start to send redirects again.
1556  *
1557  * This algorithm is much cheaper and more intelligent than dumb load limiting
1558  * in icmp.c.
1559  *
1560  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1561  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1562  */
1563 
ip_rt_send_redirect(struct sk_buff * skb)1564 void ip_rt_send_redirect(struct sk_buff *skb)
1565 {
1566 	struct rtable *rt = skb_rtable(skb);
1567 	struct in_device *in_dev;
1568 	struct inet_peer *peer;
1569 	int log_martians;
1570 
1571 	rcu_read_lock();
1572 	in_dev = __in_dev_get_rcu(rt->dst.dev);
1573 	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1574 		rcu_read_unlock();
1575 		return;
1576 	}
1577 	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1578 	rcu_read_unlock();
1579 
1580 	if (!rt->peer)
1581 		rt_bind_peer(rt, rt->rt_dst, 1);
1582 	peer = rt->peer;
1583 	if (!peer) {
1584 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1585 		return;
1586 	}
1587 
1588 	/* No redirected packets during ip_rt_redirect_silence;
1589 	 * reset the algorithm.
1590 	 */
1591 	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1592 		peer->rate_tokens = 0;
1593 
1594 	/* Too many ignored redirects; do not send anything
1595 	 * set dst.rate_last to the last seen redirected packet.
1596 	 */
1597 	if (peer->rate_tokens >= ip_rt_redirect_number) {
1598 		peer->rate_last = jiffies;
1599 		return;
1600 	}
1601 
1602 	/* Check for load limit; set rate_last to the latest sent
1603 	 * redirect.
1604 	 */
1605 	if (peer->rate_tokens == 0 ||
1606 	    time_after(jiffies,
1607 		       (peer->rate_last +
1608 			(ip_rt_redirect_load << peer->rate_tokens)))) {
1609 		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1610 		peer->rate_last = jiffies;
1611 		++peer->rate_tokens;
1612 #ifdef CONFIG_IP_ROUTE_VERBOSE
1613 		if (log_martians &&
1614 		    peer->rate_tokens == ip_rt_redirect_number &&
1615 		    net_ratelimit())
1616 			printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1617 			       &ip_hdr(skb)->saddr, rt->rt_iif,
1618 				&rt->rt_dst, &rt->rt_gateway);
1619 #endif
1620 	}
1621 }
1622 
ip_error(struct sk_buff * skb)1623 static int ip_error(struct sk_buff *skb)
1624 {
1625 	struct rtable *rt = skb_rtable(skb);
1626 	struct inet_peer *peer;
1627 	unsigned long now;
1628 	bool send;
1629 	int code;
1630 
1631 	switch (rt->dst.error) {
1632 	case EINVAL:
1633 	default:
1634 		goto out;
1635 	case EHOSTUNREACH:
1636 		code = ICMP_HOST_UNREACH;
1637 		break;
1638 	case ENETUNREACH:
1639 		code = ICMP_NET_UNREACH;
1640 		IP_INC_STATS_BH(dev_net(rt->dst.dev),
1641 				IPSTATS_MIB_INNOROUTES);
1642 		break;
1643 	case EACCES:
1644 		code = ICMP_PKT_FILTERED;
1645 		break;
1646 	}
1647 
1648 	if (!rt->peer)
1649 		rt_bind_peer(rt, rt->rt_dst, 1);
1650 	peer = rt->peer;
1651 
1652 	send = true;
1653 	if (peer) {
1654 		now = jiffies;
1655 		peer->rate_tokens += now - peer->rate_last;
1656 		if (peer->rate_tokens > ip_rt_error_burst)
1657 			peer->rate_tokens = ip_rt_error_burst;
1658 		peer->rate_last = now;
1659 		if (peer->rate_tokens >= ip_rt_error_cost)
1660 			peer->rate_tokens -= ip_rt_error_cost;
1661 		else
1662 			send = false;
1663 	}
1664 	if (send)
1665 		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1666 
1667 out:	kfree_skb(skb);
1668 	return 0;
1669 }
1670 
1671 /*
1672  *	The last two values are not from the RFC but
1673  *	are needed for AMPRnet AX.25 paths.
1674  */
1675 
1676 static const unsigned short mtu_plateau[] =
1677 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1678 
guess_mtu(unsigned short old_mtu)1679 static inline unsigned short guess_mtu(unsigned short old_mtu)
1680 {
1681 	int i;
1682 
1683 	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1684 		if (old_mtu > mtu_plateau[i])
1685 			return mtu_plateau[i];
1686 	return 68;
1687 }
1688 
ip_rt_frag_needed(struct net * net,const struct iphdr * iph,unsigned short new_mtu,struct net_device * dev)1689 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1690 				 unsigned short new_mtu,
1691 				 struct net_device *dev)
1692 {
1693 	unsigned short old_mtu = ntohs(iph->tot_len);
1694 	unsigned short est_mtu = 0;
1695 	struct inet_peer *peer;
1696 
1697 	peer = inet_getpeer_v4(iph->daddr, 1);
1698 	if (peer) {
1699 		unsigned short mtu = new_mtu;
1700 
1701 		if (new_mtu < 68 || new_mtu >= old_mtu) {
1702 			/* BSD 4.2 derived systems incorrectly adjust
1703 			 * tot_len by the IP header length, and report
1704 			 * a zero MTU in the ICMP message.
1705 			 */
1706 			if (mtu == 0 &&
1707 			    old_mtu >= 68 + (iph->ihl << 2))
1708 				old_mtu -= iph->ihl << 2;
1709 			mtu = guess_mtu(old_mtu);
1710 		}
1711 
1712 		if (mtu < ip_rt_min_pmtu)
1713 			mtu = ip_rt_min_pmtu;
1714 		if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1715 			unsigned long pmtu_expires;
1716 
1717 			pmtu_expires = jiffies + ip_rt_mtu_expires;
1718 			if (!pmtu_expires)
1719 				pmtu_expires = 1UL;
1720 
1721 			est_mtu = mtu;
1722 			peer->pmtu_learned = mtu;
1723 			peer->pmtu_expires = pmtu_expires;
1724 			atomic_inc(&__rt_peer_genid);
1725 		}
1726 
1727 		inet_putpeer(peer);
1728 	}
1729 	return est_mtu ? : new_mtu;
1730 }
1731 
check_peer_pmtu(struct dst_entry * dst,struct inet_peer * peer)1732 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1733 {
1734 	unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1735 
1736 	if (!expires)
1737 		return;
1738 	if (time_before(jiffies, expires)) {
1739 		u32 orig_dst_mtu = dst_mtu(dst);
1740 		if (peer->pmtu_learned < orig_dst_mtu) {
1741 			if (!peer->pmtu_orig)
1742 				peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1743 			dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1744 		}
1745 	} else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1746 		dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1747 }
1748 
ip_rt_update_pmtu(struct dst_entry * dst,u32 mtu)1749 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1750 {
1751 	struct rtable *rt = (struct rtable *) dst;
1752 	struct inet_peer *peer;
1753 
1754 	dst_confirm(dst);
1755 
1756 	if (!rt->peer)
1757 		rt_bind_peer(rt, rt->rt_dst, 1);
1758 	peer = rt->peer;
1759 	if (peer) {
1760 		unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1761 
1762 		if (mtu < ip_rt_min_pmtu)
1763 			mtu = ip_rt_min_pmtu;
1764 		if (!pmtu_expires || mtu < peer->pmtu_learned) {
1765 
1766 			pmtu_expires = jiffies + ip_rt_mtu_expires;
1767 			if (!pmtu_expires)
1768 				pmtu_expires = 1UL;
1769 
1770 			peer->pmtu_learned = mtu;
1771 			peer->pmtu_expires = pmtu_expires;
1772 
1773 			atomic_inc(&__rt_peer_genid);
1774 			rt->rt_peer_genid = rt_peer_genid();
1775 		}
1776 		check_peer_pmtu(dst, peer);
1777 	}
1778 }
1779 
1780 
ipv4_validate_peer(struct rtable * rt)1781 static void ipv4_validate_peer(struct rtable *rt)
1782 {
1783 	if (rt->rt_peer_genid != rt_peer_genid()) {
1784 		struct inet_peer *peer;
1785 
1786 		if (!rt->peer)
1787 			rt_bind_peer(rt, rt->rt_dst, 0);
1788 
1789 		peer = rt->peer;
1790 		if (peer) {
1791 			check_peer_pmtu(&rt->dst, peer);
1792 
1793 			if (peer->redirect_learned.a4 &&
1794 			    peer->redirect_learned.a4 != rt->rt_gateway)
1795 				check_peer_redir(&rt->dst, peer);
1796 		}
1797 
1798 		rt->rt_peer_genid = rt_peer_genid();
1799 	}
1800 }
1801 
ipv4_dst_check(struct dst_entry * dst,u32 cookie)1802 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1803 {
1804 	struct rtable *rt = (struct rtable *) dst;
1805 
1806 	if (rt_is_expired(rt))
1807 		return NULL;
1808 	ipv4_validate_peer(rt);
1809 	return dst;
1810 }
1811 
ipv4_dst_destroy(struct dst_entry * dst)1812 static void ipv4_dst_destroy(struct dst_entry *dst)
1813 {
1814 	struct rtable *rt = (struct rtable *) dst;
1815 	struct inet_peer *peer = rt->peer;
1816 
1817 	if (rt->fi) {
1818 		fib_info_put(rt->fi);
1819 		rt->fi = NULL;
1820 	}
1821 	if (peer) {
1822 		rt->peer = NULL;
1823 		inet_putpeer(peer);
1824 	}
1825 }
1826 
1827 
ipv4_link_failure(struct sk_buff * skb)1828 static void ipv4_link_failure(struct sk_buff *skb)
1829 {
1830 	struct rtable *rt;
1831 
1832 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1833 
1834 	rt = skb_rtable(skb);
1835 	if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1836 		dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1837 }
1838 
ip_rt_bug(struct sk_buff * skb)1839 static int ip_rt_bug(struct sk_buff *skb)
1840 {
1841 	printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1842 		&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1843 		skb->dev ? skb->dev->name : "?");
1844 	kfree_skb(skb);
1845 	WARN_ON(1);
1846 	return 0;
1847 }
1848 
1849 /*
1850    We do not cache source address of outgoing interface,
1851    because it is used only by IP RR, TS and SRR options,
1852    so that it out of fast path.
1853 
1854    BTW remember: "addr" is allowed to be not aligned
1855    in IP options!
1856  */
1857 
ip_rt_get_source(u8 * addr,struct sk_buff * skb,struct rtable * rt)1858 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1859 {
1860 	__be32 src;
1861 
1862 	if (rt_is_output_route(rt))
1863 		src = ip_hdr(skb)->saddr;
1864 	else {
1865 		struct fib_result res;
1866 		struct flowi4 fl4;
1867 		struct iphdr *iph;
1868 
1869 		iph = ip_hdr(skb);
1870 
1871 		memset(&fl4, 0, sizeof(fl4));
1872 		fl4.daddr = iph->daddr;
1873 		fl4.saddr = iph->saddr;
1874 		fl4.flowi4_tos = RT_TOS(iph->tos);
1875 		fl4.flowi4_oif = rt->dst.dev->ifindex;
1876 		fl4.flowi4_iif = skb->dev->ifindex;
1877 		fl4.flowi4_mark = skb->mark;
1878 
1879 		rcu_read_lock();
1880 		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1881 			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1882 		else
1883 			src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1884 					RT_SCOPE_UNIVERSE);
1885 		rcu_read_unlock();
1886 	}
1887 	memcpy(addr, &src, 4);
1888 }
1889 
1890 #ifdef CONFIG_IP_ROUTE_CLASSID
set_class_tag(struct rtable * rt,u32 tag)1891 static void set_class_tag(struct rtable *rt, u32 tag)
1892 {
1893 	if (!(rt->dst.tclassid & 0xFFFF))
1894 		rt->dst.tclassid |= tag & 0xFFFF;
1895 	if (!(rt->dst.tclassid & 0xFFFF0000))
1896 		rt->dst.tclassid |= tag & 0xFFFF0000;
1897 }
1898 #endif
1899 
ipv4_default_advmss(const struct dst_entry * dst)1900 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1901 {
1902 	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1903 
1904 	if (advmss == 0) {
1905 		advmss = max_t(unsigned int, dst->dev->mtu - 40,
1906 			       ip_rt_min_advmss);
1907 		if (advmss > 65535 - 40)
1908 			advmss = 65535 - 40;
1909 	}
1910 	return advmss;
1911 }
1912 
ipv4_mtu(const struct dst_entry * dst)1913 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1914 {
1915 	const struct rtable *rt = (const struct rtable *) dst;
1916 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1917 
1918 	if (mtu && rt_is_output_route(rt))
1919 		return mtu;
1920 
1921 	mtu = dst->dev->mtu;
1922 
1923 	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1924 
1925 		if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1926 			mtu = 576;
1927 	}
1928 
1929 	if (mtu > IP_MAX_MTU)
1930 		mtu = IP_MAX_MTU;
1931 
1932 	return mtu;
1933 }
1934 
rt_init_metrics(struct rtable * rt,const struct flowi4 * fl4,struct fib_info * fi)1935 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1936 			    struct fib_info *fi)
1937 {
1938 	struct inet_peer *peer;
1939 	int create = 0;
1940 
1941 	/* If a peer entry exists for this destination, we must hook
1942 	 * it up in order to get at cached metrics.
1943 	 */
1944 	if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1945 		create = 1;
1946 
1947 	rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1948 	if (peer) {
1949 		rt->rt_peer_genid = rt_peer_genid();
1950 		if (inet_metrics_new(peer))
1951 			memcpy(peer->metrics, fi->fib_metrics,
1952 			       sizeof(u32) * RTAX_MAX);
1953 		dst_init_metrics(&rt->dst, peer->metrics, false);
1954 
1955 		check_peer_pmtu(&rt->dst, peer);
1956 
1957 		if (peer->redirect_learned.a4 &&
1958 		    peer->redirect_learned.a4 != rt->rt_gateway) {
1959 			rt->rt_gateway = peer->redirect_learned.a4;
1960 			rt->rt_flags |= RTCF_REDIRECTED;
1961 		}
1962 	} else {
1963 		if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1964 			rt->fi = fi;
1965 			atomic_inc(&fi->fib_clntref);
1966 		}
1967 		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1968 	}
1969 }
1970 
rt_set_nexthop(struct rtable * rt,const struct flowi4 * fl4,const struct fib_result * res,struct fib_info * fi,u16 type,u32 itag)1971 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1972 			   const struct fib_result *res,
1973 			   struct fib_info *fi, u16 type, u32 itag)
1974 {
1975 	struct dst_entry *dst = &rt->dst;
1976 
1977 	if (fi) {
1978 		if (FIB_RES_GW(*res) &&
1979 		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1980 			rt->rt_gateway = FIB_RES_GW(*res);
1981 		rt_init_metrics(rt, fl4, fi);
1982 #ifdef CONFIG_IP_ROUTE_CLASSID
1983 		dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1984 #endif
1985 	}
1986 
1987 	if (dst_mtu(dst) > IP_MAX_MTU)
1988 		dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1989 	if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1990 		dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1991 
1992 #ifdef CONFIG_IP_ROUTE_CLASSID
1993 #ifdef CONFIG_IP_MULTIPLE_TABLES
1994 	set_class_tag(rt, fib_rules_tclass(res));
1995 #endif
1996 	set_class_tag(rt, itag);
1997 #endif
1998 }
1999 
rt_dst_alloc(struct net_device * dev,bool nopolicy,bool noxfrm)2000 static struct rtable *rt_dst_alloc(struct net_device *dev,
2001 				   bool nopolicy, bool noxfrm)
2002 {
2003 	return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2004 			 DST_HOST |
2005 			 (nopolicy ? DST_NOPOLICY : 0) |
2006 			 (noxfrm ? DST_NOXFRM : 0));
2007 }
2008 
2009 /* called in rcu_read_lock() section */
ip_route_input_mc(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,int our)2010 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2011 				u8 tos, struct net_device *dev, int our)
2012 {
2013 	unsigned int hash;
2014 	struct rtable *rth;
2015 	__be32 spec_dst;
2016 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2017 	u32 itag = 0;
2018 	int err;
2019 
2020 	/* Primary sanity checks. */
2021 
2022 	if (in_dev == NULL)
2023 		return -EINVAL;
2024 
2025 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2026 	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2027 		goto e_inval;
2028 
2029 	if (ipv4_is_zeronet(saddr)) {
2030 		if (!ipv4_is_local_multicast(daddr))
2031 			goto e_inval;
2032 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2033 	} else {
2034 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2035 					  &itag);
2036 		if (err < 0)
2037 			goto e_err;
2038 	}
2039 	rth = rt_dst_alloc(init_net.loopback_dev,
2040 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2041 	if (!rth)
2042 		goto e_nobufs;
2043 
2044 #ifdef CONFIG_IP_ROUTE_CLASSID
2045 	rth->dst.tclassid = itag;
2046 #endif
2047 	rth->dst.output = ip_rt_bug;
2048 
2049 	rth->rt_key_dst	= daddr;
2050 	rth->rt_key_src	= saddr;
2051 	rth->rt_genid	= rt_genid(dev_net(dev));
2052 	rth->rt_flags	= RTCF_MULTICAST;
2053 	rth->rt_type	= RTN_MULTICAST;
2054 	rth->rt_key_tos	= tos;
2055 	rth->rt_dst	= daddr;
2056 	rth->rt_src	= saddr;
2057 	rth->rt_route_iif = dev->ifindex;
2058 	rth->rt_iif	= dev->ifindex;
2059 	rth->rt_oif	= 0;
2060 	rth->rt_mark    = skb->mark;
2061 	rth->rt_gateway	= daddr;
2062 	rth->rt_spec_dst= spec_dst;
2063 	rth->rt_peer_genid = 0;
2064 	rth->peer = NULL;
2065 	rth->fi = NULL;
2066 	if (our) {
2067 		rth->dst.input= ip_local_deliver;
2068 		rth->rt_flags |= RTCF_LOCAL;
2069 	}
2070 
2071 #ifdef CONFIG_IP_MROUTE
2072 	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2073 		rth->dst.input = ip_mr_input;
2074 #endif
2075 	RT_CACHE_STAT_INC(in_slow_mc);
2076 
2077 	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2078 	rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2079 	return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2080 
2081 e_nobufs:
2082 	return -ENOBUFS;
2083 e_inval:
2084 	return -EINVAL;
2085 e_err:
2086 	return err;
2087 }
2088 
2089 
ip_handle_martian_source(struct net_device * dev,struct in_device * in_dev,struct sk_buff * skb,__be32 daddr,__be32 saddr)2090 static void ip_handle_martian_source(struct net_device *dev,
2091 				     struct in_device *in_dev,
2092 				     struct sk_buff *skb,
2093 				     __be32 daddr,
2094 				     __be32 saddr)
2095 {
2096 	RT_CACHE_STAT_INC(in_martian_src);
2097 #ifdef CONFIG_IP_ROUTE_VERBOSE
2098 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2099 		/*
2100 		 *	RFC1812 recommendation, if source is martian,
2101 		 *	the only hint is MAC header.
2102 		 */
2103 		printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2104 			&daddr, &saddr, dev->name);
2105 		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2106 			int i;
2107 			const unsigned char *p = skb_mac_header(skb);
2108 			printk(KERN_WARNING "ll header: ");
2109 			for (i = 0; i < dev->hard_header_len; i++, p++) {
2110 				printk("%02x", *p);
2111 				if (i < (dev->hard_header_len - 1))
2112 					printk(":");
2113 			}
2114 			printk("\n");
2115 		}
2116 	}
2117 #endif
2118 }
2119 
2120 /* called in rcu_read_lock() section */
__mkroute_input(struct sk_buff * skb,const struct fib_result * res,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos,struct rtable ** result)2121 static int __mkroute_input(struct sk_buff *skb,
2122 			   const struct fib_result *res,
2123 			   struct in_device *in_dev,
2124 			   __be32 daddr, __be32 saddr, u32 tos,
2125 			   struct rtable **result)
2126 {
2127 	struct rtable *rth;
2128 	int err;
2129 	struct in_device *out_dev;
2130 	unsigned int flags = 0;
2131 	__be32 spec_dst;
2132 	u32 itag;
2133 
2134 	/* get a working reference to the output device */
2135 	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2136 	if (out_dev == NULL) {
2137 		if (net_ratelimit())
2138 			printk(KERN_CRIT "Bug in ip_route_input" \
2139 			       "_slow(). Please, report\n");
2140 		return -EINVAL;
2141 	}
2142 
2143 
2144 	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2145 				  in_dev->dev, &spec_dst, &itag);
2146 	if (err < 0) {
2147 		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2148 					 saddr);
2149 
2150 		goto cleanup;
2151 	}
2152 
2153 	if (err)
2154 		flags |= RTCF_DIRECTSRC;
2155 
2156 	if (out_dev == in_dev && err &&
2157 	    (IN_DEV_SHARED_MEDIA(out_dev) ||
2158 	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2159 		flags |= RTCF_DOREDIRECT;
2160 
2161 	if (skb->protocol != htons(ETH_P_IP)) {
2162 		/* Not IP (i.e. ARP). Do not create route, if it is
2163 		 * invalid for proxy arp. DNAT routes are always valid.
2164 		 *
2165 		 * Proxy arp feature have been extended to allow, ARP
2166 		 * replies back to the same interface, to support
2167 		 * Private VLAN switch technologies. See arp.c.
2168 		 */
2169 		if (out_dev == in_dev &&
2170 		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2171 			err = -EINVAL;
2172 			goto cleanup;
2173 		}
2174 	}
2175 
2176 	rth = rt_dst_alloc(out_dev->dev,
2177 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2178 			   IN_DEV_CONF_GET(out_dev, NOXFRM));
2179 	if (!rth) {
2180 		err = -ENOBUFS;
2181 		goto cleanup;
2182 	}
2183 
2184 	rth->rt_key_dst	= daddr;
2185 	rth->rt_key_src	= saddr;
2186 	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2187 	rth->rt_flags = flags;
2188 	rth->rt_type = res->type;
2189 	rth->rt_key_tos	= tos;
2190 	rth->rt_dst	= daddr;
2191 	rth->rt_src	= saddr;
2192 	rth->rt_route_iif = in_dev->dev->ifindex;
2193 	rth->rt_iif 	= in_dev->dev->ifindex;
2194 	rth->rt_oif 	= 0;
2195 	rth->rt_mark    = skb->mark;
2196 	rth->rt_gateway	= daddr;
2197 	rth->rt_spec_dst= spec_dst;
2198 	rth->rt_peer_genid = 0;
2199 	rth->peer = NULL;
2200 	rth->fi = NULL;
2201 
2202 	rth->dst.input = ip_forward;
2203 	rth->dst.output = ip_output;
2204 
2205 	rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2206 
2207 	*result = rth;
2208 	err = 0;
2209  cleanup:
2210 	return err;
2211 }
2212 
ip_mkroute_input(struct sk_buff * skb,struct fib_result * res,const struct flowi4 * fl4,struct in_device * in_dev,__be32 daddr,__be32 saddr,u32 tos)2213 static int ip_mkroute_input(struct sk_buff *skb,
2214 			    struct fib_result *res,
2215 			    const struct flowi4 *fl4,
2216 			    struct in_device *in_dev,
2217 			    __be32 daddr, __be32 saddr, u32 tos)
2218 {
2219 	struct rtable* rth = NULL;
2220 	int err;
2221 	unsigned hash;
2222 
2223 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2224 	if (res->fi && res->fi->fib_nhs > 1)
2225 		fib_select_multipath(res);
2226 #endif
2227 
2228 	/* create a routing cache entry */
2229 	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2230 	if (err)
2231 		return err;
2232 
2233 	/* put it into the cache */
2234 	hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2235 		       rt_genid(dev_net(rth->dst.dev)));
2236 	rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2237 	if (IS_ERR(rth))
2238 		return PTR_ERR(rth);
2239 	return 0;
2240 }
2241 
2242 /*
2243  *	NOTE. We drop all the packets that has local source
2244  *	addresses, because every properly looped back packet
2245  *	must have correct destination already attached by output routine.
2246  *
2247  *	Such approach solves two big problems:
2248  *	1. Not simplex devices are handled properly.
2249  *	2. IP spoofing attempts are filtered with 100% of guarantee.
2250  *	called with rcu_read_lock()
2251  */
2252 
ip_route_input_slow(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev)2253 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2254 			       u8 tos, struct net_device *dev)
2255 {
2256 	struct fib_result res;
2257 	struct in_device *in_dev = __in_dev_get_rcu(dev);
2258 	struct flowi4	fl4;
2259 	unsigned	flags = 0;
2260 	u32		itag = 0;
2261 	struct rtable * rth;
2262 	unsigned	hash;
2263 	__be32		spec_dst;
2264 	int		err = -EINVAL;
2265 	struct net    * net = dev_net(dev);
2266 
2267 	/* IP on this device is disabled. */
2268 
2269 	if (!in_dev)
2270 		goto out;
2271 
2272 	/* Check for the most weird martians, which can be not detected
2273 	   by fib_lookup.
2274 	 */
2275 
2276 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2277 	    ipv4_is_loopback(saddr))
2278 		goto martian_source;
2279 
2280 	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2281 		goto brd_input;
2282 
2283 	/* Accept zero addresses only to limited broadcast;
2284 	 * I even do not know to fix it or not. Waiting for complains :-)
2285 	 */
2286 	if (ipv4_is_zeronet(saddr))
2287 		goto martian_source;
2288 
2289 	if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2290 		goto martian_destination;
2291 
2292 	/*
2293 	 *	Now we are ready to route packet.
2294 	 */
2295 	fl4.flowi4_oif = 0;
2296 	fl4.flowi4_iif = dev->ifindex;
2297 	fl4.flowi4_mark = skb->mark;
2298 	fl4.flowi4_tos = tos;
2299 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2300 	fl4.daddr = daddr;
2301 	fl4.saddr = saddr;
2302 	err = fib_lookup(net, &fl4, &res);
2303 	if (err != 0) {
2304 		if (!IN_DEV_FORWARD(in_dev))
2305 			goto e_hostunreach;
2306 		goto no_route;
2307 	}
2308 
2309 	RT_CACHE_STAT_INC(in_slow_tot);
2310 
2311 	if (res.type == RTN_BROADCAST)
2312 		goto brd_input;
2313 
2314 	if (res.type == RTN_LOCAL) {
2315 		err = fib_validate_source(skb, saddr, daddr, tos,
2316 					  net->loopback_dev->ifindex,
2317 					  dev, &spec_dst, &itag);
2318 		if (err < 0)
2319 			goto martian_source_keep_err;
2320 		if (err)
2321 			flags |= RTCF_DIRECTSRC;
2322 		spec_dst = daddr;
2323 		goto local_input;
2324 	}
2325 
2326 	if (!IN_DEV_FORWARD(in_dev))
2327 		goto e_hostunreach;
2328 	if (res.type != RTN_UNICAST)
2329 		goto martian_destination;
2330 
2331 	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2332 out:	return err;
2333 
2334 brd_input:
2335 	if (skb->protocol != htons(ETH_P_IP))
2336 		goto e_inval;
2337 
2338 	if (ipv4_is_zeronet(saddr))
2339 		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2340 	else {
2341 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2342 					  &itag);
2343 		if (err < 0)
2344 			goto martian_source_keep_err;
2345 		if (err)
2346 			flags |= RTCF_DIRECTSRC;
2347 	}
2348 	flags |= RTCF_BROADCAST;
2349 	res.type = RTN_BROADCAST;
2350 	RT_CACHE_STAT_INC(in_brd);
2351 
2352 local_input:
2353 	rth = rt_dst_alloc(net->loopback_dev,
2354 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2355 	if (!rth)
2356 		goto e_nobufs;
2357 
2358 	rth->dst.input= ip_local_deliver;
2359 	rth->dst.output= ip_rt_bug;
2360 #ifdef CONFIG_IP_ROUTE_CLASSID
2361 	rth->dst.tclassid = itag;
2362 #endif
2363 
2364 	rth->rt_key_dst	= daddr;
2365 	rth->rt_key_src	= saddr;
2366 	rth->rt_genid = rt_genid(net);
2367 	rth->rt_flags 	= flags|RTCF_LOCAL;
2368 	rth->rt_type	= res.type;
2369 	rth->rt_key_tos	= tos;
2370 	rth->rt_dst	= daddr;
2371 	rth->rt_src	= saddr;
2372 #ifdef CONFIG_IP_ROUTE_CLASSID
2373 	rth->dst.tclassid = itag;
2374 #endif
2375 	rth->rt_route_iif = dev->ifindex;
2376 	rth->rt_iif	= dev->ifindex;
2377 	rth->rt_oif	= 0;
2378 	rth->rt_mark    = skb->mark;
2379 	rth->rt_gateway	= daddr;
2380 	rth->rt_spec_dst= spec_dst;
2381 	rth->rt_peer_genid = 0;
2382 	rth->peer = NULL;
2383 	rth->fi = NULL;
2384 	if (res.type == RTN_UNREACHABLE) {
2385 		rth->dst.input= ip_error;
2386 		rth->dst.error= -err;
2387 		rth->rt_flags 	&= ~RTCF_LOCAL;
2388 	}
2389 	hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2390 	rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2391 	err = 0;
2392 	if (IS_ERR(rth))
2393 		err = PTR_ERR(rth);
2394 	goto out;
2395 
2396 no_route:
2397 	RT_CACHE_STAT_INC(in_no_route);
2398 	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2399 	res.type = RTN_UNREACHABLE;
2400 	if (err == -ESRCH)
2401 		err = -ENETUNREACH;
2402 	goto local_input;
2403 
2404 	/*
2405 	 *	Do not cache martian addresses: they should be logged (RFC1812)
2406 	 */
2407 martian_destination:
2408 	RT_CACHE_STAT_INC(in_martian_dst);
2409 #ifdef CONFIG_IP_ROUTE_VERBOSE
2410 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2411 		printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2412 			&daddr, &saddr, dev->name);
2413 #endif
2414 
2415 e_hostunreach:
2416 	err = -EHOSTUNREACH;
2417 	goto out;
2418 
2419 e_inval:
2420 	err = -EINVAL;
2421 	goto out;
2422 
2423 e_nobufs:
2424 	err = -ENOBUFS;
2425 	goto out;
2426 
2427 martian_source:
2428 	err = -EINVAL;
2429 martian_source_keep_err:
2430 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2431 	goto out;
2432 }
2433 
ip_route_input_common(struct sk_buff * skb,__be32 daddr,__be32 saddr,u8 tos,struct net_device * dev,bool noref)2434 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2435 			   u8 tos, struct net_device *dev, bool noref)
2436 {
2437 	struct rtable * rth;
2438 	unsigned	hash;
2439 	int iif = dev->ifindex;
2440 	struct net *net;
2441 	int res;
2442 
2443 	net = dev_net(dev);
2444 
2445 	rcu_read_lock();
2446 
2447 	if (!rt_caching(net))
2448 		goto skip_cache;
2449 
2450 	tos &= IPTOS_RT_MASK;
2451 	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2452 
2453 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2454 	     rth = rcu_dereference(rth->dst.rt_next)) {
2455 		if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2456 		     ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2457 		     (rth->rt_route_iif ^ iif) |
2458 		     (rth->rt_key_tos ^ tos)) == 0 &&
2459 		    rth->rt_mark == skb->mark &&
2460 		    net_eq(dev_net(rth->dst.dev), net) &&
2461 		    !rt_is_expired(rth)) {
2462 			ipv4_validate_peer(rth);
2463 			if (noref) {
2464 				dst_use_noref(&rth->dst, jiffies);
2465 				skb_dst_set_noref(skb, &rth->dst);
2466 			} else {
2467 				dst_use(&rth->dst, jiffies);
2468 				skb_dst_set(skb, &rth->dst);
2469 			}
2470 			RT_CACHE_STAT_INC(in_hit);
2471 			rcu_read_unlock();
2472 			return 0;
2473 		}
2474 		RT_CACHE_STAT_INC(in_hlist_search);
2475 	}
2476 
2477 skip_cache:
2478 	/* Multicast recognition logic is moved from route cache to here.
2479 	   The problem was that too many Ethernet cards have broken/missing
2480 	   hardware multicast filters :-( As result the host on multicasting
2481 	   network acquires a lot of useless route cache entries, sort of
2482 	   SDR messages from all the world. Now we try to get rid of them.
2483 	   Really, provided software IP multicast filter is organized
2484 	   reasonably (at least, hashed), it does not result in a slowdown
2485 	   comparing with route cache reject entries.
2486 	   Note, that multicast routers are not affected, because
2487 	   route cache entry is created eventually.
2488 	 */
2489 	if (ipv4_is_multicast(daddr)) {
2490 		struct in_device *in_dev = __in_dev_get_rcu(dev);
2491 
2492 		if (in_dev) {
2493 			int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2494 						  ip_hdr(skb)->protocol);
2495 			if (our
2496 #ifdef CONFIG_IP_MROUTE
2497 				||
2498 			    (!ipv4_is_local_multicast(daddr) &&
2499 			     IN_DEV_MFORWARD(in_dev))
2500 #endif
2501 			   ) {
2502 				int res = ip_route_input_mc(skb, daddr, saddr,
2503 							    tos, dev, our);
2504 				rcu_read_unlock();
2505 				return res;
2506 			}
2507 		}
2508 		rcu_read_unlock();
2509 		return -EINVAL;
2510 	}
2511 	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2512 	rcu_read_unlock();
2513 	return res;
2514 }
2515 EXPORT_SYMBOL(ip_route_input_common);
2516 
2517 /* called with rcu_read_lock() */
__mkroute_output(const struct fib_result * res,const struct flowi4 * fl4,__be32 orig_daddr,__be32 orig_saddr,int orig_oif,__u8 orig_rtos,struct net_device * dev_out,unsigned int flags)2518 static struct rtable *__mkroute_output(const struct fib_result *res,
2519 				       const struct flowi4 *fl4,
2520 				       __be32 orig_daddr, __be32 orig_saddr,
2521 				       int orig_oif, __u8 orig_rtos,
2522 				       struct net_device *dev_out,
2523 				       unsigned int flags)
2524 {
2525 	struct fib_info *fi = res->fi;
2526 	struct in_device *in_dev;
2527 	u16 type = res->type;
2528 	struct rtable *rth;
2529 
2530 	if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2531 		return ERR_PTR(-EINVAL);
2532 
2533 	if (ipv4_is_lbcast(fl4->daddr))
2534 		type = RTN_BROADCAST;
2535 	else if (ipv4_is_multicast(fl4->daddr))
2536 		type = RTN_MULTICAST;
2537 	else if (ipv4_is_zeronet(fl4->daddr))
2538 		return ERR_PTR(-EINVAL);
2539 
2540 	if (dev_out->flags & IFF_LOOPBACK)
2541 		flags |= RTCF_LOCAL;
2542 
2543 	in_dev = __in_dev_get_rcu(dev_out);
2544 	if (!in_dev)
2545 		return ERR_PTR(-EINVAL);
2546 
2547 	if (type == RTN_BROADCAST) {
2548 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2549 		fi = NULL;
2550 	} else if (type == RTN_MULTICAST) {
2551 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2552 		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2553 				     fl4->flowi4_proto))
2554 			flags &= ~RTCF_LOCAL;
2555 		/* If multicast route do not exist use
2556 		 * default one, but do not gateway in this case.
2557 		 * Yes, it is hack.
2558 		 */
2559 		if (fi && res->prefixlen < 4)
2560 			fi = NULL;
2561 	}
2562 
2563 	rth = rt_dst_alloc(dev_out,
2564 			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2565 			   IN_DEV_CONF_GET(in_dev, NOXFRM));
2566 	if (!rth)
2567 		return ERR_PTR(-ENOBUFS);
2568 
2569 	rth->dst.output = ip_output;
2570 
2571 	rth->rt_key_dst	= orig_daddr;
2572 	rth->rt_key_src	= orig_saddr;
2573 	rth->rt_genid = rt_genid(dev_net(dev_out));
2574 	rth->rt_flags	= flags;
2575 	rth->rt_type	= type;
2576 	rth->rt_key_tos	= orig_rtos;
2577 	rth->rt_dst	= fl4->daddr;
2578 	rth->rt_src	= fl4->saddr;
2579 	rth->rt_route_iif = 0;
2580 	rth->rt_iif	= orig_oif ? : dev_out->ifindex;
2581 	rth->rt_oif	= orig_oif;
2582 	rth->rt_mark    = fl4->flowi4_mark;
2583 	rth->rt_gateway = fl4->daddr;
2584 	rth->rt_spec_dst= fl4->saddr;
2585 	rth->rt_peer_genid = 0;
2586 	rth->peer = NULL;
2587 	rth->fi = NULL;
2588 
2589 	RT_CACHE_STAT_INC(out_slow_tot);
2590 
2591 	if (flags & RTCF_LOCAL) {
2592 		rth->dst.input = ip_local_deliver;
2593 		rth->rt_spec_dst = fl4->daddr;
2594 	}
2595 	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2596 		rth->rt_spec_dst = fl4->saddr;
2597 		if (flags & RTCF_LOCAL &&
2598 		    !(dev_out->flags & IFF_LOOPBACK)) {
2599 			rth->dst.output = ip_mc_output;
2600 			RT_CACHE_STAT_INC(out_slow_mc);
2601 		}
2602 #ifdef CONFIG_IP_MROUTE
2603 		if (type == RTN_MULTICAST) {
2604 			if (IN_DEV_MFORWARD(in_dev) &&
2605 			    !ipv4_is_local_multicast(fl4->daddr)) {
2606 				rth->dst.input = ip_mr_input;
2607 				rth->dst.output = ip_mc_output;
2608 			}
2609 		}
2610 #endif
2611 	}
2612 
2613 	rt_set_nexthop(rth, fl4, res, fi, type, 0);
2614 
2615 	return rth;
2616 }
2617 
2618 /*
2619  * Major route resolver routine.
2620  * called with rcu_read_lock();
2621  */
2622 
ip_route_output_slow(struct net * net,struct flowi4 * fl4)2623 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2624 {
2625 	struct net_device *dev_out = NULL;
2626 	__u8 tos = RT_FL_TOS(fl4);
2627 	unsigned int flags = 0;
2628 	struct fib_result res;
2629 	struct rtable *rth;
2630 	__be32 orig_daddr;
2631 	__be32 orig_saddr;
2632 	int orig_oif;
2633 
2634 	res.fi		= NULL;
2635 #ifdef CONFIG_IP_MULTIPLE_TABLES
2636 	res.r		= NULL;
2637 #endif
2638 
2639 	orig_daddr = fl4->daddr;
2640 	orig_saddr = fl4->saddr;
2641 	orig_oif = fl4->flowi4_oif;
2642 
2643 	fl4->flowi4_iif = net->loopback_dev->ifindex;
2644 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2645 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2646 			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2647 
2648 	rcu_read_lock();
2649 	if (fl4->saddr) {
2650 		rth = ERR_PTR(-EINVAL);
2651 		if (ipv4_is_multicast(fl4->saddr) ||
2652 		    ipv4_is_lbcast(fl4->saddr) ||
2653 		    ipv4_is_zeronet(fl4->saddr))
2654 			goto out;
2655 
2656 		/* I removed check for oif == dev_out->oif here.
2657 		   It was wrong for two reasons:
2658 		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2659 		      is assigned to multiple interfaces.
2660 		   2. Moreover, we are allowed to send packets with saddr
2661 		      of another iface. --ANK
2662 		 */
2663 
2664 		if (fl4->flowi4_oif == 0 &&
2665 		    (ipv4_is_multicast(fl4->daddr) ||
2666 		     ipv4_is_lbcast(fl4->daddr))) {
2667 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2668 			dev_out = __ip_dev_find(net, fl4->saddr, false);
2669 			if (dev_out == NULL)
2670 				goto out;
2671 
2672 			/* Special hack: user can direct multicasts
2673 			   and limited broadcast via necessary interface
2674 			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2675 			   This hack is not just for fun, it allows
2676 			   vic,vat and friends to work.
2677 			   They bind socket to loopback, set ttl to zero
2678 			   and expect that it will work.
2679 			   From the viewpoint of routing cache they are broken,
2680 			   because we are not allowed to build multicast path
2681 			   with loopback source addr (look, routing cache
2682 			   cannot know, that ttl is zero, so that packet
2683 			   will not leave this host and route is valid).
2684 			   Luckily, this hack is good workaround.
2685 			 */
2686 
2687 			fl4->flowi4_oif = dev_out->ifindex;
2688 			goto make_route;
2689 		}
2690 
2691 		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2692 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2693 			if (!__ip_dev_find(net, fl4->saddr, false))
2694 				goto out;
2695 		}
2696 	}
2697 
2698 
2699 	if (fl4->flowi4_oif) {
2700 		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2701 		rth = ERR_PTR(-ENODEV);
2702 		if (dev_out == NULL)
2703 			goto out;
2704 
2705 		/* RACE: Check return value of inet_select_addr instead. */
2706 		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2707 			rth = ERR_PTR(-ENETUNREACH);
2708 			goto out;
2709 		}
2710 		if (ipv4_is_local_multicast(fl4->daddr) ||
2711 		    ipv4_is_lbcast(fl4->daddr)) {
2712 			if (!fl4->saddr)
2713 				fl4->saddr = inet_select_addr(dev_out, 0,
2714 							      RT_SCOPE_LINK);
2715 			goto make_route;
2716 		}
2717 		if (fl4->saddr) {
2718 			if (ipv4_is_multicast(fl4->daddr))
2719 				fl4->saddr = inet_select_addr(dev_out, 0,
2720 							      fl4->flowi4_scope);
2721 			else if (!fl4->daddr)
2722 				fl4->saddr = inet_select_addr(dev_out, 0,
2723 							      RT_SCOPE_HOST);
2724 		}
2725 	}
2726 
2727 	if (!fl4->daddr) {
2728 		fl4->daddr = fl4->saddr;
2729 		if (!fl4->daddr)
2730 			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2731 		dev_out = net->loopback_dev;
2732 		fl4->flowi4_oif = net->loopback_dev->ifindex;
2733 		res.type = RTN_LOCAL;
2734 		flags |= RTCF_LOCAL;
2735 		goto make_route;
2736 	}
2737 
2738 	if (fib_lookup(net, fl4, &res)) {
2739 		res.fi = NULL;
2740 		if (fl4->flowi4_oif) {
2741 			/* Apparently, routing tables are wrong. Assume,
2742 			   that the destination is on link.
2743 
2744 			   WHY? DW.
2745 			   Because we are allowed to send to iface
2746 			   even if it has NO routes and NO assigned
2747 			   addresses. When oif is specified, routing
2748 			   tables are looked up with only one purpose:
2749 			   to catch if destination is gatewayed, rather than
2750 			   direct. Moreover, if MSG_DONTROUTE is set,
2751 			   we send packet, ignoring both routing tables
2752 			   and ifaddr state. --ANK
2753 
2754 
2755 			   We could make it even if oif is unknown,
2756 			   likely IPv6, but we do not.
2757 			 */
2758 
2759 			if (fl4->saddr == 0)
2760 				fl4->saddr = inet_select_addr(dev_out, 0,
2761 							      RT_SCOPE_LINK);
2762 			res.type = RTN_UNICAST;
2763 			goto make_route;
2764 		}
2765 		rth = ERR_PTR(-ENETUNREACH);
2766 		goto out;
2767 	}
2768 
2769 	if (res.type == RTN_LOCAL) {
2770 		if (!fl4->saddr) {
2771 			if (res.fi->fib_prefsrc)
2772 				fl4->saddr = res.fi->fib_prefsrc;
2773 			else
2774 				fl4->saddr = fl4->daddr;
2775 		}
2776 		dev_out = net->loopback_dev;
2777 		fl4->flowi4_oif = dev_out->ifindex;
2778 		res.fi = NULL;
2779 		flags |= RTCF_LOCAL;
2780 		goto make_route;
2781 	}
2782 
2783 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2784 	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2785 		fib_select_multipath(&res);
2786 	else
2787 #endif
2788 	if (!res.prefixlen &&
2789 	    res.table->tb_num_default > 1 &&
2790 	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
2791 		fib_select_default(&res);
2792 
2793 	if (!fl4->saddr)
2794 		fl4->saddr = FIB_RES_PREFSRC(net, res);
2795 
2796 	dev_out = FIB_RES_DEV(res);
2797 	fl4->flowi4_oif = dev_out->ifindex;
2798 
2799 
2800 make_route:
2801 	rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2802 			       tos, dev_out, flags);
2803 	if (!IS_ERR(rth)) {
2804 		unsigned int hash;
2805 
2806 		hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2807 			       rt_genid(dev_net(dev_out)));
2808 		rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2809 	}
2810 
2811 out:
2812 	rcu_read_unlock();
2813 	return rth;
2814 }
2815 
__ip_route_output_key(struct net * net,struct flowi4 * flp4)2816 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2817 {
2818 	struct rtable *rth;
2819 	unsigned int hash;
2820 
2821 	if (!rt_caching(net))
2822 		goto slow_output;
2823 
2824 	hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2825 
2826 	rcu_read_lock_bh();
2827 	for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2828 		rth = rcu_dereference_bh(rth->dst.rt_next)) {
2829 		if (rth->rt_key_dst == flp4->daddr &&
2830 		    rth->rt_key_src == flp4->saddr &&
2831 		    rt_is_output_route(rth) &&
2832 		    rth->rt_oif == flp4->flowi4_oif &&
2833 		    rth->rt_mark == flp4->flowi4_mark &&
2834 		    !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2835 			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
2836 		    net_eq(dev_net(rth->dst.dev), net) &&
2837 		    !rt_is_expired(rth)) {
2838 			ipv4_validate_peer(rth);
2839 			dst_use(&rth->dst, jiffies);
2840 			RT_CACHE_STAT_INC(out_hit);
2841 			rcu_read_unlock_bh();
2842 			if (!flp4->saddr)
2843 				flp4->saddr = rth->rt_src;
2844 			if (!flp4->daddr)
2845 				flp4->daddr = rth->rt_dst;
2846 			return rth;
2847 		}
2848 		RT_CACHE_STAT_INC(out_hlist_search);
2849 	}
2850 	rcu_read_unlock_bh();
2851 
2852 slow_output:
2853 	return ip_route_output_slow(net, flp4);
2854 }
2855 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2856 
ipv4_blackhole_dst_check(struct dst_entry * dst,u32 cookie)2857 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2858 {
2859 	return NULL;
2860 }
2861 
ipv4_blackhole_mtu(const struct dst_entry * dst)2862 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2863 {
2864 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2865 
2866 	return mtu ? : dst->dev->mtu;
2867 }
2868 
ipv4_rt_blackhole_update_pmtu(struct dst_entry * dst,u32 mtu)2869 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2870 {
2871 }
2872 
ipv4_rt_blackhole_cow_metrics(struct dst_entry * dst,unsigned long old)2873 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2874 					  unsigned long old)
2875 {
2876 	return NULL;
2877 }
2878 
2879 static struct dst_ops ipv4_dst_blackhole_ops = {
2880 	.family			=	AF_INET,
2881 	.protocol		=	cpu_to_be16(ETH_P_IP),
2882 	.destroy		=	ipv4_dst_destroy,
2883 	.check			=	ipv4_blackhole_dst_check,
2884 	.mtu			=	ipv4_blackhole_mtu,
2885 	.default_advmss		=	ipv4_default_advmss,
2886 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2887 	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2888 	.neigh_lookup		=	ipv4_neigh_lookup,
2889 };
2890 
ipv4_blackhole_route(struct net * net,struct dst_entry * dst_orig)2891 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2892 {
2893 	struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2894 	struct rtable *ort = (struct rtable *) dst_orig;
2895 
2896 	if (rt) {
2897 		struct dst_entry *new = &rt->dst;
2898 
2899 		new->__use = 1;
2900 		new->input = dst_discard;
2901 		new->output = dst_discard;
2902 		dst_copy_metrics(new, &ort->dst);
2903 
2904 		new->dev = ort->dst.dev;
2905 		if (new->dev)
2906 			dev_hold(new->dev);
2907 
2908 		rt->rt_key_dst = ort->rt_key_dst;
2909 		rt->rt_key_src = ort->rt_key_src;
2910 		rt->rt_key_tos = ort->rt_key_tos;
2911 		rt->rt_route_iif = ort->rt_route_iif;
2912 		rt->rt_iif = ort->rt_iif;
2913 		rt->rt_oif = ort->rt_oif;
2914 		rt->rt_mark = ort->rt_mark;
2915 
2916 		rt->rt_genid = rt_genid(net);
2917 		rt->rt_flags = ort->rt_flags;
2918 		rt->rt_type = ort->rt_type;
2919 		rt->rt_dst = ort->rt_dst;
2920 		rt->rt_src = ort->rt_src;
2921 		rt->rt_gateway = ort->rt_gateway;
2922 		rt->rt_spec_dst = ort->rt_spec_dst;
2923 		rt->peer = ort->peer;
2924 		if (rt->peer)
2925 			atomic_inc(&rt->peer->refcnt);
2926 		rt->fi = ort->fi;
2927 		if (rt->fi)
2928 			atomic_inc(&rt->fi->fib_clntref);
2929 
2930 		dst_free(new);
2931 	}
2932 
2933 	dst_release(dst_orig);
2934 
2935 	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2936 }
2937 
ip_route_output_flow(struct net * net,struct flowi4 * flp4,struct sock * sk)2938 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2939 				    struct sock *sk)
2940 {
2941 	struct rtable *rt = __ip_route_output_key(net, flp4);
2942 
2943 	if (IS_ERR(rt))
2944 		return rt;
2945 
2946 	if (flp4->flowi4_proto)
2947 		rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2948 						   flowi4_to_flowi(flp4),
2949 						   sk, 0);
2950 
2951 	return rt;
2952 }
2953 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2954 
rt_fill_info(struct net * net,struct sk_buff * skb,u32 pid,u32 seq,int event,int nowait,unsigned int flags)2955 static int rt_fill_info(struct net *net,
2956 			struct sk_buff *skb, u32 pid, u32 seq, int event,
2957 			int nowait, unsigned int flags)
2958 {
2959 	struct rtable *rt = skb_rtable(skb);
2960 	struct rtmsg *r;
2961 	struct nlmsghdr *nlh;
2962 	unsigned long expires = 0;
2963 	const struct inet_peer *peer = rt->peer;
2964 	u32 id = 0, ts = 0, tsage = 0, error;
2965 
2966 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2967 	if (nlh == NULL)
2968 		return -EMSGSIZE;
2969 
2970 	r = nlmsg_data(nlh);
2971 	r->rtm_family	 = AF_INET;
2972 	r->rtm_dst_len	= 32;
2973 	r->rtm_src_len	= 0;
2974 	r->rtm_tos	= rt->rt_key_tos;
2975 	r->rtm_table	= RT_TABLE_MAIN;
2976 	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2977 	r->rtm_type	= rt->rt_type;
2978 	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2979 	r->rtm_protocol = RTPROT_UNSPEC;
2980 	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2981 	if (rt->rt_flags & RTCF_NOTIFY)
2982 		r->rtm_flags |= RTM_F_NOTIFY;
2983 
2984 	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2985 
2986 	if (rt->rt_key_src) {
2987 		r->rtm_src_len = 32;
2988 		NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2989 	}
2990 	if (rt->dst.dev)
2991 		NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2992 #ifdef CONFIG_IP_ROUTE_CLASSID
2993 	if (rt->dst.tclassid)
2994 		NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2995 #endif
2996 	if (rt_is_input_route(rt))
2997 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2998 	else if (rt->rt_src != rt->rt_key_src)
2999 		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
3000 
3001 	if (rt->rt_dst != rt->rt_gateway)
3002 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
3003 
3004 	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
3005 		goto nla_put_failure;
3006 
3007 	if (rt->rt_mark)
3008 		NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
3009 
3010 	error = rt->dst.error;
3011 	if (peer) {
3012 		inet_peer_refcheck(rt->peer);
3013 		id = atomic_read(&peer->ip_id_count) & 0xffff;
3014 		if (peer->tcp_ts_stamp) {
3015 			ts = peer->tcp_ts;
3016 			tsage = get_seconds() - peer->tcp_ts_stamp;
3017 		}
3018 		expires = ACCESS_ONCE(peer->pmtu_expires);
3019 		if (expires) {
3020 			if (time_before(jiffies, expires))
3021 				expires -= jiffies;
3022 			else
3023 				expires = 0;
3024 		}
3025 	}
3026 
3027 	if (rt_is_input_route(rt)) {
3028 #ifdef CONFIG_IP_MROUTE
3029 		__be32 dst = rt->rt_dst;
3030 
3031 		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3032 		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3033 			int err = ipmr_get_route(net, skb,
3034 						 rt->rt_src, rt->rt_dst,
3035 						 r, nowait);
3036 			if (err <= 0) {
3037 				if (!nowait) {
3038 					if (err == 0)
3039 						return 0;
3040 					goto nla_put_failure;
3041 				} else {
3042 					if (err == -EMSGSIZE)
3043 						goto nla_put_failure;
3044 					error = err;
3045 				}
3046 			}
3047 		} else
3048 #endif
3049 			NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
3050 	}
3051 
3052 	if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3053 			       expires, error) < 0)
3054 		goto nla_put_failure;
3055 
3056 	return nlmsg_end(skb, nlh);
3057 
3058 nla_put_failure:
3059 	nlmsg_cancel(skb, nlh);
3060 	return -EMSGSIZE;
3061 }
3062 
inet_rtm_getroute(struct sk_buff * in_skb,struct nlmsghdr * nlh,void * arg)3063 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
3064 {
3065 	struct net *net = sock_net(in_skb->sk);
3066 	struct rtmsg *rtm;
3067 	struct nlattr *tb[RTA_MAX+1];
3068 	struct rtable *rt = NULL;
3069 	__be32 dst = 0;
3070 	__be32 src = 0;
3071 	u32 iif;
3072 	int err;
3073 	int mark;
3074 	struct sk_buff *skb;
3075 
3076 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3077 	if (err < 0)
3078 		goto errout;
3079 
3080 	rtm = nlmsg_data(nlh);
3081 
3082 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3083 	if (skb == NULL) {
3084 		err = -ENOBUFS;
3085 		goto errout;
3086 	}
3087 
3088 	/* Reserve room for dummy headers, this skb can pass
3089 	   through good chunk of routing engine.
3090 	 */
3091 	skb_reset_mac_header(skb);
3092 	skb_reset_network_header(skb);
3093 
3094 	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3095 	ip_hdr(skb)->protocol = IPPROTO_ICMP;
3096 	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3097 
3098 	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3099 	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3100 	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3101 	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3102 
3103 	if (iif) {
3104 		struct net_device *dev;
3105 
3106 		dev = __dev_get_by_index(net, iif);
3107 		if (dev == NULL) {
3108 			err = -ENODEV;
3109 			goto errout_free;
3110 		}
3111 
3112 		skb->protocol	= htons(ETH_P_IP);
3113 		skb->dev	= dev;
3114 		skb->mark	= mark;
3115 		local_bh_disable();
3116 		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3117 		local_bh_enable();
3118 
3119 		rt = skb_rtable(skb);
3120 		if (err == 0 && rt->dst.error)
3121 			err = -rt->dst.error;
3122 	} else {
3123 		struct flowi4 fl4 = {
3124 			.daddr = dst,
3125 			.saddr = src,
3126 			.flowi4_tos = rtm->rtm_tos,
3127 			.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3128 			.flowi4_mark = mark,
3129 		};
3130 		rt = ip_route_output_key(net, &fl4);
3131 
3132 		err = 0;
3133 		if (IS_ERR(rt))
3134 			err = PTR_ERR(rt);
3135 	}
3136 
3137 	if (err)
3138 		goto errout_free;
3139 
3140 	skb_dst_set(skb, &rt->dst);
3141 	if (rtm->rtm_flags & RTM_F_NOTIFY)
3142 		rt->rt_flags |= RTCF_NOTIFY;
3143 
3144 	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3145 			   RTM_NEWROUTE, 0, 0);
3146 	if (err <= 0)
3147 		goto errout_free;
3148 
3149 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3150 errout:
3151 	return err;
3152 
3153 errout_free:
3154 	kfree_skb(skb);
3155 	goto errout;
3156 }
3157 
ip_rt_dump(struct sk_buff * skb,struct netlink_callback * cb)3158 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3159 {
3160 	struct rtable *rt;
3161 	int h, s_h;
3162 	int idx, s_idx;
3163 	struct net *net;
3164 
3165 	net = sock_net(skb->sk);
3166 
3167 	s_h = cb->args[0];
3168 	if (s_h < 0)
3169 		s_h = 0;
3170 	s_idx = idx = cb->args[1];
3171 	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3172 		if (!rt_hash_table[h].chain)
3173 			continue;
3174 		rcu_read_lock_bh();
3175 		for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3176 		     rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3177 			if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3178 				continue;
3179 			if (rt_is_expired(rt))
3180 				continue;
3181 			skb_dst_set_noref(skb, &rt->dst);
3182 			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3183 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3184 					 1, NLM_F_MULTI) <= 0) {
3185 				skb_dst_drop(skb);
3186 				rcu_read_unlock_bh();
3187 				goto done;
3188 			}
3189 			skb_dst_drop(skb);
3190 		}
3191 		rcu_read_unlock_bh();
3192 	}
3193 
3194 done:
3195 	cb->args[0] = h;
3196 	cb->args[1] = idx;
3197 	return skb->len;
3198 }
3199 
ip_rt_multicast_event(struct in_device * in_dev)3200 void ip_rt_multicast_event(struct in_device *in_dev)
3201 {
3202 	rt_cache_flush(dev_net(in_dev->dev), 0);
3203 }
3204 
3205 #ifdef CONFIG_SYSCTL
ipv4_sysctl_rtcache_flush(ctl_table * __ctl,int write,void __user * buffer,size_t * lenp,loff_t * ppos)3206 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3207 					void __user *buffer,
3208 					size_t *lenp, loff_t *ppos)
3209 {
3210 	if (write) {
3211 		int flush_delay;
3212 		ctl_table ctl;
3213 		struct net *net;
3214 
3215 		memcpy(&ctl, __ctl, sizeof(ctl));
3216 		ctl.data = &flush_delay;
3217 		proc_dointvec(&ctl, write, buffer, lenp, ppos);
3218 
3219 		net = (struct net *)__ctl->extra1;
3220 		rt_cache_flush(net, flush_delay);
3221 		return 0;
3222 	}
3223 
3224 	return -EINVAL;
3225 }
3226 
3227 static ctl_table ipv4_route_table[] = {
3228 	{
3229 		.procname	= "gc_thresh",
3230 		.data		= &ipv4_dst_ops.gc_thresh,
3231 		.maxlen		= sizeof(int),
3232 		.mode		= 0644,
3233 		.proc_handler	= proc_dointvec,
3234 	},
3235 	{
3236 		.procname	= "max_size",
3237 		.data		= &ip_rt_max_size,
3238 		.maxlen		= sizeof(int),
3239 		.mode		= 0644,
3240 		.proc_handler	= proc_dointvec,
3241 	},
3242 	{
3243 		/*  Deprecated. Use gc_min_interval_ms */
3244 
3245 		.procname	= "gc_min_interval",
3246 		.data		= &ip_rt_gc_min_interval,
3247 		.maxlen		= sizeof(int),
3248 		.mode		= 0644,
3249 		.proc_handler	= proc_dointvec_jiffies,
3250 	},
3251 	{
3252 		.procname	= "gc_min_interval_ms",
3253 		.data		= &ip_rt_gc_min_interval,
3254 		.maxlen		= sizeof(int),
3255 		.mode		= 0644,
3256 		.proc_handler	= proc_dointvec_ms_jiffies,
3257 	},
3258 	{
3259 		.procname	= "gc_timeout",
3260 		.data		= &ip_rt_gc_timeout,
3261 		.maxlen		= sizeof(int),
3262 		.mode		= 0644,
3263 		.proc_handler	= proc_dointvec_jiffies,
3264 	},
3265 	{
3266 		.procname	= "gc_interval",
3267 		.data		= &ip_rt_gc_interval,
3268 		.maxlen		= sizeof(int),
3269 		.mode		= 0644,
3270 		.proc_handler	= proc_dointvec_jiffies,
3271 	},
3272 	{
3273 		.procname	= "redirect_load",
3274 		.data		= &ip_rt_redirect_load,
3275 		.maxlen		= sizeof(int),
3276 		.mode		= 0644,
3277 		.proc_handler	= proc_dointvec,
3278 	},
3279 	{
3280 		.procname	= "redirect_number",
3281 		.data		= &ip_rt_redirect_number,
3282 		.maxlen		= sizeof(int),
3283 		.mode		= 0644,
3284 		.proc_handler	= proc_dointvec,
3285 	},
3286 	{
3287 		.procname	= "redirect_silence",
3288 		.data		= &ip_rt_redirect_silence,
3289 		.maxlen		= sizeof(int),
3290 		.mode		= 0644,
3291 		.proc_handler	= proc_dointvec,
3292 	},
3293 	{
3294 		.procname	= "error_cost",
3295 		.data		= &ip_rt_error_cost,
3296 		.maxlen		= sizeof(int),
3297 		.mode		= 0644,
3298 		.proc_handler	= proc_dointvec,
3299 	},
3300 	{
3301 		.procname	= "error_burst",
3302 		.data		= &ip_rt_error_burst,
3303 		.maxlen		= sizeof(int),
3304 		.mode		= 0644,
3305 		.proc_handler	= proc_dointvec,
3306 	},
3307 	{
3308 		.procname	= "gc_elasticity",
3309 		.data		= &ip_rt_gc_elasticity,
3310 		.maxlen		= sizeof(int),
3311 		.mode		= 0644,
3312 		.proc_handler	= proc_dointvec,
3313 	},
3314 	{
3315 		.procname	= "mtu_expires",
3316 		.data		= &ip_rt_mtu_expires,
3317 		.maxlen		= sizeof(int),
3318 		.mode		= 0644,
3319 		.proc_handler	= proc_dointvec_jiffies,
3320 	},
3321 	{
3322 		.procname	= "min_pmtu",
3323 		.data		= &ip_rt_min_pmtu,
3324 		.maxlen		= sizeof(int),
3325 		.mode		= 0644,
3326 		.proc_handler	= proc_dointvec,
3327 	},
3328 	{
3329 		.procname	= "min_adv_mss",
3330 		.data		= &ip_rt_min_advmss,
3331 		.maxlen		= sizeof(int),
3332 		.mode		= 0644,
3333 		.proc_handler	= proc_dointvec,
3334 	},
3335 	{ }
3336 };
3337 
3338 static struct ctl_table empty[1];
3339 
3340 static struct ctl_table ipv4_skeleton[] =
3341 {
3342 	{ .procname = "route",
3343 	  .mode = 0555, .child = ipv4_route_table},
3344 	{ .procname = "neigh",
3345 	  .mode = 0555, .child = empty},
3346 	{ }
3347 };
3348 
3349 static __net_initdata struct ctl_path ipv4_path[] = {
3350 	{ .procname = "net", },
3351 	{ .procname = "ipv4", },
3352 	{ },
3353 };
3354 
3355 static struct ctl_table ipv4_route_flush_table[] = {
3356 	{
3357 		.procname	= "flush",
3358 		.maxlen		= sizeof(int),
3359 		.mode		= 0200,
3360 		.proc_handler	= ipv4_sysctl_rtcache_flush,
3361 	},
3362 	{ },
3363 };
3364 
3365 static __net_initdata struct ctl_path ipv4_route_path[] = {
3366 	{ .procname = "net", },
3367 	{ .procname = "ipv4", },
3368 	{ .procname = "route", },
3369 	{ },
3370 };
3371 
sysctl_route_net_init(struct net * net)3372 static __net_init int sysctl_route_net_init(struct net *net)
3373 {
3374 	struct ctl_table *tbl;
3375 
3376 	tbl = ipv4_route_flush_table;
3377 	if (!net_eq(net, &init_net)) {
3378 		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3379 		if (tbl == NULL)
3380 			goto err_dup;
3381 	}
3382 	tbl[0].extra1 = net;
3383 
3384 	net->ipv4.route_hdr =
3385 		register_net_sysctl_table(net, ipv4_route_path, tbl);
3386 	if (net->ipv4.route_hdr == NULL)
3387 		goto err_reg;
3388 	return 0;
3389 
3390 err_reg:
3391 	if (tbl != ipv4_route_flush_table)
3392 		kfree(tbl);
3393 err_dup:
3394 	return -ENOMEM;
3395 }
3396 
sysctl_route_net_exit(struct net * net)3397 static __net_exit void sysctl_route_net_exit(struct net *net)
3398 {
3399 	struct ctl_table *tbl;
3400 
3401 	tbl = net->ipv4.route_hdr->ctl_table_arg;
3402 	unregister_net_sysctl_table(net->ipv4.route_hdr);
3403 	BUG_ON(tbl == ipv4_route_flush_table);
3404 	kfree(tbl);
3405 }
3406 
3407 static __net_initdata struct pernet_operations sysctl_route_ops = {
3408 	.init = sysctl_route_net_init,
3409 	.exit = sysctl_route_net_exit,
3410 };
3411 #endif
3412 
rt_genid_init(struct net * net)3413 static __net_init int rt_genid_init(struct net *net)
3414 {
3415 	get_random_bytes(&net->ipv4.rt_genid,
3416 			 sizeof(net->ipv4.rt_genid));
3417 	get_random_bytes(&net->ipv4.dev_addr_genid,
3418 			 sizeof(net->ipv4.dev_addr_genid));
3419 	return 0;
3420 }
3421 
3422 static __net_initdata struct pernet_operations rt_genid_ops = {
3423 	.init = rt_genid_init,
3424 };
3425 
3426 
3427 #ifdef CONFIG_IP_ROUTE_CLASSID
3428 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3429 #endif /* CONFIG_IP_ROUTE_CLASSID */
3430 
3431 static __initdata unsigned long rhash_entries;
set_rhash_entries(char * str)3432 static int __init set_rhash_entries(char *str)
3433 {
3434 	if (!str)
3435 		return 0;
3436 	rhash_entries = simple_strtoul(str, &str, 0);
3437 	return 1;
3438 }
3439 __setup("rhash_entries=", set_rhash_entries);
3440 
ip_rt_init(void)3441 int __init ip_rt_init(void)
3442 {
3443 	int rc = 0;
3444 
3445 #ifdef CONFIG_IP_ROUTE_CLASSID
3446 	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3447 	if (!ip_rt_acct)
3448 		panic("IP: failed to allocate ip_rt_acct\n");
3449 #endif
3450 
3451 	ipv4_dst_ops.kmem_cachep =
3452 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3453 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3454 
3455 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3456 
3457 	if (dst_entries_init(&ipv4_dst_ops) < 0)
3458 		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3459 
3460 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3461 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3462 
3463 	rt_hash_table = (struct rt_hash_bucket *)
3464 		alloc_large_system_hash("IP route cache",
3465 					sizeof(struct rt_hash_bucket),
3466 					rhash_entries,
3467 					(totalram_pages >= 128 * 1024) ?
3468 					15 : 17,
3469 					0,
3470 					&rt_hash_log,
3471 					&rt_hash_mask,
3472 					rhash_entries ? 0 : 512 * 1024);
3473 	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3474 	rt_hash_lock_init();
3475 
3476 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3477 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3478 
3479 	devinet_init();
3480 	ip_fib_init();
3481 
3482 	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3483 	expires_ljiffies = jiffies;
3484 	schedule_delayed_work(&expires_work,
3485 		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3486 
3487 	if (ip_rt_proc_init())
3488 		printk(KERN_ERR "Unable to create route proc files\n");
3489 #ifdef CONFIG_XFRM
3490 	xfrm_init();
3491 	xfrm4_init(ip_rt_max_size);
3492 #endif
3493 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3494 
3495 #ifdef CONFIG_SYSCTL
3496 	register_pernet_subsys(&sysctl_route_ops);
3497 #endif
3498 	register_pernet_subsys(&rt_genid_ops);
3499 	return rc;
3500 }
3501 
3502 #ifdef CONFIG_SYSCTL
3503 /*
3504  * We really need to sanitize the damn ipv4 init order, then all
3505  * this nonsense will go away.
3506  */
ip_static_sysctl_init(void)3507 void __init ip_static_sysctl_init(void)
3508 {
3509 	register_sysctl_paths(ipv4_path, ipv4_skeleton);
3510 }
3511 #endif
3512