1 /*
2  *	Linux NET3:	GRE over IP protocol decoder.
3  *
4  *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *	This program is free software; you can redistribute it and/or
7  *	modify it under the terms of the GNU General Public License
8  *	as published by the Free Software Foundation; either version
9  *	2 of the License, or (at your option) any later version.
10  *
11  */
12 
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/slab.h>
18 #include <asm/uaccess.h>
19 #include <linux/skbuff.h>
20 #include <linux/netdevice.h>
21 #include <linux/in.h>
22 #include <linux/tcp.h>
23 #include <linux/udp.h>
24 #include <linux/if_arp.h>
25 #include <linux/mroute.h>
26 #include <linux/init.h>
27 #include <linux/in6.h>
28 #include <linux/inetdevice.h>
29 #include <linux/igmp.h>
30 #include <linux/netfilter_ipv4.h>
31 #include <linux/etherdevice.h>
32 #include <linux/if_ether.h>
33 
34 #include <net/sock.h>
35 #include <net/ip.h>
36 #include <net/icmp.h>
37 #include <net/protocol.h>
38 #include <net/ipip.h>
39 #include <net/arp.h>
40 #include <net/checksum.h>
41 #include <net/dsfield.h>
42 #include <net/inet_ecn.h>
43 #include <net/xfrm.h>
44 #include <net/net_namespace.h>
45 #include <net/netns/generic.h>
46 #include <net/rtnetlink.h>
47 #include <net/gre.h>
48 
49 #if IS_ENABLED(CONFIG_IPV6)
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #endif
54 
55 /*
56    Problems & solutions
57    --------------------
58 
59    1. The most important issue is detecting local dead loops.
60    They would cause complete host lockup in transmit, which
61    would be "resolved" by stack overflow or, if queueing is enabled,
62    with infinite looping in net_bh.
63 
64    We cannot track such dead loops during route installation,
65    it is infeasible task. The most general solutions would be
66    to keep skb->encapsulation counter (sort of local ttl),
67    and silently drop packet when it expires. It is a good
68    solution, but it supposes maintaining new variable in ALL
69    skb, even if no tunneling is used.
70 
71    Current solution: xmit_recursion breaks dead loops. This is a percpu
72    counter, since when we enter the first ndo_xmit(), cpu migration is
73    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
74 
75    2. Networking dead loops would not kill routers, but would really
76    kill network. IP hop limit plays role of "t->recursion" in this case,
77    if we copy it from packet being encapsulated to upper header.
78    It is very good solution, but it introduces two problems:
79 
80    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
81      do not work over tunnels.
82    - traceroute does not work. I planned to relay ICMP from tunnel,
83      so that this problem would be solved and traceroute output
84      would even more informative. This idea appeared to be wrong:
85      only Linux complies to rfc1812 now (yes, guys, Linux is the only
86      true router now :-)), all routers (at least, in neighbourhood of mine)
87      return only 8 bytes of payload. It is the end.
88 
89    Hence, if we want that OSPF worked or traceroute said something reasonable,
90    we should search for another solution.
91 
92    One of them is to parse packet trying to detect inner encapsulation
93    made by our node. It is difficult or even impossible, especially,
94    taking into account fragmentation. TO be short, ttl is not solution at all.
95 
96    Current solution: The solution was UNEXPECTEDLY SIMPLE.
97    We force DF flag on tunnels with preconfigured hop limit,
98    that is ALL. :-) Well, it does not remove the problem completely,
99    but exponential growth of network traffic is changed to linear
100    (branches, that exceed pmtu are pruned) and tunnel mtu
101    rapidly degrades to value <68, where looping stops.
102    Yes, it is not good if there exists a router in the loop,
103    which does not force DF, even when encapsulating packets have DF set.
104    But it is not our problem! Nobody could accuse us, we made
105    all that we could make. Even if it is your gated who injected
106    fatal route to network, even if it were you who configured
107    fatal static route: you are innocent. :-)
108 
109 
110 
111    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
112    practically identical code. It would be good to glue them
113    together, but it is not very evident, how to make them modular.
114    sit is integral part of IPv6, ipip and gre are naturally modular.
115    We could extract common parts (hash table, ioctl etc)
116    to a separate module (ip_tunnel.c).
117 
118    Alexey Kuznetsov.
119  */
120 
121 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
122 static int ipgre_tunnel_init(struct net_device *dev);
123 static void ipgre_tunnel_setup(struct net_device *dev);
124 static int ipgre_tunnel_bind_dev(struct net_device *dev);
125 
126 /* Fallback tunnel: no source, no destination, no key, no options */
127 
128 #define HASH_SIZE  16
129 
130 static int ipgre_net_id __read_mostly;
131 struct ipgre_net {
132 	struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
133 
134 	struct net_device *fb_tunnel_dev;
135 };
136 
137 /* Tunnel hash table */
138 
139 /*
140    4 hash tables:
141 
142    3: (remote,local)
143    2: (remote,*)
144    1: (*,local)
145    0: (*,*)
146 
147    We require exact key match i.e. if a key is present in packet
148    it will match only tunnel with the same key; if it is not present,
149    it will match only keyless tunnel.
150 
151    All keysless packets, if not matched configured keyless tunnels
152    will match fallback tunnel.
153  */
154 
155 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
156 
157 #define tunnels_r_l	tunnels[3]
158 #define tunnels_r	tunnels[2]
159 #define tunnels_l	tunnels[1]
160 #define tunnels_wc	tunnels[0]
161 /*
162  * Locking : hash tables are protected by RCU and RTNL
163  */
164 
165 #define for_each_ip_tunnel_rcu(start) \
166 	for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
167 
168 /* often modified stats are per cpu, other are shared (netdev->stats) */
169 struct pcpu_tstats {
170 	unsigned long	rx_packets;
171 	unsigned long	rx_bytes;
172 	unsigned long	tx_packets;
173 	unsigned long	tx_bytes;
174 } __attribute__((aligned(4*sizeof(unsigned long))));
175 
ipgre_get_stats(struct net_device * dev)176 static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
177 {
178 	struct pcpu_tstats sum = { 0 };
179 	int i;
180 
181 	for_each_possible_cpu(i) {
182 		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
183 
184 		sum.rx_packets += tstats->rx_packets;
185 		sum.rx_bytes   += tstats->rx_bytes;
186 		sum.tx_packets += tstats->tx_packets;
187 		sum.tx_bytes   += tstats->tx_bytes;
188 	}
189 	dev->stats.rx_packets = sum.rx_packets;
190 	dev->stats.rx_bytes   = sum.rx_bytes;
191 	dev->stats.tx_packets = sum.tx_packets;
192 	dev->stats.tx_bytes   = sum.tx_bytes;
193 	return &dev->stats;
194 }
195 
196 /* Given src, dst and key, find appropriate for input tunnel. */
197 
ipgre_tunnel_lookup(struct net_device * dev,__be32 remote,__be32 local,__be32 key,__be16 gre_proto)198 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
199 					      __be32 remote, __be32 local,
200 					      __be32 key, __be16 gre_proto)
201 {
202 	struct net *net = dev_net(dev);
203 	int link = dev->ifindex;
204 	unsigned int h0 = HASH(remote);
205 	unsigned int h1 = HASH(key);
206 	struct ip_tunnel *t, *cand = NULL;
207 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
208 	int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
209 		       ARPHRD_ETHER : ARPHRD_IPGRE;
210 	int score, cand_score = 4;
211 
212 	for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
213 		if (local != t->parms.iph.saddr ||
214 		    remote != t->parms.iph.daddr ||
215 		    key != t->parms.i_key ||
216 		    !(t->dev->flags & IFF_UP))
217 			continue;
218 
219 		if (t->dev->type != ARPHRD_IPGRE &&
220 		    t->dev->type != dev_type)
221 			continue;
222 
223 		score = 0;
224 		if (t->parms.link != link)
225 			score |= 1;
226 		if (t->dev->type != dev_type)
227 			score |= 2;
228 		if (score == 0)
229 			return t;
230 
231 		if (score < cand_score) {
232 			cand = t;
233 			cand_score = score;
234 		}
235 	}
236 
237 	for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
238 		if (remote != t->parms.iph.daddr ||
239 		    key != t->parms.i_key ||
240 		    !(t->dev->flags & IFF_UP))
241 			continue;
242 
243 		if (t->dev->type != ARPHRD_IPGRE &&
244 		    t->dev->type != dev_type)
245 			continue;
246 
247 		score = 0;
248 		if (t->parms.link != link)
249 			score |= 1;
250 		if (t->dev->type != dev_type)
251 			score |= 2;
252 		if (score == 0)
253 			return t;
254 
255 		if (score < cand_score) {
256 			cand = t;
257 			cand_score = score;
258 		}
259 	}
260 
261 	for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
262 		if ((local != t->parms.iph.saddr &&
263 		     (local != t->parms.iph.daddr ||
264 		      !ipv4_is_multicast(local))) ||
265 		    key != t->parms.i_key ||
266 		    !(t->dev->flags & IFF_UP))
267 			continue;
268 
269 		if (t->dev->type != ARPHRD_IPGRE &&
270 		    t->dev->type != dev_type)
271 			continue;
272 
273 		score = 0;
274 		if (t->parms.link != link)
275 			score |= 1;
276 		if (t->dev->type != dev_type)
277 			score |= 2;
278 		if (score == 0)
279 			return t;
280 
281 		if (score < cand_score) {
282 			cand = t;
283 			cand_score = score;
284 		}
285 	}
286 
287 	for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
288 		if (t->parms.i_key != key ||
289 		    !(t->dev->flags & IFF_UP))
290 			continue;
291 
292 		if (t->dev->type != ARPHRD_IPGRE &&
293 		    t->dev->type != dev_type)
294 			continue;
295 
296 		score = 0;
297 		if (t->parms.link != link)
298 			score |= 1;
299 		if (t->dev->type != dev_type)
300 			score |= 2;
301 		if (score == 0)
302 			return t;
303 
304 		if (score < cand_score) {
305 			cand = t;
306 			cand_score = score;
307 		}
308 	}
309 
310 	if (cand != NULL)
311 		return cand;
312 
313 	dev = ign->fb_tunnel_dev;
314 	if (dev->flags & IFF_UP)
315 		return netdev_priv(dev);
316 
317 	return NULL;
318 }
319 
__ipgre_bucket(struct ipgre_net * ign,struct ip_tunnel_parm * parms)320 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
321 		struct ip_tunnel_parm *parms)
322 {
323 	__be32 remote = parms->iph.daddr;
324 	__be32 local = parms->iph.saddr;
325 	__be32 key = parms->i_key;
326 	unsigned int h = HASH(key);
327 	int prio = 0;
328 
329 	if (local)
330 		prio |= 1;
331 	if (remote && !ipv4_is_multicast(remote)) {
332 		prio |= 2;
333 		h ^= HASH(remote);
334 	}
335 
336 	return &ign->tunnels[prio][h];
337 }
338 
ipgre_bucket(struct ipgre_net * ign,struct ip_tunnel * t)339 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
340 		struct ip_tunnel *t)
341 {
342 	return __ipgre_bucket(ign, &t->parms);
343 }
344 
ipgre_tunnel_link(struct ipgre_net * ign,struct ip_tunnel * t)345 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
346 {
347 	struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
348 
349 	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
350 	rcu_assign_pointer(*tp, t);
351 }
352 
ipgre_tunnel_unlink(struct ipgre_net * ign,struct ip_tunnel * t)353 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
354 {
355 	struct ip_tunnel __rcu **tp;
356 	struct ip_tunnel *iter;
357 
358 	for (tp = ipgre_bucket(ign, t);
359 	     (iter = rtnl_dereference(*tp)) != NULL;
360 	     tp = &iter->next) {
361 		if (t == iter) {
362 			rcu_assign_pointer(*tp, t->next);
363 			break;
364 		}
365 	}
366 }
367 
ipgre_tunnel_find(struct net * net,struct ip_tunnel_parm * parms,int type)368 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
369 					   struct ip_tunnel_parm *parms,
370 					   int type)
371 {
372 	__be32 remote = parms->iph.daddr;
373 	__be32 local = parms->iph.saddr;
374 	__be32 key = parms->i_key;
375 	int link = parms->link;
376 	struct ip_tunnel *t;
377 	struct ip_tunnel __rcu **tp;
378 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
379 
380 	for (tp = __ipgre_bucket(ign, parms);
381 	     (t = rtnl_dereference(*tp)) != NULL;
382 	     tp = &t->next)
383 		if (local == t->parms.iph.saddr &&
384 		    remote == t->parms.iph.daddr &&
385 		    key == t->parms.i_key &&
386 		    link == t->parms.link &&
387 		    type == t->dev->type)
388 			break;
389 
390 	return t;
391 }
392 
ipgre_tunnel_locate(struct net * net,struct ip_tunnel_parm * parms,int create)393 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
394 		struct ip_tunnel_parm *parms, int create)
395 {
396 	struct ip_tunnel *t, *nt;
397 	struct net_device *dev;
398 	char name[IFNAMSIZ];
399 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
400 
401 	t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
402 	if (t || !create)
403 		return t;
404 
405 	if (parms->name[0])
406 		strlcpy(name, parms->name, IFNAMSIZ);
407 	else
408 		strcpy(name, "gre%d");
409 
410 	dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
411 	if (!dev)
412 		return NULL;
413 
414 	dev_net_set(dev, net);
415 
416 	nt = netdev_priv(dev);
417 	nt->parms = *parms;
418 	dev->rtnl_link_ops = &ipgre_link_ops;
419 
420 	dev->mtu = ipgre_tunnel_bind_dev(dev);
421 
422 	if (register_netdevice(dev) < 0)
423 		goto failed_free;
424 
425 	/* Can use a lockless transmit, unless we generate output sequences */
426 	if (!(nt->parms.o_flags & GRE_SEQ))
427 		dev->features |= NETIF_F_LLTX;
428 
429 	dev_hold(dev);
430 	ipgre_tunnel_link(ign, nt);
431 	return nt;
432 
433 failed_free:
434 	free_netdev(dev);
435 	return NULL;
436 }
437 
ipgre_tunnel_uninit(struct net_device * dev)438 static void ipgre_tunnel_uninit(struct net_device *dev)
439 {
440 	struct net *net = dev_net(dev);
441 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
442 
443 	ipgre_tunnel_unlink(ign, netdev_priv(dev));
444 	dev_put(dev);
445 }
446 
447 
ipgre_err(struct sk_buff * skb,u32 info)448 static void ipgre_err(struct sk_buff *skb, u32 info)
449 {
450 
451 /* All the routers (except for Linux) return only
452    8 bytes of packet payload. It means, that precise relaying of
453    ICMP in the real Internet is absolutely infeasible.
454 
455    Moreover, Cisco "wise men" put GRE key to the third word
456    in GRE header. It makes impossible maintaining even soft state for keyed
457    GRE tunnels with enabled checksum. Tell them "thank you".
458 
459    Well, I wonder, rfc1812 was written by Cisco employee,
460    what the hell these idiots break standards established
461    by themselves???
462  */
463 
464 	const struct iphdr *iph = (const struct iphdr *)skb->data;
465 	__be16	     *p = (__be16*)(skb->data+(iph->ihl<<2));
466 	int grehlen = (iph->ihl<<2) + 4;
467 	const int type = icmp_hdr(skb)->type;
468 	const int code = icmp_hdr(skb)->code;
469 	struct ip_tunnel *t;
470 	__be16 flags;
471 
472 	flags = p[0];
473 	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
474 		if (flags&(GRE_VERSION|GRE_ROUTING))
475 			return;
476 		if (flags&GRE_KEY) {
477 			grehlen += 4;
478 			if (flags&GRE_CSUM)
479 				grehlen += 4;
480 		}
481 	}
482 
483 	/* If only 8 bytes returned, keyed message will be dropped here */
484 	if (skb_headlen(skb) < grehlen)
485 		return;
486 
487 	switch (type) {
488 	default:
489 	case ICMP_PARAMETERPROB:
490 		return;
491 
492 	case ICMP_DEST_UNREACH:
493 		switch (code) {
494 		case ICMP_SR_FAILED:
495 		case ICMP_PORT_UNREACH:
496 			/* Impossible event. */
497 			return;
498 		case ICMP_FRAG_NEEDED:
499 			/* Soft state for pmtu is maintained by IP core. */
500 			return;
501 		default:
502 			/* All others are translated to HOST_UNREACH.
503 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
504 			   I believe they are just ether pollution. --ANK
505 			 */
506 			break;
507 		}
508 		break;
509 	case ICMP_TIME_EXCEEDED:
510 		if (code != ICMP_EXC_TTL)
511 			return;
512 		break;
513 	}
514 
515 	rcu_read_lock();
516 	t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
517 				flags & GRE_KEY ?
518 				*(((__be32 *)p) + (grehlen / 4) - 1) : 0,
519 				p[1]);
520 	if (t == NULL || t->parms.iph.daddr == 0 ||
521 	    ipv4_is_multicast(t->parms.iph.daddr))
522 		goto out;
523 
524 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
525 		goto out;
526 
527 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
528 		t->err_count++;
529 	else
530 		t->err_count = 1;
531 	t->err_time = jiffies;
532 out:
533 	rcu_read_unlock();
534 }
535 
ipgre_ecn_decapsulate(const struct iphdr * iph,struct sk_buff * skb)536 static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
537 {
538 	if (INET_ECN_is_ce(iph->tos)) {
539 		if (skb->protocol == htons(ETH_P_IP)) {
540 			IP_ECN_set_ce(ip_hdr(skb));
541 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
542 			IP6_ECN_set_ce(ipv6_hdr(skb));
543 		}
544 	}
545 }
546 
547 static inline u8
ipgre_ecn_encapsulate(u8 tos,const struct iphdr * old_iph,struct sk_buff * skb)548 ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
549 {
550 	u8 inner = 0;
551 	if (skb->protocol == htons(ETH_P_IP))
552 		inner = old_iph->tos;
553 	else if (skb->protocol == htons(ETH_P_IPV6))
554 		inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
555 	return INET_ECN_encapsulate(tos, inner);
556 }
557 
ipgre_rcv(struct sk_buff * skb)558 static int ipgre_rcv(struct sk_buff *skb)
559 {
560 	const struct iphdr *iph;
561 	u8     *h;
562 	__be16    flags;
563 	__sum16   csum = 0;
564 	__be32 key = 0;
565 	u32    seqno = 0;
566 	struct ip_tunnel *tunnel;
567 	int    offset = 4;
568 	__be16 gre_proto;
569 
570 	if (!pskb_may_pull(skb, 16))
571 		goto drop_nolock;
572 
573 	iph = ip_hdr(skb);
574 	h = skb->data;
575 	flags = *(__be16*)h;
576 
577 	if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
578 		/* - Version must be 0.
579 		   - We do not support routing headers.
580 		 */
581 		if (flags&(GRE_VERSION|GRE_ROUTING))
582 			goto drop_nolock;
583 
584 		if (flags&GRE_CSUM) {
585 			switch (skb->ip_summed) {
586 			case CHECKSUM_COMPLETE:
587 				csum = csum_fold(skb->csum);
588 				if (!csum)
589 					break;
590 				/* fall through */
591 			case CHECKSUM_NONE:
592 				skb->csum = 0;
593 				csum = __skb_checksum_complete(skb);
594 				skb->ip_summed = CHECKSUM_COMPLETE;
595 			}
596 			offset += 4;
597 		}
598 		if (flags&GRE_KEY) {
599 			key = *(__be32*)(h + offset);
600 			offset += 4;
601 		}
602 		if (flags&GRE_SEQ) {
603 			seqno = ntohl(*(__be32*)(h + offset));
604 			offset += 4;
605 		}
606 	}
607 
608 	gre_proto = *(__be16 *)(h + 2);
609 
610 	rcu_read_lock();
611 	if ((tunnel = ipgre_tunnel_lookup(skb->dev,
612 					  iph->saddr, iph->daddr, key,
613 					  gre_proto))) {
614 		struct pcpu_tstats *tstats;
615 
616 		secpath_reset(skb);
617 
618 		skb->protocol = gre_proto;
619 		/* WCCP version 1 and 2 protocol decoding.
620 		 * - Change protocol to IP
621 		 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
622 		 */
623 		if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
624 			skb->protocol = htons(ETH_P_IP);
625 			if ((*(h + offset) & 0xF0) != 0x40)
626 				offset += 4;
627 		}
628 
629 		skb->mac_header = skb->network_header;
630 		__pskb_pull(skb, offset);
631 		skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
632 		skb->pkt_type = PACKET_HOST;
633 #ifdef CONFIG_NET_IPGRE_BROADCAST
634 		if (ipv4_is_multicast(iph->daddr)) {
635 			/* Looped back packet, drop it! */
636 			if (rt_is_output_route(skb_rtable(skb)))
637 				goto drop;
638 			tunnel->dev->stats.multicast++;
639 			skb->pkt_type = PACKET_BROADCAST;
640 		}
641 #endif
642 
643 		if (((flags&GRE_CSUM) && csum) ||
644 		    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
645 			tunnel->dev->stats.rx_crc_errors++;
646 			tunnel->dev->stats.rx_errors++;
647 			goto drop;
648 		}
649 		if (tunnel->parms.i_flags&GRE_SEQ) {
650 			if (!(flags&GRE_SEQ) ||
651 			    (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
652 				tunnel->dev->stats.rx_fifo_errors++;
653 				tunnel->dev->stats.rx_errors++;
654 				goto drop;
655 			}
656 			tunnel->i_seqno = seqno + 1;
657 		}
658 
659 		/* Warning: All skb pointers will be invalidated! */
660 		if (tunnel->dev->type == ARPHRD_ETHER) {
661 			if (!pskb_may_pull(skb, ETH_HLEN)) {
662 				tunnel->dev->stats.rx_length_errors++;
663 				tunnel->dev->stats.rx_errors++;
664 				goto drop;
665 			}
666 
667 			iph = ip_hdr(skb);
668 			skb->protocol = eth_type_trans(skb, tunnel->dev);
669 			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
670 		}
671 
672 		tstats = this_cpu_ptr(tunnel->dev->tstats);
673 		tstats->rx_packets++;
674 		tstats->rx_bytes += skb->len;
675 
676 		__skb_tunnel_rx(skb, tunnel->dev);
677 
678 		skb_reset_network_header(skb);
679 		ipgre_ecn_decapsulate(iph, skb);
680 
681 		netif_rx(skb);
682 
683 		rcu_read_unlock();
684 		return 0;
685 	}
686 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
687 
688 drop:
689 	rcu_read_unlock();
690 drop_nolock:
691 	kfree_skb(skb);
692 	return 0;
693 }
694 
ipgre_tunnel_xmit(struct sk_buff * skb,struct net_device * dev)695 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
696 {
697 	struct ip_tunnel *tunnel = netdev_priv(dev);
698 	struct pcpu_tstats *tstats;
699 	const struct iphdr  *old_iph = ip_hdr(skb);
700 	const struct iphdr  *tiph;
701 	struct flowi4 fl4;
702 	u8     tos;
703 	__be16 df;
704 	struct rtable *rt;     			/* Route to the other host */
705 	struct net_device *tdev;		/* Device to other host */
706 	struct iphdr  *iph;			/* Our new IP header */
707 	unsigned int max_headroom;		/* The extra header space needed */
708 	int    gre_hlen;
709 	__be32 dst;
710 	int    mtu;
711 
712 	if (dev->type == ARPHRD_ETHER)
713 		IPCB(skb)->flags = 0;
714 
715 	if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
716 		gre_hlen = 0;
717 		tiph = (const struct iphdr *)skb->data;
718 	} else {
719 		gre_hlen = tunnel->hlen;
720 		tiph = &tunnel->parms.iph;
721 	}
722 
723 	if ((dst = tiph->daddr) == 0) {
724 		/* NBMA tunnel */
725 
726 		if (skb_dst(skb) == NULL) {
727 			dev->stats.tx_fifo_errors++;
728 			goto tx_error;
729 		}
730 
731 		if (skb->protocol == htons(ETH_P_IP)) {
732 			rt = skb_rtable(skb);
733 			if ((dst = rt->rt_gateway) == 0)
734 				goto tx_error_icmp;
735 		}
736 #if IS_ENABLED(CONFIG_IPV6)
737 		else if (skb->protocol == htons(ETH_P_IPV6)) {
738 			struct neighbour *neigh = dst_get_neighbour_noref(skb_dst(skb));
739 			const struct in6_addr *addr6;
740 			int addr_type;
741 
742 			if (neigh == NULL)
743 				goto tx_error;
744 
745 			addr6 = (const struct in6_addr *)&neigh->primary_key;
746 			addr_type = ipv6_addr_type(addr6);
747 
748 			if (addr_type == IPV6_ADDR_ANY) {
749 				addr6 = &ipv6_hdr(skb)->daddr;
750 				addr_type = ipv6_addr_type(addr6);
751 			}
752 
753 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
754 				goto tx_error_icmp;
755 
756 			dst = addr6->s6_addr32[3];
757 		}
758 #endif
759 		else
760 			goto tx_error;
761 	}
762 
763 	tos = tiph->tos;
764 	if (tos == 1) {
765 		tos = 0;
766 		if (skb->protocol == htons(ETH_P_IP))
767 			tos = old_iph->tos;
768 		else if (skb->protocol == htons(ETH_P_IPV6))
769 			tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
770 	}
771 
772 	rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
773 				 tunnel->parms.o_key, RT_TOS(tos),
774 				 tunnel->parms.link);
775 	if (IS_ERR(rt)) {
776 		dev->stats.tx_carrier_errors++;
777 		goto tx_error;
778 	}
779 	tdev = rt->dst.dev;
780 
781 	if (tdev == dev) {
782 		ip_rt_put(rt);
783 		dev->stats.collisions++;
784 		goto tx_error;
785 	}
786 
787 	df = tiph->frag_off;
788 	if (df)
789 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
790 	else
791 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
792 
793 	if (skb_dst(skb))
794 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
795 
796 	if (skb->protocol == htons(ETH_P_IP)) {
797 		df |= (old_iph->frag_off&htons(IP_DF));
798 
799 		if ((old_iph->frag_off&htons(IP_DF)) &&
800 		    mtu < ntohs(old_iph->tot_len)) {
801 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
802 			ip_rt_put(rt);
803 			goto tx_error;
804 		}
805 	}
806 #if IS_ENABLED(CONFIG_IPV6)
807 	else if (skb->protocol == htons(ETH_P_IPV6)) {
808 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
809 
810 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
811 			if ((tunnel->parms.iph.daddr &&
812 			     !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
813 			    rt6->rt6i_dst.plen == 128) {
814 				rt6->rt6i_flags |= RTF_MODIFIED;
815 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
816 			}
817 		}
818 
819 		if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
820 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
821 			ip_rt_put(rt);
822 			goto tx_error;
823 		}
824 	}
825 #endif
826 
827 	if (tunnel->err_count > 0) {
828 		if (time_before(jiffies,
829 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
830 			tunnel->err_count--;
831 
832 			dst_link_failure(skb);
833 		} else
834 			tunnel->err_count = 0;
835 	}
836 
837 	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
838 
839 	if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
840 	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
841 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
842 		if (max_headroom > dev->needed_headroom)
843 			dev->needed_headroom = max_headroom;
844 		if (!new_skb) {
845 			ip_rt_put(rt);
846 			dev->stats.tx_dropped++;
847 			dev_kfree_skb(skb);
848 			return NETDEV_TX_OK;
849 		}
850 		if (skb->sk)
851 			skb_set_owner_w(new_skb, skb->sk);
852 		dev_kfree_skb(skb);
853 		skb = new_skb;
854 		old_iph = ip_hdr(skb);
855 	}
856 
857 	skb_reset_transport_header(skb);
858 	skb_push(skb, gre_hlen);
859 	skb_reset_network_header(skb);
860 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
861 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
862 			      IPSKB_REROUTED);
863 	skb_dst_drop(skb);
864 	skb_dst_set(skb, &rt->dst);
865 
866 	/*
867 	 *	Push down and install the IPIP header.
868 	 */
869 
870 	iph 			=	ip_hdr(skb);
871 	iph->version		=	4;
872 	iph->ihl		=	sizeof(struct iphdr) >> 2;
873 	iph->frag_off		=	df;
874 	iph->protocol		=	IPPROTO_GRE;
875 	iph->tos		=	ipgre_ecn_encapsulate(tos, old_iph, skb);
876 	iph->daddr		=	fl4.daddr;
877 	iph->saddr		=	fl4.saddr;
878 
879 	if ((iph->ttl = tiph->ttl) == 0) {
880 		if (skb->protocol == htons(ETH_P_IP))
881 			iph->ttl = old_iph->ttl;
882 #if IS_ENABLED(CONFIG_IPV6)
883 		else if (skb->protocol == htons(ETH_P_IPV6))
884 			iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
885 #endif
886 		else
887 			iph->ttl = ip4_dst_hoplimit(&rt->dst);
888 	}
889 
890 	((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
891 	((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
892 				   htons(ETH_P_TEB) : skb->protocol;
893 
894 	if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
895 		__be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
896 
897 		if (tunnel->parms.o_flags&GRE_SEQ) {
898 			++tunnel->o_seqno;
899 			*ptr = htonl(tunnel->o_seqno);
900 			ptr--;
901 		}
902 		if (tunnel->parms.o_flags&GRE_KEY) {
903 			*ptr = tunnel->parms.o_key;
904 			ptr--;
905 		}
906 		if (tunnel->parms.o_flags&GRE_CSUM) {
907 			*ptr = 0;
908 			*(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
909 		}
910 	}
911 
912 	nf_reset(skb);
913 	tstats = this_cpu_ptr(dev->tstats);
914 	__IPTUNNEL_XMIT(tstats, &dev->stats);
915 	return NETDEV_TX_OK;
916 
917 tx_error_icmp:
918 	dst_link_failure(skb);
919 
920 tx_error:
921 	dev->stats.tx_errors++;
922 	dev_kfree_skb(skb);
923 	return NETDEV_TX_OK;
924 }
925 
ipgre_tunnel_bind_dev(struct net_device * dev)926 static int ipgre_tunnel_bind_dev(struct net_device *dev)
927 {
928 	struct net_device *tdev = NULL;
929 	struct ip_tunnel *tunnel;
930 	const struct iphdr *iph;
931 	int hlen = LL_MAX_HEADER;
932 	int mtu = ETH_DATA_LEN;
933 	int addend = sizeof(struct iphdr) + 4;
934 
935 	tunnel = netdev_priv(dev);
936 	iph = &tunnel->parms.iph;
937 
938 	/* Guess output device to choose reasonable mtu and needed_headroom */
939 
940 	if (iph->daddr) {
941 		struct flowi4 fl4;
942 		struct rtable *rt;
943 
944 		rt = ip_route_output_gre(dev_net(dev), &fl4,
945 					 iph->daddr, iph->saddr,
946 					 tunnel->parms.o_key,
947 					 RT_TOS(iph->tos),
948 					 tunnel->parms.link);
949 		if (!IS_ERR(rt)) {
950 			tdev = rt->dst.dev;
951 			ip_rt_put(rt);
952 		}
953 
954 		if (dev->type != ARPHRD_ETHER)
955 			dev->flags |= IFF_POINTOPOINT;
956 	}
957 
958 	if (!tdev && tunnel->parms.link)
959 		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
960 
961 	if (tdev) {
962 		hlen = tdev->hard_header_len + tdev->needed_headroom;
963 		mtu = tdev->mtu;
964 	}
965 	dev->iflink = tunnel->parms.link;
966 
967 	/* Precalculate GRE options length */
968 	if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
969 		if (tunnel->parms.o_flags&GRE_CSUM)
970 			addend += 4;
971 		if (tunnel->parms.o_flags&GRE_KEY)
972 			addend += 4;
973 		if (tunnel->parms.o_flags&GRE_SEQ)
974 			addend += 4;
975 	}
976 	dev->needed_headroom = addend + hlen;
977 	mtu -= dev->hard_header_len + addend;
978 
979 	if (mtu < 68)
980 		mtu = 68;
981 
982 	tunnel->hlen = addend;
983 
984 	return mtu;
985 }
986 
987 static int
ipgre_tunnel_ioctl(struct net_device * dev,struct ifreq * ifr,int cmd)988 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
989 {
990 	int err = 0;
991 	struct ip_tunnel_parm p;
992 	struct ip_tunnel *t;
993 	struct net *net = dev_net(dev);
994 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
995 
996 	switch (cmd) {
997 	case SIOCGETTUNNEL:
998 		t = NULL;
999 		if (dev == ign->fb_tunnel_dev) {
1000 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1001 				err = -EFAULT;
1002 				break;
1003 			}
1004 			t = ipgre_tunnel_locate(net, &p, 0);
1005 		}
1006 		if (t == NULL)
1007 			t = netdev_priv(dev);
1008 		memcpy(&p, &t->parms, sizeof(p));
1009 		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1010 			err = -EFAULT;
1011 		break;
1012 
1013 	case SIOCADDTUNNEL:
1014 	case SIOCCHGTUNNEL:
1015 		err = -EPERM;
1016 		if (!capable(CAP_NET_ADMIN))
1017 			goto done;
1018 
1019 		err = -EFAULT;
1020 		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1021 			goto done;
1022 
1023 		err = -EINVAL;
1024 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1025 		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1026 		    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1027 			goto done;
1028 		if (p.iph.ttl)
1029 			p.iph.frag_off |= htons(IP_DF);
1030 
1031 		if (!(p.i_flags&GRE_KEY))
1032 			p.i_key = 0;
1033 		if (!(p.o_flags&GRE_KEY))
1034 			p.o_key = 0;
1035 
1036 		t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1037 
1038 		if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1039 			if (t != NULL) {
1040 				if (t->dev != dev) {
1041 					err = -EEXIST;
1042 					break;
1043 				}
1044 			} else {
1045 				unsigned int nflags = 0;
1046 
1047 				t = netdev_priv(dev);
1048 
1049 				if (ipv4_is_multicast(p.iph.daddr))
1050 					nflags = IFF_BROADCAST;
1051 				else if (p.iph.daddr)
1052 					nflags = IFF_POINTOPOINT;
1053 
1054 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1055 					err = -EINVAL;
1056 					break;
1057 				}
1058 				ipgre_tunnel_unlink(ign, t);
1059 				synchronize_net();
1060 				t->parms.iph.saddr = p.iph.saddr;
1061 				t->parms.iph.daddr = p.iph.daddr;
1062 				t->parms.i_key = p.i_key;
1063 				t->parms.o_key = p.o_key;
1064 				memcpy(dev->dev_addr, &p.iph.saddr, 4);
1065 				memcpy(dev->broadcast, &p.iph.daddr, 4);
1066 				ipgre_tunnel_link(ign, t);
1067 				netdev_state_change(dev);
1068 			}
1069 		}
1070 
1071 		if (t) {
1072 			err = 0;
1073 			if (cmd == SIOCCHGTUNNEL) {
1074 				t->parms.iph.ttl = p.iph.ttl;
1075 				t->parms.iph.tos = p.iph.tos;
1076 				t->parms.iph.frag_off = p.iph.frag_off;
1077 				if (t->parms.link != p.link) {
1078 					t->parms.link = p.link;
1079 					dev->mtu = ipgre_tunnel_bind_dev(dev);
1080 					netdev_state_change(dev);
1081 				}
1082 			}
1083 			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1084 				err = -EFAULT;
1085 		} else
1086 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1087 		break;
1088 
1089 	case SIOCDELTUNNEL:
1090 		err = -EPERM;
1091 		if (!capable(CAP_NET_ADMIN))
1092 			goto done;
1093 
1094 		if (dev == ign->fb_tunnel_dev) {
1095 			err = -EFAULT;
1096 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1097 				goto done;
1098 			err = -ENOENT;
1099 			if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1100 				goto done;
1101 			err = -EPERM;
1102 			if (t == netdev_priv(ign->fb_tunnel_dev))
1103 				goto done;
1104 			dev = t->dev;
1105 		}
1106 		unregister_netdevice(dev);
1107 		err = 0;
1108 		break;
1109 
1110 	default:
1111 		err = -EINVAL;
1112 	}
1113 
1114 done:
1115 	return err;
1116 }
1117 
ipgre_tunnel_change_mtu(struct net_device * dev,int new_mtu)1118 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1119 {
1120 	struct ip_tunnel *tunnel = netdev_priv(dev);
1121 	if (new_mtu < 68 ||
1122 	    new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1123 		return -EINVAL;
1124 	dev->mtu = new_mtu;
1125 	return 0;
1126 }
1127 
1128 /* Nice toy. Unfortunately, useless in real life :-)
1129    It allows to construct virtual multiprotocol broadcast "LAN"
1130    over the Internet, provided multicast routing is tuned.
1131 
1132 
1133    I have no idea was this bicycle invented before me,
1134    so that I had to set ARPHRD_IPGRE to a random value.
1135    I have an impression, that Cisco could make something similar,
1136    but this feature is apparently missing in IOS<=11.2(8).
1137 
1138    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1139    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1140 
1141    ping -t 255 224.66.66.66
1142 
1143    If nobody answers, mbone does not work.
1144 
1145    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1146    ip addr add 10.66.66.<somewhat>/24 dev Universe
1147    ifconfig Universe up
1148    ifconfig Universe add fe80::<Your_real_addr>/10
1149    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1150    ftp 10.66.66.66
1151    ...
1152    ftp fec0:6666:6666::193.233.7.65
1153    ...
1154 
1155  */
1156 
ipgre_header(struct sk_buff * skb,struct net_device * dev,unsigned short type,const void * daddr,const void * saddr,unsigned int len)1157 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1158 			unsigned short type,
1159 			const void *daddr, const void *saddr, unsigned int len)
1160 {
1161 	struct ip_tunnel *t = netdev_priv(dev);
1162 	struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1163 	__be16 *p = (__be16*)(iph+1);
1164 
1165 	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1166 	p[0]		= t->parms.o_flags;
1167 	p[1]		= htons(type);
1168 
1169 	/*
1170 	 *	Set the source hardware address.
1171 	 */
1172 
1173 	if (saddr)
1174 		memcpy(&iph->saddr, saddr, 4);
1175 	if (daddr)
1176 		memcpy(&iph->daddr, daddr, 4);
1177 	if (iph->daddr)
1178 		return t->hlen;
1179 
1180 	return -t->hlen;
1181 }
1182 
ipgre_header_parse(const struct sk_buff * skb,unsigned char * haddr)1183 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1184 {
1185 	const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
1186 	memcpy(haddr, &iph->saddr, 4);
1187 	return 4;
1188 }
1189 
1190 static const struct header_ops ipgre_header_ops = {
1191 	.create	= ipgre_header,
1192 	.parse	= ipgre_header_parse,
1193 };
1194 
1195 #ifdef CONFIG_NET_IPGRE_BROADCAST
ipgre_open(struct net_device * dev)1196 static int ipgre_open(struct net_device *dev)
1197 {
1198 	struct ip_tunnel *t = netdev_priv(dev);
1199 
1200 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
1201 		struct flowi4 fl4;
1202 		struct rtable *rt;
1203 
1204 		rt = ip_route_output_gre(dev_net(dev), &fl4,
1205 					 t->parms.iph.daddr,
1206 					 t->parms.iph.saddr,
1207 					 t->parms.o_key,
1208 					 RT_TOS(t->parms.iph.tos),
1209 					 t->parms.link);
1210 		if (IS_ERR(rt))
1211 			return -EADDRNOTAVAIL;
1212 		dev = rt->dst.dev;
1213 		ip_rt_put(rt);
1214 		if (__in_dev_get_rtnl(dev) == NULL)
1215 			return -EADDRNOTAVAIL;
1216 		t->mlink = dev->ifindex;
1217 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1218 	}
1219 	return 0;
1220 }
1221 
ipgre_close(struct net_device * dev)1222 static int ipgre_close(struct net_device *dev)
1223 {
1224 	struct ip_tunnel *t = netdev_priv(dev);
1225 
1226 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1227 		struct in_device *in_dev;
1228 		in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1229 		if (in_dev)
1230 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1231 	}
1232 	return 0;
1233 }
1234 
1235 #endif
1236 
1237 static const struct net_device_ops ipgre_netdev_ops = {
1238 	.ndo_init		= ipgre_tunnel_init,
1239 	.ndo_uninit		= ipgre_tunnel_uninit,
1240 #ifdef CONFIG_NET_IPGRE_BROADCAST
1241 	.ndo_open		= ipgre_open,
1242 	.ndo_stop		= ipgre_close,
1243 #endif
1244 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1245 	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
1246 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1247 	.ndo_get_stats		= ipgre_get_stats,
1248 };
1249 
ipgre_dev_free(struct net_device * dev)1250 static void ipgre_dev_free(struct net_device *dev)
1251 {
1252 	free_percpu(dev->tstats);
1253 	free_netdev(dev);
1254 }
1255 
ipgre_tunnel_setup(struct net_device * dev)1256 static void ipgre_tunnel_setup(struct net_device *dev)
1257 {
1258 	dev->netdev_ops		= &ipgre_netdev_ops;
1259 	dev->destructor 	= ipgre_dev_free;
1260 
1261 	dev->type		= ARPHRD_IPGRE;
1262 	dev->needed_headroom 	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1263 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1264 	dev->flags		= IFF_NOARP;
1265 	dev->iflink		= 0;
1266 	dev->addr_len		= 4;
1267 	dev->features		|= NETIF_F_NETNS_LOCAL;
1268 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
1269 }
1270 
ipgre_tunnel_init(struct net_device * dev)1271 static int ipgre_tunnel_init(struct net_device *dev)
1272 {
1273 	struct ip_tunnel *tunnel;
1274 	struct iphdr *iph;
1275 
1276 	tunnel = netdev_priv(dev);
1277 	iph = &tunnel->parms.iph;
1278 
1279 	tunnel->dev = dev;
1280 	strcpy(tunnel->parms.name, dev->name);
1281 
1282 	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1283 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1284 
1285 	if (iph->daddr) {
1286 #ifdef CONFIG_NET_IPGRE_BROADCAST
1287 		if (ipv4_is_multicast(iph->daddr)) {
1288 			if (!iph->saddr)
1289 				return -EINVAL;
1290 			dev->flags = IFF_BROADCAST;
1291 			dev->header_ops = &ipgre_header_ops;
1292 		}
1293 #endif
1294 	} else
1295 		dev->header_ops = &ipgre_header_ops;
1296 
1297 	dev->tstats = alloc_percpu(struct pcpu_tstats);
1298 	if (!dev->tstats)
1299 		return -ENOMEM;
1300 
1301 	return 0;
1302 }
1303 
ipgre_fb_tunnel_init(struct net_device * dev)1304 static void ipgre_fb_tunnel_init(struct net_device *dev)
1305 {
1306 	struct ip_tunnel *tunnel = netdev_priv(dev);
1307 	struct iphdr *iph = &tunnel->parms.iph;
1308 
1309 	tunnel->dev = dev;
1310 	strcpy(tunnel->parms.name, dev->name);
1311 
1312 	iph->version		= 4;
1313 	iph->protocol		= IPPROTO_GRE;
1314 	iph->ihl		= 5;
1315 	tunnel->hlen		= sizeof(struct iphdr) + 4;
1316 
1317 	dev_hold(dev);
1318 }
1319 
1320 
1321 static const struct gre_protocol ipgre_protocol = {
1322 	.handler     = ipgre_rcv,
1323 	.err_handler = ipgre_err,
1324 };
1325 
ipgre_destroy_tunnels(struct ipgre_net * ign,struct list_head * head)1326 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1327 {
1328 	int prio;
1329 
1330 	for (prio = 0; prio < 4; prio++) {
1331 		int h;
1332 		for (h = 0; h < HASH_SIZE; h++) {
1333 			struct ip_tunnel *t;
1334 
1335 			t = rtnl_dereference(ign->tunnels[prio][h]);
1336 
1337 			while (t != NULL) {
1338 				unregister_netdevice_queue(t->dev, head);
1339 				t = rtnl_dereference(t->next);
1340 			}
1341 		}
1342 	}
1343 }
1344 
ipgre_init_net(struct net * net)1345 static int __net_init ipgre_init_net(struct net *net)
1346 {
1347 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1348 	int err;
1349 
1350 	ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1351 					   ipgre_tunnel_setup);
1352 	if (!ign->fb_tunnel_dev) {
1353 		err = -ENOMEM;
1354 		goto err_alloc_dev;
1355 	}
1356 	dev_net_set(ign->fb_tunnel_dev, net);
1357 
1358 	ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1359 	ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1360 
1361 	if ((err = register_netdev(ign->fb_tunnel_dev)))
1362 		goto err_reg_dev;
1363 
1364 	rcu_assign_pointer(ign->tunnels_wc[0],
1365 			   netdev_priv(ign->fb_tunnel_dev));
1366 	return 0;
1367 
1368 err_reg_dev:
1369 	ipgre_dev_free(ign->fb_tunnel_dev);
1370 err_alloc_dev:
1371 	return err;
1372 }
1373 
ipgre_exit_net(struct net * net)1374 static void __net_exit ipgre_exit_net(struct net *net)
1375 {
1376 	struct ipgre_net *ign;
1377 	LIST_HEAD(list);
1378 
1379 	ign = net_generic(net, ipgre_net_id);
1380 	rtnl_lock();
1381 	ipgre_destroy_tunnels(ign, &list);
1382 	unregister_netdevice_many(&list);
1383 	rtnl_unlock();
1384 }
1385 
1386 static struct pernet_operations ipgre_net_ops = {
1387 	.init = ipgre_init_net,
1388 	.exit = ipgre_exit_net,
1389 	.id   = &ipgre_net_id,
1390 	.size = sizeof(struct ipgre_net),
1391 };
1392 
ipgre_tunnel_validate(struct nlattr * tb[],struct nlattr * data[])1393 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1394 {
1395 	__be16 flags;
1396 
1397 	if (!data)
1398 		return 0;
1399 
1400 	flags = 0;
1401 	if (data[IFLA_GRE_IFLAGS])
1402 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1403 	if (data[IFLA_GRE_OFLAGS])
1404 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1405 	if (flags & (GRE_VERSION|GRE_ROUTING))
1406 		return -EINVAL;
1407 
1408 	return 0;
1409 }
1410 
ipgre_tap_validate(struct nlattr * tb[],struct nlattr * data[])1411 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1412 {
1413 	__be32 daddr;
1414 
1415 	if (tb[IFLA_ADDRESS]) {
1416 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1417 			return -EINVAL;
1418 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1419 			return -EADDRNOTAVAIL;
1420 	}
1421 
1422 	if (!data)
1423 		goto out;
1424 
1425 	if (data[IFLA_GRE_REMOTE]) {
1426 		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1427 		if (!daddr)
1428 			return -EINVAL;
1429 	}
1430 
1431 out:
1432 	return ipgre_tunnel_validate(tb, data);
1433 }
1434 
ipgre_netlink_parms(struct nlattr * data[],struct ip_tunnel_parm * parms)1435 static void ipgre_netlink_parms(struct nlattr *data[],
1436 				struct ip_tunnel_parm *parms)
1437 {
1438 	memset(parms, 0, sizeof(*parms));
1439 
1440 	parms->iph.protocol = IPPROTO_GRE;
1441 
1442 	if (!data)
1443 		return;
1444 
1445 	if (data[IFLA_GRE_LINK])
1446 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1447 
1448 	if (data[IFLA_GRE_IFLAGS])
1449 		parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1450 
1451 	if (data[IFLA_GRE_OFLAGS])
1452 		parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1453 
1454 	if (data[IFLA_GRE_IKEY])
1455 		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1456 
1457 	if (data[IFLA_GRE_OKEY])
1458 		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1459 
1460 	if (data[IFLA_GRE_LOCAL])
1461 		parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1462 
1463 	if (data[IFLA_GRE_REMOTE])
1464 		parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1465 
1466 	if (data[IFLA_GRE_TTL])
1467 		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1468 
1469 	if (data[IFLA_GRE_TOS])
1470 		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1471 
1472 	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1473 		parms->iph.frag_off = htons(IP_DF);
1474 }
1475 
ipgre_tap_init(struct net_device * dev)1476 static int ipgre_tap_init(struct net_device *dev)
1477 {
1478 	struct ip_tunnel *tunnel;
1479 
1480 	tunnel = netdev_priv(dev);
1481 
1482 	tunnel->dev = dev;
1483 	strcpy(tunnel->parms.name, dev->name);
1484 
1485 	ipgre_tunnel_bind_dev(dev);
1486 
1487 	dev->tstats = alloc_percpu(struct pcpu_tstats);
1488 	if (!dev->tstats)
1489 		return -ENOMEM;
1490 
1491 	return 0;
1492 }
1493 
1494 static const struct net_device_ops ipgre_tap_netdev_ops = {
1495 	.ndo_init		= ipgre_tap_init,
1496 	.ndo_uninit		= ipgre_tunnel_uninit,
1497 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1498 	.ndo_set_mac_address 	= eth_mac_addr,
1499 	.ndo_validate_addr	= eth_validate_addr,
1500 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1501 	.ndo_get_stats		= ipgre_get_stats,
1502 };
1503 
ipgre_tap_setup(struct net_device * dev)1504 static void ipgre_tap_setup(struct net_device *dev)
1505 {
1506 
1507 	ether_setup(dev);
1508 
1509 	dev->netdev_ops		= &ipgre_tap_netdev_ops;
1510 	dev->destructor 	= ipgre_dev_free;
1511 
1512 	dev->iflink		= 0;
1513 	dev->features		|= NETIF_F_NETNS_LOCAL;
1514 }
1515 
ipgre_newlink(struct net * src_net,struct net_device * dev,struct nlattr * tb[],struct nlattr * data[])1516 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1517 			 struct nlattr *data[])
1518 {
1519 	struct ip_tunnel *nt;
1520 	struct net *net = dev_net(dev);
1521 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1522 	int mtu;
1523 	int err;
1524 
1525 	nt = netdev_priv(dev);
1526 	ipgre_netlink_parms(data, &nt->parms);
1527 
1528 	if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1529 		return -EEXIST;
1530 
1531 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1532 		random_ether_addr(dev->dev_addr);
1533 
1534 	mtu = ipgre_tunnel_bind_dev(dev);
1535 	if (!tb[IFLA_MTU])
1536 		dev->mtu = mtu;
1537 
1538 	/* Can use a lockless transmit, unless we generate output sequences */
1539 	if (!(nt->parms.o_flags & GRE_SEQ))
1540 		dev->features |= NETIF_F_LLTX;
1541 
1542 	err = register_netdevice(dev);
1543 	if (err)
1544 		goto out;
1545 
1546 	dev_hold(dev);
1547 	ipgre_tunnel_link(ign, nt);
1548 
1549 out:
1550 	return err;
1551 }
1552 
ipgre_changelink(struct net_device * dev,struct nlattr * tb[],struct nlattr * data[])1553 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1554 			    struct nlattr *data[])
1555 {
1556 	struct ip_tunnel *t, *nt;
1557 	struct net *net = dev_net(dev);
1558 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1559 	struct ip_tunnel_parm p;
1560 	int mtu;
1561 
1562 	if (dev == ign->fb_tunnel_dev)
1563 		return -EINVAL;
1564 
1565 	nt = netdev_priv(dev);
1566 	ipgre_netlink_parms(data, &p);
1567 
1568 	t = ipgre_tunnel_locate(net, &p, 0);
1569 
1570 	if (t) {
1571 		if (t->dev != dev)
1572 			return -EEXIST;
1573 	} else {
1574 		t = nt;
1575 
1576 		if (dev->type != ARPHRD_ETHER) {
1577 			unsigned int nflags = 0;
1578 
1579 			if (ipv4_is_multicast(p.iph.daddr))
1580 				nflags = IFF_BROADCAST;
1581 			else if (p.iph.daddr)
1582 				nflags = IFF_POINTOPOINT;
1583 
1584 			if ((dev->flags ^ nflags) &
1585 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1586 				return -EINVAL;
1587 		}
1588 
1589 		ipgre_tunnel_unlink(ign, t);
1590 		t->parms.iph.saddr = p.iph.saddr;
1591 		t->parms.iph.daddr = p.iph.daddr;
1592 		t->parms.i_key = p.i_key;
1593 		if (dev->type != ARPHRD_ETHER) {
1594 			memcpy(dev->dev_addr, &p.iph.saddr, 4);
1595 			memcpy(dev->broadcast, &p.iph.daddr, 4);
1596 		}
1597 		ipgre_tunnel_link(ign, t);
1598 		netdev_state_change(dev);
1599 	}
1600 
1601 	t->parms.o_key = p.o_key;
1602 	t->parms.iph.ttl = p.iph.ttl;
1603 	t->parms.iph.tos = p.iph.tos;
1604 	t->parms.iph.frag_off = p.iph.frag_off;
1605 
1606 	if (t->parms.link != p.link) {
1607 		t->parms.link = p.link;
1608 		mtu = ipgre_tunnel_bind_dev(dev);
1609 		if (!tb[IFLA_MTU])
1610 			dev->mtu = mtu;
1611 		netdev_state_change(dev);
1612 	}
1613 
1614 	return 0;
1615 }
1616 
ipgre_get_size(const struct net_device * dev)1617 static size_t ipgre_get_size(const struct net_device *dev)
1618 {
1619 	return
1620 		/* IFLA_GRE_LINK */
1621 		nla_total_size(4) +
1622 		/* IFLA_GRE_IFLAGS */
1623 		nla_total_size(2) +
1624 		/* IFLA_GRE_OFLAGS */
1625 		nla_total_size(2) +
1626 		/* IFLA_GRE_IKEY */
1627 		nla_total_size(4) +
1628 		/* IFLA_GRE_OKEY */
1629 		nla_total_size(4) +
1630 		/* IFLA_GRE_LOCAL */
1631 		nla_total_size(4) +
1632 		/* IFLA_GRE_REMOTE */
1633 		nla_total_size(4) +
1634 		/* IFLA_GRE_TTL */
1635 		nla_total_size(1) +
1636 		/* IFLA_GRE_TOS */
1637 		nla_total_size(1) +
1638 		/* IFLA_GRE_PMTUDISC */
1639 		nla_total_size(1) +
1640 		0;
1641 }
1642 
ipgre_fill_info(struct sk_buff * skb,const struct net_device * dev)1643 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1644 {
1645 	struct ip_tunnel *t = netdev_priv(dev);
1646 	struct ip_tunnel_parm *p = &t->parms;
1647 
1648 	NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1649 	NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1650 	NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1651 	NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1652 	NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1653 	NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1654 	NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1655 	NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1656 	NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1657 	NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1658 
1659 	return 0;
1660 
1661 nla_put_failure:
1662 	return -EMSGSIZE;
1663 }
1664 
1665 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1666 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
1667 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
1668 	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
1669 	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
1670 	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
1671 	[IFLA_GRE_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1672 	[IFLA_GRE_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1673 	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
1674 	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
1675 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
1676 };
1677 
1678 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1679 	.kind		= "gre",
1680 	.maxtype	= IFLA_GRE_MAX,
1681 	.policy		= ipgre_policy,
1682 	.priv_size	= sizeof(struct ip_tunnel),
1683 	.setup		= ipgre_tunnel_setup,
1684 	.validate	= ipgre_tunnel_validate,
1685 	.newlink	= ipgre_newlink,
1686 	.changelink	= ipgre_changelink,
1687 	.get_size	= ipgre_get_size,
1688 	.fill_info	= ipgre_fill_info,
1689 };
1690 
1691 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1692 	.kind		= "gretap",
1693 	.maxtype	= IFLA_GRE_MAX,
1694 	.policy		= ipgre_policy,
1695 	.priv_size	= sizeof(struct ip_tunnel),
1696 	.setup		= ipgre_tap_setup,
1697 	.validate	= ipgre_tap_validate,
1698 	.newlink	= ipgre_newlink,
1699 	.changelink	= ipgre_changelink,
1700 	.get_size	= ipgre_get_size,
1701 	.fill_info	= ipgre_fill_info,
1702 };
1703 
1704 /*
1705  *	And now the modules code and kernel interface.
1706  */
1707 
ipgre_init(void)1708 static int __init ipgre_init(void)
1709 {
1710 	int err;
1711 
1712 	printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1713 
1714 	err = register_pernet_device(&ipgre_net_ops);
1715 	if (err < 0)
1716 		return err;
1717 
1718 	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1719 	if (err < 0) {
1720 		printk(KERN_INFO "ipgre init: can't add protocol\n");
1721 		goto add_proto_failed;
1722 	}
1723 
1724 	err = rtnl_link_register(&ipgre_link_ops);
1725 	if (err < 0)
1726 		goto rtnl_link_failed;
1727 
1728 	err = rtnl_link_register(&ipgre_tap_ops);
1729 	if (err < 0)
1730 		goto tap_ops_failed;
1731 
1732 out:
1733 	return err;
1734 
1735 tap_ops_failed:
1736 	rtnl_link_unregister(&ipgre_link_ops);
1737 rtnl_link_failed:
1738 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1739 add_proto_failed:
1740 	unregister_pernet_device(&ipgre_net_ops);
1741 	goto out;
1742 }
1743 
ipgre_fini(void)1744 static void __exit ipgre_fini(void)
1745 {
1746 	rtnl_link_unregister(&ipgre_tap_ops);
1747 	rtnl_link_unregister(&ipgre_link_ops);
1748 	if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1749 		printk(KERN_INFO "ipgre close: can't remove protocol\n");
1750 	unregister_pernet_device(&ipgre_net_ops);
1751 }
1752 
1753 module_init(ipgre_init);
1754 module_exit(ipgre_fini);
1755 MODULE_LICENSE("GPL");
1756 MODULE_ALIAS_RTNL_LINK("gre");
1757 MODULE_ALIAS_RTNL_LINK("gretap");
1758 MODULE_ALIAS_NETDEV("gre0");
1759