1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 
__ip6_local_out(struct sk_buff * skb)61 int __ip6_local_out(struct sk_buff *skb)
62 {
63 	int len;
64 
65 	len = skb->len - sizeof(struct ipv6hdr);
66 	if (len > IPV6_MAXPLEN)
67 		len = 0;
68 	ipv6_hdr(skb)->payload_len = htons(len);
69 
70 	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 		       skb_dst(skb)->dev, dst_output);
72 }
73 
ip6_local_out(struct sk_buff * skb)74 int ip6_local_out(struct sk_buff *skb)
75 {
76 	int err;
77 
78 	err = __ip6_local_out(skb);
79 	if (likely(err == 1))
80 		err = dst_output(skb);
81 
82 	return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85 
86 /* dev_loopback_xmit for use with netfilter. */
ip6_dev_loopback_xmit(struct sk_buff * newskb)87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89 	skb_reset_mac_header(newskb);
90 	__skb_pull(newskb, skb_network_offset(newskb));
91 	newskb->pkt_type = PACKET_LOOPBACK;
92 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
93 	WARN_ON(!skb_dst(newskb));
94 
95 	netif_rx_ni(newskb);
96 	return 0;
97 }
98 
ip6_finish_output2(struct sk_buff * skb)99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101 	struct dst_entry *dst = skb_dst(skb);
102 	struct net_device *dev = dst->dev;
103 	struct neighbour *neigh;
104 
105 	skb->protocol = htons(ETH_P_IPV6);
106 	skb->dev = dev;
107 
108 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110 
111 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112 		    ((mroute6_socket(dev_net(dev), skb) &&
113 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115 					 &ipv6_hdr(skb)->saddr))) {
116 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117 
118 			/* Do not check for IFF_ALLMULTI; multicast routing
119 			   is not supported in any case.
120 			 */
121 			if (newskb)
122 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123 					newskb, NULL, newskb->dev,
124 					ip6_dev_loopback_xmit);
125 
126 			if (ipv6_hdr(skb)->hop_limit == 0) {
127 				IP6_INC_STATS(dev_net(dev), idev,
128 					      IPSTATS_MIB_OUTDISCARDS);
129 				kfree_skb(skb);
130 				return 0;
131 			}
132 		}
133 
134 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
135 				skb->len);
136 	}
137 
138 	rcu_read_lock();
139 	neigh = dst_get_neighbour_noref(dst);
140 	if (neigh) {
141 		int res = neigh_output(neigh, skb);
142 
143 		rcu_read_unlock();
144 		return res;
145 	}
146 	rcu_read_unlock();
147 	IP6_INC_STATS_BH(dev_net(dst->dev),
148 			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
149 	kfree_skb(skb);
150 	return -EINVAL;
151 }
152 
ip6_finish_output(struct sk_buff * skb)153 static int ip6_finish_output(struct sk_buff *skb)
154 {
155 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
156 	    dst_allfrag(skb_dst(skb)))
157 		return ip6_fragment(skb, ip6_finish_output2);
158 	else
159 		return ip6_finish_output2(skb);
160 }
161 
ip6_output(struct sk_buff * skb)162 int ip6_output(struct sk_buff *skb)
163 {
164 	struct net_device *dev = skb_dst(skb)->dev;
165 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
166 	if (unlikely(idev->cnf.disable_ipv6)) {
167 		IP6_INC_STATS(dev_net(dev), idev,
168 			      IPSTATS_MIB_OUTDISCARDS);
169 		kfree_skb(skb);
170 		return 0;
171 	}
172 
173 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
174 			    ip6_finish_output,
175 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
176 }
177 
178 /*
179  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
180  */
181 
ip6_xmit(struct sock * sk,struct sk_buff * skb,struct flowi6 * fl6,struct ipv6_txoptions * opt,int tclass)182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
183 	     struct ipv6_txoptions *opt, int tclass)
184 {
185 	struct net *net = sock_net(sk);
186 	struct ipv6_pinfo *np = inet6_sk(sk);
187 	struct in6_addr *first_hop = &fl6->daddr;
188 	struct dst_entry *dst = skb_dst(skb);
189 	struct ipv6hdr *hdr;
190 	u8  proto = fl6->flowi6_proto;
191 	int seg_len = skb->len;
192 	int hlimit = -1;
193 	u32 mtu;
194 
195 	if (opt) {
196 		unsigned int head_room;
197 
198 		/* First: exthdrs may take lots of space (~8K for now)
199 		   MAX_HEADER is not enough.
200 		 */
201 		head_room = opt->opt_nflen + opt->opt_flen;
202 		seg_len += head_room;
203 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
204 
205 		if (skb_headroom(skb) < head_room) {
206 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
207 			if (skb2 == NULL) {
208 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
209 					      IPSTATS_MIB_OUTDISCARDS);
210 				kfree_skb(skb);
211 				return -ENOBUFS;
212 			}
213 			kfree_skb(skb);
214 			skb = skb2;
215 			skb_set_owner_w(skb, sk);
216 		}
217 		if (opt->opt_flen)
218 			ipv6_push_frag_opts(skb, opt, &proto);
219 		if (opt->opt_nflen)
220 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
221 	}
222 
223 	skb_push(skb, sizeof(struct ipv6hdr));
224 	skb_reset_network_header(skb);
225 	hdr = ipv6_hdr(skb);
226 
227 	/*
228 	 *	Fill in the IPv6 header
229 	 */
230 	if (np)
231 		hlimit = np->hop_limit;
232 	if (hlimit < 0)
233 		hlimit = ip6_dst_hoplimit(dst);
234 
235 	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
236 
237 	hdr->payload_len = htons(seg_len);
238 	hdr->nexthdr = proto;
239 	hdr->hop_limit = hlimit;
240 
241 	hdr->saddr = fl6->saddr;
242 	hdr->daddr = *first_hop;
243 
244 	skb->priority = sk->sk_priority;
245 	skb->mark = sk->sk_mark;
246 
247 	mtu = dst_mtu(dst);
248 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
249 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
250 			      IPSTATS_MIB_OUT, skb->len);
251 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
252 			       dst->dev, dst_output);
253 	}
254 
255 	if (net_ratelimit())
256 		printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
257 	skb->dev = dst->dev;
258 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
259 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
260 	kfree_skb(skb);
261 	return -EMSGSIZE;
262 }
263 
264 EXPORT_SYMBOL(ip6_xmit);
265 
266 /*
267  *	To avoid extra problems ND packets are send through this
268  *	routine. It's code duplication but I really want to avoid
269  *	extra checks since ipv6_build_header is used by TCP (which
270  *	is for us performance critical)
271  */
272 
ip6_nd_hdr(struct sock * sk,struct sk_buff * skb,struct net_device * dev,const struct in6_addr * saddr,const struct in6_addr * daddr,int proto,int len)273 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
274 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
275 	       int proto, int len)
276 {
277 	struct ipv6_pinfo *np = inet6_sk(sk);
278 	struct ipv6hdr *hdr;
279 
280 	skb->protocol = htons(ETH_P_IPV6);
281 	skb->dev = dev;
282 
283 	skb_reset_network_header(skb);
284 	skb_put(skb, sizeof(struct ipv6hdr));
285 	hdr = ipv6_hdr(skb);
286 
287 	*(__be32*)hdr = htonl(0x60000000);
288 
289 	hdr->payload_len = htons(len);
290 	hdr->nexthdr = proto;
291 	hdr->hop_limit = np->hop_limit;
292 
293 	hdr->saddr = *saddr;
294 	hdr->daddr = *daddr;
295 
296 	return 0;
297 }
298 
ip6_call_ra_chain(struct sk_buff * skb,int sel)299 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
300 {
301 	struct ip6_ra_chain *ra;
302 	struct sock *last = NULL;
303 
304 	read_lock(&ip6_ra_lock);
305 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
306 		struct sock *sk = ra->sk;
307 		if (sk && ra->sel == sel &&
308 		    (!sk->sk_bound_dev_if ||
309 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
310 			if (last) {
311 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
312 				if (skb2)
313 					rawv6_rcv(last, skb2);
314 			}
315 			last = sk;
316 		}
317 	}
318 
319 	if (last) {
320 		rawv6_rcv(last, skb);
321 		read_unlock(&ip6_ra_lock);
322 		return 1;
323 	}
324 	read_unlock(&ip6_ra_lock);
325 	return 0;
326 }
327 
ip6_forward_proxy_check(struct sk_buff * skb)328 static int ip6_forward_proxy_check(struct sk_buff *skb)
329 {
330 	struct ipv6hdr *hdr = ipv6_hdr(skb);
331 	u8 nexthdr = hdr->nexthdr;
332 	__be16 frag_off;
333 	int offset;
334 
335 	if (ipv6_ext_hdr(nexthdr)) {
336 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
337 		if (offset < 0)
338 			return 0;
339 	} else
340 		offset = sizeof(struct ipv6hdr);
341 
342 	if (nexthdr == IPPROTO_ICMPV6) {
343 		struct icmp6hdr *icmp6;
344 
345 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
346 					 offset + 1 - skb->data)))
347 			return 0;
348 
349 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
350 
351 		switch (icmp6->icmp6_type) {
352 		case NDISC_ROUTER_SOLICITATION:
353 		case NDISC_ROUTER_ADVERTISEMENT:
354 		case NDISC_NEIGHBOUR_SOLICITATION:
355 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
356 		case NDISC_REDIRECT:
357 			/* For reaction involving unicast neighbor discovery
358 			 * message destined to the proxied address, pass it to
359 			 * input function.
360 			 */
361 			return 1;
362 		default:
363 			break;
364 		}
365 	}
366 
367 	/*
368 	 * The proxying router can't forward traffic sent to a link-local
369 	 * address, so signal the sender and discard the packet. This
370 	 * behavior is clarified by the MIPv6 specification.
371 	 */
372 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
373 		dst_link_failure(skb);
374 		return -1;
375 	}
376 
377 	return 0;
378 }
379 
ip6_forward_finish(struct sk_buff * skb)380 static inline int ip6_forward_finish(struct sk_buff *skb)
381 {
382 	return dst_output(skb);
383 }
384 
ip6_forward(struct sk_buff * skb)385 int ip6_forward(struct sk_buff *skb)
386 {
387 	struct dst_entry *dst = skb_dst(skb);
388 	struct ipv6hdr *hdr = ipv6_hdr(skb);
389 	struct inet6_skb_parm *opt = IP6CB(skb);
390 	struct net *net = dev_net(dst->dev);
391 	struct neighbour *n;
392 	u32 mtu;
393 
394 	if (net->ipv6.devconf_all->forwarding == 0)
395 		goto error;
396 
397 	if (skb_warn_if_lro(skb))
398 		goto drop;
399 
400 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
401 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
402 		goto drop;
403 	}
404 
405 	if (skb->pkt_type != PACKET_HOST)
406 		goto drop;
407 
408 	skb_forward_csum(skb);
409 
410 	/*
411 	 *	We DO NOT make any processing on
412 	 *	RA packets, pushing them to user level AS IS
413 	 *	without ane WARRANTY that application will be able
414 	 *	to interpret them. The reason is that we
415 	 *	cannot make anything clever here.
416 	 *
417 	 *	We are not end-node, so that if packet contains
418 	 *	AH/ESP, we cannot make anything.
419 	 *	Defragmentation also would be mistake, RA packets
420 	 *	cannot be fragmented, because there is no warranty
421 	 *	that different fragments will go along one path. --ANK
422 	 */
423 	if (opt->ra) {
424 		u8 *ptr = skb_network_header(skb) + opt->ra;
425 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
426 			return 0;
427 	}
428 
429 	/*
430 	 *	check and decrement ttl
431 	 */
432 	if (hdr->hop_limit <= 1) {
433 		/* Force OUTPUT device used as source address */
434 		skb->dev = dst->dev;
435 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
436 		IP6_INC_STATS_BH(net,
437 				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
438 
439 		kfree_skb(skb);
440 		return -ETIMEDOUT;
441 	}
442 
443 	/* XXX: idev->cnf.proxy_ndp? */
444 	if (net->ipv6.devconf_all->proxy_ndp &&
445 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
446 		int proxied = ip6_forward_proxy_check(skb);
447 		if (proxied > 0)
448 			return ip6_input(skb);
449 		else if (proxied < 0) {
450 			IP6_INC_STATS(net, ip6_dst_idev(dst),
451 				      IPSTATS_MIB_INDISCARDS);
452 			goto drop;
453 		}
454 	}
455 
456 	if (!xfrm6_route_forward(skb)) {
457 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
458 		goto drop;
459 	}
460 	dst = skb_dst(skb);
461 
462 	/* IPv6 specs say nothing about it, but it is clear that we cannot
463 	   send redirects to source routed frames.
464 	   We don't send redirects to frames decapsulated from IPsec.
465 	 */
466 	n = dst_get_neighbour_noref(dst);
467 	if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) {
468 		struct in6_addr *target = NULL;
469 		struct rt6_info *rt;
470 
471 		/*
472 		 *	incoming and outgoing devices are the same
473 		 *	send a redirect.
474 		 */
475 
476 		rt = (struct rt6_info *) dst;
477 		if ((rt->rt6i_flags & RTF_GATEWAY))
478 			target = (struct in6_addr*)&n->primary_key;
479 		else
480 			target = &hdr->daddr;
481 
482 		if (!rt->rt6i_peer)
483 			rt6_bind_peer(rt, 1);
484 
485 		/* Limit redirects both by destination (here)
486 		   and by source (inside ndisc_send_redirect)
487 		 */
488 		if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
489 			ndisc_send_redirect(skb, n, target);
490 	} else {
491 		int addrtype = ipv6_addr_type(&hdr->saddr);
492 
493 		/* This check is security critical. */
494 		if (addrtype == IPV6_ADDR_ANY ||
495 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
496 			goto error;
497 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
498 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
499 				    ICMPV6_NOT_NEIGHBOUR, 0);
500 			goto error;
501 		}
502 	}
503 
504 	mtu = dst_mtu(dst);
505 	if (mtu < IPV6_MIN_MTU)
506 		mtu = IPV6_MIN_MTU;
507 
508 	if (skb->len > mtu && !skb_is_gso(skb)) {
509 		/* Again, force OUTPUT device used as source address */
510 		skb->dev = dst->dev;
511 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
512 		IP6_INC_STATS_BH(net,
513 				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
514 		IP6_INC_STATS_BH(net,
515 				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
516 		kfree_skb(skb);
517 		return -EMSGSIZE;
518 	}
519 
520 	if (skb_cow(skb, dst->dev->hard_header_len)) {
521 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
522 		goto drop;
523 	}
524 
525 	hdr = ipv6_hdr(skb);
526 
527 	/* Mangling hops number delayed to point after skb COW */
528 
529 	hdr->hop_limit--;
530 
531 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
532 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
533 		       ip6_forward_finish);
534 
535 error:
536 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
537 drop:
538 	kfree_skb(skb);
539 	return -EINVAL;
540 }
541 
ip6_copy_metadata(struct sk_buff * to,struct sk_buff * from)542 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
543 {
544 	to->pkt_type = from->pkt_type;
545 	to->priority = from->priority;
546 	to->protocol = from->protocol;
547 	skb_dst_drop(to);
548 	skb_dst_set(to, dst_clone(skb_dst(from)));
549 	to->dev = from->dev;
550 	to->mark = from->mark;
551 
552 #ifdef CONFIG_NET_SCHED
553 	to->tc_index = from->tc_index;
554 #endif
555 	nf_copy(to, from);
556 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
557     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
558 	to->nf_trace = from->nf_trace;
559 #endif
560 	skb_copy_secmark(to, from);
561 }
562 
ip6_find_1stfragopt(struct sk_buff * skb,u8 ** nexthdr)563 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
564 {
565 	u16 offset = sizeof(struct ipv6hdr);
566 	struct ipv6_opt_hdr *exthdr =
567 				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
568 	unsigned int packet_len = skb->tail - skb->network_header;
569 	int found_rhdr = 0;
570 	*nexthdr = &ipv6_hdr(skb)->nexthdr;
571 
572 	while (offset + 1 <= packet_len) {
573 
574 		switch (**nexthdr) {
575 
576 		case NEXTHDR_HOP:
577 			break;
578 		case NEXTHDR_ROUTING:
579 			found_rhdr = 1;
580 			break;
581 		case NEXTHDR_DEST:
582 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
583 			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
584 				break;
585 #endif
586 			if (found_rhdr)
587 				return offset;
588 			break;
589 		default :
590 			return offset;
591 		}
592 
593 		offset += ipv6_optlen(exthdr);
594 		*nexthdr = &exthdr->nexthdr;
595 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
596 						 offset);
597 	}
598 
599 	return offset;
600 }
601 
ipv6_select_ident(struct frag_hdr * fhdr,struct rt6_info * rt)602 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
603 {
604 	static atomic_t ipv6_fragmentation_id;
605 	int old, new;
606 
607 	if (rt && !(rt->dst.flags & DST_NOPEER)) {
608 		struct inet_peer *peer;
609 
610 		if (!rt->rt6i_peer)
611 			rt6_bind_peer(rt, 1);
612 		peer = rt->rt6i_peer;
613 		if (peer) {
614 			fhdr->identification = htonl(inet_getid(peer, 0));
615 			return;
616 		}
617 	}
618 	do {
619 		old = atomic_read(&ipv6_fragmentation_id);
620 		new = old + 1;
621 		if (!new)
622 			new = 1;
623 	} while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
624 	fhdr->identification = htonl(new);
625 }
626 
ip6_fragment(struct sk_buff * skb,int (* output)(struct sk_buff *))627 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
628 {
629 	struct sk_buff *frag;
630 	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
631 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
632 	struct ipv6hdr *tmp_hdr;
633 	struct frag_hdr *fh;
634 	unsigned int mtu, hlen, left, len;
635 	int hroom, troom;
636 	__be32 frag_id = 0;
637 	int ptr, offset = 0, err=0;
638 	u8 *prevhdr, nexthdr = 0;
639 	struct net *net = dev_net(skb_dst(skb)->dev);
640 
641 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
642 	nexthdr = *prevhdr;
643 
644 	mtu = ip6_skb_dst_mtu(skb);
645 
646 	/* We must not fragment if the socket is set to force MTU discovery
647 	 * or if the skb it not generated by a local socket.
648 	 */
649 	if (!skb->local_df && skb->len > mtu) {
650 		skb->dev = skb_dst(skb)->dev;
651 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
652 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
653 			      IPSTATS_MIB_FRAGFAILS);
654 		kfree_skb(skb);
655 		return -EMSGSIZE;
656 	}
657 
658 	if (np && np->frag_size < mtu) {
659 		if (np->frag_size)
660 			mtu = np->frag_size;
661 	}
662 	mtu -= hlen + sizeof(struct frag_hdr);
663 
664 	if (skb_has_frag_list(skb)) {
665 		int first_len = skb_pagelen(skb);
666 		struct sk_buff *frag2;
667 
668 		if (first_len - hlen > mtu ||
669 		    ((first_len - hlen) & 7) ||
670 		    skb_cloned(skb))
671 			goto slow_path;
672 
673 		skb_walk_frags(skb, frag) {
674 			/* Correct geometry. */
675 			if (frag->len > mtu ||
676 			    ((frag->len & 7) && frag->next) ||
677 			    skb_headroom(frag) < hlen)
678 				goto slow_path_clean;
679 
680 			/* Partially cloned skb? */
681 			if (skb_shared(frag))
682 				goto slow_path_clean;
683 
684 			BUG_ON(frag->sk);
685 			if (skb->sk) {
686 				frag->sk = skb->sk;
687 				frag->destructor = sock_wfree;
688 			}
689 			skb->truesize -= frag->truesize;
690 		}
691 
692 		err = 0;
693 		offset = 0;
694 		frag = skb_shinfo(skb)->frag_list;
695 		skb_frag_list_init(skb);
696 		/* BUILD HEADER */
697 
698 		*prevhdr = NEXTHDR_FRAGMENT;
699 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
700 		if (!tmp_hdr) {
701 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
702 				      IPSTATS_MIB_FRAGFAILS);
703 			return -ENOMEM;
704 		}
705 
706 		__skb_pull(skb, hlen);
707 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
708 		__skb_push(skb, hlen);
709 		skb_reset_network_header(skb);
710 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
711 
712 		ipv6_select_ident(fh, rt);
713 		fh->nexthdr = nexthdr;
714 		fh->reserved = 0;
715 		fh->frag_off = htons(IP6_MF);
716 		frag_id = fh->identification;
717 
718 		first_len = skb_pagelen(skb);
719 		skb->data_len = first_len - skb_headlen(skb);
720 		skb->len = first_len;
721 		ipv6_hdr(skb)->payload_len = htons(first_len -
722 						   sizeof(struct ipv6hdr));
723 
724 		dst_hold(&rt->dst);
725 
726 		for (;;) {
727 			/* Prepare header of the next frame,
728 			 * before previous one went down. */
729 			if (frag) {
730 				frag->ip_summed = CHECKSUM_NONE;
731 				skb_reset_transport_header(frag);
732 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
733 				__skb_push(frag, hlen);
734 				skb_reset_network_header(frag);
735 				memcpy(skb_network_header(frag), tmp_hdr,
736 				       hlen);
737 				offset += skb->len - hlen - sizeof(struct frag_hdr);
738 				fh->nexthdr = nexthdr;
739 				fh->reserved = 0;
740 				fh->frag_off = htons(offset);
741 				if (frag->next != NULL)
742 					fh->frag_off |= htons(IP6_MF);
743 				fh->identification = frag_id;
744 				ipv6_hdr(frag)->payload_len =
745 						htons(frag->len -
746 						      sizeof(struct ipv6hdr));
747 				ip6_copy_metadata(frag, skb);
748 			}
749 
750 			err = output(skb);
751 			if(!err)
752 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
753 					      IPSTATS_MIB_FRAGCREATES);
754 
755 			if (err || !frag)
756 				break;
757 
758 			skb = frag;
759 			frag = skb->next;
760 			skb->next = NULL;
761 		}
762 
763 		kfree(tmp_hdr);
764 
765 		if (err == 0) {
766 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
767 				      IPSTATS_MIB_FRAGOKS);
768 			dst_release(&rt->dst);
769 			return 0;
770 		}
771 
772 		while (frag) {
773 			skb = frag->next;
774 			kfree_skb(frag);
775 			frag = skb;
776 		}
777 
778 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
779 			      IPSTATS_MIB_FRAGFAILS);
780 		dst_release(&rt->dst);
781 		return err;
782 
783 slow_path_clean:
784 		skb_walk_frags(skb, frag2) {
785 			if (frag2 == frag)
786 				break;
787 			frag2->sk = NULL;
788 			frag2->destructor = NULL;
789 			skb->truesize += frag2->truesize;
790 		}
791 	}
792 
793 slow_path:
794 	left = skb->len - hlen;		/* Space per frame */
795 	ptr = hlen;			/* Where to start from */
796 
797 	/*
798 	 *	Fragment the datagram.
799 	 */
800 
801 	*prevhdr = NEXTHDR_FRAGMENT;
802 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
803 	troom = rt->dst.dev->needed_tailroom;
804 
805 	/*
806 	 *	Keep copying data until we run out.
807 	 */
808 	while(left > 0)	{
809 		len = left;
810 		/* IF: it doesn't fit, use 'mtu' - the data space left */
811 		if (len > mtu)
812 			len = mtu;
813 		/* IF: we are not sending up to and including the packet end
814 		   then align the next start on an eight byte boundary */
815 		if (len < left)	{
816 			len &= ~7;
817 		}
818 		/*
819 		 *	Allocate buffer.
820 		 */
821 
822 		if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
823 				      hroom + troom, GFP_ATOMIC)) == NULL) {
824 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
825 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
826 				      IPSTATS_MIB_FRAGFAILS);
827 			err = -ENOMEM;
828 			goto fail;
829 		}
830 
831 		/*
832 		 *	Set up data on packet
833 		 */
834 
835 		ip6_copy_metadata(frag, skb);
836 		skb_reserve(frag, hroom);
837 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
838 		skb_reset_network_header(frag);
839 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
840 		frag->transport_header = (frag->network_header + hlen +
841 					  sizeof(struct frag_hdr));
842 
843 		/*
844 		 *	Charge the memory for the fragment to any owner
845 		 *	it might possess
846 		 */
847 		if (skb->sk)
848 			skb_set_owner_w(frag, skb->sk);
849 
850 		/*
851 		 *	Copy the packet header into the new buffer.
852 		 */
853 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
854 
855 		/*
856 		 *	Build fragment header.
857 		 */
858 		fh->nexthdr = nexthdr;
859 		fh->reserved = 0;
860 		if (!frag_id) {
861 			ipv6_select_ident(fh, rt);
862 			frag_id = fh->identification;
863 		} else
864 			fh->identification = frag_id;
865 
866 		/*
867 		 *	Copy a block of the IP datagram.
868 		 */
869 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
870 			BUG();
871 		left -= len;
872 
873 		fh->frag_off = htons(offset);
874 		if (left > 0)
875 			fh->frag_off |= htons(IP6_MF);
876 		ipv6_hdr(frag)->payload_len = htons(frag->len -
877 						    sizeof(struct ipv6hdr));
878 
879 		ptr += len;
880 		offset += len;
881 
882 		/*
883 		 *	Put this fragment into the sending queue.
884 		 */
885 		err = output(frag);
886 		if (err)
887 			goto fail;
888 
889 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
890 			      IPSTATS_MIB_FRAGCREATES);
891 	}
892 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
893 		      IPSTATS_MIB_FRAGOKS);
894 	kfree_skb(skb);
895 	return err;
896 
897 fail:
898 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
899 		      IPSTATS_MIB_FRAGFAILS);
900 	kfree_skb(skb);
901 	return err;
902 }
903 
ip6_rt_check(const struct rt6key * rt_key,const struct in6_addr * fl_addr,const struct in6_addr * addr_cache)904 static inline int ip6_rt_check(const struct rt6key *rt_key,
905 			       const struct in6_addr *fl_addr,
906 			       const struct in6_addr *addr_cache)
907 {
908 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
909 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
910 }
911 
ip6_sk_dst_check(struct sock * sk,struct dst_entry * dst,const struct flowi6 * fl6)912 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
913 					  struct dst_entry *dst,
914 					  const struct flowi6 *fl6)
915 {
916 	struct ipv6_pinfo *np = inet6_sk(sk);
917 	struct rt6_info *rt = (struct rt6_info *)dst;
918 
919 	if (!dst)
920 		goto out;
921 
922 	/* Yes, checking route validity in not connected
923 	 * case is not very simple. Take into account,
924 	 * that we do not support routing by source, TOS,
925 	 * and MSG_DONTROUTE 		--ANK (980726)
926 	 *
927 	 * 1. ip6_rt_check(): If route was host route,
928 	 *    check that cached destination is current.
929 	 *    If it is network route, we still may
930 	 *    check its validity using saved pointer
931 	 *    to the last used address: daddr_cache.
932 	 *    We do not want to save whole address now,
933 	 *    (because main consumer of this service
934 	 *    is tcp, which has not this problem),
935 	 *    so that the last trick works only on connected
936 	 *    sockets.
937 	 * 2. oif also should be the same.
938 	 */
939 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
940 #ifdef CONFIG_IPV6_SUBTREES
941 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
942 #endif
943 	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
944 		dst_release(dst);
945 		dst = NULL;
946 	}
947 
948 out:
949 	return dst;
950 }
951 
ip6_dst_lookup_tail(struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)952 static int ip6_dst_lookup_tail(struct sock *sk,
953 			       struct dst_entry **dst, struct flowi6 *fl6)
954 {
955 	struct net *net = sock_net(sk);
956 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
957 	struct neighbour *n;
958 #endif
959 	int err;
960 
961 	if (*dst == NULL)
962 		*dst = ip6_route_output(net, sk, fl6);
963 
964 	if ((err = (*dst)->error))
965 		goto out_err_release;
966 
967 	if (ipv6_addr_any(&fl6->saddr)) {
968 		struct rt6_info *rt = (struct rt6_info *) *dst;
969 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
970 					  sk ? inet6_sk(sk)->srcprefs : 0,
971 					  &fl6->saddr);
972 		if (err)
973 			goto out_err_release;
974 	}
975 
976 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
977 	/*
978 	 * Here if the dst entry we've looked up
979 	 * has a neighbour entry that is in the INCOMPLETE
980 	 * state and the src address from the flow is
981 	 * marked as OPTIMISTIC, we release the found
982 	 * dst entry and replace it instead with the
983 	 * dst entry of the nexthop router
984 	 */
985 	rcu_read_lock();
986 	n = dst_get_neighbour_noref(*dst);
987 	if (n && !(n->nud_state & NUD_VALID)) {
988 		struct inet6_ifaddr *ifp;
989 		struct flowi6 fl_gw6;
990 		int redirect;
991 
992 		rcu_read_unlock();
993 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
994 				      (*dst)->dev, 1);
995 
996 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
997 		if (ifp)
998 			in6_ifa_put(ifp);
999 
1000 		if (redirect) {
1001 			/*
1002 			 * We need to get the dst entry for the
1003 			 * default router instead
1004 			 */
1005 			dst_release(*dst);
1006 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1007 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1008 			*dst = ip6_route_output(net, sk, &fl_gw6);
1009 			if ((err = (*dst)->error))
1010 				goto out_err_release;
1011 		}
1012 	} else {
1013 		rcu_read_unlock();
1014 	}
1015 #endif
1016 
1017 	return 0;
1018 
1019 out_err_release:
1020 	if (err == -ENETUNREACH)
1021 		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1022 	dst_release(*dst);
1023 	*dst = NULL;
1024 	return err;
1025 }
1026 
1027 /**
1028  *	ip6_dst_lookup - perform route lookup on flow
1029  *	@sk: socket which provides route info
1030  *	@dst: pointer to dst_entry * for result
1031  *	@fl6: flow to lookup
1032  *
1033  *	This function performs a route lookup on the given flow.
1034  *
1035  *	It returns zero on success, or a standard errno code on error.
1036  */
ip6_dst_lookup(struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1037 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1038 {
1039 	*dst = NULL;
1040 	return ip6_dst_lookup_tail(sk, dst, fl6);
1041 }
1042 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1043 
1044 /**
1045  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1046  *	@sk: socket which provides route info
1047  *	@fl6: flow to lookup
1048  *	@final_dst: final destination address for ipsec lookup
1049  *	@can_sleep: we are in a sleepable context
1050  *
1051  *	This function performs a route lookup on the given flow.
1052  *
1053  *	It returns a valid dst pointer on success, or a pointer encoded
1054  *	error code.
1055  */
ip6_dst_lookup_flow(struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst,bool can_sleep)1056 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1057 				      const struct in6_addr *final_dst,
1058 				      bool can_sleep)
1059 {
1060 	struct dst_entry *dst = NULL;
1061 	int err;
1062 
1063 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1064 	if (err)
1065 		return ERR_PTR(err);
1066 	if (final_dst)
1067 		fl6->daddr = *final_dst;
1068 	if (can_sleep)
1069 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1070 
1071 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1072 }
1073 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1074 
1075 /**
1076  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1077  *	@sk: socket which provides the dst cache and route info
1078  *	@fl6: flow to lookup
1079  *	@final_dst: final destination address for ipsec lookup
1080  *	@can_sleep: we are in a sleepable context
1081  *
1082  *	This function performs a route lookup on the given flow with the
1083  *	possibility of using the cached route in the socket if it is valid.
1084  *	It will take the socket dst lock when operating on the dst cache.
1085  *	As a result, this function can only be used in process context.
1086  *
1087  *	It returns a valid dst pointer on success, or a pointer encoded
1088  *	error code.
1089  */
ip6_sk_dst_lookup_flow(struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst,bool can_sleep)1090 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1091 					 const struct in6_addr *final_dst,
1092 					 bool can_sleep)
1093 {
1094 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1095 	int err;
1096 
1097 	dst = ip6_sk_dst_check(sk, dst, fl6);
1098 
1099 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1100 	if (err)
1101 		return ERR_PTR(err);
1102 	if (final_dst)
1103 		fl6->daddr = *final_dst;
1104 	if (can_sleep)
1105 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1106 
1107 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1108 }
1109 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1110 
ip6_ufo_append_data(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int hh_len,int fragheaderlen,int transhdrlen,int mtu,unsigned int flags,struct rt6_info * rt)1111 static inline int ip6_ufo_append_data(struct sock *sk,
1112 			int getfrag(void *from, char *to, int offset, int len,
1113 			int odd, struct sk_buff *skb),
1114 			void *from, int length, int hh_len, int fragheaderlen,
1115 			int transhdrlen, int mtu,unsigned int flags,
1116 			struct rt6_info *rt)
1117 
1118 {
1119 	struct sk_buff *skb;
1120 	int err;
1121 
1122 	/* There is support for UDP large send offload by network
1123 	 * device, so create one single skb packet containing complete
1124 	 * udp datagram
1125 	 */
1126 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1127 		skb = sock_alloc_send_skb(sk,
1128 			hh_len + fragheaderlen + transhdrlen + 20,
1129 			(flags & MSG_DONTWAIT), &err);
1130 		if (skb == NULL)
1131 			return err;
1132 
1133 		/* reserve space for Hardware header */
1134 		skb_reserve(skb, hh_len);
1135 
1136 		/* create space for UDP/IP header */
1137 		skb_put(skb,fragheaderlen + transhdrlen);
1138 
1139 		/* initialize network header pointer */
1140 		skb_reset_network_header(skb);
1141 
1142 		/* initialize protocol header pointer */
1143 		skb->transport_header = skb->network_header + fragheaderlen;
1144 
1145 		skb->ip_summed = CHECKSUM_PARTIAL;
1146 		skb->csum = 0;
1147 	}
1148 
1149 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1150 				      (length - transhdrlen));
1151 	if (!err) {
1152 		struct frag_hdr fhdr;
1153 
1154 		/* Specify the length of each IPv6 datagram fragment.
1155 		 * It has to be a multiple of 8.
1156 		 */
1157 		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1158 					     sizeof(struct frag_hdr)) & ~7;
1159 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1160 		ipv6_select_ident(&fhdr, rt);
1161 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1162 		__skb_queue_tail(&sk->sk_write_queue, skb);
1163 
1164 		return 0;
1165 	}
1166 	/* There is not enough support do UPD LSO,
1167 	 * so follow normal path
1168 	 */
1169 	kfree_skb(skb);
1170 
1171 	return err;
1172 }
1173 
ip6_opt_dup(struct ipv6_opt_hdr * src,gfp_t gfp)1174 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1175 					       gfp_t gfp)
1176 {
1177 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1178 }
1179 
ip6_rthdr_dup(struct ipv6_rt_hdr * src,gfp_t gfp)1180 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1181 						gfp_t gfp)
1182 {
1183 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1184 }
1185 
ip6_append_data(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,int length,int transhdrlen,int hlimit,int tclass,struct ipv6_txoptions * opt,struct flowi6 * fl6,struct rt6_info * rt,unsigned int flags,int dontfrag)1186 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1187 	int offset, int len, int odd, struct sk_buff *skb),
1188 	void *from, int length, int transhdrlen,
1189 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1190 	struct rt6_info *rt, unsigned int flags, int dontfrag)
1191 {
1192 	struct inet_sock *inet = inet_sk(sk);
1193 	struct ipv6_pinfo *np = inet6_sk(sk);
1194 	struct inet_cork *cork;
1195 	struct sk_buff *skb;
1196 	unsigned int maxfraglen, fragheaderlen;
1197 	int exthdrlen;
1198 	int dst_exthdrlen;
1199 	int hh_len;
1200 	int mtu;
1201 	int copy;
1202 	int err;
1203 	int offset = 0;
1204 	int csummode = CHECKSUM_NONE;
1205 	__u8 tx_flags = 0;
1206 
1207 	if (flags&MSG_PROBE)
1208 		return 0;
1209 	cork = &inet->cork.base;
1210 	if (skb_queue_empty(&sk->sk_write_queue)) {
1211 		/*
1212 		 * setup for corking
1213 		 */
1214 		if (opt) {
1215 			if (WARN_ON(np->cork.opt))
1216 				return -EINVAL;
1217 
1218 			np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1219 			if (unlikely(np->cork.opt == NULL))
1220 				return -ENOBUFS;
1221 
1222 			np->cork.opt->tot_len = opt->tot_len;
1223 			np->cork.opt->opt_flen = opt->opt_flen;
1224 			np->cork.opt->opt_nflen = opt->opt_nflen;
1225 
1226 			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1227 							    sk->sk_allocation);
1228 			if (opt->dst0opt && !np->cork.opt->dst0opt)
1229 				return -ENOBUFS;
1230 
1231 			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1232 							    sk->sk_allocation);
1233 			if (opt->dst1opt && !np->cork.opt->dst1opt)
1234 				return -ENOBUFS;
1235 
1236 			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1237 							   sk->sk_allocation);
1238 			if (opt->hopopt && !np->cork.opt->hopopt)
1239 				return -ENOBUFS;
1240 
1241 			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1242 							    sk->sk_allocation);
1243 			if (opt->srcrt && !np->cork.opt->srcrt)
1244 				return -ENOBUFS;
1245 
1246 			/* need source address above miyazawa*/
1247 		}
1248 		dst_hold(&rt->dst);
1249 		cork->dst = &rt->dst;
1250 		inet->cork.fl.u.ip6 = *fl6;
1251 		np->cork.hop_limit = hlimit;
1252 		np->cork.tclass = tclass;
1253 		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1254 		      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1255 		if (np->frag_size < mtu) {
1256 			if (np->frag_size)
1257 				mtu = np->frag_size;
1258 		}
1259 		cork->fragsize = mtu;
1260 		if (dst_allfrag(rt->dst.path))
1261 			cork->flags |= IPCORK_ALLFRAG;
1262 		cork->length = 0;
1263 		sk->sk_sndmsg_page = NULL;
1264 		sk->sk_sndmsg_off = 0;
1265 		exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1266 		length += exthdrlen;
1267 		transhdrlen += exthdrlen;
1268 		dst_exthdrlen = rt->dst.header_len;
1269 	} else {
1270 		rt = (struct rt6_info *)cork->dst;
1271 		fl6 = &inet->cork.fl.u.ip6;
1272 		opt = np->cork.opt;
1273 		transhdrlen = 0;
1274 		exthdrlen = 0;
1275 		dst_exthdrlen = 0;
1276 		mtu = cork->fragsize;
1277 	}
1278 
1279 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1280 
1281 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1282 			(opt ? opt->opt_nflen : 0);
1283 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1284 
1285 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1286 		if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1287 			ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1288 			return -EMSGSIZE;
1289 		}
1290 	}
1291 
1292 	/* For UDP, check if TX timestamp is enabled */
1293 	if (sk->sk_type == SOCK_DGRAM) {
1294 		err = sock_tx_timestamp(sk, &tx_flags);
1295 		if (err)
1296 			goto error;
1297 	}
1298 
1299 	/*
1300 	 * Let's try using as much space as possible.
1301 	 * Use MTU if total length of the message fits into the MTU.
1302 	 * Otherwise, we need to reserve fragment header and
1303 	 * fragment alignment (= 8-15 octects, in total).
1304 	 *
1305 	 * Note that we may need to "move" the data from the tail of
1306 	 * of the buffer to the new fragment when we split
1307 	 * the message.
1308 	 *
1309 	 * FIXME: It may be fragmented into multiple chunks
1310 	 *        at once if non-fragmentable extension headers
1311 	 *        are too large.
1312 	 * --yoshfuji
1313 	 */
1314 
1315 	cork->length += length;
1316 	if (length > mtu) {
1317 		int proto = sk->sk_protocol;
1318 		if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1319 			ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1320 			return -EMSGSIZE;
1321 		}
1322 
1323 		if (proto == IPPROTO_UDP &&
1324 		    (rt->dst.dev->features & NETIF_F_UFO)) {
1325 
1326 			err = ip6_ufo_append_data(sk, getfrag, from, length,
1327 						  hh_len, fragheaderlen,
1328 						  transhdrlen, mtu, flags, rt);
1329 			if (err)
1330 				goto error;
1331 			return 0;
1332 		}
1333 	}
1334 
1335 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1336 		goto alloc_new_skb;
1337 
1338 	while (length > 0) {
1339 		/* Check if the remaining data fits into current packet. */
1340 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1341 		if (copy < length)
1342 			copy = maxfraglen - skb->len;
1343 
1344 		if (copy <= 0) {
1345 			char *data;
1346 			unsigned int datalen;
1347 			unsigned int fraglen;
1348 			unsigned int fraggap;
1349 			unsigned int alloclen;
1350 			struct sk_buff *skb_prev;
1351 alloc_new_skb:
1352 			skb_prev = skb;
1353 
1354 			/* There's no room in the current skb */
1355 			if (skb_prev)
1356 				fraggap = skb_prev->len - maxfraglen;
1357 			else
1358 				fraggap = 0;
1359 
1360 			/*
1361 			 * If remaining data exceeds the mtu,
1362 			 * we know we need more fragment(s).
1363 			 */
1364 			datalen = length + fraggap;
1365 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1366 				datalen = maxfraglen - fragheaderlen;
1367 
1368 			fraglen = datalen + fragheaderlen;
1369 			if ((flags & MSG_MORE) &&
1370 			    !(rt->dst.dev->features&NETIF_F_SG))
1371 				alloclen = mtu;
1372 			else
1373 				alloclen = datalen + fragheaderlen;
1374 
1375 			alloclen += dst_exthdrlen;
1376 
1377 			/*
1378 			 * The last fragment gets additional space at tail.
1379 			 * Note: we overallocate on fragments with MSG_MODE
1380 			 * because we have no idea if we're the last one.
1381 			 */
1382 			if (datalen == length + fraggap)
1383 				alloclen += rt->dst.trailer_len;
1384 
1385 			/*
1386 			 * We just reserve space for fragment header.
1387 			 * Note: this may be overallocation if the message
1388 			 * (without MSG_MORE) fits into the MTU.
1389 			 */
1390 			alloclen += sizeof(struct frag_hdr);
1391 
1392 			if (transhdrlen) {
1393 				skb = sock_alloc_send_skb(sk,
1394 						alloclen + hh_len,
1395 						(flags & MSG_DONTWAIT), &err);
1396 			} else {
1397 				skb = NULL;
1398 				if (atomic_read(&sk->sk_wmem_alloc) <=
1399 				    2 * sk->sk_sndbuf)
1400 					skb = sock_wmalloc(sk,
1401 							   alloclen + hh_len, 1,
1402 							   sk->sk_allocation);
1403 				if (unlikely(skb == NULL))
1404 					err = -ENOBUFS;
1405 				else {
1406 					/* Only the initial fragment
1407 					 * is time stamped.
1408 					 */
1409 					tx_flags = 0;
1410 				}
1411 			}
1412 			if (skb == NULL)
1413 				goto error;
1414 			/*
1415 			 *	Fill in the control structures
1416 			 */
1417 			skb->ip_summed = csummode;
1418 			skb->csum = 0;
1419 			/* reserve for fragmentation */
1420 			skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1421 
1422 			if (sk->sk_type == SOCK_DGRAM)
1423 				skb_shinfo(skb)->tx_flags = tx_flags;
1424 
1425 			/*
1426 			 *	Find where to start putting bytes
1427 			 */
1428 			data = skb_put(skb, fraglen + dst_exthdrlen);
1429 			skb_set_network_header(skb, exthdrlen + dst_exthdrlen);
1430 			data += fragheaderlen + dst_exthdrlen;
1431 			skb->transport_header = (skb->network_header +
1432 						 fragheaderlen);
1433 			if (fraggap) {
1434 				skb->csum = skb_copy_and_csum_bits(
1435 					skb_prev, maxfraglen,
1436 					data + transhdrlen, fraggap, 0);
1437 				skb_prev->csum = csum_sub(skb_prev->csum,
1438 							  skb->csum);
1439 				data += fraggap;
1440 				pskb_trim_unique(skb_prev, maxfraglen);
1441 			}
1442 			copy = datalen - transhdrlen - fraggap;
1443 
1444 			if (copy < 0) {
1445 				err = -EINVAL;
1446 				kfree_skb(skb);
1447 				goto error;
1448 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1449 				err = -EFAULT;
1450 				kfree_skb(skb);
1451 				goto error;
1452 			}
1453 
1454 			offset += copy;
1455 			length -= datalen - fraggap;
1456 			transhdrlen = 0;
1457 			exthdrlen = 0;
1458 			dst_exthdrlen = 0;
1459 			csummode = CHECKSUM_NONE;
1460 
1461 			/*
1462 			 * Put the packet on the pending queue
1463 			 */
1464 			__skb_queue_tail(&sk->sk_write_queue, skb);
1465 			continue;
1466 		}
1467 
1468 		if (copy > length)
1469 			copy = length;
1470 
1471 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1472 			unsigned int off;
1473 
1474 			off = skb->len;
1475 			if (getfrag(from, skb_put(skb, copy),
1476 						offset, copy, off, skb) < 0) {
1477 				__skb_trim(skb, off);
1478 				err = -EFAULT;
1479 				goto error;
1480 			}
1481 		} else {
1482 			int i = skb_shinfo(skb)->nr_frags;
1483 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1484 			struct page *page = sk->sk_sndmsg_page;
1485 			int off = sk->sk_sndmsg_off;
1486 			unsigned int left;
1487 
1488 			if (page && (left = PAGE_SIZE - off) > 0) {
1489 				if (copy >= left)
1490 					copy = left;
1491 				if (page != skb_frag_page(frag)) {
1492 					if (i == MAX_SKB_FRAGS) {
1493 						err = -EMSGSIZE;
1494 						goto error;
1495 					}
1496 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1497 					skb_frag_ref(skb, i);
1498 					frag = &skb_shinfo(skb)->frags[i];
1499 				}
1500 			} else if(i < MAX_SKB_FRAGS) {
1501 				if (copy > PAGE_SIZE)
1502 					copy = PAGE_SIZE;
1503 				page = alloc_pages(sk->sk_allocation, 0);
1504 				if (page == NULL) {
1505 					err = -ENOMEM;
1506 					goto error;
1507 				}
1508 				sk->sk_sndmsg_page = page;
1509 				sk->sk_sndmsg_off = 0;
1510 
1511 				skb_fill_page_desc(skb, i, page, 0, 0);
1512 				frag = &skb_shinfo(skb)->frags[i];
1513 			} else {
1514 				err = -EMSGSIZE;
1515 				goto error;
1516 			}
1517 			if (getfrag(from,
1518 				    skb_frag_address(frag) + skb_frag_size(frag),
1519 				    offset, copy, skb->len, skb) < 0) {
1520 				err = -EFAULT;
1521 				goto error;
1522 			}
1523 			sk->sk_sndmsg_off += copy;
1524 			skb_frag_size_add(frag, copy);
1525 			skb->len += copy;
1526 			skb->data_len += copy;
1527 			skb->truesize += copy;
1528 			atomic_add(copy, &sk->sk_wmem_alloc);
1529 		}
1530 		offset += copy;
1531 		length -= copy;
1532 	}
1533 	return 0;
1534 error:
1535 	cork->length -= length;
1536 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1537 	return err;
1538 }
1539 
ip6_cork_release(struct inet_sock * inet,struct ipv6_pinfo * np)1540 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1541 {
1542 	if (np->cork.opt) {
1543 		kfree(np->cork.opt->dst0opt);
1544 		kfree(np->cork.opt->dst1opt);
1545 		kfree(np->cork.opt->hopopt);
1546 		kfree(np->cork.opt->srcrt);
1547 		kfree(np->cork.opt);
1548 		np->cork.opt = NULL;
1549 	}
1550 
1551 	if (inet->cork.base.dst) {
1552 		dst_release(inet->cork.base.dst);
1553 		inet->cork.base.dst = NULL;
1554 		inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1555 	}
1556 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1557 }
1558 
ip6_push_pending_frames(struct sock * sk)1559 int ip6_push_pending_frames(struct sock *sk)
1560 {
1561 	struct sk_buff *skb, *tmp_skb;
1562 	struct sk_buff **tail_skb;
1563 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1564 	struct inet_sock *inet = inet_sk(sk);
1565 	struct ipv6_pinfo *np = inet6_sk(sk);
1566 	struct net *net = sock_net(sk);
1567 	struct ipv6hdr *hdr;
1568 	struct ipv6_txoptions *opt = np->cork.opt;
1569 	struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1570 	struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1571 	unsigned char proto = fl6->flowi6_proto;
1572 	int err = 0;
1573 
1574 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1575 		goto out;
1576 	tail_skb = &(skb_shinfo(skb)->frag_list);
1577 
1578 	/* move skb->data to ip header from ext header */
1579 	if (skb->data < skb_network_header(skb))
1580 		__skb_pull(skb, skb_network_offset(skb));
1581 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1582 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1583 		*tail_skb = tmp_skb;
1584 		tail_skb = &(tmp_skb->next);
1585 		skb->len += tmp_skb->len;
1586 		skb->data_len += tmp_skb->len;
1587 		skb->truesize += tmp_skb->truesize;
1588 		tmp_skb->destructor = NULL;
1589 		tmp_skb->sk = NULL;
1590 	}
1591 
1592 	/* Allow local fragmentation. */
1593 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1594 		skb->local_df = 1;
1595 
1596 	*final_dst = fl6->daddr;
1597 	__skb_pull(skb, skb_network_header_len(skb));
1598 	if (opt && opt->opt_flen)
1599 		ipv6_push_frag_opts(skb, opt, &proto);
1600 	if (opt && opt->opt_nflen)
1601 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1602 
1603 	skb_push(skb, sizeof(struct ipv6hdr));
1604 	skb_reset_network_header(skb);
1605 	hdr = ipv6_hdr(skb);
1606 
1607 	*(__be32*)hdr = fl6->flowlabel |
1608 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1609 
1610 	hdr->hop_limit = np->cork.hop_limit;
1611 	hdr->nexthdr = proto;
1612 	hdr->saddr = fl6->saddr;
1613 	hdr->daddr = *final_dst;
1614 
1615 	skb->priority = sk->sk_priority;
1616 	skb->mark = sk->sk_mark;
1617 
1618 	skb_dst_set(skb, dst_clone(&rt->dst));
1619 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1620 	if (proto == IPPROTO_ICMPV6) {
1621 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1622 
1623 		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1624 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1625 	}
1626 
1627 	err = ip6_local_out(skb);
1628 	if (err) {
1629 		if (err > 0)
1630 			err = net_xmit_errno(err);
1631 		if (err)
1632 			goto error;
1633 	}
1634 
1635 out:
1636 	ip6_cork_release(inet, np);
1637 	return err;
1638 error:
1639 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1640 	goto out;
1641 }
1642 
ip6_flush_pending_frames(struct sock * sk)1643 void ip6_flush_pending_frames(struct sock *sk)
1644 {
1645 	struct sk_buff *skb;
1646 
1647 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1648 		if (skb_dst(skb))
1649 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1650 				      IPSTATS_MIB_OUTDISCARDS);
1651 		kfree_skb(skb);
1652 	}
1653 
1654 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1655 }
1656