xref: /linux/net/ipv6/ip6_output.c (revision 7945fe4858663f2d3b80b5ddcc939efbeba4f36e)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/gso.h>
46 #include <net/ipv6.h>
47 #include <net/ndisc.h>
48 #include <net/protocol.h>
49 #include <net/ip6_route.h>
50 #include <net/addrconf.h>
51 #include <net/rawv6.h>
52 #include <net/icmp.h>
53 #include <net/xfrm.h>
54 #include <net/checksum.h>
55 #include <linux/mroute6.h>
56 #include <net/l3mdev.h>
57 #include <net/lwtunnel.h>
58 #include <net/ip_tunnels.h>
59 
60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 {
62 	struct dst_entry *dst = skb_dst(skb);
63 	struct net_device *dev = dst_dev(dst);
64 	struct inet6_dev *idev = ip6_dst_idev(dst);
65 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
66 	const struct in6_addr *daddr, *nexthop;
67 	struct ipv6hdr *hdr;
68 	struct neighbour *neigh;
69 	int ret;
70 
71 	/* Be paranoid, rather than too clever. */
72 	if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
73 		/* Make sure idev stays alive */
74 		rcu_read_lock();
75 		skb = skb_expand_head(skb, hh_len);
76 		if (!skb) {
77 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
78 			rcu_read_unlock();
79 			return -ENOMEM;
80 		}
81 		rcu_read_unlock();
82 	}
83 
84 	hdr = ipv6_hdr(skb);
85 	daddr = &hdr->daddr;
86 	if (ipv6_addr_is_multicast(daddr)) {
87 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
88 		    ((mroute6_is_socket(net, skb) &&
89 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
90 		     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
91 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
92 
93 			/* Do not check for IFF_ALLMULTI; multicast routing
94 			   is not supported in any case.
95 			 */
96 			if (newskb)
97 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
98 					net, sk, newskb, NULL, newskb->dev,
99 					dev_loopback_xmit);
100 
101 			if (hdr->hop_limit == 0) {
102 				IP6_INC_STATS(net, idev,
103 					      IPSTATS_MIB_OUTDISCARDS);
104 				kfree_skb(skb);
105 				return 0;
106 			}
107 		}
108 
109 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
110 		if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
111 		    !(dev->flags & IFF_LOOPBACK)) {
112 			kfree_skb(skb);
113 			return 0;
114 		}
115 	}
116 
117 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
118 		int res = lwtunnel_xmit(skb);
119 
120 		if (res != LWTUNNEL_XMIT_CONTINUE)
121 			return res;
122 	}
123 
124 	IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
125 
126 	rcu_read_lock();
127 	nexthop = rt6_nexthop(dst_rt6_info(dst), daddr);
128 	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
129 
130 	if (IS_ERR_OR_NULL(neigh)) {
131 		if (unlikely(!neigh))
132 			neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
133 		if (IS_ERR(neigh)) {
134 			rcu_read_unlock();
135 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
136 			kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
137 			return -EINVAL;
138 		}
139 	}
140 	sock_confirm_neigh(skb, neigh);
141 	ret = neigh_output(neigh, skb, false);
142 	rcu_read_unlock();
143 	return ret;
144 }
145 
146 static int
147 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
148 				    struct sk_buff *skb, unsigned int mtu)
149 {
150 	struct sk_buff *segs, *nskb;
151 	netdev_features_t features;
152 	int ret = 0;
153 
154 	/* Please see corresponding comment in ip_finish_output_gso
155 	 * describing the cases where GSO segment length exceeds the
156 	 * egress MTU.
157 	 */
158 	features = netif_skb_features(skb);
159 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
160 	if (IS_ERR_OR_NULL(segs)) {
161 		kfree_skb(skb);
162 		return -ENOMEM;
163 	}
164 
165 	consume_skb(skb);
166 
167 	skb_list_walk_safe(segs, segs, nskb) {
168 		int err;
169 
170 		skb_mark_not_on_list(segs);
171 		/* Last GSO segment can be smaller than gso_size (and MTU).
172 		 * Adding a fragment header would produce an "atomic fragment",
173 		 * which is considered harmful (RFC-8021). Avoid that.
174 		 */
175 		err = segs->len > mtu ?
176 			ip6_fragment(net, sk, segs, ip6_finish_output2) :
177 			ip6_finish_output2(net, sk, segs);
178 		if (err && ret == 0)
179 			ret = err;
180 	}
181 
182 	return ret;
183 }
184 
185 static int ip6_finish_output_gso(struct net *net, struct sock *sk,
186 				 struct sk_buff *skb, unsigned int mtu)
187 {
188 	if (!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
189 	    !skb_gso_validate_network_len(skb, mtu))
190 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
191 
192 	return ip6_finish_output2(net, sk, skb);
193 }
194 
195 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
196 {
197 	unsigned int mtu;
198 
199 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
200 	/* Policy lookup after SNAT yielded a new policy */
201 	if (skb_dst(skb)->xfrm) {
202 		IP6CB(skb)->flags |= IP6SKB_REROUTED;
203 		return dst_output(net, sk, skb);
204 	}
205 #endif
206 
207 	mtu = ip6_skb_dst_mtu(skb);
208 	if (skb_is_gso(skb))
209 		return ip6_finish_output_gso(net, sk, skb, mtu);
210 
211 	if (skb->len > mtu ||
212 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
213 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
214 
215 	return ip6_finish_output2(net, sk, skb);
216 }
217 
218 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
219 {
220 	int ret;
221 
222 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
223 	switch (ret) {
224 	case NET_XMIT_SUCCESS:
225 	case NET_XMIT_CN:
226 		return __ip6_finish_output(net, sk, skb) ? : ret;
227 	default:
228 		kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
229 		return ret;
230 	}
231 }
232 
233 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
234 {
235 	struct dst_entry *dst = skb_dst(skb);
236 	struct net_device *dev = dst_dev(dst), *indev = skb->dev;
237 	struct inet6_dev *idev = ip6_dst_idev(dst);
238 
239 	skb->protocol = htons(ETH_P_IPV6);
240 	skb->dev = dev;
241 
242 	if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) {
243 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
244 		kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
245 		return 0;
246 	}
247 
248 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
249 			    net, sk, skb, indev, dev,
250 			    ip6_finish_output,
251 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
252 }
253 EXPORT_SYMBOL(ip6_output);
254 
255 bool ip6_autoflowlabel(struct net *net, const struct sock *sk)
256 {
257 	if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk))
258 		return ip6_default_np_autolabel(net);
259 	return inet6_test_bit(AUTOFLOWLABEL, sk);
260 }
261 
262 /*
263  * xmit an sk_buff (used by TCP and SCTP)
264  * Note : socket lock is not held for SYNACK packets, but might be modified
265  * by calls to skb_set_owner_w() and ipv6_local_error(),
266  * which are using proper atomic operations or spinlocks.
267  */
268 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
269 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
270 {
271 	struct net *net = sock_net(sk);
272 	const struct ipv6_pinfo *np = inet6_sk(sk);
273 	struct in6_addr *first_hop = &fl6->daddr;
274 	struct dst_entry *dst = skb_dst(skb);
275 	struct net_device *dev = dst_dev(dst);
276 	struct inet6_dev *idev = ip6_dst_idev(dst);
277 	struct hop_jumbo_hdr *hop_jumbo;
278 	int hoplen = sizeof(*hop_jumbo);
279 	unsigned int head_room;
280 	struct ipv6hdr *hdr;
281 	u8  proto = fl6->flowi6_proto;
282 	int seg_len = skb->len;
283 	int hlimit = -1;
284 	u32 mtu;
285 
286 	head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
287 	if (opt)
288 		head_room += opt->opt_nflen + opt->opt_flen;
289 
290 	if (unlikely(head_room > skb_headroom(skb))) {
291 		/* Make sure idev stays alive */
292 		rcu_read_lock();
293 		skb = skb_expand_head(skb, head_room);
294 		if (!skb) {
295 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
296 			rcu_read_unlock();
297 			return -ENOBUFS;
298 		}
299 		rcu_read_unlock();
300 	}
301 
302 	if (opt) {
303 		seg_len += opt->opt_nflen + opt->opt_flen;
304 
305 		if (opt->opt_flen)
306 			ipv6_push_frag_opts(skb, opt, &proto);
307 
308 		if (opt->opt_nflen)
309 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
310 					     &fl6->saddr);
311 	}
312 
313 	if (unlikely(seg_len > IPV6_MAXPLEN)) {
314 		hop_jumbo = skb_push(skb, hoplen);
315 
316 		hop_jumbo->nexthdr = proto;
317 		hop_jumbo->hdrlen = 0;
318 		hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
319 		hop_jumbo->tlv_len = 4;
320 		hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
321 
322 		proto = IPPROTO_HOPOPTS;
323 		seg_len = 0;
324 		IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
325 	}
326 
327 	skb_push(skb, sizeof(struct ipv6hdr));
328 	skb_reset_network_header(skb);
329 	hdr = ipv6_hdr(skb);
330 
331 	/*
332 	 *	Fill in the IPv6 header
333 	 */
334 	if (np)
335 		hlimit = READ_ONCE(np->hop_limit);
336 	if (hlimit < 0)
337 		hlimit = ip6_dst_hoplimit(dst);
338 
339 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
340 				ip6_autoflowlabel(net, sk), fl6));
341 
342 	hdr->payload_len = htons(seg_len);
343 	hdr->nexthdr = proto;
344 	hdr->hop_limit = hlimit;
345 
346 	hdr->saddr = fl6->saddr;
347 	hdr->daddr = *first_hop;
348 
349 	skb->protocol = htons(ETH_P_IPV6);
350 	skb->priority = priority;
351 	skb->mark = mark;
352 
353 	mtu = dst_mtu(dst);
354 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
355 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
356 
357 		/* if egress device is enslaved to an L3 master device pass the
358 		 * skb to its handler for processing
359 		 */
360 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
361 		if (unlikely(!skb))
362 			return 0;
363 
364 		/* hooks should never assume socket lock is held.
365 		 * we promote our socket to non const
366 		 */
367 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
368 			       net, (struct sock *)sk, skb, NULL, dev,
369 			       dst_output);
370 	}
371 
372 	skb->dev = dev;
373 	/* ipv6_local_error() does not require socket lock,
374 	 * we promote our socket to non const
375 	 */
376 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
377 
378 	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
379 	kfree_skb(skb);
380 	return -EMSGSIZE;
381 }
382 EXPORT_SYMBOL(ip6_xmit);
383 
384 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
385 {
386 	struct ip6_ra_chain *ra;
387 	struct sock *last = NULL;
388 
389 	read_lock(&ip6_ra_lock);
390 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
391 		struct sock *sk = ra->sk;
392 		if (sk && ra->sel == sel &&
393 		    (!sk->sk_bound_dev_if ||
394 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
395 
396 			if (inet6_test_bit(RTALERT_ISOLATE, sk) &&
397 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
398 				continue;
399 			}
400 			if (last) {
401 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
402 				if (skb2)
403 					rawv6_rcv(last, skb2);
404 			}
405 			last = sk;
406 		}
407 	}
408 
409 	if (last) {
410 		rawv6_rcv(last, skb);
411 		read_unlock(&ip6_ra_lock);
412 		return 1;
413 	}
414 	read_unlock(&ip6_ra_lock);
415 	return 0;
416 }
417 
418 static int ip6_forward_proxy_check(struct sk_buff *skb)
419 {
420 	struct ipv6hdr *hdr = ipv6_hdr(skb);
421 	u8 nexthdr = hdr->nexthdr;
422 	__be16 frag_off;
423 	int offset;
424 
425 	if (ipv6_ext_hdr(nexthdr)) {
426 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
427 		if (offset < 0)
428 			return 0;
429 	} else
430 		offset = sizeof(struct ipv6hdr);
431 
432 	if (nexthdr == IPPROTO_ICMPV6) {
433 		struct icmp6hdr *icmp6;
434 
435 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
436 					 offset + 1 - skb->data)))
437 			return 0;
438 
439 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
440 
441 		switch (icmp6->icmp6_type) {
442 		case NDISC_ROUTER_SOLICITATION:
443 		case NDISC_ROUTER_ADVERTISEMENT:
444 		case NDISC_NEIGHBOUR_SOLICITATION:
445 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
446 		case NDISC_REDIRECT:
447 			/* For reaction involving unicast neighbor discovery
448 			 * message destined to the proxied address, pass it to
449 			 * input function.
450 			 */
451 			return 1;
452 		default:
453 			break;
454 		}
455 	}
456 
457 	/*
458 	 * The proxying router can't forward traffic sent to a link-local
459 	 * address, so signal the sender and discard the packet. This
460 	 * behavior is clarified by the MIPv6 specification.
461 	 */
462 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
463 		dst_link_failure(skb);
464 		return -1;
465 	}
466 
467 	return 0;
468 }
469 
470 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
471 				     struct sk_buff *skb)
472 {
473 #ifdef CONFIG_NET_SWITCHDEV
474 	if (skb->offload_l3_fwd_mark) {
475 		consume_skb(skb);
476 		return 0;
477 	}
478 #endif
479 
480 	skb_clear_tstamp(skb);
481 	return dst_output(net, sk, skb);
482 }
483 
484 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
485 {
486 	if (skb->len <= mtu)
487 		return false;
488 
489 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
490 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
491 		return true;
492 
493 	if (skb->ignore_df)
494 		return false;
495 
496 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
497 		return false;
498 
499 	return true;
500 }
501 
502 int ip6_forward(struct sk_buff *skb)
503 {
504 	struct dst_entry *dst = skb_dst(skb);
505 	struct ipv6hdr *hdr = ipv6_hdr(skb);
506 	struct inet6_skb_parm *opt = IP6CB(skb);
507 	struct net *net = dev_net(dst_dev(dst));
508 	struct net_device *dev;
509 	struct inet6_dev *idev;
510 	SKB_DR(reason);
511 	u32 mtu;
512 
513 	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
514 	if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0)
515 		goto error;
516 
517 	if (skb->pkt_type != PACKET_HOST)
518 		goto drop;
519 
520 	if (unlikely(skb->sk))
521 		goto drop;
522 
523 	if (skb_warn_if_lro(skb))
524 		goto drop;
525 
526 	if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) &&
527 	    (!idev || !READ_ONCE(idev->cnf.disable_policy)) &&
528 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
529 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
530 		goto drop;
531 	}
532 
533 	skb_forward_csum(skb);
534 
535 	/*
536 	 *	We DO NOT make any processing on
537 	 *	RA packets, pushing them to user level AS IS
538 	 *	without ane WARRANTY that application will be able
539 	 *	to interpret them. The reason is that we
540 	 *	cannot make anything clever here.
541 	 *
542 	 *	We are not end-node, so that if packet contains
543 	 *	AH/ESP, we cannot make anything.
544 	 *	Defragmentation also would be mistake, RA packets
545 	 *	cannot be fragmented, because there is no warranty
546 	 *	that different fragments will go along one path. --ANK
547 	 */
548 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
549 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
550 			return 0;
551 	}
552 
553 	/*
554 	 *	check and decrement ttl
555 	 */
556 	if (hdr->hop_limit <= 1) {
557 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
558 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
559 
560 		kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
561 		return -ETIMEDOUT;
562 	}
563 
564 	/* XXX: idev->cnf.proxy_ndp? */
565 	if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) &&
566 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
567 		int proxied = ip6_forward_proxy_check(skb);
568 		if (proxied > 0) {
569 			/* It's tempting to decrease the hop limit
570 			 * here by 1, as we do at the end of the
571 			 * function too.
572 			 *
573 			 * But that would be incorrect, as proxying is
574 			 * not forwarding.  The ip6_input function
575 			 * will handle this packet locally, and it
576 			 * depends on the hop limit being unchanged.
577 			 *
578 			 * One example is the NDP hop limit, that
579 			 * always has to stay 255, but other would be
580 			 * similar checks around RA packets, where the
581 			 * user can even change the desired limit.
582 			 */
583 			return ip6_input(skb);
584 		} else if (proxied < 0) {
585 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
586 			goto drop;
587 		}
588 	}
589 
590 	if (!xfrm6_route_forward(skb)) {
591 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
592 		SKB_DR_SET(reason, XFRM_POLICY);
593 		goto drop;
594 	}
595 	dst = skb_dst(skb);
596 	dev = dst_dev(dst);
597 	/* IPv6 specs say nothing about it, but it is clear that we cannot
598 	   send redirects to source routed frames.
599 	   We don't send redirects to frames decapsulated from IPsec.
600 	 */
601 	if (IP6CB(skb)->iif == dev->ifindex &&
602 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
603 		struct in6_addr *target = NULL;
604 		struct inet_peer *peer;
605 		struct rt6_info *rt;
606 
607 		/*
608 		 *	incoming and outgoing devices are the same
609 		 *	send a redirect.
610 		 */
611 
612 		rt = dst_rt6_info(dst);
613 		if (rt->rt6i_flags & RTF_GATEWAY)
614 			target = &rt->rt6i_gateway;
615 		else
616 			target = &hdr->daddr;
617 
618 		rcu_read_lock();
619 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr);
620 
621 		/* Limit redirects both by destination (here)
622 		   and by source (inside ndisc_send_redirect)
623 		 */
624 		if (inet_peer_xrlim_allow(peer, 1*HZ))
625 			ndisc_send_redirect(skb, target);
626 		rcu_read_unlock();
627 	} else {
628 		int addrtype = ipv6_addr_type(&hdr->saddr);
629 
630 		/* This check is security critical. */
631 		if (addrtype == IPV6_ADDR_ANY ||
632 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
633 			goto error;
634 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
635 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
636 				    ICMPV6_NOT_NEIGHBOUR, 0);
637 			goto error;
638 		}
639 	}
640 
641 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
642 
643 	mtu = ip6_dst_mtu_maybe_forward(dst, true);
644 	if (mtu < IPV6_MIN_MTU)
645 		mtu = IPV6_MIN_MTU;
646 
647 	if (ip6_pkt_too_big(skb, mtu)) {
648 		/* Again, force OUTPUT device used as source address */
649 		skb->dev = dev;
650 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
651 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
652 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
653 				IPSTATS_MIB_FRAGFAILS);
654 		kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
655 		return -EMSGSIZE;
656 	}
657 
658 	if (skb_cow(skb, dev->hard_header_len)) {
659 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
660 				IPSTATS_MIB_OUTDISCARDS);
661 		goto drop;
662 	}
663 
664 	hdr = ipv6_hdr(skb);
665 
666 	/* Mangling hops number delayed to point after skb COW */
667 
668 	hdr->hop_limit--;
669 
670 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
671 		       net, NULL, skb, skb->dev, dev,
672 		       ip6_forward_finish);
673 
674 error:
675 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
676 	SKB_DR_SET(reason, IP_INADDRERRORS);
677 drop:
678 	kfree_skb_reason(skb, reason);
679 	return -EINVAL;
680 }
681 
682 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
683 {
684 	to->pkt_type = from->pkt_type;
685 	to->priority = from->priority;
686 	to->protocol = from->protocol;
687 	skb_dst_drop(to);
688 	skb_dst_set(to, dst_clone(skb_dst(from)));
689 	to->dev = from->dev;
690 	to->mark = from->mark;
691 
692 	skb_copy_hash(to, from);
693 
694 #ifdef CONFIG_NET_SCHED
695 	to->tc_index = from->tc_index;
696 #endif
697 	nf_copy(to, from);
698 	skb_ext_copy(to, from);
699 	skb_copy_secmark(to, from);
700 }
701 
702 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
703 		      u8 nexthdr, __be32 frag_id,
704 		      struct ip6_fraglist_iter *iter)
705 {
706 	unsigned int first_len;
707 	struct frag_hdr *fh;
708 
709 	/* BUILD HEADER */
710 	*prevhdr = NEXTHDR_FRAGMENT;
711 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
712 	if (!iter->tmp_hdr)
713 		return -ENOMEM;
714 
715 	iter->frag = skb_shinfo(skb)->frag_list;
716 	skb_frag_list_init(skb);
717 
718 	iter->offset = 0;
719 	iter->hlen = hlen;
720 	iter->frag_id = frag_id;
721 	iter->nexthdr = nexthdr;
722 
723 	__skb_pull(skb, hlen);
724 	fh = __skb_push(skb, sizeof(struct frag_hdr));
725 	__skb_push(skb, hlen);
726 	skb_reset_network_header(skb);
727 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
728 
729 	fh->nexthdr = nexthdr;
730 	fh->reserved = 0;
731 	fh->frag_off = htons(IP6_MF);
732 	fh->identification = frag_id;
733 
734 	first_len = skb_pagelen(skb);
735 	skb->data_len = first_len - skb_headlen(skb);
736 	skb->len = first_len;
737 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
738 
739 	return 0;
740 }
741 EXPORT_SYMBOL(ip6_fraglist_init);
742 
743 void ip6_fraglist_prepare(struct sk_buff *skb,
744 			  struct ip6_fraglist_iter *iter)
745 {
746 	struct sk_buff *frag = iter->frag;
747 	unsigned int hlen = iter->hlen;
748 	struct frag_hdr *fh;
749 
750 	frag->ip_summed = CHECKSUM_NONE;
751 	skb_reset_transport_header(frag);
752 	fh = __skb_push(frag, sizeof(struct frag_hdr));
753 	__skb_push(frag, hlen);
754 	skb_reset_network_header(frag);
755 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
756 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
757 	fh->nexthdr = iter->nexthdr;
758 	fh->reserved = 0;
759 	fh->frag_off = htons(iter->offset);
760 	if (frag->next)
761 		fh->frag_off |= htons(IP6_MF);
762 	fh->identification = iter->frag_id;
763 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
764 	ip6_copy_metadata(frag, skb);
765 }
766 EXPORT_SYMBOL(ip6_fraglist_prepare);
767 
768 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
769 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
770 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
771 {
772 	state->prevhdr = prevhdr;
773 	state->nexthdr = nexthdr;
774 	state->frag_id = frag_id;
775 
776 	state->hlen = hlen;
777 	state->mtu = mtu;
778 
779 	state->left = skb->len - hlen;	/* Space per frame */
780 	state->ptr = hlen;		/* Where to start from */
781 
782 	state->hroom = hdr_room;
783 	state->troom = needed_tailroom;
784 
785 	state->offset = 0;
786 }
787 EXPORT_SYMBOL(ip6_frag_init);
788 
789 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
790 {
791 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
792 	struct sk_buff *frag;
793 	struct frag_hdr *fh;
794 	unsigned int len;
795 
796 	len = state->left;
797 	/* IF: it doesn't fit, use 'mtu' - the data space left */
798 	if (len > state->mtu)
799 		len = state->mtu;
800 	/* IF: we are not sending up to and including the packet end
801 	   then align the next start on an eight byte boundary */
802 	if (len < state->left)
803 		len &= ~7;
804 
805 	/* Allocate buffer */
806 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
807 			 state->hroom + state->troom, GFP_ATOMIC);
808 	if (!frag)
809 		return ERR_PTR(-ENOMEM);
810 
811 	/*
812 	 *	Set up data on packet
813 	 */
814 
815 	ip6_copy_metadata(frag, skb);
816 	skb_reserve(frag, state->hroom);
817 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
818 	skb_reset_network_header(frag);
819 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
820 	frag->transport_header = (frag->network_header + state->hlen +
821 				  sizeof(struct frag_hdr));
822 
823 	/*
824 	 *	Charge the memory for the fragment to any owner
825 	 *	it might possess
826 	 */
827 	if (skb->sk)
828 		skb_set_owner_w(frag, skb->sk);
829 
830 	/*
831 	 *	Copy the packet header into the new buffer.
832 	 */
833 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
834 
835 	fragnexthdr_offset = skb_network_header(frag);
836 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
837 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
838 
839 	/*
840 	 *	Build fragment header.
841 	 */
842 	fh->nexthdr = state->nexthdr;
843 	fh->reserved = 0;
844 	fh->identification = state->frag_id;
845 
846 	/*
847 	 *	Copy a block of the IP datagram.
848 	 */
849 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
850 			     len));
851 	state->left -= len;
852 
853 	fh->frag_off = htons(state->offset);
854 	if (state->left > 0)
855 		fh->frag_off |= htons(IP6_MF);
856 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
857 
858 	state->ptr += len;
859 	state->offset += len;
860 
861 	return frag;
862 }
863 EXPORT_SYMBOL(ip6_frag_next);
864 
865 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
866 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
867 {
868 	struct sk_buff *frag;
869 	struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
870 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
871 				inet6_sk(skb->sk) : NULL;
872 	u8 tstamp_type = skb->tstamp_type;
873 	struct ip6_frag_state state;
874 	unsigned int mtu, hlen, nexthdr_offset;
875 	ktime_t tstamp = skb->tstamp;
876 	int hroom, err = 0;
877 	__be32 frag_id;
878 	u8 *prevhdr, nexthdr = 0;
879 
880 	err = ip6_find_1stfragopt(skb, &prevhdr);
881 	if (err < 0)
882 		goto fail;
883 	hlen = err;
884 	nexthdr = *prevhdr;
885 	nexthdr_offset = prevhdr - skb_network_header(skb);
886 
887 	mtu = ip6_skb_dst_mtu(skb);
888 
889 	/* We must not fragment if the socket is set to force MTU discovery
890 	 * or if the skb it not generated by a local socket.
891 	 */
892 	if (unlikely(!skb->ignore_df && skb->len > mtu))
893 		goto fail_toobig;
894 
895 	if (IP6CB(skb)->frag_max_size) {
896 		if (IP6CB(skb)->frag_max_size > mtu)
897 			goto fail_toobig;
898 
899 		/* don't send fragments larger than what we received */
900 		mtu = IP6CB(skb)->frag_max_size;
901 		if (mtu < IPV6_MIN_MTU)
902 			mtu = IPV6_MIN_MTU;
903 	}
904 
905 	if (np) {
906 		u32 frag_size = READ_ONCE(np->frag_size);
907 
908 		if (frag_size && frag_size < mtu)
909 			mtu = frag_size;
910 	}
911 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
912 		goto fail_toobig;
913 	mtu -= hlen + sizeof(struct frag_hdr);
914 
915 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
916 				    &ipv6_hdr(skb)->saddr);
917 
918 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
919 	    (err = skb_checksum_help(skb)))
920 		goto fail;
921 
922 	prevhdr = skb_network_header(skb) + nexthdr_offset;
923 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
924 	if (skb_has_frag_list(skb)) {
925 		unsigned int first_len = skb_pagelen(skb);
926 		struct ip6_fraglist_iter iter;
927 		struct sk_buff *frag2;
928 
929 		if (first_len - hlen > mtu ||
930 		    ((first_len - hlen) & 7) ||
931 		    skb_cloned(skb) ||
932 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
933 			goto slow_path;
934 
935 		skb_walk_frags(skb, frag) {
936 			/* Correct geometry. */
937 			if (frag->len > mtu ||
938 			    ((frag->len & 7) && frag->next) ||
939 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
940 				goto slow_path_clean;
941 
942 			/* Partially cloned skb? */
943 			if (skb_shared(frag))
944 				goto slow_path_clean;
945 
946 			BUG_ON(frag->sk);
947 			if (skb->sk) {
948 				frag->sk = skb->sk;
949 				frag->destructor = sock_wfree;
950 			}
951 			skb->truesize -= frag->truesize;
952 		}
953 
954 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
955 					&iter);
956 		if (err < 0)
957 			goto fail;
958 
959 		/* We prevent @rt from being freed. */
960 		rcu_read_lock();
961 
962 		for (;;) {
963 			/* Prepare header of the next frame,
964 			 * before previous one went down. */
965 			if (iter.frag)
966 				ip6_fraglist_prepare(skb, &iter);
967 
968 			skb_set_delivery_time(skb, tstamp, tstamp_type);
969 			err = output(net, sk, skb);
970 			if (!err)
971 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
972 					      IPSTATS_MIB_FRAGCREATES);
973 
974 			if (err || !iter.frag)
975 				break;
976 
977 			skb = ip6_fraglist_next(&iter);
978 		}
979 
980 		kfree(iter.tmp_hdr);
981 
982 		if (err == 0) {
983 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
984 				      IPSTATS_MIB_FRAGOKS);
985 			rcu_read_unlock();
986 			return 0;
987 		}
988 
989 		kfree_skb_list(iter.frag);
990 
991 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
992 			      IPSTATS_MIB_FRAGFAILS);
993 		rcu_read_unlock();
994 		return err;
995 
996 slow_path_clean:
997 		skb_walk_frags(skb, frag2) {
998 			if (frag2 == frag)
999 				break;
1000 			frag2->sk = NULL;
1001 			frag2->destructor = NULL;
1002 			skb->truesize += frag2->truesize;
1003 		}
1004 	}
1005 
1006 slow_path:
1007 	/*
1008 	 *	Fragment the datagram.
1009 	 */
1010 
1011 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
1012 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
1013 		      &state);
1014 
1015 	/*
1016 	 *	Keep copying data until we run out.
1017 	 */
1018 
1019 	while (state.left > 0) {
1020 		frag = ip6_frag_next(skb, &state);
1021 		if (IS_ERR(frag)) {
1022 			err = PTR_ERR(frag);
1023 			goto fail;
1024 		}
1025 
1026 		/*
1027 		 *	Put this fragment into the sending queue.
1028 		 */
1029 		skb_set_delivery_time(frag, tstamp, tstamp_type);
1030 		err = output(net, sk, frag);
1031 		if (err)
1032 			goto fail;
1033 
1034 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1035 			      IPSTATS_MIB_FRAGCREATES);
1036 	}
1037 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1038 		      IPSTATS_MIB_FRAGOKS);
1039 	consume_skb(skb);
1040 	return err;
1041 
1042 fail_toobig:
1043 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1044 	err = -EMSGSIZE;
1045 
1046 fail:
1047 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1048 		      IPSTATS_MIB_FRAGFAILS);
1049 	kfree_skb(skb);
1050 	return err;
1051 }
1052 
1053 static inline int ip6_rt_check(const struct rt6key *rt_key,
1054 			       const struct in6_addr *fl_addr,
1055 			       const struct in6_addr *addr_cache)
1056 {
1057 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1058 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1059 }
1060 
1061 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1062 					  struct dst_entry *dst,
1063 					  const struct flowi6 *fl6)
1064 {
1065 	struct ipv6_pinfo *np = inet6_sk(sk);
1066 	struct rt6_info *rt;
1067 
1068 	if (!dst)
1069 		goto out;
1070 
1071 	if (dst->ops->family != AF_INET6) {
1072 		dst_release(dst);
1073 		return NULL;
1074 	}
1075 
1076 	rt = dst_rt6_info(dst);
1077 	/* Yes, checking route validity in not connected
1078 	 * case is not very simple. Take into account,
1079 	 * that we do not support routing by source, TOS,
1080 	 * and MSG_DONTROUTE		--ANK (980726)
1081 	 *
1082 	 * 1. ip6_rt_check(): If route was host route,
1083 	 *    check that cached destination is current.
1084 	 *    If it is network route, we still may
1085 	 *    check its validity using saved pointer
1086 	 *    to the last used address: daddr_cache.
1087 	 *    We do not want to save whole address now,
1088 	 *    (because main consumer of this service
1089 	 *    is tcp, which has not this problem),
1090 	 *    so that the last trick works only on connected
1091 	 *    sockets.
1092 	 * 2. oif also should be the same.
1093 	 */
1094 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1095 #ifdef CONFIG_IPV6_SUBTREES
1096 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1097 #endif
1098 	   (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) {
1099 		dst_release(dst);
1100 		dst = NULL;
1101 	}
1102 
1103 out:
1104 	return dst;
1105 }
1106 
1107 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1108 			       struct dst_entry **dst, struct flowi6 *fl6)
1109 {
1110 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1111 	struct neighbour *n;
1112 	struct rt6_info *rt;
1113 #endif
1114 	int err;
1115 	int flags = 0;
1116 
1117 	/* The correct way to handle this would be to do
1118 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1119 	 * the route-specific preferred source forces the
1120 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1121 	 *
1122 	 * In source specific routing (no src=any default route),
1123 	 * ip6_route_output will fail given src=any saddr, though, so
1124 	 * that's why we try it again later.
1125 	 */
1126 	if (ipv6_addr_any(&fl6->saddr)) {
1127 		struct fib6_info *from;
1128 		struct rt6_info *rt;
1129 
1130 		*dst = ip6_route_output(net, sk, fl6);
1131 		rt = (*dst)->error ? NULL : dst_rt6_info(*dst);
1132 
1133 		rcu_read_lock();
1134 		from = rt ? rcu_dereference(rt->from) : NULL;
1135 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1136 					  sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0,
1137 					  fl6->flowi6_l3mdev,
1138 					  &fl6->saddr);
1139 		rcu_read_unlock();
1140 
1141 		if (err)
1142 			goto out_err_release;
1143 
1144 		/* If we had an erroneous initial result, pretend it
1145 		 * never existed and let the SA-enabled version take
1146 		 * over.
1147 		 */
1148 		if ((*dst)->error) {
1149 			dst_release(*dst);
1150 			*dst = NULL;
1151 		}
1152 
1153 		if (fl6->flowi6_oif)
1154 			flags |= RT6_LOOKUP_F_IFACE;
1155 	}
1156 
1157 	if (!*dst)
1158 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1159 
1160 	err = (*dst)->error;
1161 	if (err)
1162 		goto out_err_release;
1163 
1164 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1165 	/*
1166 	 * Here if the dst entry we've looked up
1167 	 * has a neighbour entry that is in the INCOMPLETE
1168 	 * state and the src address from the flow is
1169 	 * marked as OPTIMISTIC, we release the found
1170 	 * dst entry and replace it instead with the
1171 	 * dst entry of the nexthop router
1172 	 */
1173 	rt = dst_rt6_info(*dst);
1174 	rcu_read_lock();
1175 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1176 				      rt6_nexthop(rt, &fl6->daddr));
1177 	err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1178 	rcu_read_unlock();
1179 
1180 	if (err) {
1181 		struct inet6_ifaddr *ifp;
1182 		struct flowi6 fl_gw6;
1183 		int redirect;
1184 
1185 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1186 				      (*dst)->dev, 1);
1187 
1188 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1189 		if (ifp)
1190 			in6_ifa_put(ifp);
1191 
1192 		if (redirect) {
1193 			/*
1194 			 * We need to get the dst entry for the
1195 			 * default router instead
1196 			 */
1197 			dst_release(*dst);
1198 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1199 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1200 			*dst = ip6_route_output(net, sk, &fl_gw6);
1201 			err = (*dst)->error;
1202 			if (err)
1203 				goto out_err_release;
1204 		}
1205 	}
1206 #endif
1207 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1208 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1209 		err = -EAFNOSUPPORT;
1210 		goto out_err_release;
1211 	}
1212 
1213 	return 0;
1214 
1215 out_err_release:
1216 	dst_release(*dst);
1217 	*dst = NULL;
1218 
1219 	if (err == -ENETUNREACH)
1220 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1221 	return err;
1222 }
1223 
1224 /**
1225  *	ip6_dst_lookup - perform route lookup on flow
1226  *	@net: Network namespace to perform lookup in
1227  *	@sk: socket which provides route info
1228  *	@dst: pointer to dst_entry * for result
1229  *	@fl6: flow to lookup
1230  *
1231  *	This function performs a route lookup on the given flow.
1232  *
1233  *	It returns zero on success, or a standard errno code on error.
1234  */
1235 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1236 		   struct flowi6 *fl6)
1237 {
1238 	*dst = NULL;
1239 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1240 }
1241 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1242 
1243 /**
1244  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1245  *	@net: Network namespace to perform lookup in
1246  *	@sk: socket which provides route info
1247  *	@fl6: flow to lookup
1248  *	@final_dst: final destination address for ipsec lookup
1249  *
1250  *	This function performs a route lookup on the given flow.
1251  *
1252  *	It returns a valid dst pointer on success, or a pointer encoded
1253  *	error code.
1254  */
1255 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1256 				      const struct in6_addr *final_dst)
1257 {
1258 	struct dst_entry *dst = NULL;
1259 	int err;
1260 
1261 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1262 	if (err)
1263 		return ERR_PTR(err);
1264 	if (final_dst)
1265 		fl6->daddr = *final_dst;
1266 
1267 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1268 }
1269 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1270 
1271 /**
1272  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1273  *	@sk: socket which provides the dst cache and route info
1274  *	@fl6: flow to lookup
1275  *	@final_dst: final destination address for ipsec lookup
1276  *	@connected: whether @sk is connected or not
1277  *
1278  *	This function performs a route lookup on the given flow with the
1279  *	possibility of using the cached route in the socket if it is valid.
1280  *	It will take the socket dst lock when operating on the dst cache.
1281  *	As a result, this function can only be used in process context.
1282  *
1283  *	In addition, for a connected socket, cache the dst in the socket
1284  *	if the current cache is not valid.
1285  *
1286  *	It returns a valid dst pointer on success, or a pointer encoded
1287  *	error code.
1288  */
1289 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1290 					 const struct in6_addr *final_dst,
1291 					 bool connected)
1292 {
1293 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1294 
1295 	dst = ip6_sk_dst_check(sk, dst, fl6);
1296 	if (dst)
1297 		return dst;
1298 
1299 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1300 	if (connected && !IS_ERR(dst))
1301 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1302 
1303 	return dst;
1304 }
1305 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1306 
1307 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1308 					       gfp_t gfp)
1309 {
1310 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1311 }
1312 
1313 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1314 						gfp_t gfp)
1315 {
1316 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1317 }
1318 
1319 static void ip6_append_data_mtu(unsigned int *mtu,
1320 				int *maxfraglen,
1321 				unsigned int fragheaderlen,
1322 				struct sk_buff *skb,
1323 				struct rt6_info *rt,
1324 				unsigned int orig_mtu)
1325 {
1326 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1327 		if (!skb) {
1328 			/* first fragment, reserve header_len */
1329 			*mtu = orig_mtu - rt->dst.header_len;
1330 
1331 		} else {
1332 			/*
1333 			 * this fragment is not first, the headers
1334 			 * space is regarded as data space.
1335 			 */
1336 			*mtu = orig_mtu;
1337 		}
1338 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1339 			      + fragheaderlen - sizeof(struct frag_hdr);
1340 	}
1341 }
1342 
1343 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1344 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1345 			  struct rt6_info *rt)
1346 {
1347 	struct ipv6_pinfo *np = inet6_sk(sk);
1348 	unsigned int mtu, frag_size;
1349 	struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1350 
1351 	/* callers pass dst together with a reference, set it first so
1352 	 * ip6_cork_release() can put it down even in case of an error.
1353 	 */
1354 	cork->base.dst = &rt->dst;
1355 
1356 	/*
1357 	 * setup for corking
1358 	 */
1359 	if (opt) {
1360 		if (WARN_ON(v6_cork->opt))
1361 			return -EINVAL;
1362 
1363 		nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1364 		if (unlikely(!nopt))
1365 			return -ENOBUFS;
1366 
1367 		nopt->tot_len = sizeof(*opt);
1368 		nopt->opt_flen = opt->opt_flen;
1369 		nopt->opt_nflen = opt->opt_nflen;
1370 
1371 		nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1372 		if (opt->dst0opt && !nopt->dst0opt)
1373 			return -ENOBUFS;
1374 
1375 		nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1376 		if (opt->dst1opt && !nopt->dst1opt)
1377 			return -ENOBUFS;
1378 
1379 		nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1380 		if (opt->hopopt && !nopt->hopopt)
1381 			return -ENOBUFS;
1382 
1383 		nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1384 		if (opt->srcrt && !nopt->srcrt)
1385 			return -ENOBUFS;
1386 
1387 		/* need source address above miyazawa*/
1388 	}
1389 	v6_cork->hop_limit = ipc6->hlimit;
1390 	v6_cork->tclass = ipc6->tclass;
1391 	v6_cork->dontfrag = ipc6->dontfrag;
1392 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1393 		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1394 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1395 	else
1396 		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1397 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1398 
1399 	frag_size = READ_ONCE(np->frag_size);
1400 	if (frag_size && frag_size < mtu)
1401 		mtu = frag_size;
1402 
1403 	cork->base.fragsize = mtu;
1404 	cork->base.gso_size = ipc6->gso_size;
1405 	cork->base.tx_flags = 0;
1406 	cork->base.mark = ipc6->sockc.mark;
1407 	cork->base.priority = ipc6->sockc.priority;
1408 	sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags);
1409 	if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) {
1410 		cork->base.flags |= IPCORK_TS_OPT_ID;
1411 		cork->base.ts_opt_id = ipc6->sockc.ts_opt_id;
1412 	}
1413 	cork->base.length = 0;
1414 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1415 
1416 	return 0;
1417 }
1418 
1419 static int __ip6_append_data(struct sock *sk,
1420 			     struct sk_buff_head *queue,
1421 			     struct inet_cork_full *cork_full,
1422 			     struct inet6_cork *v6_cork,
1423 			     struct page_frag *pfrag,
1424 			     int getfrag(void *from, char *to, int offset,
1425 					 int len, int odd, struct sk_buff *skb),
1426 			     void *from, size_t length, int transhdrlen,
1427 			     unsigned int flags)
1428 {
1429 	struct sk_buff *skb, *skb_prev = NULL;
1430 	struct inet_cork *cork = &cork_full->base;
1431 	struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1432 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1433 	struct ubuf_info *uarg = NULL;
1434 	int exthdrlen = 0;
1435 	int dst_exthdrlen = 0;
1436 	int hh_len;
1437 	int copy;
1438 	int err;
1439 	int offset = 0;
1440 	bool zc = false;
1441 	u32 tskey = 0;
1442 	struct rt6_info *rt = dst_rt6_info(cork->dst);
1443 	bool paged, hold_tskey = false, extra_uref = false;
1444 	struct ipv6_txoptions *opt = v6_cork->opt;
1445 	int csummode = CHECKSUM_NONE;
1446 	unsigned int maxnonfragsize, headersize;
1447 	unsigned int wmem_alloc_delta = 0;
1448 
1449 	skb = skb_peek_tail(queue);
1450 	if (!skb) {
1451 		exthdrlen = opt ? opt->opt_flen : 0;
1452 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1453 	}
1454 
1455 	paged = !!cork->gso_size;
1456 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1457 	orig_mtu = mtu;
1458 
1459 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1460 
1461 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1462 			(opt ? opt->opt_nflen : 0);
1463 
1464 	headersize = sizeof(struct ipv6hdr) +
1465 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1466 		     rt->rt6i_nfheader_len;
1467 
1468 	if (mtu <= fragheaderlen ||
1469 	    ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1470 		goto emsgsize;
1471 
1472 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1473 		     sizeof(struct frag_hdr);
1474 
1475 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1476 	 * the first fragment
1477 	 */
1478 	if (headersize + transhdrlen > mtu)
1479 		goto emsgsize;
1480 
1481 	if (cork->length + length > mtu - headersize && v6_cork->dontfrag &&
1482 	    (sk->sk_protocol == IPPROTO_UDP ||
1483 	     sk->sk_protocol == IPPROTO_ICMPV6 ||
1484 	     sk->sk_protocol == IPPROTO_RAW)) {
1485 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1486 				sizeof(struct ipv6hdr));
1487 		goto emsgsize;
1488 	}
1489 
1490 	if (ip6_sk_ignore_df(sk))
1491 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1492 	else
1493 		maxnonfragsize = mtu;
1494 
1495 	if (cork->length + length > maxnonfragsize - headersize) {
1496 emsgsize:
1497 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1498 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1499 		return -EMSGSIZE;
1500 	}
1501 
1502 	/* CHECKSUM_PARTIAL only with no extension headers and when
1503 	 * we are not going to fragment
1504 	 */
1505 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1506 	    headersize == sizeof(struct ipv6hdr) &&
1507 	    length <= mtu - headersize &&
1508 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1509 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1510 		csummode = CHECKSUM_PARTIAL;
1511 
1512 	if ((flags & MSG_ZEROCOPY) && length) {
1513 		struct msghdr *msg = from;
1514 
1515 		if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1516 			if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1517 				return -EINVAL;
1518 
1519 			/* Leave uarg NULL if can't zerocopy, callers should
1520 			 * be able to handle it.
1521 			 */
1522 			if ((rt->dst.dev->features & NETIF_F_SG) &&
1523 			    csummode == CHECKSUM_PARTIAL) {
1524 				paged = true;
1525 				zc = true;
1526 				uarg = msg->msg_ubuf;
1527 			}
1528 		} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1529 			uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb),
1530 						    false);
1531 			if (!uarg)
1532 				return -ENOBUFS;
1533 			extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1534 			if (rt->dst.dev->features & NETIF_F_SG &&
1535 			    csummode == CHECKSUM_PARTIAL) {
1536 				paged = true;
1537 				zc = true;
1538 			} else {
1539 				uarg_to_msgzc(uarg)->zerocopy = 0;
1540 				skb_zcopy_set(skb, uarg, &extra_uref);
1541 			}
1542 		}
1543 	} else if ((flags & MSG_SPLICE_PAGES) && length) {
1544 		if (inet_test_bit(HDRINCL, sk))
1545 			return -EPERM;
1546 		if (rt->dst.dev->features & NETIF_F_SG &&
1547 		    getfrag == ip_generic_getfrag)
1548 			/* We need an empty buffer to attach stuff to */
1549 			paged = true;
1550 		else
1551 			flags &= ~MSG_SPLICE_PAGES;
1552 	}
1553 
1554 	if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1555 	    READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
1556 		if (cork->flags & IPCORK_TS_OPT_ID) {
1557 			tskey = cork->ts_opt_id;
1558 		} else {
1559 			tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1560 			hold_tskey = true;
1561 		}
1562 	}
1563 
1564 	/*
1565 	 * Let's try using as much space as possible.
1566 	 * Use MTU if total length of the message fits into the MTU.
1567 	 * Otherwise, we need to reserve fragment header and
1568 	 * fragment alignment (= 8-15 octects, in total).
1569 	 *
1570 	 * Note that we may need to "move" the data from the tail
1571 	 * of the buffer to the new fragment when we split
1572 	 * the message.
1573 	 *
1574 	 * FIXME: It may be fragmented into multiple chunks
1575 	 *        at once if non-fragmentable extension headers
1576 	 *        are too large.
1577 	 * --yoshfuji
1578 	 */
1579 
1580 	cork->length += length;
1581 	if (!skb)
1582 		goto alloc_new_skb;
1583 
1584 	while (length > 0) {
1585 		/* Check if the remaining data fits into current packet. */
1586 		copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len;
1587 		if (copy < length)
1588 			copy = maxfraglen - skb->len;
1589 
1590 		if (copy <= 0) {
1591 			char *data;
1592 			unsigned int datalen;
1593 			unsigned int fraglen;
1594 			unsigned int fraggap;
1595 			unsigned int alloclen, alloc_extra;
1596 			unsigned int pagedlen;
1597 alloc_new_skb:
1598 			/* There's no room in the current skb */
1599 			if (skb)
1600 				fraggap = skb->len - maxfraglen;
1601 			else
1602 				fraggap = 0;
1603 			/* update mtu and maxfraglen if necessary */
1604 			if (!skb || !skb_prev)
1605 				ip6_append_data_mtu(&mtu, &maxfraglen,
1606 						    fragheaderlen, skb, rt,
1607 						    orig_mtu);
1608 
1609 			skb_prev = skb;
1610 
1611 			/*
1612 			 * If remaining data exceeds the mtu,
1613 			 * we know we need more fragment(s).
1614 			 */
1615 			datalen = length + fraggap;
1616 
1617 			if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen)
1618 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1619 			fraglen = datalen + fragheaderlen;
1620 			pagedlen = 0;
1621 
1622 			alloc_extra = hh_len;
1623 			alloc_extra += dst_exthdrlen;
1624 			alloc_extra += rt->dst.trailer_len;
1625 
1626 			/* We just reserve space for fragment header.
1627 			 * Note: this may be overallocation if the message
1628 			 * (without MSG_MORE) fits into the MTU.
1629 			 */
1630 			alloc_extra += sizeof(struct frag_hdr);
1631 
1632 			if ((flags & MSG_MORE) &&
1633 			    !(rt->dst.dev->features&NETIF_F_SG))
1634 				alloclen = mtu;
1635 			else if (!paged &&
1636 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1637 				  !(rt->dst.dev->features & NETIF_F_SG)))
1638 				alloclen = fraglen;
1639 			else {
1640 				alloclen = fragheaderlen + transhdrlen;
1641 				pagedlen = datalen - transhdrlen;
1642 			}
1643 			alloclen += alloc_extra;
1644 
1645 			if (datalen != length + fraggap) {
1646 				/*
1647 				 * this is not the last fragment, the trailer
1648 				 * space is regarded as data space.
1649 				 */
1650 				datalen += rt->dst.trailer_len;
1651 			}
1652 
1653 			fraglen = datalen + fragheaderlen;
1654 
1655 			copy = datalen - transhdrlen - fraggap - pagedlen;
1656 			/* [!] NOTE: copy may be negative if pagedlen>0
1657 			 * because then the equation may reduces to -fraggap.
1658 			 */
1659 			if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
1660 				err = -EINVAL;
1661 				goto error;
1662 			}
1663 			if (transhdrlen) {
1664 				skb = sock_alloc_send_skb(sk, alloclen,
1665 						(flags & MSG_DONTWAIT), &err);
1666 			} else {
1667 				skb = NULL;
1668 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1669 				    2 * sk->sk_sndbuf)
1670 					skb = alloc_skb(alloclen,
1671 							sk->sk_allocation);
1672 				if (unlikely(!skb))
1673 					err = -ENOBUFS;
1674 			}
1675 			if (!skb)
1676 				goto error;
1677 			/*
1678 			 *	Fill in the control structures
1679 			 */
1680 			skb->protocol = htons(ETH_P_IPV6);
1681 			skb->ip_summed = csummode;
1682 			skb->csum = 0;
1683 			/* reserve for fragmentation and ipsec header */
1684 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1685 				    dst_exthdrlen);
1686 
1687 			/*
1688 			 *	Find where to start putting bytes
1689 			 */
1690 			data = skb_put(skb, fraglen - pagedlen);
1691 			skb_set_network_header(skb, exthdrlen);
1692 			data += fragheaderlen;
1693 			skb->transport_header = (skb->network_header +
1694 						 fragheaderlen);
1695 			if (fraggap) {
1696 				skb->csum = skb_copy_and_csum_bits(
1697 					skb_prev, maxfraglen,
1698 					data + transhdrlen, fraggap);
1699 				skb_prev->csum = csum_sub(skb_prev->csum,
1700 							  skb->csum);
1701 				data += fraggap;
1702 				pskb_trim_unique(skb_prev, maxfraglen);
1703 			}
1704 			if (copy > 0 &&
1705 			    INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1706 					   from, data + transhdrlen, offset,
1707 					   copy, fraggap, skb) < 0) {
1708 				err = -EFAULT;
1709 				kfree_skb(skb);
1710 				goto error;
1711 			} else if (flags & MSG_SPLICE_PAGES) {
1712 				copy = 0;
1713 			}
1714 
1715 			offset += copy;
1716 			length -= copy + transhdrlen;
1717 			transhdrlen = 0;
1718 			exthdrlen = 0;
1719 			dst_exthdrlen = 0;
1720 
1721 			/* Only the initial fragment is time stamped */
1722 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1723 			cork->tx_flags = 0;
1724 			skb_shinfo(skb)->tskey = tskey;
1725 			tskey = 0;
1726 			skb_zcopy_set(skb, uarg, &extra_uref);
1727 
1728 			if ((flags & MSG_CONFIRM) && !skb_prev)
1729 				skb_set_dst_pending_confirm(skb, 1);
1730 
1731 			/*
1732 			 * Put the packet on the pending queue
1733 			 */
1734 			if (!skb->destructor) {
1735 				skb->destructor = sock_wfree;
1736 				skb->sk = sk;
1737 				wmem_alloc_delta += skb->truesize;
1738 			}
1739 			__skb_queue_tail(queue, skb);
1740 			continue;
1741 		}
1742 
1743 		if (copy > length)
1744 			copy = length;
1745 
1746 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1747 		    skb_tailroom(skb) >= copy) {
1748 			unsigned int off;
1749 
1750 			off = skb->len;
1751 			if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1752 					    from, skb_put(skb, copy),
1753 					    offset, copy, off, skb) < 0) {
1754 				__skb_trim(skb, off);
1755 				err = -EFAULT;
1756 				goto error;
1757 			}
1758 		} else if (flags & MSG_SPLICE_PAGES) {
1759 			struct msghdr *msg = from;
1760 
1761 			err = -EIO;
1762 			if (WARN_ON_ONCE(copy > msg->msg_iter.count))
1763 				goto error;
1764 
1765 			err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
1766 						   sk->sk_allocation);
1767 			if (err < 0)
1768 				goto error;
1769 			copy = err;
1770 			wmem_alloc_delta += copy;
1771 		} else if (!zc) {
1772 			int i = skb_shinfo(skb)->nr_frags;
1773 
1774 			err = -ENOMEM;
1775 			if (!sk_page_frag_refill(sk, pfrag))
1776 				goto error;
1777 
1778 			skb_zcopy_downgrade_managed(skb);
1779 			if (!skb_can_coalesce(skb, i, pfrag->page,
1780 					      pfrag->offset)) {
1781 				err = -EMSGSIZE;
1782 				if (i == MAX_SKB_FRAGS)
1783 					goto error;
1784 
1785 				__skb_fill_page_desc(skb, i, pfrag->page,
1786 						     pfrag->offset, 0);
1787 				skb_shinfo(skb)->nr_frags = ++i;
1788 				get_page(pfrag->page);
1789 			}
1790 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1791 			if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1792 				    from,
1793 				    page_address(pfrag->page) + pfrag->offset,
1794 				    offset, copy, skb->len, skb) < 0)
1795 				goto error_efault;
1796 
1797 			pfrag->offset += copy;
1798 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1799 			skb->len += copy;
1800 			skb->data_len += copy;
1801 			skb->truesize += copy;
1802 			wmem_alloc_delta += copy;
1803 		} else {
1804 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1805 			if (err < 0)
1806 				goto error;
1807 		}
1808 		offset += copy;
1809 		length -= copy;
1810 	}
1811 
1812 	if (wmem_alloc_delta)
1813 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1814 	return 0;
1815 
1816 error_efault:
1817 	err = -EFAULT;
1818 error:
1819 	net_zcopy_put_abort(uarg, extra_uref);
1820 	cork->length -= length;
1821 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1822 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1823 	if (hold_tskey)
1824 		atomic_dec(&sk->sk_tskey);
1825 	return err;
1826 }
1827 
1828 int ip6_append_data(struct sock *sk,
1829 		    int getfrag(void *from, char *to, int offset, int len,
1830 				int odd, struct sk_buff *skb),
1831 		    void *from, size_t length, int transhdrlen,
1832 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1833 		    struct rt6_info *rt, unsigned int flags)
1834 {
1835 	struct inet_sock *inet = inet_sk(sk);
1836 	struct ipv6_pinfo *np = inet6_sk(sk);
1837 	int exthdrlen;
1838 	int err;
1839 
1840 	if (flags&MSG_PROBE)
1841 		return 0;
1842 	if (skb_queue_empty(&sk->sk_write_queue)) {
1843 		/*
1844 		 * setup for corking
1845 		 */
1846 		dst_hold(&rt->dst);
1847 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1848 				     ipc6, rt);
1849 		if (err)
1850 			return err;
1851 
1852 		inet->cork.fl.u.ip6 = *fl6;
1853 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1854 		length += exthdrlen;
1855 		transhdrlen += exthdrlen;
1856 	} else {
1857 		transhdrlen = 0;
1858 	}
1859 
1860 	return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1861 				 &np->cork, sk_page_frag(sk), getfrag,
1862 				 from, length, transhdrlen, flags);
1863 }
1864 EXPORT_SYMBOL_GPL(ip6_append_data);
1865 
1866 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1867 {
1868 	struct dst_entry *dst = cork->base.dst;
1869 
1870 	cork->base.dst = NULL;
1871 	skb_dst_set(skb, dst);
1872 }
1873 
1874 static void ip6_cork_release(struct inet_cork_full *cork,
1875 			     struct inet6_cork *v6_cork)
1876 {
1877 	if (v6_cork->opt) {
1878 		struct ipv6_txoptions *opt = v6_cork->opt;
1879 
1880 		kfree(opt->dst0opt);
1881 		kfree(opt->dst1opt);
1882 		kfree(opt->hopopt);
1883 		kfree(opt->srcrt);
1884 		kfree(opt);
1885 		v6_cork->opt = NULL;
1886 	}
1887 
1888 	if (cork->base.dst) {
1889 		dst_release(cork->base.dst);
1890 		cork->base.dst = NULL;
1891 	}
1892 }
1893 
1894 struct sk_buff *__ip6_make_skb(struct sock *sk,
1895 			       struct sk_buff_head *queue,
1896 			       struct inet_cork_full *cork,
1897 			       struct inet6_cork *v6_cork)
1898 {
1899 	struct sk_buff *skb, *tmp_skb;
1900 	struct sk_buff **tail_skb;
1901 	struct in6_addr *final_dst;
1902 	struct net *net = sock_net(sk);
1903 	struct ipv6hdr *hdr;
1904 	struct ipv6_txoptions *opt = v6_cork->opt;
1905 	struct rt6_info *rt = dst_rt6_info(cork->base.dst);
1906 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1907 	unsigned char proto = fl6->flowi6_proto;
1908 
1909 	skb = __skb_dequeue(queue);
1910 	if (!skb)
1911 		goto out;
1912 	tail_skb = &(skb_shinfo(skb)->frag_list);
1913 
1914 	/* move skb->data to ip header from ext header */
1915 	if (skb->data < skb_network_header(skb))
1916 		__skb_pull(skb, skb_network_offset(skb));
1917 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1918 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1919 		*tail_skb = tmp_skb;
1920 		tail_skb = &(tmp_skb->next);
1921 		skb->len += tmp_skb->len;
1922 		skb->data_len += tmp_skb->len;
1923 		skb->truesize += tmp_skb->truesize;
1924 		tmp_skb->destructor = NULL;
1925 		tmp_skb->sk = NULL;
1926 	}
1927 
1928 	/* Allow local fragmentation. */
1929 	skb->ignore_df = ip6_sk_ignore_df(sk);
1930 	__skb_pull(skb, skb_network_header_len(skb));
1931 
1932 	final_dst = &fl6->daddr;
1933 	if (opt && opt->opt_flen)
1934 		ipv6_push_frag_opts(skb, opt, &proto);
1935 	if (opt && opt->opt_nflen)
1936 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1937 
1938 	skb_push(skb, sizeof(struct ipv6hdr));
1939 	skb_reset_network_header(skb);
1940 	hdr = ipv6_hdr(skb);
1941 
1942 	ip6_flow_hdr(hdr, v6_cork->tclass,
1943 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1944 					ip6_autoflowlabel(net, sk), fl6));
1945 	hdr->hop_limit = v6_cork->hop_limit;
1946 	hdr->nexthdr = proto;
1947 	hdr->saddr = fl6->saddr;
1948 	hdr->daddr = *final_dst;
1949 
1950 	skb->priority = cork->base.priority;
1951 	skb->mark = cork->base.mark;
1952 	if (sk_is_tcp(sk))
1953 		skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC);
1954 	else
1955 		skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid);
1956 
1957 	ip6_cork_steal_dst(skb, cork);
1958 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1959 	if (proto == IPPROTO_ICMPV6) {
1960 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1961 		u8 icmp6_type;
1962 
1963 		if (sk->sk_socket->type == SOCK_RAW &&
1964 		   !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH))
1965 			icmp6_type = fl6->fl6_icmp_type;
1966 		else
1967 			icmp6_type = icmp6_hdr(skb)->icmp6_type;
1968 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
1969 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1970 	}
1971 
1972 	ip6_cork_release(cork, v6_cork);
1973 out:
1974 	return skb;
1975 }
1976 
1977 int ip6_send_skb(struct sk_buff *skb)
1978 {
1979 	struct net *net = sock_net(skb->sk);
1980 	struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
1981 	int err;
1982 
1983 	rcu_read_lock();
1984 	err = ip6_local_out(net, skb->sk, skb);
1985 	if (err) {
1986 		if (err > 0)
1987 			err = net_xmit_errno(err);
1988 		if (err)
1989 			IP6_INC_STATS(net, rt->rt6i_idev,
1990 				      IPSTATS_MIB_OUTDISCARDS);
1991 	}
1992 
1993 	rcu_read_unlock();
1994 	return err;
1995 }
1996 
1997 int ip6_push_pending_frames(struct sock *sk)
1998 {
1999 	struct sk_buff *skb;
2000 
2001 	skb = ip6_finish_skb(sk);
2002 	if (!skb)
2003 		return 0;
2004 
2005 	return ip6_send_skb(skb);
2006 }
2007 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
2008 
2009 static void __ip6_flush_pending_frames(struct sock *sk,
2010 				       struct sk_buff_head *queue,
2011 				       struct inet_cork_full *cork,
2012 				       struct inet6_cork *v6_cork)
2013 {
2014 	struct sk_buff *skb;
2015 
2016 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2017 		if (skb_dst(skb))
2018 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2019 				      IPSTATS_MIB_OUTDISCARDS);
2020 		kfree_skb(skb);
2021 	}
2022 
2023 	ip6_cork_release(cork, v6_cork);
2024 }
2025 
2026 void ip6_flush_pending_frames(struct sock *sk)
2027 {
2028 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2029 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2030 }
2031 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2032 
2033 struct sk_buff *ip6_make_skb(struct sock *sk,
2034 			     int getfrag(void *from, char *to, int offset,
2035 					 int len, int odd, struct sk_buff *skb),
2036 			     void *from, size_t length, int transhdrlen,
2037 			     struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2038 			     unsigned int flags, struct inet_cork_full *cork)
2039 {
2040 	struct inet6_cork v6_cork;
2041 	struct sk_buff_head queue;
2042 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2043 	int err;
2044 
2045 	if (flags & MSG_PROBE) {
2046 		dst_release(&rt->dst);
2047 		return NULL;
2048 	}
2049 
2050 	__skb_queue_head_init(&queue);
2051 
2052 	cork->base.flags = 0;
2053 	cork->base.addr = 0;
2054 	cork->base.opt = NULL;
2055 	v6_cork.opt = NULL;
2056 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2057 	if (err) {
2058 		ip6_cork_release(cork, &v6_cork);
2059 		return ERR_PTR(err);
2060 	}
2061 
2062 	err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2063 				&current->task_frag, getfrag, from,
2064 				length + exthdrlen, transhdrlen + exthdrlen,
2065 				flags);
2066 	if (err) {
2067 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2068 		return ERR_PTR(err);
2069 	}
2070 
2071 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2072 }
2073