xref: /linux/net/ipv4/tcp_ipv4.c (revision a0b0f6c7d7f29f1ade9ec59699d02e3b153ee8e4)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Implementation of the Transmission Control Protocol(TCP).
8  *
9  *		IPv4 specific functions
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  */
18 
19 /*
20  * Changes:
21  *		David S. Miller	:	New socket lookup architecture.
22  *					This code is dedicated to John Dyson.
23  *		David S. Miller :	Change semantics of established hash,
24  *					half is devoted to TIME_WAIT sockets
25  *					and the rest go in the other half.
26  *		Andi Kleen :		Add support for syncookies and fixed
27  *					some bugs: ip options weren't passed to
28  *					the TCP layer, missed a check for an
29  *					ACK bit.
30  *		Andi Kleen :		Implemented fast path mtu discovery.
31  *	     				Fixed many serious bugs in the
32  *					request_sock handling and moved
33  *					most of it into the af independent code.
34  *					Added tail drop and some other bugfixes.
35  *					Added new listen semantics.
36  *		Mike McLagan	:	Routing by source
37  *	Juan Jose Ciarlante:		ip_dynaddr bits
38  *		Andi Kleen:		various fixes.
39  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
40  *					coma.
41  *	Andi Kleen		:	Fix new listen.
42  *	Andi Kleen		:	Fix accept error reporting.
43  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
44  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
45  *					a single port at the same time.
46  */
47 
48 #define pr_fmt(fmt) "TCP: " fmt
49 
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/fips.h>
57 #include <linux/jhash.h>
58 #include <linux/init.h>
59 #include <linux/times.h>
60 #include <linux/slab.h>
61 #include <linux/sched.h>
62 #include <linux/sock_diag.h>
63 
64 #include <net/aligned_data.h>
65 #include <net/net_namespace.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/tcp_ecn.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/inet_ecn.h>
74 #include <net/timewait_sock.h>
75 #include <net/xfrm.h>
76 #include <net/secure_seq.h>
77 #include <net/busy_poll.h>
78 #include <net/rstreason.h>
79 #include <net/psp.h>
80 
81 #include <linux/inet.h>
82 #include <linux/ipv6.h>
83 #include <linux/stddef.h>
84 #include <linux/proc_fs.h>
85 #include <linux/seq_file.h>
86 #include <linux/inetdevice.h>
87 #include <linux/btf_ids.h>
88 #include <linux/skbuff_ref.h>
89 
90 #include <crypto/md5.h>
91 #include <crypto/utils.h>
92 
93 #include <trace/events/tcp.h>
94 
95 #ifdef CONFIG_TCP_MD5SIG
96 static void tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
97 				__be32 daddr, __be32 saddr, const struct tcphdr *th);
98 #endif
99 
100 struct inet_hashinfo tcp_hashinfo;
101 
102 static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
103 	.bh_lock = INIT_LOCAL_LOCK(bh_lock),
104 };
105 
106 static DEFINE_MUTEX(tcp_exit_batch_mutex);
107 
108 INDIRECT_CALLABLE_SCOPE union tcp_seq_and_ts_off
tcp_v4_init_seq_and_ts_off(const struct net * net,const struct sk_buff * skb)109 tcp_v4_init_seq_and_ts_off(const struct net *net, const struct sk_buff *skb)
110 {
111 	return secure_tcp_seq_and_ts_off(net,
112 					 ip_hdr(skb)->daddr,
113 					 ip_hdr(skb)->saddr,
114 					 tcp_hdr(skb)->dest,
115 					 tcp_hdr(skb)->source);
116 }
117 
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)118 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
119 {
120 	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
121 	const struct inet_timewait_sock *tw = inet_twsk(sktw);
122 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
123 	struct tcp_sock *tp = tcp_sk(sk);
124 	int ts_recent_stamp;
125 	u32 reuse_thresh;
126 
127 	if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
128 		reuse = 0;
129 
130 	if (reuse == 2) {
131 		/* Still does not detect *everything* that goes through
132 		 * lo, since we require a loopback src or dst address
133 		 * or direct binding to 'lo' interface.
134 		 */
135 		bool loopback = false;
136 		if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
137 			loopback = true;
138 #if IS_ENABLED(CONFIG_IPV6)
139 		if (tw->tw_family == AF_INET6) {
140 			if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
141 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
142 			    ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
143 			    ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
144 				loopback = true;
145 		} else
146 #endif
147 		{
148 			if (ipv4_is_loopback(tw->tw_daddr) ||
149 			    ipv4_is_loopback(tw->tw_rcv_saddr))
150 				loopback = true;
151 		}
152 		if (!loopback)
153 			reuse = 0;
154 	}
155 
156 	/* With PAWS, it is safe from the viewpoint
157 	   of data integrity. Even without PAWS it is safe provided sequence
158 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
159 
160 	   Actually, the idea is close to VJ's one, only timestamp cache is
161 	   held not per host, but per port pair and TW bucket is used as state
162 	   holder.
163 
164 	   If TW bucket has been already destroyed we fall back to VJ's scheme
165 	   and use initial timestamp retrieved from peer table.
166 	 */
167 	ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
168 	reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
169 		       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
170 	if (ts_recent_stamp &&
171 	    (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
172 		/* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
173 		 * and releasing the bucket lock.
174 		 */
175 		if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
176 			return 0;
177 
178 		/* In case of repair and re-using TIME-WAIT sockets we still
179 		 * want to be sure that it is safe as above but honor the
180 		 * sequence numbers and time stamps set as part of the repair
181 		 * process.
182 		 *
183 		 * Without this check re-using a TIME-WAIT socket with TCP
184 		 * repair would accumulate a -1 on the repair assigned
185 		 * sequence number. The first time it is reused the sequence
186 		 * is -1, the second time -2, etc. This fixes that issue
187 		 * without appearing to create any others.
188 		 */
189 		if (likely(!tp->repair)) {
190 			u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
191 
192 			if (!seq)
193 				seq = 1;
194 			WRITE_ONCE(tp->write_seq, seq);
195 			tp->rx_opt.ts_recent	   = READ_ONCE(tcptw->tw_ts_recent);
196 			tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
197 		}
198 
199 		return 1;
200 	}
201 
202 	return 0;
203 }
204 
tcp_v4_pre_connect(struct sock * sk,struct sockaddr_unsized * uaddr,int addr_len)205 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
206 			      int addr_len)
207 {
208 	/* This check is replicated from tcp_v4_connect() and intended to
209 	 * prevent BPF program called below from accessing bytes that are out
210 	 * of the bound specified by user in addr_len.
211 	 */
212 	if (addr_len < sizeof(struct sockaddr_in))
213 		return -EINVAL;
214 
215 	sock_owned_by_me(sk);
216 
217 	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
218 }
219 
220 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr_unsized * uaddr,int addr_len)221 int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
222 {
223 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
224 	struct inet_timewait_death_row *tcp_death_row;
225 	struct inet_sock *inet = inet_sk(sk);
226 	struct tcp_sock *tp = tcp_sk(sk);
227 	struct ip_options_rcu *inet_opt;
228 	struct net *net = sock_net(sk);
229 	__be16 orig_sport, orig_dport;
230 	__be32 daddr, nexthop;
231 	struct flowi4 *fl4;
232 	struct rtable *rt;
233 	int err;
234 
235 	if (addr_len < sizeof(struct sockaddr_in))
236 		return -EINVAL;
237 
238 	if (usin->sin_family != AF_INET)
239 		return -EAFNOSUPPORT;
240 
241 	nexthop = daddr = usin->sin_addr.s_addr;
242 	inet_opt = rcu_dereference_protected(inet->inet_opt,
243 					     lockdep_sock_is_held(sk));
244 	if (inet_opt && inet_opt->opt.srr) {
245 		if (!daddr)
246 			return -EINVAL;
247 		nexthop = inet_opt->opt.faddr;
248 	}
249 
250 	orig_sport = inet->inet_sport;
251 	orig_dport = usin->sin_port;
252 	fl4 = &inet->cork.fl.u.ip4;
253 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
254 			      sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
255 			      orig_dport, sk);
256 	if (IS_ERR(rt)) {
257 		err = PTR_ERR(rt);
258 		if (err == -ENETUNREACH)
259 			IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
260 		return err;
261 	}
262 
263 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
264 		ip_rt_put(rt);
265 		return -ENETUNREACH;
266 	}
267 
268 	if (!inet_opt || !inet_opt->opt.srr)
269 		daddr = fl4->daddr;
270 
271 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
272 
273 	if (!inet->inet_saddr) {
274 		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
275 		if (err) {
276 			ip_rt_put(rt);
277 			return err;
278 		}
279 	} else {
280 		sk_rcv_saddr_set(sk, inet->inet_saddr);
281 	}
282 
283 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
284 		/* Reset inherited state */
285 		tp->rx_opt.ts_recent	   = 0;
286 		tp->rx_opt.ts_recent_stamp = 0;
287 		if (likely(!tp->repair))
288 			WRITE_ONCE(tp->write_seq, 0);
289 	}
290 
291 	inet->inet_dport = usin->sin_port;
292 	sk_daddr_set(sk, daddr);
293 
294 	inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk);
295 	if (inet_opt)
296 		inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen;
297 
298 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
299 
300 	/* Socket identity is still unknown (sport may be zero).
301 	 * However we set state to SYN-SENT and not releasing socket
302 	 * lock select source port, enter ourselves into the hash tables and
303 	 * complete initialization after this.
304 	 */
305 	tcp_set_state(sk, TCP_SYN_SENT);
306 	err = inet_hash_connect(tcp_death_row, sk);
307 	if (err)
308 		goto failure;
309 
310 	sk_set_txhash(sk);
311 
312 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
313 			       inet->inet_sport, inet->inet_dport, sk);
314 	if (IS_ERR(rt)) {
315 		err = PTR_ERR(rt);
316 		rt = NULL;
317 		goto failure;
318 	}
319 	tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
320 	/* OK, now commit destination to socket.  */
321 	sk->sk_gso_type = SKB_GSO_TCPV4;
322 	sk_setup_caps(sk, &rt->dst);
323 	rt = NULL;
324 
325 	if (likely(!tp->repair)) {
326 		union tcp_seq_and_ts_off st;
327 
328 		st = secure_tcp_seq_and_ts_off(net,
329 					       inet->inet_saddr,
330 					       inet->inet_daddr,
331 					       inet->inet_sport,
332 					       usin->sin_port);
333 		if (!tp->write_seq)
334 			WRITE_ONCE(tp->write_seq, st.seq);
335 		WRITE_ONCE(tp->tsoffset, st.ts_off);
336 	}
337 
338 	atomic_set(&inet->inet_id, get_random_u16());
339 
340 	if (tcp_fastopen_defer_connect(sk, &err))
341 		return err;
342 	if (err)
343 		goto failure;
344 
345 	err = tcp_connect(sk);
346 
347 	if (err)
348 		goto failure;
349 
350 	return 0;
351 
352 failure:
353 	/*
354 	 * This unhashes the socket and releases the local port,
355 	 * if necessary.
356 	 */
357 	tcp_set_state(sk, TCP_CLOSE);
358 	inet_bhash2_reset_saddr(sk);
359 	ip_rt_put(rt);
360 	sk->sk_route_caps = 0;
361 	inet->inet_dport = 0;
362 	return err;
363 }
364 
365 /*
366  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
367  * It can be called through tcp_release_cb() if socket was owned by user
368  * at the time tcp_v4_err() was called to handle ICMP message.
369  */
tcp_v4_mtu_reduced(struct sock * sk)370 void tcp_v4_mtu_reduced(struct sock *sk)
371 {
372 	struct inet_sock *inet = inet_sk(sk);
373 	struct dst_entry *dst;
374 	u32 mtu, dmtu;
375 
376 	if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
377 		return;
378 	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
379 	dst = inet_csk_update_pmtu(sk, mtu);
380 	if (!dst)
381 		return;
382 
383 	/* Something is about to be wrong... Remember soft error
384 	 * for the case, if this connection will not able to recover.
385 	 */
386 	dmtu = dst4_mtu(dst);
387 	if (mtu < dmtu && ip_dont_fragment(sk, dst))
388 		WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
389 
390 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
391 	    ip_sk_accept_pmtu(sk) &&
392 	    inet_csk(sk)->icsk_pmtu_cookie > dmtu) {
393 		tcp_sync_mss(sk, dmtu);
394 
395 		/* Resend the TCP packet because it's
396 		 * clear that the old packet has been
397 		 * dropped. This is the new "fast" path mtu
398 		 * discovery.
399 		 */
400 		tcp_simple_retransmit(sk);
401 	} /* else let the usual retransmit timer handle it */
402 }
403 
do_redirect(struct sk_buff * skb,struct sock * sk)404 static void do_redirect(struct sk_buff *skb, struct sock *sk)
405 {
406 	struct dst_entry *dst = __sk_dst_check(sk, 0);
407 
408 	if (dst)
409 		dst->ops->redirect(dst, sk, skb);
410 }
411 
412 
413 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)414 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
415 {
416 	struct request_sock *req = inet_reqsk(sk);
417 	struct net *net = sock_net(sk);
418 
419 	/* ICMPs are not backlogged, hence we cannot get
420 	 * an established socket here.
421 	 */
422 	if (seq != tcp_rsk(req)->snt_isn) {
423 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
424 	} else if (abort) {
425 		/*
426 		 * Still in SYN_RECV, just remove it silently.
427 		 * There is no good way to pass the error to the newly
428 		 * created socket, and POSIX does not want network
429 		 * errors returned from accept().
430 		 */
431 		inet_csk_reqsk_queue_drop(req->rsk_listener, req);
432 		tcp_listendrop(req->rsk_listener);
433 	}
434 	reqsk_put(req);
435 }
436 
437 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)438 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
439 {
440 	struct inet_connection_sock *icsk = inet_csk(sk);
441 	struct tcp_sock *tp = tcp_sk(sk);
442 	struct sk_buff *skb;
443 	s32 remaining;
444 	u32 delta_us;
445 
446 	if (sock_owned_by_user(sk))
447 		return;
448 
449 	if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
450 	    !icsk->icsk_backoff)
451 		return;
452 
453 	skb = tcp_rtx_queue_head(sk);
454 	if (WARN_ON_ONCE(!skb))
455 		return;
456 
457 	icsk->icsk_backoff--;
458 	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
459 	icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk));
460 
461 	tcp_mstamp_refresh(tp);
462 	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
463 	remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
464 
465 	if (remaining > 0) {
466 		tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false);
467 	} else {
468 		/* RTO revert clocked out retransmission.
469 		 * Will retransmit now.
470 		 */
471 		tcp_retransmit_timer(sk);
472 	}
473 }
474 
475 /*
476  * This routine is called by the ICMP module when it gets some
477  * sort of error condition.  If err < 0 then the socket should
478  * be closed and the error returned to the user.  If err > 0
479  * it's just the icmp type << 8 | icmp code.  After adjustment
480  * header points to the first 8 bytes of the tcp header.  We need
481  * to find the appropriate port.
482  *
483  * The locking strategy used here is very "optimistic". When
484  * someone else accesses the socket the ICMP is just dropped
485  * and for some paths there is no check at all.
486  * A more general error queue to queue errors for later handling
487  * is probably better.
488  *
489  */
490 
tcp_v4_err(struct sk_buff * skb,u32 info)491 int tcp_v4_err(struct sk_buff *skb, u32 info)
492 {
493 	const struct iphdr *iph = (const struct iphdr *)skb->data;
494 	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
495 	struct net *net = dev_net_rcu(skb->dev);
496 	const int type = icmp_hdr(skb)->type;
497 	const int code = icmp_hdr(skb)->code;
498 	struct request_sock *fastopen;
499 	struct tcp_sock *tp;
500 	u32 seq, snd_una;
501 	struct sock *sk;
502 	int err;
503 
504 	sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr,
505 				       ntohs(th->source), inet_iif(skb), 0);
506 	if (!sk) {
507 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
508 		return -ENOENT;
509 	}
510 	if (sk->sk_state == TCP_TIME_WAIT) {
511 		/* To increase the counter of ignored icmps for TCP-AO */
512 		tcp_ao_ignore_icmp(sk, AF_INET, type, code);
513 		inet_twsk_put(inet_twsk(sk));
514 		return 0;
515 	}
516 	seq = ntohl(th->seq);
517 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
518 		tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
519 				     type == ICMP_TIME_EXCEEDED ||
520 				     (type == ICMP_DEST_UNREACH &&
521 				      (code == ICMP_NET_UNREACH ||
522 				       code == ICMP_HOST_UNREACH)));
523 		return 0;
524 	}
525 
526 	if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
527 		sock_put(sk);
528 		return 0;
529 	}
530 
531 	bh_lock_sock(sk);
532 	/* If too many ICMPs get dropped on busy
533 	 * servers this needs to be solved differently.
534 	 * We do take care of PMTU discovery (RFC1191) special case :
535 	 * we can receive locally generated ICMP messages while socket is held.
536 	 */
537 	if (sock_owned_by_user(sk)) {
538 		if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
539 			__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
540 	}
541 	if (sk->sk_state == TCP_CLOSE)
542 		goto out;
543 
544 	if (static_branch_unlikely(&ip4_min_ttl)) {
545 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
546 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
547 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
548 			goto out;
549 		}
550 	}
551 
552 	tp = tcp_sk(sk);
553 	/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
554 	fastopen = rcu_dereference(tp->fastopen_rsk);
555 	snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
556 	if (sk->sk_state != TCP_LISTEN &&
557 	    !between(seq, snd_una, tp->snd_nxt)) {
558 		__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
559 		goto out;
560 	}
561 
562 	switch (type) {
563 	case ICMP_REDIRECT:
564 		if (!sock_owned_by_user(sk))
565 			do_redirect(skb, sk);
566 		goto out;
567 	case ICMP_SOURCE_QUENCH:
568 		/* Just silently ignore these. */
569 		goto out;
570 	case ICMP_PARAMETERPROB:
571 		err = EPROTO;
572 		break;
573 	case ICMP_DEST_UNREACH:
574 		if (code > NR_ICMP_UNREACH)
575 			goto out;
576 
577 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
578 			/* We are not interested in TCP_LISTEN and open_requests
579 			 * (SYN-ACKs send out by Linux are always <576bytes so
580 			 * they should go through unfragmented).
581 			 */
582 			if (sk->sk_state == TCP_LISTEN)
583 				goto out;
584 
585 			WRITE_ONCE(tp->mtu_info, info);
586 			if (!sock_owned_by_user(sk)) {
587 				tcp_v4_mtu_reduced(sk);
588 			} else {
589 				if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
590 					sock_hold(sk);
591 			}
592 			goto out;
593 		}
594 
595 		err = icmp_err_convert[code].errno;
596 		/* check if this ICMP message allows revert of backoff.
597 		 * (see RFC 6069)
598 		 */
599 		if (!fastopen &&
600 		    (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
601 			tcp_ld_RTO_revert(sk, seq);
602 		break;
603 	case ICMP_TIME_EXCEEDED:
604 		err = EHOSTUNREACH;
605 		break;
606 	default:
607 		goto out;
608 	}
609 
610 	switch (sk->sk_state) {
611 	case TCP_SYN_SENT:
612 	case TCP_SYN_RECV:
613 		/* Only in fast or simultaneous open. If a fast open socket is
614 		 * already accepted it is treated as a connected one below.
615 		 */
616 		if (fastopen && !fastopen->sk)
617 			break;
618 
619 		ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
620 
621 		if (!sock_owned_by_user(sk))
622 			tcp_done_with_error(sk, err);
623 		else
624 			WRITE_ONCE(sk->sk_err_soft, err);
625 		goto out;
626 	}
627 
628 	/* If we've already connected we will keep trying
629 	 * until we time out, or the user gives up.
630 	 *
631 	 * rfc1122 4.2.3.9 allows to consider as hard errors
632 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
633 	 * but it is obsoleted by pmtu discovery).
634 	 *
635 	 * Note, that in modern internet, where routing is unreliable
636 	 * and in each dark corner broken firewalls sit, sending random
637 	 * errors ordered by their masters even this two messages finally lose
638 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
639 	 *
640 	 * Now we are in compliance with RFCs.
641 	 *							--ANK (980905)
642 	 */
643 
644 	if (!sock_owned_by_user(sk) &&
645 	    inet_test_bit(RECVERR, sk)) {
646 		WRITE_ONCE(sk->sk_err, err);
647 		sk_error_report(sk);
648 	} else	{ /* Only an error on timeout */
649 		WRITE_ONCE(sk->sk_err_soft, err);
650 	}
651 
652 out:
653 	bh_unlock_sock(sk);
654 	sock_put(sk);
655 	return 0;
656 }
657 
658 #define REPLY_OPTIONS_LEN      (MAX_TCP_OPTION_SPACE / sizeof(__be32))
659 
tcp_v4_ao_sign_reset(const struct sock * sk,struct sk_buff * skb,const struct tcp_ao_hdr * aoh,struct ip_reply_arg * arg,struct tcphdr * reply,__be32 reply_options[REPLY_OPTIONS_LEN])660 static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
661 				 const struct tcp_ao_hdr *aoh,
662 				 struct ip_reply_arg *arg, struct tcphdr *reply,
663 				 __be32 reply_options[REPLY_OPTIONS_LEN])
664 {
665 #ifdef CONFIG_TCP_AO
666 	int sdif = tcp_v4_sdif(skb);
667 	int dif = inet_iif(skb);
668 	int l3index = sdif ? dif : 0;
669 	bool allocated_traffic_key;
670 	struct tcp_ao_key *key;
671 	char *traffic_key;
672 	bool drop = true;
673 	u32 ao_sne = 0;
674 	u8 keyid;
675 
676 	rcu_read_lock();
677 	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
678 				 &key, &traffic_key, &allocated_traffic_key,
679 				 &keyid, &ao_sne))
680 		goto out;
681 
682 	reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
683 				 (aoh->rnext_keyid << 8) | keyid);
684 	arg->iov[0].iov_len += tcp_ao_len_aligned(key);
685 	reply->doff = arg->iov[0].iov_len / 4;
686 
687 	if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
688 			    key, traffic_key,
689 			    (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
690 			    (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
691 			    reply, ao_sne))
692 		goto out;
693 	drop = false;
694 out:
695 	rcu_read_unlock();
696 	if (allocated_traffic_key)
697 		kfree(traffic_key);
698 	return drop;
699 #else
700 	return true;
701 #endif
702 }
703 
704 /*
705  *	This routine will send an RST to the other tcp.
706  *
707  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
708  *		      for reset.
709  *	Answer: if a packet caused RST, it is not for a socket
710  *		existing in our system, if it is matched to a socket,
711  *		it is just duplicate segment or bug in other side's TCP.
712  *		So that we build reply only basing on parameters
713  *		arrived with segment.
714  *	Exception: precedence violation. We do not implement it in any case.
715  */
716 
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb,enum sk_rst_reason reason)717 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
718 			      enum sk_rst_reason reason)
719 {
720 	const struct tcphdr *th = tcp_hdr(skb);
721 	struct {
722 		struct tcphdr th;
723 		__be32 opt[REPLY_OPTIONS_LEN];
724 	} rep;
725 	const __u8 *md5_hash_location = NULL;
726 	const struct tcp_ao_hdr *aoh;
727 	struct ip_reply_arg arg;
728 #ifdef CONFIG_TCP_MD5SIG
729 	struct tcp_md5sig_key *key = NULL;
730 	unsigned char newhash[16];
731 	struct sock *sk1 = NULL;
732 #endif
733 	u64 transmit_time = 0;
734 	struct sock *ctl_sk;
735 	struct net *net;
736 	u32 txhash = 0;
737 
738 	/* Never send a reset in response to a reset. */
739 	if (th->rst)
740 		return;
741 
742 	/* If sk not NULL, it means we did a successful lookup and incoming
743 	 * route had to be correct. prequeue might have dropped our dst.
744 	 */
745 	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
746 		return;
747 
748 	/* Swap the send and the receive. */
749 	memset(&rep, 0, sizeof(rep));
750 	rep.th.dest   = th->source;
751 	rep.th.source = th->dest;
752 	rep.th.doff   = sizeof(struct tcphdr) / 4;
753 	rep.th.rst    = 1;
754 
755 	if (th->ack) {
756 		rep.th.seq = th->ack_seq;
757 	} else {
758 		rep.th.ack = 1;
759 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
760 				       skb->len - (th->doff << 2));
761 	}
762 
763 	memset(&arg, 0, sizeof(arg));
764 	arg.iov[0].iov_base = (unsigned char *)&rep;
765 	arg.iov[0].iov_len  = sizeof(rep.th);
766 
767 	net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
768 
769 	/* Invalid TCP option size or twice included auth */
770 	if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
771 		return;
772 
773 	if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
774 		return;
775 
776 #ifdef CONFIG_TCP_MD5SIG
777 	rcu_read_lock();
778 	if (sk && sk_fullsock(sk)) {
779 		const union tcp_md5_addr *addr;
780 		int l3index;
781 
782 		/* sdif set, means packet ingressed via a device
783 		 * in an L3 domain and inet_iif is set to it.
784 		 */
785 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
786 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
787 		key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
788 	} else if (md5_hash_location) {
789 		const union tcp_md5_addr *addr;
790 		int sdif = tcp_v4_sdif(skb);
791 		int dif = inet_iif(skb);
792 		int l3index;
793 
794 		/*
795 		 * active side is lost. Try to find listening socket through
796 		 * source port, and then find md5 key through listening socket.
797 		 * we are not loose security here:
798 		 * Incoming packet is checked with md5 hash with finding key,
799 		 * no RST generated if md5 hash doesn't match.
800 		 */
801 		sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr,
802 					     th->source, ip_hdr(skb)->daddr,
803 					     ntohs(th->source), dif, sdif);
804 		/* don't send rst if it can't find key */
805 		if (!sk1)
806 			goto out;
807 
808 		/* sdif set, means packet ingressed via a device
809 		 * in an L3 domain and dif is set to it.
810 		 */
811 		l3index = sdif ? dif : 0;
812 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
813 		key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
814 		if (!key)
815 			goto out;
816 
817 		tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
818 		if (crypto_memneq(md5_hash_location, newhash, 16))
819 			goto out;
820 	}
821 
822 	if (key) {
823 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
824 				   (TCPOPT_NOP << 16) |
825 				   (TCPOPT_MD5SIG << 8) |
826 				   TCPOLEN_MD5SIG);
827 		/* Update length and the length the header thinks exists */
828 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
829 		rep.th.doff = arg.iov[0].iov_len / 4;
830 
831 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
832 				     key, ip_hdr(skb)->saddr,
833 				     ip_hdr(skb)->daddr, &rep.th);
834 	}
835 #endif
836 	/* Can't co-exist with TCPMD5, hence check rep.opt[0] */
837 	if (rep.opt[0] == 0) {
838 		__be32 mrst = mptcp_reset_option(skb);
839 
840 		if (mrst) {
841 			rep.opt[0] = mrst;
842 			arg.iov[0].iov_len += sizeof(mrst);
843 			rep.th.doff = arg.iov[0].iov_len / 4;
844 		}
845 	}
846 
847 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
848 				      ip_hdr(skb)->saddr, /* XXX */
849 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
850 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
851 	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
852 
853 	/* When socket is gone, all binding information is lost.
854 	 * routing might fail in this case. No choice here, if we choose to force
855 	 * input interface, we will misroute in case of asymmetric route.
856 	 */
857 	if (sk)
858 		arg.bound_dev_if = sk->sk_bound_dev_if;
859 
860 	trace_tcp_send_reset(sk, skb, reason);
861 
862 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
863 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
864 
865 	/* ECN bits of TW reset are cleared */
866 	arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK;
867 	arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
868 	local_bh_disable();
869 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
870 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
871 
872 	sock_net_set(ctl_sk, net);
873 	if (sk) {
874 		ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
875 				   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
876 		ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
877 				   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
878 		transmit_time = tcp_transmit_time(sk);
879 		xfrm_sk_clone_policy(ctl_sk, sk);
880 		txhash = (sk->sk_state == TCP_TIME_WAIT) ?
881 			 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
882 	} else {
883 		ctl_sk->sk_mark = 0;
884 		ctl_sk->sk_priority = 0;
885 	}
886 	ip_send_unicast_reply(ctl_sk, sk,
887 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
888 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
889 			      &arg, arg.iov[0].iov_len,
890 			      transmit_time, txhash);
891 
892 	xfrm_sk_free_policy(ctl_sk);
893 	sock_net_set(ctl_sk, &init_net);
894 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
895 	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
896 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
897 	local_bh_enable();
898 
899 #ifdef CONFIG_TCP_MD5SIG
900 out:
901 	rcu_read_unlock();
902 #endif
903 }
904 
905 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
906    outside socket context is ugly, certainly. What can I do?
907  */
908 
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_key * key,int reply_flags,u8 tos,u32 txhash)909 static void tcp_v4_send_ack(const struct sock *sk,
910 			    struct sk_buff *skb, u32 seq, u32 ack,
911 			    u32 win, u32 tsval, u32 tsecr, int oif,
912 			    struct tcp_key *key,
913 			    int reply_flags, u8 tos, u32 txhash)
914 {
915 	const struct tcphdr *th = tcp_hdr(skb);
916 	struct {
917 		struct tcphdr th;
918 		__be32 opt[(MAX_TCP_OPTION_SPACE  >> 2)];
919 	} rep;
920 	struct net *net = sock_net(sk);
921 	struct ip_reply_arg arg;
922 	struct sock *ctl_sk;
923 	u64 transmit_time;
924 
925 	memset(&rep.th, 0, sizeof(struct tcphdr));
926 	memset(&arg, 0, sizeof(arg));
927 
928 	arg.iov[0].iov_base = (unsigned char *)&rep;
929 	arg.iov[0].iov_len  = sizeof(rep.th);
930 	if (tsecr) {
931 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
932 				   (TCPOPT_TIMESTAMP << 8) |
933 				   TCPOLEN_TIMESTAMP);
934 		rep.opt[1] = htonl(tsval);
935 		rep.opt[2] = htonl(tsecr);
936 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
937 	}
938 
939 	/* Swap the send and the receive. */
940 	rep.th.dest    = th->source;
941 	rep.th.source  = th->dest;
942 	rep.th.doff    = arg.iov[0].iov_len / 4;
943 	rep.th.seq     = htonl(seq);
944 	rep.th.ack_seq = htonl(ack);
945 	rep.th.ack     = 1;
946 	rep.th.window  = htons(win);
947 
948 #ifdef CONFIG_TCP_MD5SIG
949 	if (tcp_key_is_md5(key)) {
950 		int offset = (tsecr) ? 3 : 0;
951 
952 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
953 					  (TCPOPT_NOP << 16) |
954 					  (TCPOPT_MD5SIG << 8) |
955 					  TCPOLEN_MD5SIG);
956 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
957 		rep.th.doff = arg.iov[0].iov_len/4;
958 
959 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
960 				    key->md5_key, ip_hdr(skb)->saddr,
961 				    ip_hdr(skb)->daddr, &rep.th);
962 	}
963 #endif
964 #ifdef CONFIG_TCP_AO
965 	if (tcp_key_is_ao(key)) {
966 		int offset = (tsecr) ? 3 : 0;
967 
968 		rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
969 					  (tcp_ao_len(key->ao_key) << 16) |
970 					  (key->ao_key->sndid << 8) |
971 					  key->rcv_next);
972 		arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
973 		rep.th.doff = arg.iov[0].iov_len / 4;
974 
975 		tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
976 				key->ao_key, key->traffic_key,
977 				(union tcp_ao_addr *)&ip_hdr(skb)->saddr,
978 				(union tcp_ao_addr *)&ip_hdr(skb)->daddr,
979 				&rep.th, key->sne);
980 	}
981 #endif
982 	arg.flags = reply_flags;
983 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
984 				      ip_hdr(skb)->saddr, /* XXX */
985 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
986 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
987 	if (oif)
988 		arg.bound_dev_if = oif;
989 	arg.tos = tos;
990 	arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
991 	local_bh_disable();
992 	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
993 	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
994 	sock_net_set(ctl_sk, net);
995 	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
996 			   inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
997 	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
998 			   inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
999 	transmit_time = tcp_transmit_time(sk);
1000 	ip_send_unicast_reply(ctl_sk, sk,
1001 			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
1002 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
1003 			      &arg, arg.iov[0].iov_len,
1004 			      transmit_time, txhash);
1005 
1006 	sock_net_set(ctl_sk, &init_net);
1007 	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1008 	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1009 	local_bh_enable();
1010 }
1011 
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb,enum tcp_tw_status tw_status)1012 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb,
1013 				enum tcp_tw_status tw_status)
1014 {
1015 	struct inet_timewait_sock *tw = inet_twsk(sk);
1016 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1017 	struct tcp_key key = {};
1018 	u8 tos = tw->tw_tos;
1019 
1020 	/* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject,
1021 	 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs
1022 	 * being placed in a different service queues (Classic rather than L4S)
1023 	 */
1024 	if (tw_status == TCP_TW_ACK_OOW)
1025 		tos &= ~INET_ECN_MASK;
1026 
1027 #ifdef CONFIG_TCP_AO
1028 	struct tcp_ao_info *ao_info;
1029 
1030 	if (static_branch_unlikely(&tcp_ao_needed.key)) {
1031 		/* FIXME: the segment to-be-acked is not verified yet */
1032 		ao_info = rcu_dereference(tcptw->ao_info);
1033 		if (ao_info) {
1034 			const struct tcp_ao_hdr *aoh;
1035 
1036 			if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
1037 				inet_twsk_put(tw);
1038 				return;
1039 			}
1040 
1041 			if (aoh)
1042 				key.ao_key = tcp_ao_established_key(sk, ao_info,
1043 								    aoh->rnext_keyid, -1);
1044 		}
1045 	}
1046 	if (key.ao_key) {
1047 		struct tcp_ao_key *rnext_key;
1048 
1049 		key.traffic_key = snd_other_key(key.ao_key);
1050 		key.sne = READ_ONCE(ao_info->snd_sne);
1051 		rnext_key = READ_ONCE(ao_info->rnext_key);
1052 		key.rcv_next = rnext_key->rcvid;
1053 		key.type = TCP_KEY_AO;
1054 #else
1055 	if (0) {
1056 #endif
1057 	} else if (static_branch_tcp_md5()) {
1058 		key.md5_key = tcp_twsk_md5_key(tcptw);
1059 		if (key.md5_key)
1060 			key.type = TCP_KEY_MD5;
1061 	}
1062 
1063 	tcp_v4_send_ack(sk, skb,
1064 			tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1065 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1066 			tcp_tw_tsval(tcptw),
1067 			READ_ONCE(tcptw->tw_ts_recent),
1068 			tw->tw_bound_dev_if, &key,
1069 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1070 			tos,
1071 			tw->tw_txhash);
1072 
1073 	inet_twsk_put(tw);
1074 }
1075 
1076 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1077 				  struct request_sock *req)
1078 {
1079 	struct tcp_key key = {};
1080 
1081 	/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1082 	 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1083 	 */
1084 	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1085 					     tcp_sk(sk)->snd_nxt;
1086 
1087 #ifdef CONFIG_TCP_AO
1088 	if (static_branch_unlikely(&tcp_ao_needed.key) &&
1089 	    tcp_rsk_used_ao(req)) {
1090 		const union tcp_md5_addr *addr;
1091 		const struct tcp_ao_hdr *aoh;
1092 		int l3index;
1093 
1094 		/* Invalid TCP option size or twice included auth */
1095 		if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
1096 			return;
1097 		if (!aoh)
1098 			return;
1099 
1100 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1101 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1102 		key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1103 					      aoh->rnext_keyid, -1);
1104 		if (unlikely(!key.ao_key)) {
1105 			/* Send ACK with any matching MKT for the peer */
1106 			key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
1107 			/* Matching key disappeared (user removed the key?)
1108 			 * let the handshake timeout.
1109 			 */
1110 			if (!key.ao_key) {
1111 				net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1112 						     addr,
1113 						     ntohs(tcp_hdr(skb)->source),
1114 						     &ip_hdr(skb)->daddr,
1115 						     ntohs(tcp_hdr(skb)->dest));
1116 				return;
1117 			}
1118 		}
1119 		key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1120 		if (!key.traffic_key)
1121 			return;
1122 
1123 		key.type = TCP_KEY_AO;
1124 		key.rcv_next = aoh->keyid;
1125 		tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
1126 #else
1127 	if (0) {
1128 #endif
1129 	} else if (static_branch_tcp_md5()) {
1130 		const union tcp_md5_addr *addr;
1131 		int l3index;
1132 
1133 		addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1134 		l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1135 		key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1136 		if (key.md5_key)
1137 			key.type = TCP_KEY_MD5;
1138 	}
1139 
1140 	/* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */
1141 	tcp_v4_send_ack(sk, skb, seq,
1142 			tcp_rsk(req)->rcv_nxt,
1143 			tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
1144 			tcp_rsk_tsval(tcp_rsk(req)),
1145 			req->ts_recent,
1146 			0, &key,
1147 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1148 			ip_hdr(skb)->tos & ~INET_ECN_MASK,
1149 			READ_ONCE(tcp_rsk(req)->txhash));
1150 	if (tcp_key_is_ao(&key))
1151 		kfree(key.traffic_key);
1152 }
1153 
1154 /*
1155  *	Send a SYN-ACK after having received a SYN.
1156  *	This still operates on a request_sock only, not on a big
1157  *	socket.
1158  */
1159 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1160 			      struct flowi *fl,
1161 			      struct request_sock *req,
1162 			      struct tcp_fastopen_cookie *foc,
1163 			      enum tcp_synack_type synack_type,
1164 			      struct sk_buff *syn_skb)
1165 {
1166 	struct inet_request_sock *ireq = inet_rsk(req);
1167 	struct flowi4 fl4;
1168 	int err = -1;
1169 	struct sk_buff *skb;
1170 	u8 tos;
1171 
1172 	/* First, grab a route. */
1173 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1174 		return -1;
1175 
1176 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1177 
1178 	if (skb) {
1179 		tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
1180 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1181 
1182 		tos = READ_ONCE(inet_sk(sk)->tos);
1183 
1184 		if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1185 			tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1186 			      (tos & INET_ECN_MASK);
1187 
1188 		if (!INET_ECN_is_capable(tos) &&
1189 		    tcp_bpf_ca_needs_ecn((struct sock *)req))
1190 			tos |= INET_ECN_ECT_0;
1191 
1192 		rcu_read_lock();
1193 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1194 					    ireq->ir_rmt_addr,
1195 					    rcu_dereference(ireq->ireq_opt),
1196 					    tos);
1197 		rcu_read_unlock();
1198 		err = net_xmit_eval(err);
1199 	}
1200 
1201 	return err;
1202 }
1203 
1204 /*
1205  *	IPv4 request_sock destructor.
1206  */
1207 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1208 {
1209 	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1210 }
1211 
1212 #ifdef CONFIG_TCP_MD5SIG
1213 /*
1214  * RFC2385 MD5 checksumming requires a mapping of
1215  * IP address->MD5 Key.
1216  * We need to maintain these in the sk structure.
1217  */
1218 
1219 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1220 
1221 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1222 {
1223 	if (!old)
1224 		return true;
1225 
1226 	/* l3index always overrides non-l3index */
1227 	if (old->l3index && new->l3index == 0)
1228 		return false;
1229 	if (old->l3index == 0 && new->l3index)
1230 		return true;
1231 
1232 	return old->prefixlen < new->prefixlen;
1233 }
1234 
1235 /* Find the Key structure for an address.  */
1236 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1237 					   const union tcp_md5_addr *addr,
1238 					   int family, bool any_l3index)
1239 {
1240 	const struct tcp_sock *tp = tcp_sk(sk);
1241 	struct tcp_md5sig_key *key;
1242 	const struct tcp_md5sig_info *md5sig;
1243 	__be32 mask;
1244 	struct tcp_md5sig_key *best_match = NULL;
1245 	bool match;
1246 
1247 	/* caller either holds rcu_read_lock() or socket lock */
1248 	md5sig = rcu_dereference_check(tp->md5sig_info,
1249 				       lockdep_sock_is_held(sk));
1250 	if (!md5sig)
1251 		return NULL;
1252 
1253 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1254 				 lockdep_sock_is_held(sk)) {
1255 		if (key->family != family)
1256 			continue;
1257 		if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1258 		    key->l3index != l3index)
1259 			continue;
1260 		if (family == AF_INET) {
1261 			mask = inet_make_mask(key->prefixlen);
1262 			match = (key->addr.a4.s_addr & mask) ==
1263 				(addr->a4.s_addr & mask);
1264 #if IS_ENABLED(CONFIG_IPV6)
1265 		} else if (family == AF_INET6) {
1266 			match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1267 						  key->prefixlen);
1268 #endif
1269 		} else {
1270 			match = false;
1271 		}
1272 
1273 		if (match && better_md5_match(best_match, key))
1274 			best_match = key;
1275 	}
1276 	return best_match;
1277 }
1278 
1279 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1280 						      const union tcp_md5_addr *addr,
1281 						      int family, u8 prefixlen,
1282 						      int l3index, u8 flags)
1283 {
1284 	const struct tcp_sock *tp = tcp_sk(sk);
1285 	struct tcp_md5sig_key *key;
1286 	unsigned int size = sizeof(struct in_addr);
1287 	const struct tcp_md5sig_info *md5sig;
1288 
1289 	/* caller either holds rcu_read_lock() or socket lock */
1290 	md5sig = rcu_dereference_check(tp->md5sig_info,
1291 				       lockdep_sock_is_held(sk));
1292 	if (!md5sig)
1293 		return NULL;
1294 #if IS_ENABLED(CONFIG_IPV6)
1295 	if (family == AF_INET6)
1296 		size = sizeof(struct in6_addr);
1297 #endif
1298 	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1299 				 lockdep_sock_is_held(sk)) {
1300 		if (key->family != family)
1301 			continue;
1302 		if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1303 			continue;
1304 		if (key->l3index != l3index)
1305 			continue;
1306 		if (!memcmp(&key->addr, addr, size) &&
1307 		    key->prefixlen == prefixlen)
1308 			return key;
1309 	}
1310 	return NULL;
1311 }
1312 
1313 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1314 					 const struct sock *addr_sk)
1315 {
1316 	const union tcp_md5_addr *addr;
1317 	int l3index;
1318 
1319 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1320 						 addr_sk->sk_bound_dev_if);
1321 	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1322 	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1323 }
1324 
1325 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1326 {
1327 	struct tcp_sock *tp = tcp_sk(sk);
1328 	struct tcp_md5sig_info *md5sig;
1329 
1330 	md5sig = kmalloc_obj(*md5sig, gfp);
1331 	if (!md5sig)
1332 		return -ENOMEM;
1333 
1334 	sk_gso_disable(sk);
1335 	INIT_HLIST_HEAD(&md5sig->head);
1336 	rcu_assign_pointer(tp->md5sig_info, md5sig);
1337 	return 0;
1338 }
1339 
1340 /* This can be called on a newly created socket, from other files */
1341 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1342 			    int family, u8 prefixlen, int l3index, u8 flags,
1343 			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1344 {
1345 	/* Add Key to the list */
1346 	struct tcp_md5sig_key *key;
1347 	struct tcp_sock *tp = tcp_sk(sk);
1348 	struct tcp_md5sig_info *md5sig;
1349 
1350 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1351 	if (key) {
1352 		/* Pre-existing entry - just update that one.
1353 		 * Note that the key might be used concurrently.
1354 		 * data_race() is telling kcsan that we do not care of
1355 		 * key mismatches, since changing MD5 key on live flows
1356 		 * can lead to packet drops.
1357 		 */
1358 		data_race(memcpy(key->key, newkey, newkeylen));
1359 
1360 		/* Pairs with READ_ONCE() in tcp_md5_hash_key().
1361 		 * Also note that a reader could catch new key->keylen value
1362 		 * but old key->key[], this is the reason we use __GFP_ZERO
1363 		 * at sock_kmalloc() time below these lines.
1364 		 */
1365 		WRITE_ONCE(key->keylen, newkeylen);
1366 
1367 		return 0;
1368 	}
1369 
1370 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1371 					   lockdep_sock_is_held(sk));
1372 
1373 	key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1374 	if (!key)
1375 		return -ENOMEM;
1376 
1377 	memcpy(key->key, newkey, newkeylen);
1378 	key->keylen = newkeylen;
1379 	key->family = family;
1380 	key->prefixlen = prefixlen;
1381 	key->l3index = l3index;
1382 	key->flags = flags;
1383 	memcpy(&key->addr, addr,
1384 	       (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1385 								 sizeof(struct in_addr));
1386 	hlist_add_head_rcu(&key->node, &md5sig->head);
1387 	return 0;
1388 }
1389 
1390 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1391 		   int family, u8 prefixlen, int l3index, u8 flags,
1392 		   const u8 *newkey, u8 newkeylen)
1393 {
1394 	struct tcp_sock *tp = tcp_sk(sk);
1395 
1396 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1397 		if (fips_enabled) {
1398 			pr_warn_once("TCP-MD5 support is disabled due to FIPS\n");
1399 			return -EOPNOTSUPP;
1400 		}
1401 
1402 		if (tcp_md5sig_info_add(sk, GFP_KERNEL))
1403 			return -ENOMEM;
1404 
1405 		if (!static_branch_inc(&tcp_md5_needed.key)) {
1406 			struct tcp_md5sig_info *md5sig;
1407 
1408 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1409 			rcu_assign_pointer(tp->md5sig_info, NULL);
1410 			kfree_rcu(md5sig, rcu);
1411 			return -EUSERS;
1412 		}
1413 	}
1414 
1415 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1416 				newkey, newkeylen, GFP_KERNEL);
1417 }
1418 
1419 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1420 		     int family, u8 prefixlen, int l3index,
1421 		     struct tcp_md5sig_key *key)
1422 {
1423 	struct tcp_sock *tp = tcp_sk(sk);
1424 
1425 	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1426 
1427 		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
1428 			return -ENOMEM;
1429 
1430 		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1431 			struct tcp_md5sig_info *md5sig;
1432 
1433 			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1434 			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1435 			rcu_assign_pointer(tp->md5sig_info, NULL);
1436 			kfree_rcu(md5sig, rcu);
1437 			return -EUSERS;
1438 		}
1439 	}
1440 
1441 	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1442 				key->flags, key->key, key->keylen,
1443 				sk_gfp_mask(sk, GFP_ATOMIC));
1444 }
1445 
1446 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1447 		   u8 prefixlen, int l3index, u8 flags)
1448 {
1449 	struct tcp_md5sig_key *key;
1450 
1451 	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1452 	if (!key)
1453 		return -ENOENT;
1454 	hlist_del_rcu(&key->node);
1455 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1456 	kfree_rcu(key, rcu);
1457 	return 0;
1458 }
1459 
1460 void tcp_clear_md5_list(struct sock *sk)
1461 {
1462 	struct tcp_sock *tp = tcp_sk(sk);
1463 	struct tcp_md5sig_key *key;
1464 	struct hlist_node *n;
1465 	struct tcp_md5sig_info *md5sig;
1466 
1467 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1468 
1469 	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1470 		hlist_del(&key->node);
1471 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1472 		kfree(key);
1473 	}
1474 }
1475 
1476 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1477 				 sockptr_t optval, int optlen)
1478 {
1479 	struct tcp_md5sig cmd;
1480 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1481 	const union tcp_md5_addr *addr;
1482 	u8 prefixlen = 32;
1483 	int l3index = 0;
1484 	bool l3flag;
1485 	u8 flags;
1486 
1487 	if (optlen < sizeof(cmd))
1488 		return -EINVAL;
1489 
1490 	if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1491 		return -EFAULT;
1492 
1493 	if (sin->sin_family != AF_INET)
1494 		return -EINVAL;
1495 
1496 	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1497 	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1498 
1499 	if (optname == TCP_MD5SIG_EXT &&
1500 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1501 		prefixlen = cmd.tcpm_prefixlen;
1502 		if (prefixlen > 32)
1503 			return -EINVAL;
1504 	}
1505 
1506 	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1507 	    cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1508 		struct net_device *dev;
1509 
1510 		rcu_read_lock();
1511 		dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1512 		if (dev && netif_is_l3_master(dev))
1513 			l3index = dev->ifindex;
1514 
1515 		rcu_read_unlock();
1516 
1517 		/* ok to reference set/not set outside of rcu;
1518 		 * right now device MUST be an L3 master
1519 		 */
1520 		if (!dev || !l3index)
1521 			return -EINVAL;
1522 	}
1523 
1524 	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1525 
1526 	if (!cmd.tcpm_keylen)
1527 		return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1528 
1529 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1530 		return -EINVAL;
1531 
1532 	/* Don't allow keys for peers that have a matching TCP-AO key.
1533 	 * See the comment in tcp_ao_add_cmd()
1534 	 */
1535 	if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
1536 		return -EKEYREJECTED;
1537 
1538 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1539 			      cmd.tcpm_key, cmd.tcpm_keylen);
1540 }
1541 
1542 static void tcp_v4_md5_hash_headers(struct md5_ctx *ctx,
1543 				    __be32 daddr, __be32 saddr,
1544 				    const struct tcphdr *th, int nbytes)
1545 {
1546 	struct {
1547 		struct tcp4_pseudohdr ip;
1548 		struct tcphdr tcp;
1549 	} h;
1550 
1551 	h.ip.saddr = saddr;
1552 	h.ip.daddr = daddr;
1553 	h.ip.pad = 0;
1554 	h.ip.protocol = IPPROTO_TCP;
1555 	h.ip.len = cpu_to_be16(nbytes);
1556 	h.tcp = *th;
1557 	h.tcp.check = 0;
1558 	md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp));
1559 }
1560 
1561 static noinline_for_stack void
1562 tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1563 		    __be32 daddr, __be32 saddr, const struct tcphdr *th)
1564 {
1565 	struct md5_ctx ctx;
1566 
1567 	md5_init(&ctx);
1568 	tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2);
1569 	tcp_md5_hash_key(&ctx, key);
1570 	md5_final(&ctx, md5_hash);
1571 }
1572 
1573 noinline_for_stack void
1574 tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1575 		    const struct sock *sk, const struct sk_buff *skb)
1576 {
1577 	const struct tcphdr *th = tcp_hdr(skb);
1578 	__be32 saddr, daddr;
1579 	struct md5_ctx ctx;
1580 
1581 	if (sk) { /* valid for establish/request sockets */
1582 		saddr = sk->sk_rcv_saddr;
1583 		daddr = sk->sk_daddr;
1584 	} else {
1585 		const struct iphdr *iph = ip_hdr(skb);
1586 		saddr = iph->saddr;
1587 		daddr = iph->daddr;
1588 	}
1589 
1590 	md5_init(&ctx);
1591 	tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, skb->len);
1592 	tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2);
1593 	tcp_md5_hash_key(&ctx, key);
1594 	md5_final(&ctx, md5_hash);
1595 }
1596 
1597 #endif
1598 
1599 static void tcp_v4_init_req(struct request_sock *req,
1600 			    const struct sock *sk_listener,
1601 			    struct sk_buff *skb)
1602 {
1603 	struct inet_request_sock *ireq = inet_rsk(req);
1604 	struct net *net = sock_net(sk_listener);
1605 
1606 	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1607 	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1608 	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1609 }
1610 
1611 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1612 					  struct sk_buff *skb,
1613 					  struct flowi *fl,
1614 					  struct request_sock *req,
1615 					  u32 tw_isn)
1616 {
1617 	tcp_v4_init_req(req, sk, skb);
1618 
1619 	if (security_inet_conn_request(sk, skb, req))
1620 		return NULL;
1621 
1622 	return inet_csk_route_req(sk, &fl->u.ip4, req);
1623 }
1624 
1625 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1626 	.family		=	PF_INET,
1627 	.obj_size	=	sizeof(struct tcp_request_sock),
1628 	.send_ack	=	tcp_v4_reqsk_send_ack,
1629 	.destructor	=	tcp_v4_reqsk_destructor,
1630 	.send_reset	=	tcp_v4_send_reset,
1631 };
1632 
1633 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1634 	.mss_clamp	=	TCP_MSS_DEFAULT,
1635 #ifdef CONFIG_TCP_MD5SIG
1636 	.req_md5_lookup	=	tcp_v4_md5_lookup,
1637 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1638 #endif
1639 #ifdef CONFIG_TCP_AO
1640 	.ao_lookup	=	tcp_v4_ao_lookup_rsk,
1641 	.ao_calc_key	=	tcp_v4_ao_calc_key_rsk,
1642 	.ao_synack_hash	=	tcp_v4_ao_synack_hash,
1643 #endif
1644 #ifdef CONFIG_SYN_COOKIES
1645 	.cookie_init_seq =	cookie_v4_init_sequence,
1646 #endif
1647 	.route_req	=	tcp_v4_route_req,
1648 	.init_seq_and_ts_off	=	tcp_v4_init_seq_and_ts_off,
1649 	.send_synack	=	tcp_v4_send_synack,
1650 };
1651 
1652 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1653 {
1654 	/* Never answer to SYNs send to broadcast or multicast */
1655 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1656 		goto drop;
1657 
1658 	return tcp_conn_request(&tcp_request_sock_ops,
1659 				&tcp_request_sock_ipv4_ops, sk, skb);
1660 
1661 drop:
1662 	tcp_listendrop(sk);
1663 	return 0;
1664 }
1665 
1666 
1667 /*
1668  * The three way handshake has completed - we got a valid synack -
1669  * now create the new socket.
1670  */
1671 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1672 				  struct request_sock *req,
1673 				  struct dst_entry *dst,
1674 				  struct request_sock *req_unhash,
1675 				  bool *own_req,
1676 				  void (*opt_child_init)(struct sock *newsk,
1677 							 const struct sock *sk))
1678 {
1679 	struct inet_request_sock *ireq;
1680 	bool found_dup_sk = false;
1681 	struct inet_sock *newinet;
1682 	struct tcp_sock *newtp;
1683 	struct sock *newsk;
1684 #ifdef CONFIG_TCP_MD5SIG
1685 	const union tcp_md5_addr *addr;
1686 	struct tcp_md5sig_key *key;
1687 	int l3index;
1688 #endif
1689 	struct ip_options_rcu *inet_opt;
1690 
1691 	if (sk_acceptq_is_full(sk))
1692 		goto exit_overflow;
1693 
1694 	newsk = tcp_create_openreq_child(sk, req, skb);
1695 	if (!newsk)
1696 		goto exit_nonewsk;
1697 
1698 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1699 	inet_sk_rx_dst_set(newsk, skb);
1700 
1701 	newtp		      = tcp_sk(newsk);
1702 	newinet		      = inet_sk(newsk);
1703 	ireq		      = inet_rsk(req);
1704 	inet_opt	      = rcu_dereference(ireq->ireq_opt);
1705 	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1706 	newinet->mc_index     = inet_iif(skb);
1707 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1708 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1709 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1710 	if (inet_opt)
1711 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1712 	atomic_set(&newinet->inet_id, get_random_u16());
1713 
1714 	/* Set ToS of the new socket based upon the value of incoming SYN.
1715 	 * ECT bits are set later in tcp_init_transfer().
1716 	 */
1717 	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1718 		newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1719 
1720 	if (!dst) {
1721 		dst = inet_csk_route_child_sock(sk, newsk, req);
1722 		if (!dst)
1723 			goto put_and_exit;
1724 	} else {
1725 		/* syncookie case : see end of cookie_v4_check() */
1726 	}
1727 	sk_setup_caps(newsk, dst);
1728 
1729 #if IS_ENABLED(CONFIG_IPV6)
1730 	if (opt_child_init)
1731 		opt_child_init(newsk, sk);
1732 #endif
1733 	tcp_ca_openreq_child(newsk, dst);
1734 
1735 	tcp_sync_mss(newsk, dst4_mtu(dst));
1736 	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1737 
1738 	tcp_initialize_rcv_mss(newsk);
1739 
1740 #ifdef CONFIG_TCP_MD5SIG
1741 	l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1742 	/* Copy over the MD5 key from the original socket */
1743 	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1744 	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1745 	if (key && !tcp_rsk_used_ao(req)) {
1746 		if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1747 			goto put_and_exit;
1748 		sk_gso_disable(newsk);
1749 	}
1750 #endif
1751 #ifdef CONFIG_TCP_AO
1752 	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1753 		goto put_and_exit; /* OOM, release back memory */
1754 #endif
1755 
1756 	if (__inet_inherit_port(sk, newsk) < 0)
1757 		goto put_and_exit;
1758 	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1759 				       &found_dup_sk);
1760 	if (likely(*own_req)) {
1761 		tcp_move_syn(newtp, req);
1762 		ireq->ireq_opt = NULL;
1763 	} else {
1764 		newinet->inet_opt = NULL;
1765 
1766 		if (!req_unhash && found_dup_sk) {
1767 			/* This code path should only be executed in the
1768 			 * syncookie case only
1769 			 */
1770 			bh_unlock_sock(newsk);
1771 			sock_put(newsk);
1772 			newsk = NULL;
1773 		}
1774 	}
1775 	return newsk;
1776 
1777 exit_overflow:
1778 	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1779 exit_nonewsk:
1780 	dst_release(dst);
1781 exit:
1782 	tcp_listendrop(sk);
1783 	return NULL;
1784 put_and_exit:
1785 	newinet->inet_opt = NULL;
1786 	inet_csk_prepare_forced_close(newsk);
1787 	tcp_done(newsk);
1788 	goto exit;
1789 }
1790 
1791 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1792 {
1793 #ifdef CONFIG_SYN_COOKIES
1794 	const struct tcphdr *th = tcp_hdr(skb);
1795 
1796 	if (!th->syn)
1797 		sk = cookie_v4_check(sk, skb);
1798 #endif
1799 	return sk;
1800 }
1801 
1802 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1803 			 struct tcphdr *th, u32 *cookie)
1804 {
1805 	u16 mss = 0;
1806 #ifdef CONFIG_SYN_COOKIES
1807 	mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1808 				    &tcp_request_sock_ipv4_ops, sk, th);
1809 	if (mss) {
1810 		*cookie = __cookie_v4_init_sequence(iph, th, &mss);
1811 		tcp_synq_overflow(sk);
1812 	}
1813 #endif
1814 	return mss;
1815 }
1816 
1817 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1818 							   u32));
1819 /* The socket must have it's spinlock held when we get
1820  * here, unless it is a TCP_LISTEN socket.
1821  *
1822  * We have a potential double-lock case here, so even when
1823  * doing backlog processing we use the BH locking scheme.
1824  * This is because we cannot sleep with the original spinlock
1825  * held.
1826  */
1827 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1828 {
1829 	enum skb_drop_reason reason;
1830 	struct sock *rsk;
1831 
1832 	reason = psp_sk_rx_policy_check(sk, skb);
1833 	if (reason)
1834 		goto err_discard;
1835 
1836 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1837 		struct dst_entry *dst;
1838 
1839 		dst = rcu_dereference_protected(sk->sk_rx_dst,
1840 						lockdep_sock_is_held(sk));
1841 
1842 		sock_rps_save_rxhash(sk, skb);
1843 		sk_mark_napi_id(sk, skb);
1844 		if (dst && unlikely(dst != skb_dst(skb))) {
1845 			if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1846 			    !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1847 					     dst, 0)) {
1848 				RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1849 				dst_release(dst);
1850 			}
1851 		}
1852 		tcp_rcv_established(sk, skb);
1853 		return 0;
1854 	}
1855 
1856 	if (tcp_checksum_complete(skb))
1857 		goto csum_err;
1858 
1859 	if (sk->sk_state == TCP_LISTEN) {
1860 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1861 
1862 		if (!nsk)
1863 			return 0;
1864 		if (nsk != sk) {
1865 			reason = tcp_child_process(sk, nsk, skb);
1866 			if (reason) {
1867 				rsk = nsk;
1868 				goto reset;
1869 			}
1870 			return 0;
1871 		}
1872 	} else
1873 		sock_rps_save_rxhash(sk, skb);
1874 
1875 	reason = tcp_rcv_state_process(sk, skb);
1876 	if (reason) {
1877 		rsk = sk;
1878 		goto reset;
1879 	}
1880 	return 0;
1881 
1882 reset:
1883 	tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
1884 discard:
1885 	sk_skb_reason_drop(sk, skb, reason);
1886 	/* Be careful here. If this function gets more complicated and
1887 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1888 	 * might be destroyed here. This current version compiles correctly,
1889 	 * but you have been warned.
1890 	 */
1891 	return 0;
1892 
1893 csum_err:
1894 	reason = SKB_DROP_REASON_TCP_CSUM;
1895 	trace_tcp_bad_csum(skb);
1896 	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1897 err_discard:
1898 	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1899 	goto discard;
1900 }
1901 EXPORT_SYMBOL(tcp_v4_do_rcv);
1902 
1903 enum skb_drop_reason tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1904 {
1905 	u32 tail_gso_size, tail_gso_segs;
1906 	struct skb_shared_info *shinfo;
1907 	const struct tcphdr *th;
1908 	struct tcphdr *thtail;
1909 	struct sk_buff *tail;
1910 	unsigned int hdrlen;
1911 	bool fragstolen;
1912 	u32 gso_segs;
1913 	u32 gso_size;
1914 	u64 limit;
1915 	int delta;
1916 	int err;
1917 
1918 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1919 	 * we can fix skb->truesize to its real value to avoid future drops.
1920 	 * This is valid because skb is not yet charged to the socket.
1921 	 * It has been noticed pure SACK packets were sometimes dropped
1922 	 * (if cooked by drivers without copybreak feature).
1923 	 */
1924 	skb_condense(skb);
1925 
1926 	tcp_cleanup_skb(skb);
1927 
1928 	if (unlikely(tcp_checksum_complete(skb))) {
1929 		bh_unlock_sock(sk);
1930 		trace_tcp_bad_csum(skb);
1931 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1932 		__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1933 		return SKB_DROP_REASON_TCP_CSUM;
1934 	}
1935 
1936 	/* Attempt coalescing to last skb in backlog, even if we are
1937 	 * above the limits.
1938 	 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1939 	 */
1940 	th = (const struct tcphdr *)skb->data;
1941 	hdrlen = th->doff * 4;
1942 
1943 	tail = sk->sk_backlog.tail;
1944 	if (!tail)
1945 		goto no_coalesce;
1946 	thtail = (struct tcphdr *)tail->data;
1947 
1948 	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1949 	    TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1950 	    ((TCP_SKB_CB(tail)->tcp_flags |
1951 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1952 	    !((TCP_SKB_CB(tail)->tcp_flags &
1953 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1954 	    ((TCP_SKB_CB(tail)->tcp_flags ^
1955 	      TCP_SKB_CB(skb)->tcp_flags) &
1956 	     (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) ||
1957 	    !tcp_skb_can_collapse_rx(tail, skb) ||
1958 	    thtail->doff != th->doff ||
1959 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) ||
1960 	    /* prior to PSP Rx policy check, retain exact PSP metadata */
1961 	    psp_skb_coalesce_diff(tail, skb))
1962 		goto no_coalesce;
1963 
1964 	__skb_pull(skb, hdrlen);
1965 
1966 	shinfo = skb_shinfo(skb);
1967 	gso_size = shinfo->gso_size ?: skb->len;
1968 	gso_segs = shinfo->gso_segs ?: 1;
1969 
1970 	shinfo = skb_shinfo(tail);
1971 	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1972 	tail_gso_segs = shinfo->gso_segs ?: 1;
1973 
1974 	if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1975 		TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1976 
1977 		if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1978 			TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1979 			thtail->window = th->window;
1980 		}
1981 
1982 		/* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1983 		 * thtail->fin, so that the fast path in tcp_rcv_established()
1984 		 * is not entered if we append a packet with a FIN.
1985 		 * SYN, RST, URG are not present.
1986 		 * ACK is set on both packets.
1987 		 * PSH : we do not really care in TCP stack,
1988 		 *       at least for 'GRO' packets.
1989 		 */
1990 		thtail->fin |= th->fin;
1991 		TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1992 
1993 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
1994 			TCP_SKB_CB(tail)->has_rxtstamp = true;
1995 			tail->tstamp = skb->tstamp;
1996 			skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1997 		}
1998 
1999 		/* Not as strict as GRO. We only need to carry mss max value */
2000 		shinfo->gso_size = max(gso_size, tail_gso_size);
2001 		shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2002 
2003 		sk->sk_backlog.len += delta;
2004 		__NET_INC_STATS(sock_net(sk),
2005 				LINUX_MIB_TCPBACKLOGCOALESCE);
2006 		kfree_skb_partial(skb, fragstolen);
2007 		return SKB_NOT_DROPPED_YET;
2008 	}
2009 	__skb_push(skb, hdrlen);
2010 
2011 no_coalesce:
2012 	/* sk->sk_backlog.len is reset only at the end of __release_sock().
2013 	 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2014 	 * sk_rcvbuf in normal conditions.
2015 	 */
2016 	limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2017 
2018 	limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2019 
2020 	/* Only socket owner can try to collapse/prune rx queues
2021 	 * to reduce memory overhead, so add a little headroom here.
2022 	 * Few sockets backlog are possibly concurrently non empty.
2023 	 */
2024 	limit += 64 * 1024;
2025 
2026 	limit = min_t(u64, limit, UINT_MAX);
2027 
2028 	err = sk_add_backlog(sk, skb, limit);
2029 	if (unlikely(err)) {
2030 		bh_unlock_sock(sk);
2031 		if (err == -ENOMEM) {
2032 			__NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
2033 			return SKB_DROP_REASON_PFMEMALLOC;
2034 		}
2035 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2036 		return SKB_DROP_REASON_SOCKET_BACKLOG;
2037 	}
2038 	return SKB_NOT_DROPPED_YET;
2039 }
2040 
2041 static void tcp_v4_restore_cb(struct sk_buff *skb)
2042 {
2043 	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2044 		sizeof(struct inet_skb_parm));
2045 }
2046 
2047 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2048 			   const struct tcphdr *th)
2049 {
2050 	/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2051 	 * barrier() makes sure compiler wont play fool^Waliasing games.
2052 	 */
2053 	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2054 		sizeof(struct inet_skb_parm));
2055 	barrier();
2056 
2057 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2058 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2059 				    skb->len - th->doff * 4);
2060 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2061 	TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th);
2062 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2063 	TCP_SKB_CB(skb)->sacked	 = 0;
2064 	TCP_SKB_CB(skb)->has_rxtstamp =
2065 			skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2066 }
2067 
2068 /*
2069  *	From tcp_input.c
2070  */
2071 
2072 int tcp_v4_rcv(struct sk_buff *skb)
2073 {
2074 	struct net *net = dev_net_rcu(skb->dev);
2075 	enum skb_drop_reason drop_reason;
2076 	enum tcp_tw_status tw_status;
2077 	int sdif = inet_sdif(skb);
2078 	int dif = inet_iif(skb);
2079 	const struct iphdr *iph;
2080 	const struct tcphdr *th;
2081 	struct sock *sk = NULL;
2082 	bool refcounted;
2083 	int ret;
2084 	u32 isn;
2085 
2086 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2087 	if (skb->pkt_type != PACKET_HOST)
2088 		goto discard_it;
2089 
2090 	/* Count it even if it's bad */
2091 	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2092 
2093 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2094 		goto discard_it;
2095 
2096 	th = (const struct tcphdr *)skb->data;
2097 
2098 	if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2099 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2100 		goto bad_packet;
2101 	}
2102 	if (!pskb_may_pull(skb, th->doff * 4))
2103 		goto discard_it;
2104 
2105 	/* An explanation is required here, I think.
2106 	 * Packet length and doff are validated by header prediction,
2107 	 * provided case of th->doff==0 is eliminated.
2108 	 * So, we defer the checks. */
2109 
2110 	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2111 		goto csum_error;
2112 
2113 	th = (const struct tcphdr *)skb->data;
2114 	iph = ip_hdr(skb);
2115 lookup:
2116 	sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source,
2117 			       th->dest, sdif, &refcounted);
2118 	if (!sk)
2119 		goto no_tcp_socket;
2120 
2121 	if (sk->sk_state == TCP_TIME_WAIT)
2122 		goto do_time_wait;
2123 
2124 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2125 		struct request_sock *req = inet_reqsk(sk);
2126 		bool req_stolen = false;
2127 		struct sock *nsk;
2128 
2129 		sk = req->rsk_listener;
2130 		if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2131 			drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2132 		else
2133 			drop_reason = tcp_inbound_hash(sk, req, skb,
2134 						       &iph->saddr, &iph->daddr,
2135 						       AF_INET, dif, sdif);
2136 		if (unlikely(drop_reason)) {
2137 			sk_drops_skbadd(sk, skb);
2138 			reqsk_put(req);
2139 			goto discard_it;
2140 		}
2141 		if (tcp_checksum_complete(skb)) {
2142 			reqsk_put(req);
2143 			goto csum_error;
2144 		}
2145 		if (unlikely(sk->sk_state != TCP_LISTEN)) {
2146 			nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2147 			if (!nsk) {
2148 				inet_csk_reqsk_queue_drop_and_put(sk, req);
2149 				goto lookup;
2150 			}
2151 			sk = nsk;
2152 			/* reuseport_migrate_sock() has already held one sk_refcnt
2153 			 * before returning.
2154 			 */
2155 		} else {
2156 			/* We own a reference on the listener, increase it again
2157 			 * as we might lose it too soon.
2158 			 */
2159 			sock_hold(sk);
2160 		}
2161 		refcounted = true;
2162 		nsk = NULL;
2163 		drop_reason = tcp_filter(sk, skb);
2164 		if (!drop_reason) {
2165 			th = (const struct tcphdr *)skb->data;
2166 			iph = ip_hdr(skb);
2167 			tcp_v4_fill_cb(skb, iph, th);
2168 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
2169 					    &drop_reason);
2170 		}
2171 		if (!nsk) {
2172 			reqsk_put(req);
2173 			if (req_stolen) {
2174 				/* Another cpu got exclusive access to req
2175 				 * and created a full blown socket.
2176 				 * Try to feed this packet to this socket
2177 				 * instead of discarding it.
2178 				 */
2179 				tcp_v4_restore_cb(skb);
2180 				sock_put(sk);
2181 				goto lookup;
2182 			}
2183 			goto discard_and_relse;
2184 		}
2185 		nf_reset_ct(skb);
2186 		if (nsk == sk) {
2187 			reqsk_put(req);
2188 			tcp_v4_restore_cb(skb);
2189 		} else {
2190 			drop_reason = tcp_child_process(sk, nsk, skb);
2191 			if (drop_reason) {
2192 				enum sk_rst_reason rst_reason;
2193 
2194 				rst_reason = sk_rst_convert_drop_reason(drop_reason);
2195 				tcp_v4_send_reset(nsk, skb, rst_reason);
2196 				goto discard_and_relse;
2197 			}
2198 			sock_put(sk);
2199 			return 0;
2200 		}
2201 	}
2202 
2203 process:
2204 	if (static_branch_unlikely(&ip4_min_ttl)) {
2205 		/* min_ttl can be changed concurrently from do_ip_setsockopt() */
2206 		if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2207 			__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2208 			drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2209 			goto discard_and_relse;
2210 		}
2211 	}
2212 
2213 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2214 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2215 		goto discard_and_relse;
2216 	}
2217 
2218 	drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
2219 				       AF_INET, dif, sdif);
2220 	if (drop_reason)
2221 		goto discard_and_relse;
2222 
2223 	nf_reset_ct(skb);
2224 
2225 	drop_reason = tcp_filter(sk, skb);
2226 	if (drop_reason)
2227 		goto discard_and_relse;
2228 
2229 	th = (const struct tcphdr *)skb->data;
2230 	iph = ip_hdr(skb);
2231 	tcp_v4_fill_cb(skb, iph, th);
2232 
2233 	skb->dev = NULL;
2234 
2235 	if (sk->sk_state == TCP_LISTEN) {
2236 		ret = tcp_v4_do_rcv(sk, skb);
2237 		goto put_and_return;
2238 	}
2239 
2240 	sk_incoming_cpu_update(sk);
2241 
2242 	bh_lock_sock_nested(sk);
2243 	tcp_segs_in(tcp_sk(sk), skb);
2244 	ret = 0;
2245 	if (!sock_owned_by_user(sk)) {
2246 		ret = tcp_v4_do_rcv(sk, skb);
2247 	} else {
2248 		drop_reason = tcp_add_backlog(sk, skb);
2249 		if (drop_reason)
2250 			goto discard_and_relse;
2251 	}
2252 	bh_unlock_sock(sk);
2253 
2254 put_and_return:
2255 	if (refcounted)
2256 		sock_put(sk);
2257 
2258 	return ret;
2259 
2260 no_tcp_socket:
2261 	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2262 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2263 		goto discard_it;
2264 
2265 	tcp_v4_fill_cb(skb, iph, th);
2266 
2267 	if (tcp_checksum_complete(skb)) {
2268 csum_error:
2269 		drop_reason = SKB_DROP_REASON_TCP_CSUM;
2270 		trace_tcp_bad_csum(skb);
2271 		__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2272 bad_packet:
2273 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
2274 	} else {
2275 		tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
2276 	}
2277 
2278 discard_it:
2279 	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2280 	/* Discard frame. */
2281 	sk_skb_reason_drop(sk, skb, drop_reason);
2282 	return 0;
2283 
2284 discard_and_relse:
2285 	sk_drops_skbadd(sk, skb);
2286 	if (refcounted)
2287 		sock_put(sk);
2288 	goto discard_it;
2289 
2290 do_time_wait:
2291 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2292 		drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2293 		inet_twsk_put(inet_twsk(sk));
2294 		goto discard_it;
2295 	}
2296 
2297 	tcp_v4_fill_cb(skb, iph, th);
2298 
2299 	if (tcp_checksum_complete(skb)) {
2300 		inet_twsk_put(inet_twsk(sk));
2301 		goto csum_error;
2302 	}
2303 
2304 	tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn,
2305 					       &drop_reason);
2306 	switch (tw_status) {
2307 	case TCP_TW_SYN: {
2308 		struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th),
2309 							iph->saddr, th->source,
2310 							iph->daddr, th->dest,
2311 							inet_iif(skb),
2312 							sdif);
2313 		if (sk2) {
2314 			inet_twsk_deschedule_put(inet_twsk(sk));
2315 			sk = sk2;
2316 			tcp_v4_restore_cb(skb);
2317 			refcounted = false;
2318 			__this_cpu_write(tcp_tw_isn, isn);
2319 			goto process;
2320 		}
2321 
2322 		drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb);
2323 		if (drop_reason)
2324 			break;
2325 	}
2326 		/* to ACK */
2327 		fallthrough;
2328 	case TCP_TW_ACK:
2329 	case TCP_TW_ACK_OOW:
2330 		tcp_v4_timewait_ack(sk, skb, tw_status);
2331 		break;
2332 	case TCP_TW_RST:
2333 		tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2334 		inet_twsk_deschedule_put(inet_twsk(sk));
2335 		goto discard_it;
2336 	case TCP_TW_SUCCESS:;
2337 	}
2338 	goto discard_it;
2339 }
2340 
2341 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2342 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
2343 };
2344 
2345 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2346 {
2347 	struct dst_entry *dst = skb_dst(skb);
2348 
2349 	if (dst && dst_hold_safe(dst)) {
2350 		rcu_assign_pointer(sk->sk_rx_dst, dst);
2351 		sk->sk_rx_dst_ifindex = skb->skb_iif;
2352 	}
2353 }
2354 
2355 const struct inet_connection_sock_af_ops ipv4_specific = {
2356 	.queue_xmit	   = ip_queue_xmit,
2357 	.rebuild_header	   = inet_sk_rebuild_header,
2358 	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
2359 	.conn_request	   = tcp_v4_conn_request,
2360 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
2361 	.net_header_len	   = sizeof(struct iphdr),
2362 	.setsockopt	   = ip_setsockopt,
2363 	.getsockopt	   = ip_getsockopt,
2364 	.mtu_reduced	   = tcp_v4_mtu_reduced,
2365 };
2366 
2367 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2368 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2369 #ifdef CONFIG_TCP_MD5SIG
2370 	.md5_lookup		= tcp_v4_md5_lookup,
2371 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
2372 	.md5_parse		= tcp_v4_parse_md5_keys,
2373 #endif
2374 #ifdef CONFIG_TCP_AO
2375 	.ao_lookup		= tcp_v4_ao_lookup,
2376 	.calc_ao_hash		= tcp_v4_ao_hash_skb,
2377 	.ao_parse		= tcp_v4_parse_ao,
2378 	.ao_calc_key_sk		= tcp_v4_ao_calc_key_sk,
2379 #endif
2380 };
2381 
2382 static void tcp4_destruct_sock(struct sock *sk)
2383 {
2384 	tcp_md5_destruct_sock(sk);
2385 	tcp_ao_destroy_sock(sk, false);
2386 	inet_sock_destruct(sk);
2387 }
2388 #endif
2389 
2390 /* NOTE: A lot of things set to zero explicitly by call to
2391  *       sk_alloc() so need not be done here.
2392  */
2393 static int tcp_v4_init_sock(struct sock *sk)
2394 {
2395 	struct inet_connection_sock *icsk = inet_csk(sk);
2396 
2397 	tcp_init_sock(sk);
2398 
2399 	icsk->icsk_af_ops = &ipv4_specific;
2400 
2401 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2402 	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2403 	sk->sk_destruct = tcp4_destruct_sock;
2404 #endif
2405 
2406 	return 0;
2407 }
2408 
2409 static void tcp_release_user_frags(struct sock *sk)
2410 {
2411 #ifdef CONFIG_PAGE_POOL
2412 	unsigned long index;
2413 	void *netmem;
2414 
2415 	xa_for_each(&sk->sk_user_frags, index, netmem)
2416 		WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2417 #endif
2418 }
2419 
2420 void tcp_v4_destroy_sock(struct sock *sk)
2421 {
2422 	struct tcp_sock *tp = tcp_sk(sk);
2423 
2424 	tcp_release_user_frags(sk);
2425 
2426 	xa_destroy(&sk->sk_user_frags);
2427 
2428 	trace_tcp_destroy_sock(sk);
2429 
2430 	tcp_clear_xmit_timers(sk);
2431 
2432 	tcp_cleanup_congestion_control(sk);
2433 
2434 	tcp_cleanup_ulp(sk);
2435 
2436 	/* Cleanup up the write buffer. */
2437 	tcp_write_queue_purge(sk);
2438 
2439 	/* Check if we want to disable active TFO */
2440 	tcp_fastopen_active_disable_ofo_check(sk);
2441 
2442 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2443 	skb_rbtree_purge(&tp->out_of_order_queue);
2444 
2445 	/* Clean up a referenced TCP bind bucket. */
2446 	if (inet_csk(sk)->icsk_bind_hash)
2447 		inet_put_port(sk);
2448 
2449 	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2450 
2451 	/* If socket is aborted during connect operation */
2452 	tcp_free_fastopen_req(tp);
2453 	tcp_fastopen_destroy_cipher(sk);
2454 	tcp_saved_syn_free(tp);
2455 
2456 	sk_sockets_allocated_dec(sk);
2457 }
2458 
2459 #ifdef CONFIG_PROC_FS
2460 /* Proc filesystem TCP sock list dumping. */
2461 
2462 static unsigned short seq_file_family(const struct seq_file *seq);
2463 
2464 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2465 {
2466 	unsigned short family = seq_file_family(seq);
2467 
2468 	/* AF_UNSPEC is used as a match all */
2469 	return ((family == AF_UNSPEC || family == sk->sk_family) &&
2470 		net_eq(sock_net(sk), seq_file_net(seq)));
2471 }
2472 
2473 /* Find a non empty bucket (starting from st->bucket)
2474  * and return the first sk from it.
2475  */
2476 static void *listening_get_first(struct seq_file *seq)
2477 {
2478 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2479 	struct tcp_iter_state *st = seq->private;
2480 
2481 	st->offset = 0;
2482 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2483 		struct inet_listen_hashbucket *ilb2;
2484 		struct hlist_nulls_node *node;
2485 		struct sock *sk;
2486 
2487 		ilb2 = &hinfo->lhash2[st->bucket];
2488 		if (hlist_nulls_empty(&ilb2->nulls_head))
2489 			continue;
2490 
2491 		spin_lock(&ilb2->lock);
2492 		sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2493 			if (seq_sk_match(seq, sk))
2494 				return sk;
2495 		}
2496 		spin_unlock(&ilb2->lock);
2497 	}
2498 
2499 	return NULL;
2500 }
2501 
2502 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2503  * If "cur" is the last one in the st->bucket,
2504  * call listening_get_first() to return the first sk of the next
2505  * non empty bucket.
2506  */
2507 static void *listening_get_next(struct seq_file *seq, void *cur)
2508 {
2509 	struct tcp_iter_state *st = seq->private;
2510 	struct inet_listen_hashbucket *ilb2;
2511 	struct hlist_nulls_node *node;
2512 	struct inet_hashinfo *hinfo;
2513 	struct sock *sk = cur;
2514 
2515 	++st->num;
2516 	++st->offset;
2517 
2518 	sk = sk_nulls_next(sk);
2519 	sk_nulls_for_each_from(sk, node) {
2520 		if (seq_sk_match(seq, sk))
2521 			return sk;
2522 	}
2523 
2524 	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2525 	ilb2 = &hinfo->lhash2[st->bucket];
2526 	spin_unlock(&ilb2->lock);
2527 	++st->bucket;
2528 	return listening_get_first(seq);
2529 }
2530 
2531 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2532 {
2533 	struct tcp_iter_state *st = seq->private;
2534 	void *rc;
2535 
2536 	st->bucket = 0;
2537 	st->offset = 0;
2538 	rc = listening_get_first(seq);
2539 
2540 	while (rc && *pos) {
2541 		rc = listening_get_next(seq, rc);
2542 		--*pos;
2543 	}
2544 	return rc;
2545 }
2546 
2547 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2548 				const struct tcp_iter_state *st)
2549 {
2550 	return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2551 }
2552 
2553 /*
2554  * Get first established socket starting from bucket given in st->bucket.
2555  * If st->bucket is zero, the very first socket in the hash is returned.
2556  */
2557 static void *established_get_first(struct seq_file *seq)
2558 {
2559 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2560 	struct tcp_iter_state *st = seq->private;
2561 
2562 	st->offset = 0;
2563 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2564 		struct sock *sk;
2565 		struct hlist_nulls_node *node;
2566 		spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2567 
2568 		cond_resched();
2569 
2570 		/* Lockless fast path for the common case of empty buckets */
2571 		if (empty_bucket(hinfo, st))
2572 			continue;
2573 
2574 		spin_lock_bh(lock);
2575 		sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2576 			if (seq_sk_match(seq, sk))
2577 				return sk;
2578 		}
2579 		spin_unlock_bh(lock);
2580 	}
2581 
2582 	return NULL;
2583 }
2584 
2585 static void *established_get_next(struct seq_file *seq, void *cur)
2586 {
2587 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2588 	struct tcp_iter_state *st = seq->private;
2589 	struct hlist_nulls_node *node;
2590 	struct sock *sk = cur;
2591 
2592 	++st->num;
2593 	++st->offset;
2594 
2595 	sk = sk_nulls_next(sk);
2596 
2597 	sk_nulls_for_each_from(sk, node) {
2598 		if (seq_sk_match(seq, sk))
2599 			return sk;
2600 	}
2601 
2602 	spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2603 	++st->bucket;
2604 	return established_get_first(seq);
2605 }
2606 
2607 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2608 {
2609 	struct tcp_iter_state *st = seq->private;
2610 	void *rc;
2611 
2612 	st->bucket = 0;
2613 	rc = established_get_first(seq);
2614 
2615 	while (rc && pos) {
2616 		rc = established_get_next(seq, rc);
2617 		--pos;
2618 	}
2619 	return rc;
2620 }
2621 
2622 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2623 {
2624 	void *rc;
2625 	struct tcp_iter_state *st = seq->private;
2626 
2627 	st->state = TCP_SEQ_STATE_LISTENING;
2628 	rc	  = listening_get_idx(seq, &pos);
2629 
2630 	if (!rc) {
2631 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2632 		rc	  = established_get_idx(seq, pos);
2633 	}
2634 
2635 	return rc;
2636 }
2637 
2638 static void *tcp_seek_last_pos(struct seq_file *seq)
2639 {
2640 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2641 	struct tcp_iter_state *st = seq->private;
2642 	int bucket = st->bucket;
2643 	int offset = st->offset;
2644 	int orig_num = st->num;
2645 	void *rc = NULL;
2646 
2647 	switch (st->state) {
2648 	case TCP_SEQ_STATE_LISTENING:
2649 		if (st->bucket > hinfo->lhash2_mask)
2650 			break;
2651 		rc = listening_get_first(seq);
2652 		while (offset-- && rc && bucket == st->bucket)
2653 			rc = listening_get_next(seq, rc);
2654 		if (rc)
2655 			break;
2656 		st->bucket = 0;
2657 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2658 		fallthrough;
2659 	case TCP_SEQ_STATE_ESTABLISHED:
2660 		if (st->bucket > hinfo->ehash_mask)
2661 			break;
2662 		rc = established_get_first(seq);
2663 		while (offset-- && rc && bucket == st->bucket)
2664 			rc = established_get_next(seq, rc);
2665 	}
2666 
2667 	st->num = orig_num;
2668 
2669 	return rc;
2670 }
2671 
2672 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2673 {
2674 	struct tcp_iter_state *st = seq->private;
2675 	void *rc;
2676 
2677 	if (*pos && *pos == st->last_pos) {
2678 		rc = tcp_seek_last_pos(seq);
2679 		if (rc)
2680 			goto out;
2681 	}
2682 
2683 	st->state = TCP_SEQ_STATE_LISTENING;
2684 	st->num = 0;
2685 	st->bucket = 0;
2686 	st->offset = 0;
2687 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2688 
2689 out:
2690 	st->last_pos = *pos;
2691 	return rc;
2692 }
2693 
2694 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2695 {
2696 	struct tcp_iter_state *st = seq->private;
2697 	void *rc = NULL;
2698 
2699 	if (v == SEQ_START_TOKEN) {
2700 		rc = tcp_get_idx(seq, 0);
2701 		goto out;
2702 	}
2703 
2704 	switch (st->state) {
2705 	case TCP_SEQ_STATE_LISTENING:
2706 		rc = listening_get_next(seq, v);
2707 		if (!rc) {
2708 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2709 			st->bucket = 0;
2710 			st->offset = 0;
2711 			rc	  = established_get_first(seq);
2712 		}
2713 		break;
2714 	case TCP_SEQ_STATE_ESTABLISHED:
2715 		rc = established_get_next(seq, v);
2716 		break;
2717 	}
2718 out:
2719 	++*pos;
2720 	st->last_pos = *pos;
2721 	return rc;
2722 }
2723 
2724 void tcp_seq_stop(struct seq_file *seq, void *v)
2725 {
2726 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2727 	struct tcp_iter_state *st = seq->private;
2728 
2729 	switch (st->state) {
2730 	case TCP_SEQ_STATE_LISTENING:
2731 		if (v != SEQ_START_TOKEN)
2732 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2733 		break;
2734 	case TCP_SEQ_STATE_ESTABLISHED:
2735 		if (v)
2736 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2737 		break;
2738 	}
2739 }
2740 
2741 static void get_openreq4(const struct request_sock *req,
2742 			 struct seq_file *f, int i)
2743 {
2744 	const struct inet_request_sock *ireq = inet_rsk(req);
2745 	long delta = req->rsk_timer.expires - jiffies;
2746 
2747 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2748 		" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2749 		i,
2750 		ireq->ir_loc_addr,
2751 		ireq->ir_num,
2752 		ireq->ir_rmt_addr,
2753 		ntohs(ireq->ir_rmt_port),
2754 		TCP_SYN_RECV,
2755 		0, 0, /* could print option size, but that is af dependent. */
2756 		1,    /* timers active (only the expire timer) */
2757 		jiffies_delta_to_clock_t(delta),
2758 		req->num_timeout,
2759 		from_kuid_munged(seq_user_ns(f),
2760 				 sk_uid(req->rsk_listener)),
2761 		0,  /* non standard timer */
2762 		0, /* open_requests have no inode */
2763 		0,
2764 		req);
2765 }
2766 
2767 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2768 {
2769 	int timer_active;
2770 	unsigned long timer_expires;
2771 	const struct tcp_sock *tp = tcp_sk(sk);
2772 	const struct inet_connection_sock *icsk = inet_csk(sk);
2773 	const struct inet_sock *inet = inet_sk(sk);
2774 	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2775 	__be32 dest = inet->inet_daddr;
2776 	__be32 src = inet->inet_rcv_saddr;
2777 	__u16 destp = ntohs(inet->inet_dport);
2778 	__u16 srcp = ntohs(inet->inet_sport);
2779 	u8 icsk_pending;
2780 	int rx_queue;
2781 	int state;
2782 
2783 	icsk_pending = smp_load_acquire(&icsk->icsk_pending);
2784 	if (icsk_pending == ICSK_TIME_RETRANS ||
2785 	    icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2786 	    icsk_pending == ICSK_TIME_LOSS_PROBE) {
2787 		timer_active	= 1;
2788 		timer_expires	= tcp_timeout_expires(sk);
2789 	} else if (icsk_pending == ICSK_TIME_PROBE0) {
2790 		timer_active	= 4;
2791 		timer_expires	= tcp_timeout_expires(sk);
2792 	} else if (timer_pending(&icsk->icsk_keepalive_timer)) {
2793 		timer_active	= 2;
2794 		timer_expires	= icsk->icsk_keepalive_timer.expires;
2795 	} else {
2796 		timer_active	= 0;
2797 		timer_expires = jiffies;
2798 	}
2799 
2800 	state = inet_sk_state_load(sk);
2801 	if (state == TCP_LISTEN)
2802 		rx_queue = READ_ONCE(sk->sk_ack_backlog);
2803 	else
2804 		/* Because we don't lock the socket,
2805 		 * we might find a transient negative value.
2806 		 */
2807 		rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2808 				      READ_ONCE(tp->copied_seq), 0);
2809 
2810 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2811 			"%08X %5u %8d %llu %d %pK %lu %lu %u %u %d",
2812 		i, src, srcp, dest, destp, state,
2813 		READ_ONCE(tp->write_seq) - tp->snd_una,
2814 		rx_queue,
2815 		timer_active,
2816 		jiffies_delta_to_clock_t(timer_expires - jiffies),
2817 		READ_ONCE(icsk->icsk_retransmits),
2818 		from_kuid_munged(seq_user_ns(f), sk_uid(sk)),
2819 		READ_ONCE(icsk->icsk_probes_out),
2820 		sock_i_ino(sk),
2821 		refcount_read(&sk->sk_refcnt), sk,
2822 		jiffies_to_clock_t(icsk->icsk_rto),
2823 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2824 		(icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2825 		tcp_snd_cwnd(tp),
2826 		state == TCP_LISTEN ?
2827 		    fastopenq->max_qlen :
2828 		    (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2829 }
2830 
2831 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2832 			       struct seq_file *f, int i)
2833 {
2834 	long delta = tw->tw_timer.expires - jiffies;
2835 	__be32 dest, src;
2836 	__u16 destp, srcp;
2837 
2838 	dest  = tw->tw_daddr;
2839 	src   = tw->tw_rcv_saddr;
2840 	destp = ntohs(tw->tw_dport);
2841 	srcp  = ntohs(tw->tw_sport);
2842 
2843 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2844 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2845 		i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
2846 		3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2847 		refcount_read(&tw->tw_refcnt), tw);
2848 }
2849 
2850 #define TMPSZ 150
2851 
2852 static int tcp4_seq_show(struct seq_file *seq, void *v)
2853 {
2854 	struct tcp_iter_state *st;
2855 	struct sock *sk = v;
2856 
2857 	seq_setwidth(seq, TMPSZ - 1);
2858 	if (v == SEQ_START_TOKEN) {
2859 		seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2860 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2861 			   "inode");
2862 		goto out;
2863 	}
2864 	st = seq->private;
2865 
2866 	if (sk->sk_state == TCP_TIME_WAIT)
2867 		get_timewait4_sock(v, seq, st->num);
2868 	else if (sk->sk_state == TCP_NEW_SYN_RECV)
2869 		get_openreq4(v, seq, st->num);
2870 	else
2871 		get_tcp4_sock(v, seq, st->num);
2872 out:
2873 	seq_pad(seq, '\n');
2874 	return 0;
2875 }
2876 
2877 #ifdef CONFIG_BPF_SYSCALL
2878 union bpf_tcp_iter_batch_item {
2879 	struct sock *sk;
2880 	__u64 cookie;
2881 };
2882 
2883 struct bpf_tcp_iter_state {
2884 	struct tcp_iter_state state;
2885 	unsigned int cur_sk;
2886 	unsigned int end_sk;
2887 	unsigned int max_sk;
2888 	union bpf_tcp_iter_batch_item *batch;
2889 };
2890 
2891 struct bpf_iter__tcp {
2892 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
2893 	__bpf_md_ptr(struct sock_common *, sk_common);
2894 	uid_t uid __aligned(8);
2895 };
2896 
2897 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2898 			     struct sock_common *sk_common, uid_t uid)
2899 {
2900 	struct bpf_iter__tcp ctx;
2901 
2902 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
2903 	ctx.meta = meta;
2904 	ctx.sk_common = sk_common;
2905 	ctx.uid = uid;
2906 	return bpf_iter_run_prog(prog, &ctx);
2907 }
2908 
2909 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2910 {
2911 	union bpf_tcp_iter_batch_item *item;
2912 	unsigned int cur_sk = iter->cur_sk;
2913 	__u64 cookie;
2914 
2915 	/* Remember the cookies of the sockets we haven't seen yet, so we can
2916 	 * pick up where we left off next time around.
2917 	 */
2918 	while (cur_sk < iter->end_sk) {
2919 		item = &iter->batch[cur_sk++];
2920 		cookie = sock_gen_cookie(item->sk);
2921 		sock_gen_put(item->sk);
2922 		item->cookie = cookie;
2923 	}
2924 }
2925 
2926 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2927 				      unsigned int new_batch_sz, gfp_t flags)
2928 {
2929 	union bpf_tcp_iter_batch_item *new_batch;
2930 
2931 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2932 			     flags | __GFP_NOWARN);
2933 	if (!new_batch)
2934 		return -ENOMEM;
2935 
2936 	memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
2937 	kvfree(iter->batch);
2938 	iter->batch = new_batch;
2939 	iter->max_sk = new_batch_sz;
2940 
2941 	return 0;
2942 }
2943 
2944 static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk,
2945 					       union bpf_tcp_iter_batch_item *cookies,
2946 					       int n_cookies)
2947 {
2948 	struct hlist_nulls_node *node;
2949 	struct sock *sk;
2950 	int i;
2951 
2952 	for (i = 0; i < n_cookies; i++) {
2953 		sk = first_sk;
2954 		sk_nulls_for_each_from(sk, node)
2955 			if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
2956 				return sk;
2957 	}
2958 
2959 	return NULL;
2960 }
2961 
2962 static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq)
2963 {
2964 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2965 	struct bpf_tcp_iter_state *iter = seq->private;
2966 	struct tcp_iter_state *st = &iter->state;
2967 	unsigned int find_cookie = iter->cur_sk;
2968 	unsigned int end_cookie = iter->end_sk;
2969 	int resume_bucket = st->bucket;
2970 	struct sock *sk;
2971 
2972 	if (end_cookie && find_cookie == end_cookie)
2973 		++st->bucket;
2974 
2975 	sk = listening_get_first(seq);
2976 	iter->cur_sk = 0;
2977 	iter->end_sk = 0;
2978 
2979 	if (sk && st->bucket == resume_bucket && end_cookie) {
2980 		sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
2981 						end_cookie - find_cookie);
2982 		if (!sk) {
2983 			spin_unlock(&hinfo->lhash2[st->bucket].lock);
2984 			++st->bucket;
2985 			sk = listening_get_first(seq);
2986 		}
2987 	}
2988 
2989 	return sk;
2990 }
2991 
2992 static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq)
2993 {
2994 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2995 	struct bpf_tcp_iter_state *iter = seq->private;
2996 	struct tcp_iter_state *st = &iter->state;
2997 	unsigned int find_cookie = iter->cur_sk;
2998 	unsigned int end_cookie = iter->end_sk;
2999 	int resume_bucket = st->bucket;
3000 	struct sock *sk;
3001 
3002 	if (end_cookie && find_cookie == end_cookie)
3003 		++st->bucket;
3004 
3005 	sk = established_get_first(seq);
3006 	iter->cur_sk = 0;
3007 	iter->end_sk = 0;
3008 
3009 	if (sk && st->bucket == resume_bucket && end_cookie) {
3010 		sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
3011 						end_cookie - find_cookie);
3012 		if (!sk) {
3013 			spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3014 			++st->bucket;
3015 			sk = established_get_first(seq);
3016 		}
3017 	}
3018 
3019 	return sk;
3020 }
3021 
3022 static struct sock *bpf_iter_tcp_resume(struct seq_file *seq)
3023 {
3024 	struct bpf_tcp_iter_state *iter = seq->private;
3025 	struct tcp_iter_state *st = &iter->state;
3026 	struct sock *sk = NULL;
3027 
3028 	switch (st->state) {
3029 	case TCP_SEQ_STATE_LISTENING:
3030 		sk = bpf_iter_tcp_resume_listening(seq);
3031 		if (sk)
3032 			break;
3033 		st->bucket = 0;
3034 		st->state = TCP_SEQ_STATE_ESTABLISHED;
3035 		fallthrough;
3036 	case TCP_SEQ_STATE_ESTABLISHED:
3037 		sk = bpf_iter_tcp_resume_established(seq);
3038 		break;
3039 	}
3040 
3041 	return sk;
3042 }
3043 
3044 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3045 						 struct sock **start_sk)
3046 {
3047 	struct bpf_tcp_iter_state *iter = seq->private;
3048 	struct hlist_nulls_node *node;
3049 	unsigned int expected = 1;
3050 	struct sock *sk;
3051 
3052 	sock_hold(*start_sk);
3053 	iter->batch[iter->end_sk++].sk = *start_sk;
3054 
3055 	sk = sk_nulls_next(*start_sk);
3056 	*start_sk = NULL;
3057 	sk_nulls_for_each_from(sk, node) {
3058 		if (seq_sk_match(seq, sk)) {
3059 			if (iter->end_sk < iter->max_sk) {
3060 				sock_hold(sk);
3061 				iter->batch[iter->end_sk++].sk = sk;
3062 			} else if (!*start_sk) {
3063 				/* Remember where we left off. */
3064 				*start_sk = sk;
3065 			}
3066 			expected++;
3067 		}
3068 	}
3069 
3070 	return expected;
3071 }
3072 
3073 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3074 						   struct sock **start_sk)
3075 {
3076 	struct bpf_tcp_iter_state *iter = seq->private;
3077 	struct hlist_nulls_node *node;
3078 	unsigned int expected = 1;
3079 	struct sock *sk;
3080 
3081 	sock_hold(*start_sk);
3082 	iter->batch[iter->end_sk++].sk = *start_sk;
3083 
3084 	sk = sk_nulls_next(*start_sk);
3085 	*start_sk = NULL;
3086 	sk_nulls_for_each_from(sk, node) {
3087 		if (seq_sk_match(seq, sk)) {
3088 			if (iter->end_sk < iter->max_sk) {
3089 				sock_hold(sk);
3090 				iter->batch[iter->end_sk++].sk = sk;
3091 			} else if (!*start_sk) {
3092 				/* Remember where we left off. */
3093 				*start_sk = sk;
3094 			}
3095 			expected++;
3096 		}
3097 	}
3098 
3099 	return expected;
3100 }
3101 
3102 static unsigned int bpf_iter_fill_batch(struct seq_file *seq,
3103 					struct sock **start_sk)
3104 {
3105 	struct bpf_tcp_iter_state *iter = seq->private;
3106 	struct tcp_iter_state *st = &iter->state;
3107 
3108 	if (st->state == TCP_SEQ_STATE_LISTENING)
3109 		return bpf_iter_tcp_listening_batch(seq, start_sk);
3110 	else
3111 		return bpf_iter_tcp_established_batch(seq, start_sk);
3112 }
3113 
3114 static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq)
3115 {
3116 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3117 	struct bpf_tcp_iter_state *iter = seq->private;
3118 	struct tcp_iter_state *st = &iter->state;
3119 
3120 	if (st->state == TCP_SEQ_STATE_LISTENING)
3121 		spin_unlock(&hinfo->lhash2[st->bucket].lock);
3122 	else
3123 		spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
3124 }
3125 
3126 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3127 {
3128 	struct bpf_tcp_iter_state *iter = seq->private;
3129 	unsigned int expected;
3130 	struct sock *sk;
3131 	int err;
3132 
3133 	sk = bpf_iter_tcp_resume(seq);
3134 	if (!sk)
3135 		return NULL; /* Done */
3136 
3137 	expected = bpf_iter_fill_batch(seq, &sk);
3138 	if (likely(iter->end_sk == expected))
3139 		goto done;
3140 
3141 	/* Batch size was too small. */
3142 	bpf_iter_tcp_unlock_bucket(seq);
3143 	bpf_iter_tcp_put_batch(iter);
3144 	err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2,
3145 					 GFP_USER);
3146 	if (err)
3147 		return ERR_PTR(err);
3148 
3149 	sk = bpf_iter_tcp_resume(seq);
3150 	if (!sk)
3151 		return NULL; /* Done */
3152 
3153 	expected = bpf_iter_fill_batch(seq, &sk);
3154 	if (likely(iter->end_sk == expected))
3155 		goto done;
3156 
3157 	/* Batch size was still too small. Hold onto the lock while we try
3158 	 * again with a larger batch to make sure the current bucket's size
3159 	 * does not change in the meantime.
3160 	 */
3161 	err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT);
3162 	if (err) {
3163 		bpf_iter_tcp_unlock_bucket(seq);
3164 		return ERR_PTR(err);
3165 	}
3166 
3167 	expected = bpf_iter_fill_batch(seq, &sk);
3168 	WARN_ON_ONCE(iter->end_sk != expected);
3169 done:
3170 	bpf_iter_tcp_unlock_bucket(seq);
3171 	return iter->batch[0].sk;
3172 }
3173 
3174 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3175 {
3176 	/* bpf iter does not support lseek, so it always
3177 	 * continue from where it was stop()-ped.
3178 	 */
3179 	if (*pos)
3180 		return bpf_iter_tcp_batch(seq);
3181 
3182 	return SEQ_START_TOKEN;
3183 }
3184 
3185 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3186 {
3187 	struct bpf_tcp_iter_state *iter = seq->private;
3188 	struct tcp_iter_state *st = &iter->state;
3189 	struct sock *sk;
3190 
3191 	/* Whenever seq_next() is called, the iter->cur_sk is
3192 	 * done with seq_show(), so advance to the next sk in
3193 	 * the batch.
3194 	 */
3195 	if (iter->cur_sk < iter->end_sk) {
3196 		/* Keeping st->num consistent in tcp_iter_state.
3197 		 * bpf_iter_tcp does not use st->num.
3198 		 * meta.seq_num is used instead.
3199 		 */
3200 		st->num++;
3201 		sock_gen_put(iter->batch[iter->cur_sk++].sk);
3202 	}
3203 
3204 	if (iter->cur_sk < iter->end_sk)
3205 		sk = iter->batch[iter->cur_sk].sk;
3206 	else
3207 		sk = bpf_iter_tcp_batch(seq);
3208 
3209 	++*pos;
3210 	/* Keeping st->last_pos consistent in tcp_iter_state.
3211 	 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3212 	 */
3213 	st->last_pos = *pos;
3214 	return sk;
3215 }
3216 
3217 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3218 {
3219 	struct bpf_iter_meta meta;
3220 	struct bpf_prog *prog;
3221 	struct sock *sk = v;
3222 	uid_t uid;
3223 	int ret;
3224 
3225 	if (v == SEQ_START_TOKEN)
3226 		return 0;
3227 
3228 	if (sk_fullsock(sk))
3229 		lock_sock(sk);
3230 
3231 	if (unlikely(sk_unhashed(sk))) {
3232 		ret = SEQ_SKIP;
3233 		goto unlock;
3234 	}
3235 
3236 	if (sk->sk_state == TCP_TIME_WAIT) {
3237 		uid = 0;
3238 	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3239 		const struct request_sock *req = v;
3240 
3241 		uid = from_kuid_munged(seq_user_ns(seq),
3242 				       sk_uid(req->rsk_listener));
3243 	} else {
3244 		uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
3245 	}
3246 
3247 	meta.seq = seq;
3248 	prog = bpf_iter_get_info(&meta, false);
3249 	ret = tcp_prog_seq_show(prog, &meta, v, uid);
3250 
3251 unlock:
3252 	if (sk_fullsock(sk))
3253 		release_sock(sk);
3254 	return ret;
3255 
3256 }
3257 
3258 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3259 {
3260 	struct bpf_tcp_iter_state *iter = seq->private;
3261 	struct bpf_iter_meta meta;
3262 	struct bpf_prog *prog;
3263 
3264 	if (!v) {
3265 		meta.seq = seq;
3266 		prog = bpf_iter_get_info(&meta, true);
3267 		if (prog)
3268 			(void)tcp_prog_seq_show(prog, &meta, v, 0);
3269 	}
3270 
3271 	if (iter->cur_sk < iter->end_sk)
3272 		bpf_iter_tcp_put_batch(iter);
3273 }
3274 
3275 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3276 	.show		= bpf_iter_tcp_seq_show,
3277 	.start		= bpf_iter_tcp_seq_start,
3278 	.next		= bpf_iter_tcp_seq_next,
3279 	.stop		= bpf_iter_tcp_seq_stop,
3280 };
3281 #endif
3282 static unsigned short seq_file_family(const struct seq_file *seq)
3283 {
3284 	const struct tcp_seq_afinfo *afinfo;
3285 
3286 #ifdef CONFIG_BPF_SYSCALL
3287 	/* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3288 	if (seq->op == &bpf_iter_tcp_seq_ops)
3289 		return AF_UNSPEC;
3290 #endif
3291 
3292 	/* Iterated from proc fs */
3293 	afinfo = pde_data(file_inode(seq->file));
3294 	return afinfo->family;
3295 }
3296 
3297 static const struct seq_operations tcp4_seq_ops = {
3298 	.show		= tcp4_seq_show,
3299 	.start		= tcp_seq_start,
3300 	.next		= tcp_seq_next,
3301 	.stop		= tcp_seq_stop,
3302 };
3303 
3304 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3305 	.family		= AF_INET,
3306 };
3307 
3308 static int __net_init tcp4_proc_init_net(struct net *net)
3309 {
3310 	if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3311 			sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3312 		return -ENOMEM;
3313 	return 0;
3314 }
3315 
3316 static void __net_exit tcp4_proc_exit_net(struct net *net)
3317 {
3318 	remove_proc_entry("tcp", net->proc_net);
3319 }
3320 
3321 static struct pernet_operations tcp4_net_ops = {
3322 	.init = tcp4_proc_init_net,
3323 	.exit = tcp4_proc_exit_net,
3324 };
3325 
3326 int __init tcp4_proc_init(void)
3327 {
3328 	return register_pernet_subsys(&tcp4_net_ops);
3329 }
3330 
3331 void tcp4_proc_exit(void)
3332 {
3333 	unregister_pernet_subsys(&tcp4_net_ops);
3334 }
3335 #endif /* CONFIG_PROC_FS */
3336 
3337 struct proto tcp_prot = {
3338 	.name			= "TCP",
3339 	.owner			= THIS_MODULE,
3340 	.close			= tcp_close,
3341 	.pre_connect		= tcp_v4_pre_connect,
3342 	.connect		= tcp_v4_connect,
3343 	.disconnect		= tcp_disconnect,
3344 	.accept			= inet_csk_accept,
3345 	.ioctl			= tcp_ioctl,
3346 	.init			= tcp_v4_init_sock,
3347 	.destroy		= tcp_v4_destroy_sock,
3348 	.shutdown		= tcp_shutdown,
3349 	.setsockopt		= tcp_setsockopt,
3350 	.getsockopt		= tcp_getsockopt,
3351 	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
3352 	.keepalive		= tcp_set_keepalive,
3353 	.recvmsg		= tcp_recvmsg,
3354 	.sendmsg		= tcp_sendmsg,
3355 	.splice_eof		= tcp_splice_eof,
3356 	.backlog_rcv		= tcp_v4_do_rcv,
3357 	.release_cb		= tcp_release_cb,
3358 	.hash			= inet_hash,
3359 	.unhash			= inet_unhash,
3360 	.get_port		= inet_csk_get_port,
3361 	.put_port		= inet_put_port,
3362 #ifdef CONFIG_BPF_SYSCALL
3363 	.psock_update_sk_prot	= tcp_bpf_update_proto,
3364 #endif
3365 	.enter_memory_pressure	= tcp_enter_memory_pressure,
3366 	.leave_memory_pressure	= tcp_leave_memory_pressure,
3367 	.stream_memory_free	= tcp_stream_memory_free,
3368 	.sockets_allocated	= &tcp_sockets_allocated,
3369 
3370 	.memory_allocated	= &net_aligned_data.tcp_memory_allocated,
3371 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
3372 
3373 	.memory_pressure	= &tcp_memory_pressure,
3374 	.sysctl_mem		= sysctl_tcp_mem,
3375 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
3376 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
3377 	.max_header		= MAX_TCP_HEADER,
3378 	.obj_size		= sizeof(struct tcp_sock),
3379 	.freeptr_offset		= offsetof(struct tcp_sock,
3380 					   inet_conn.icsk_inet.sk.sk_freeptr),
3381 	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
3382 	.twsk_prot		= &tcp_timewait_sock_ops,
3383 	.rsk_prot		= &tcp_request_sock_ops,
3384 	.h.hashinfo		= NULL,
3385 	.no_autobind		= true,
3386 	.diag_destroy		= tcp_abort,
3387 };
3388 EXPORT_SYMBOL(tcp_prot);
3389 
3390 static void __net_exit tcp_sk_exit(struct net *net)
3391 {
3392 	if (net->ipv4.tcp_congestion_control)
3393 		bpf_module_put(net->ipv4.tcp_congestion_control,
3394 			       net->ipv4.tcp_congestion_control->owner);
3395 }
3396 
3397 static void __net_init tcp_set_hashinfo(struct net *net)
3398 {
3399 	struct inet_hashinfo *hinfo;
3400 	unsigned int ehash_entries;
3401 	struct net *old_net;
3402 
3403 	if (net_eq(net, &init_net))
3404 		goto fallback;
3405 
3406 	old_net = current->nsproxy->net_ns;
3407 	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3408 	if (!ehash_entries)
3409 		goto fallback;
3410 
3411 	ehash_entries = roundup_pow_of_two(ehash_entries);
3412 	hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3413 	if (!hinfo) {
3414 		pr_warn("Failed to allocate TCP ehash (entries: %u) "
3415 			"for a netns, fallback to the global one\n",
3416 			ehash_entries);
3417 fallback:
3418 		hinfo = &tcp_hashinfo;
3419 		ehash_entries = tcp_hashinfo.ehash_mask + 1;
3420 	}
3421 
3422 	net->ipv4.tcp_death_row.hashinfo = hinfo;
3423 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3424 	net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3425 }
3426 
3427 static int __net_init tcp_sk_init(struct net *net)
3428 {
3429 	net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN;
3430 	net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL;
3431 	net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON;
3432 	net->ipv4.sysctl_tcp_ecn_fallback = 1;
3433 
3434 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3435 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3436 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3437 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3438 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3439 
3440 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3441 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3442 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3443 
3444 	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3445 	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3446 	net->ipv4.sysctl_tcp_syncookies = 1;
3447 	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3448 	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3449 	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3450 	net->ipv4.sysctl_tcp_orphan_retries = 0;
3451 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3452 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3453 	net->ipv4.sysctl_tcp_tw_reuse = 2;
3454 	net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC;
3455 	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3456 
3457 	refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3458 	tcp_set_hashinfo(net);
3459 
3460 	net->ipv4.sysctl_tcp_sack = 1;
3461 	net->ipv4.sysctl_tcp_window_scaling = 1;
3462 	net->ipv4.sysctl_tcp_timestamps = 1;
3463 	net->ipv4.sysctl_tcp_early_retrans = 3;
3464 	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3465 	net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3466 	net->ipv4.sysctl_tcp_retrans_collapse = 1;
3467 	net->ipv4.sysctl_tcp_max_reordering = 300;
3468 	net->ipv4.sysctl_tcp_dsack = 1;
3469 	net->ipv4.sysctl_tcp_app_win = 31;
3470 	net->ipv4.sysctl_tcp_adv_win_scale = 1;
3471 	net->ipv4.sysctl_tcp_frto = 2;
3472 	net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3473 	net->ipv4.sysctl_tcp_rcvbuf_low_rtt = USEC_PER_MSEC;
3474 	/* This limits the percentage of the congestion window which we
3475 	 * will allow a single TSO frame to consume.  Building TSO frames
3476 	 * which are too large can cause TCP streams to be bursty.
3477 	 */
3478 	net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3479 	/* Default TSQ limit of 4 MB */
3480 	net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20;
3481 
3482 	/* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3483 	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3484 
3485 	net->ipv4.sysctl_tcp_min_tso_segs = 2;
3486 	net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3487 	net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3488 	net->ipv4.sysctl_tcp_autocorking = 1;
3489 	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3490 	net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3491 	net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3492 	if (net != &init_net) {
3493 		memcpy(net->ipv4.sysctl_tcp_rmem,
3494 		       init_net.ipv4.sysctl_tcp_rmem,
3495 		       sizeof(init_net.ipv4.sysctl_tcp_rmem));
3496 		memcpy(net->ipv4.sysctl_tcp_wmem,
3497 		       init_net.ipv4.sysctl_tcp_wmem,
3498 		       sizeof(init_net.ipv4.sysctl_tcp_wmem));
3499 	}
3500 	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3501 	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 10 * NSEC_PER_USEC;
3502 	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3503 	net->ipv4.sysctl_tcp_comp_sack_rtt_percent = 33;
3504 	net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3505 	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3506 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3507 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3508 
3509 	/* Set default values for PLB */
3510 	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3511 	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3512 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3513 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3514 	/* Default congestion threshold for PLB to mark a round is 50% */
3515 	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3516 
3517 	/* Reno is always built in */
3518 	if (!net_eq(net, &init_net) &&
3519 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3520 			       init_net.ipv4.tcp_congestion_control->owner))
3521 		net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3522 	else
3523 		net->ipv4.tcp_congestion_control = &tcp_reno;
3524 
3525 	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3526 	net->ipv4.sysctl_tcp_shrink_window = 0;
3527 
3528 	net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3529 	net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3530 	net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC;
3531 
3532 	return 0;
3533 }
3534 
3535 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3536 {
3537 	struct net *net;
3538 
3539 	/* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3540 	 * and failed setup_net error unwinding path are serialized.
3541 	 *
3542 	 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3543 	 * net_exit_list, the thread that dismantles a particular twsk must
3544 	 * do so without other thread progressing to refcount_dec_and_test() of
3545 	 * tcp_death_row.tw_refcount.
3546 	 */
3547 	mutex_lock(&tcp_exit_batch_mutex);
3548 
3549 	tcp_twsk_purge(net_exit_list);
3550 
3551 	list_for_each_entry(net, net_exit_list, exit_list) {
3552 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3553 		WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3554 		tcp_fastopen_ctx_destroy(net);
3555 	}
3556 
3557 	mutex_unlock(&tcp_exit_batch_mutex);
3558 }
3559 
3560 static struct pernet_operations __net_initdata tcp_sk_ops = {
3561        .init	   = tcp_sk_init,
3562        .exit	   = tcp_sk_exit,
3563        .exit_batch = tcp_sk_exit_batch,
3564 };
3565 
3566 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3567 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3568 		     struct sock_common *sk_common, uid_t uid)
3569 
3570 #define INIT_BATCH_SZ 16
3571 
3572 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3573 {
3574 	struct bpf_tcp_iter_state *iter = priv_data;
3575 	int err;
3576 
3577 	err = bpf_iter_init_seq_net(priv_data, aux);
3578 	if (err)
3579 		return err;
3580 
3581 	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
3582 	if (err) {
3583 		bpf_iter_fini_seq_net(priv_data);
3584 		return err;
3585 	}
3586 
3587 	return 0;
3588 }
3589 
3590 static void bpf_iter_fini_tcp(void *priv_data)
3591 {
3592 	struct bpf_tcp_iter_state *iter = priv_data;
3593 
3594 	bpf_iter_fini_seq_net(priv_data);
3595 	kvfree(iter->batch);
3596 }
3597 
3598 static const struct bpf_iter_seq_info tcp_seq_info = {
3599 	.seq_ops		= &bpf_iter_tcp_seq_ops,
3600 	.init_seq_private	= bpf_iter_init_tcp,
3601 	.fini_seq_private	= bpf_iter_fini_tcp,
3602 	.seq_priv_size		= sizeof(struct bpf_tcp_iter_state),
3603 };
3604 
3605 static const struct bpf_func_proto *
3606 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3607 			    const struct bpf_prog *prog)
3608 {
3609 	switch (func_id) {
3610 	case BPF_FUNC_setsockopt:
3611 		return &bpf_sk_setsockopt_proto;
3612 	case BPF_FUNC_getsockopt:
3613 		return &bpf_sk_getsockopt_proto;
3614 	default:
3615 		return NULL;
3616 	}
3617 }
3618 
3619 static struct bpf_iter_reg tcp_reg_info = {
3620 	.target			= "tcp",
3621 	.ctx_arg_info_size	= 1,
3622 	.ctx_arg_info		= {
3623 		{ offsetof(struct bpf_iter__tcp, sk_common),
3624 		  PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3625 	},
3626 	.get_func_proto		= bpf_iter_tcp_get_func_proto,
3627 	.seq_info		= &tcp_seq_info,
3628 };
3629 
3630 static void __init bpf_iter_register(void)
3631 {
3632 	tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3633 	if (bpf_iter_reg_target(&tcp_reg_info))
3634 		pr_warn("Warning: could not register bpf iterator tcp\n");
3635 }
3636 
3637 #endif
3638 
3639 void __init tcp_v4_init(void)
3640 {
3641 	int cpu, res;
3642 
3643 	for_each_possible_cpu(cpu) {
3644 		struct sock *sk;
3645 
3646 		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3647 					   IPPROTO_TCP, &init_net);
3648 		if (res)
3649 			panic("Failed to create the TCP control socket.\n");
3650 		sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3651 
3652 		/* Please enforce IP_DF and IPID==0 for RST and
3653 		 * ACK sent in SYN-RECV and TIME-WAIT state.
3654 		 */
3655 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3656 
3657 		sk->sk_clockid = CLOCK_MONOTONIC;
3658 
3659 		per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3660 	}
3661 	if (register_pernet_subsys(&tcp_sk_ops))
3662 		panic("Failed to create the TCP control socket.\n");
3663 
3664 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3665 	bpf_iter_register();
3666 #endif
3667 }
3668