1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 
54 #include <linux/bottom_half.h>
55 #include <linux/types.h>
56 #include <linux/fcntl.h>
57 #include <linux/module.h>
58 #include <linux/random.h>
59 #include <linux/cache.h>
60 #include <linux/jhash.h>
61 #include <linux/init.h>
62 #include <linux/times.h>
63 #include <linux/slab.h>
64 
65 #include <net/net_namespace.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/transp_v6.h>
70 #include <net/ipv6.h>
71 #include <net/inet_common.h>
72 #include <net/timewait_sock.h>
73 #include <net/xfrm.h>
74 #include <net/netdma.h>
75 #include <net/secure_seq.h>
76 #include <net/tcp_memcontrol.h>
77 
78 #include <linux/inet.h>
79 #include <linux/ipv6.h>
80 #include <linux/stddef.h>
81 #include <linux/proc_fs.h>
82 #include <linux/seq_file.h>
83 
84 #include <linux/crypto.h>
85 #include <linux/scatterlist.h>
86 
87 int sysctl_tcp_tw_reuse __read_mostly;
88 int sysctl_tcp_low_latency __read_mostly;
89 EXPORT_SYMBOL(sysctl_tcp_low_latency);
90 
91 
92 #ifdef CONFIG_TCP_MD5SIG
93 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
94 						   __be32 addr);
95 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
96 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
97 #else
98 static inline
tcp_v4_md5_do_lookup(struct sock * sk,__be32 addr)99 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
100 {
101 	return NULL;
102 }
103 #endif
104 
105 struct inet_hashinfo tcp_hashinfo;
106 EXPORT_SYMBOL(tcp_hashinfo);
107 
tcp_v4_init_sequence(const struct sk_buff * skb)108 static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
109 {
110 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
111 					  ip_hdr(skb)->saddr,
112 					  tcp_hdr(skb)->dest,
113 					  tcp_hdr(skb)->source);
114 }
115 
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)116 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
117 {
118 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
119 	struct tcp_sock *tp = tcp_sk(sk);
120 
121 	/* With PAWS, it is safe from the viewpoint
122 	   of data integrity. Even without PAWS it is safe provided sequence
123 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
124 
125 	   Actually, the idea is close to VJ's one, only timestamp cache is
126 	   held not per host, but per port pair and TW bucket is used as state
127 	   holder.
128 
129 	   If TW bucket has been already destroyed we fall back to VJ's scheme
130 	   and use initial timestamp retrieved from peer table.
131 	 */
132 	if (tcptw->tw_ts_recent_stamp &&
133 	    (twp == NULL || (sysctl_tcp_tw_reuse &&
134 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
135 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
136 		if (tp->write_seq == 0)
137 			tp->write_seq = 1;
138 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
139 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
140 		sock_hold(sktw);
141 		return 1;
142 	}
143 
144 	return 0;
145 }
146 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
147 
148 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)149 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
150 {
151 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
152 	struct inet_sock *inet = inet_sk(sk);
153 	struct tcp_sock *tp = tcp_sk(sk);
154 	__be16 orig_sport, orig_dport;
155 	__be32 daddr, nexthop;
156 	struct flowi4 *fl4;
157 	struct rtable *rt;
158 	int err;
159 	struct ip_options_rcu *inet_opt;
160 
161 	if (addr_len < sizeof(struct sockaddr_in))
162 		return -EINVAL;
163 
164 	if (usin->sin_family != AF_INET)
165 		return -EAFNOSUPPORT;
166 
167 	nexthop = daddr = usin->sin_addr.s_addr;
168 	inet_opt = rcu_dereference_protected(inet->inet_opt,
169 					     sock_owned_by_user(sk));
170 	if (inet_opt && inet_opt->opt.srr) {
171 		if (!daddr)
172 			return -EINVAL;
173 		nexthop = inet_opt->opt.faddr;
174 	}
175 
176 	orig_sport = inet->inet_sport;
177 	orig_dport = usin->sin_port;
178 	fl4 = &inet->cork.fl.u.ip4;
179 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
180 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
181 			      IPPROTO_TCP,
182 			      orig_sport, orig_dport, sk, true);
183 	if (IS_ERR(rt)) {
184 		err = PTR_ERR(rt);
185 		if (err == -ENETUNREACH)
186 			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
187 		return err;
188 	}
189 
190 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
191 		ip_rt_put(rt);
192 		return -ENETUNREACH;
193 	}
194 
195 	if (!inet_opt || !inet_opt->opt.srr)
196 		daddr = fl4->daddr;
197 
198 	if (!inet->inet_saddr)
199 		inet->inet_saddr = fl4->saddr;
200 	inet->inet_rcv_saddr = inet->inet_saddr;
201 
202 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
203 		/* Reset inherited state */
204 		tp->rx_opt.ts_recent	   = 0;
205 		tp->rx_opt.ts_recent_stamp = 0;
206 		tp->write_seq		   = 0;
207 	}
208 
209 	if (tcp_death_row.sysctl_tw_recycle &&
210 	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
211 		struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
212 		/*
213 		 * VJ's idea. We save last timestamp seen from
214 		 * the destination in peer table, when entering state
215 		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
216 		 * when trying new connection.
217 		 */
218 		if (peer) {
219 			inet_peer_refcheck(peer);
220 			if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
221 				tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
222 				tp->rx_opt.ts_recent = peer->tcp_ts;
223 			}
224 		}
225 	}
226 
227 	inet->inet_dport = usin->sin_port;
228 	inet->inet_daddr = daddr;
229 
230 	inet_csk(sk)->icsk_ext_hdr_len = 0;
231 	if (inet_opt)
232 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
233 
234 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
235 
236 	/* Socket identity is still unknown (sport may be zero).
237 	 * However we set state to SYN-SENT and not releasing socket
238 	 * lock select source port, enter ourselves into the hash tables and
239 	 * complete initialization after this.
240 	 */
241 	tcp_set_state(sk, TCP_SYN_SENT);
242 	err = inet_hash_connect(&tcp_death_row, sk);
243 	if (err)
244 		goto failure;
245 
246 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
247 			       inet->inet_sport, inet->inet_dport, sk);
248 	if (IS_ERR(rt)) {
249 		err = PTR_ERR(rt);
250 		rt = NULL;
251 		goto failure;
252 	}
253 	/* OK, now commit destination to socket.  */
254 	sk->sk_gso_type = SKB_GSO_TCPV4;
255 	sk_setup_caps(sk, &rt->dst);
256 
257 	if (!tp->write_seq)
258 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
259 							   inet->inet_daddr,
260 							   inet->inet_sport,
261 							   usin->sin_port);
262 
263 	inet->inet_id = tp->write_seq ^ jiffies;
264 
265 	err = tcp_connect(sk);
266 	rt = NULL;
267 	if (err)
268 		goto failure;
269 
270 	return 0;
271 
272 failure:
273 	/*
274 	 * This unhashes the socket and releases the local port,
275 	 * if necessary.
276 	 */
277 	tcp_set_state(sk, TCP_CLOSE);
278 	ip_rt_put(rt);
279 	sk->sk_route_caps = 0;
280 	inet->inet_dport = 0;
281 	return err;
282 }
283 EXPORT_SYMBOL(tcp_v4_connect);
284 
285 /*
286  * This routine does path mtu discovery as defined in RFC1191.
287  */
do_pmtu_discovery(struct sock * sk,const struct iphdr * iph,u32 mtu)288 static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
289 {
290 	struct dst_entry *dst;
291 	struct inet_sock *inet = inet_sk(sk);
292 
293 	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
294 	 * send out by Linux are always <576bytes so they should go through
295 	 * unfragmented).
296 	 */
297 	if (sk->sk_state == TCP_LISTEN)
298 		return;
299 
300 	/* We don't check in the destentry if pmtu discovery is forbidden
301 	 * on this route. We just assume that no packet_to_big packets
302 	 * are send back when pmtu discovery is not active.
303 	 * There is a small race when the user changes this flag in the
304 	 * route, but I think that's acceptable.
305 	 */
306 	if ((dst = __sk_dst_check(sk, 0)) == NULL)
307 		return;
308 
309 	dst->ops->update_pmtu(dst, mtu);
310 
311 	/* Something is about to be wrong... Remember soft error
312 	 * for the case, if this connection will not able to recover.
313 	 */
314 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
315 		sk->sk_err_soft = EMSGSIZE;
316 
317 	mtu = dst_mtu(dst);
318 
319 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
320 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
321 		tcp_sync_mss(sk, mtu);
322 
323 		/* Resend the TCP packet because it's
324 		 * clear that the old packet has been
325 		 * dropped. This is the new "fast" path mtu
326 		 * discovery.
327 		 */
328 		tcp_simple_retransmit(sk);
329 	} /* else let the usual retransmit timer handle it */
330 }
331 
332 /*
333  * This routine is called by the ICMP module when it gets some
334  * sort of error condition.  If err < 0 then the socket should
335  * be closed and the error returned to the user.  If err > 0
336  * it's just the icmp type << 8 | icmp code.  After adjustment
337  * header points to the first 8 bytes of the tcp header.  We need
338  * to find the appropriate port.
339  *
340  * The locking strategy used here is very "optimistic". When
341  * someone else accesses the socket the ICMP is just dropped
342  * and for some paths there is no check at all.
343  * A more general error queue to queue errors for later handling
344  * is probably better.
345  *
346  */
347 
tcp_v4_err(struct sk_buff * icmp_skb,u32 info)348 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
349 {
350 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
351 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
352 	struct inet_connection_sock *icsk;
353 	struct tcp_sock *tp;
354 	struct inet_sock *inet;
355 	const int type = icmp_hdr(icmp_skb)->type;
356 	const int code = icmp_hdr(icmp_skb)->code;
357 	struct sock *sk;
358 	struct sk_buff *skb;
359 	__u32 seq;
360 	__u32 remaining;
361 	int err;
362 	struct net *net = dev_net(icmp_skb->dev);
363 
364 	if (icmp_skb->len < (iph->ihl << 2) + 8) {
365 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
366 		return;
367 	}
368 
369 	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
370 			iph->saddr, th->source, inet_iif(icmp_skb));
371 	if (!sk) {
372 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
373 		return;
374 	}
375 	if (sk->sk_state == TCP_TIME_WAIT) {
376 		inet_twsk_put(inet_twsk(sk));
377 		return;
378 	}
379 
380 	bh_lock_sock(sk);
381 	/* If too many ICMPs get dropped on busy
382 	 * servers this needs to be solved differently.
383 	 */
384 	if (sock_owned_by_user(sk))
385 		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
386 
387 	if (sk->sk_state == TCP_CLOSE)
388 		goto out;
389 
390 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
391 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
392 		goto out;
393 	}
394 
395 	icsk = inet_csk(sk);
396 	tp = tcp_sk(sk);
397 	seq = ntohl(th->seq);
398 	if (sk->sk_state != TCP_LISTEN &&
399 	    !between(seq, tp->snd_una, tp->snd_nxt)) {
400 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
401 		goto out;
402 	}
403 
404 	switch (type) {
405 	case ICMP_SOURCE_QUENCH:
406 		/* Just silently ignore these. */
407 		goto out;
408 	case ICMP_PARAMETERPROB:
409 		err = EPROTO;
410 		break;
411 	case ICMP_DEST_UNREACH:
412 		if (code > NR_ICMP_UNREACH)
413 			goto out;
414 
415 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
416 			if (!sock_owned_by_user(sk))
417 				do_pmtu_discovery(sk, iph, info);
418 			goto out;
419 		}
420 
421 		err = icmp_err_convert[code].errno;
422 		/* check if icmp_skb allows revert of backoff
423 		 * (see draft-zimmermann-tcp-lcd) */
424 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
425 			break;
426 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
427 		    !icsk->icsk_backoff)
428 			break;
429 
430 		if (sock_owned_by_user(sk))
431 			break;
432 
433 		icsk->icsk_backoff--;
434 		inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
435 			TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
436 		tcp_bound_rto(sk);
437 
438 		skb = tcp_write_queue_head(sk);
439 		BUG_ON(!skb);
440 
441 		remaining = icsk->icsk_rto - min(icsk->icsk_rto,
442 				tcp_time_stamp - TCP_SKB_CB(skb)->when);
443 
444 		if (remaining) {
445 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
446 						  remaining, TCP_RTO_MAX);
447 		} else {
448 			/* RTO revert clocked out retransmission.
449 			 * Will retransmit now */
450 			tcp_retransmit_timer(sk);
451 		}
452 
453 		break;
454 	case ICMP_TIME_EXCEEDED:
455 		err = EHOSTUNREACH;
456 		break;
457 	default:
458 		goto out;
459 	}
460 
461 	switch (sk->sk_state) {
462 		struct request_sock *req, **prev;
463 	case TCP_LISTEN:
464 		if (sock_owned_by_user(sk))
465 			goto out;
466 
467 		req = inet_csk_search_req(sk, &prev, th->dest,
468 					  iph->daddr, iph->saddr);
469 		if (!req)
470 			goto out;
471 
472 		/* ICMPs are not backlogged, hence we cannot get
473 		   an established socket here.
474 		 */
475 		WARN_ON(req->sk);
476 
477 		if (seq != tcp_rsk(req)->snt_isn) {
478 			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
479 			goto out;
480 		}
481 
482 		/*
483 		 * Still in SYN_RECV, just remove it silently.
484 		 * There is no good way to pass the error to the newly
485 		 * created socket, and POSIX does not want network
486 		 * errors returned from accept().
487 		 */
488 		inet_csk_reqsk_queue_drop(sk, req, prev);
489 		goto out;
490 
491 	case TCP_SYN_SENT:
492 	case TCP_SYN_RECV:  /* Cannot happen.
493 			       It can f.e. if SYNs crossed.
494 			     */
495 		if (!sock_owned_by_user(sk)) {
496 			sk->sk_err = err;
497 
498 			sk->sk_error_report(sk);
499 
500 			tcp_done(sk);
501 		} else {
502 			sk->sk_err_soft = err;
503 		}
504 		goto out;
505 	}
506 
507 	/* If we've already connected we will keep trying
508 	 * until we time out, or the user gives up.
509 	 *
510 	 * rfc1122 4.2.3.9 allows to consider as hard errors
511 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
512 	 * but it is obsoleted by pmtu discovery).
513 	 *
514 	 * Note, that in modern internet, where routing is unreliable
515 	 * and in each dark corner broken firewalls sit, sending random
516 	 * errors ordered by their masters even this two messages finally lose
517 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
518 	 *
519 	 * Now we are in compliance with RFCs.
520 	 *							--ANK (980905)
521 	 */
522 
523 	inet = inet_sk(sk);
524 	if (!sock_owned_by_user(sk) && inet->recverr) {
525 		sk->sk_err = err;
526 		sk->sk_error_report(sk);
527 	} else	{ /* Only an error on timeout */
528 		sk->sk_err_soft = err;
529 	}
530 
531 out:
532 	bh_unlock_sock(sk);
533 	sock_put(sk);
534 }
535 
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)536 static void __tcp_v4_send_check(struct sk_buff *skb,
537 				__be32 saddr, __be32 daddr)
538 {
539 	struct tcphdr *th = tcp_hdr(skb);
540 
541 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
542 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
543 		skb->csum_start = skb_transport_header(skb) - skb->head;
544 		skb->csum_offset = offsetof(struct tcphdr, check);
545 	} else {
546 		th->check = tcp_v4_check(skb->len, saddr, daddr,
547 					 csum_partial(th,
548 						      th->doff << 2,
549 						      skb->csum));
550 	}
551 }
552 
553 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)554 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
555 {
556 	const struct inet_sock *inet = inet_sk(sk);
557 
558 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
559 }
560 EXPORT_SYMBOL(tcp_v4_send_check);
561 
tcp_v4_gso_send_check(struct sk_buff * skb)562 int tcp_v4_gso_send_check(struct sk_buff *skb)
563 {
564 	const struct iphdr *iph;
565 	struct tcphdr *th;
566 
567 	if (!pskb_may_pull(skb, sizeof(*th)))
568 		return -EINVAL;
569 
570 	iph = ip_hdr(skb);
571 	th = tcp_hdr(skb);
572 
573 	th->check = 0;
574 	skb->ip_summed = CHECKSUM_PARTIAL;
575 	__tcp_v4_send_check(skb, iph->saddr, iph->daddr);
576 	return 0;
577 }
578 
579 /*
580  *	This routine will send an RST to the other tcp.
581  *
582  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
583  *		      for reset.
584  *	Answer: if a packet caused RST, it is not for a socket
585  *		existing in our system, if it is matched to a socket,
586  *		it is just duplicate segment or bug in other side's TCP.
587  *		So that we build reply only basing on parameters
588  *		arrived with segment.
589  *	Exception: precedence violation. We do not implement it in any case.
590  */
591 
tcp_v4_send_reset(struct sock * sk,struct sk_buff * skb)592 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
593 {
594 	const struct tcphdr *th = tcp_hdr(skb);
595 	struct {
596 		struct tcphdr th;
597 #ifdef CONFIG_TCP_MD5SIG
598 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
599 #endif
600 	} rep;
601 	struct ip_reply_arg arg;
602 #ifdef CONFIG_TCP_MD5SIG
603 	struct tcp_md5sig_key *key;
604 #endif
605 	struct net *net;
606 
607 	/* Never send a reset in response to a reset. */
608 	if (th->rst)
609 		return;
610 
611 	if (skb_rtable(skb)->rt_type != RTN_LOCAL)
612 		return;
613 
614 	/* Swap the send and the receive. */
615 	memset(&rep, 0, sizeof(rep));
616 	rep.th.dest   = th->source;
617 	rep.th.source = th->dest;
618 	rep.th.doff   = sizeof(struct tcphdr) / 4;
619 	rep.th.rst    = 1;
620 
621 	if (th->ack) {
622 		rep.th.seq = th->ack_seq;
623 	} else {
624 		rep.th.ack = 1;
625 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
626 				       skb->len - (th->doff << 2));
627 	}
628 
629 	memset(&arg, 0, sizeof(arg));
630 	arg.iov[0].iov_base = (unsigned char *)&rep;
631 	arg.iov[0].iov_len  = sizeof(rep.th);
632 
633 #ifdef CONFIG_TCP_MD5SIG
634 	key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->saddr) : NULL;
635 	if (key) {
636 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
637 				   (TCPOPT_NOP << 16) |
638 				   (TCPOPT_MD5SIG << 8) |
639 				   TCPOLEN_MD5SIG);
640 		/* Update length and the length the header thinks exists */
641 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
642 		rep.th.doff = arg.iov[0].iov_len / 4;
643 
644 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
645 				     key, ip_hdr(skb)->saddr,
646 				     ip_hdr(skb)->daddr, &rep.th);
647 	}
648 #endif
649 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
650 				      ip_hdr(skb)->saddr, /* XXX */
651 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
652 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
653 	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
654 	/* When socket is gone, all binding information is lost.
655 	 * routing might fail in this case. using iif for oif to
656 	 * make sure we can deliver it
657 	 */
658 	arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb);
659 
660 	net = dev_net(skb_dst(skb)->dev);
661 	arg.tos = ip_hdr(skb)->tos;
662 	ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
663 		      &arg, arg.iov[0].iov_len);
664 
665 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
666 	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
667 }
668 
669 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
670    outside socket context is ugly, certainly. What can I do?
671  */
672 
tcp_v4_send_ack(struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 ts,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos)673 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
674 			    u32 win, u32 ts, int oif,
675 			    struct tcp_md5sig_key *key,
676 			    int reply_flags, u8 tos)
677 {
678 	const struct tcphdr *th = tcp_hdr(skb);
679 	struct {
680 		struct tcphdr th;
681 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
682 #ifdef CONFIG_TCP_MD5SIG
683 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
684 #endif
685 			];
686 	} rep;
687 	struct ip_reply_arg arg;
688 	struct net *net = dev_net(skb_dst(skb)->dev);
689 
690 	memset(&rep.th, 0, sizeof(struct tcphdr));
691 	memset(&arg, 0, sizeof(arg));
692 
693 	arg.iov[0].iov_base = (unsigned char *)&rep;
694 	arg.iov[0].iov_len  = sizeof(rep.th);
695 	if (ts) {
696 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
697 				   (TCPOPT_TIMESTAMP << 8) |
698 				   TCPOLEN_TIMESTAMP);
699 		rep.opt[1] = htonl(tcp_time_stamp);
700 		rep.opt[2] = htonl(ts);
701 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
702 	}
703 
704 	/* Swap the send and the receive. */
705 	rep.th.dest    = th->source;
706 	rep.th.source  = th->dest;
707 	rep.th.doff    = arg.iov[0].iov_len / 4;
708 	rep.th.seq     = htonl(seq);
709 	rep.th.ack_seq = htonl(ack);
710 	rep.th.ack     = 1;
711 	rep.th.window  = htons(win);
712 
713 #ifdef CONFIG_TCP_MD5SIG
714 	if (key) {
715 		int offset = (ts) ? 3 : 0;
716 
717 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
718 					  (TCPOPT_NOP << 16) |
719 					  (TCPOPT_MD5SIG << 8) |
720 					  TCPOLEN_MD5SIG);
721 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
722 		rep.th.doff = arg.iov[0].iov_len/4;
723 
724 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
725 				    key, ip_hdr(skb)->saddr,
726 				    ip_hdr(skb)->daddr, &rep.th);
727 	}
728 #endif
729 	arg.flags = reply_flags;
730 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
731 				      ip_hdr(skb)->saddr, /* XXX */
732 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
733 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
734 	if (oif)
735 		arg.bound_dev_if = oif;
736 	arg.tos = tos;
737 	ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
738 		      &arg, arg.iov[0].iov_len);
739 
740 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
741 }
742 
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)743 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
744 {
745 	struct inet_timewait_sock *tw = inet_twsk(sk);
746 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
747 
748 	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
749 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
750 			tcptw->tw_ts_recent,
751 			tw->tw_bound_dev_if,
752 			tcp_twsk_md5_key(tcptw),
753 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
754 			tw->tw_tos
755 			);
756 
757 	inet_twsk_put(tw);
758 }
759 
tcp_v4_reqsk_send_ack(struct sock * sk,struct sk_buff * skb,struct request_sock * req)760 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
761 				  struct request_sock *req)
762 {
763 	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
764 			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
765 			req->ts_recent,
766 			0,
767 			tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
768 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
769 			ip_hdr(skb)->tos);
770 }
771 
772 /*
773  *	Send a SYN-ACK after having received a SYN.
774  *	This still operates on a request_sock only, not on a big
775  *	socket.
776  */
tcp_v4_send_synack(struct sock * sk,struct dst_entry * dst,struct request_sock * req,struct request_values * rvp)777 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
778 			      struct request_sock *req,
779 			      struct request_values *rvp)
780 {
781 	const struct inet_request_sock *ireq = inet_rsk(req);
782 	struct flowi4 fl4;
783 	int err = -1;
784 	struct sk_buff * skb;
785 
786 	/* First, grab a route. */
787 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
788 		return -1;
789 
790 	skb = tcp_make_synack(sk, dst, req, rvp);
791 
792 	if (skb) {
793 		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
794 
795 		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
796 					    ireq->rmt_addr,
797 					    ireq->opt);
798 		err = net_xmit_eval(err);
799 	}
800 
801 	dst_release(dst);
802 	return err;
803 }
804 
tcp_v4_rtx_synack(struct sock * sk,struct request_sock * req,struct request_values * rvp)805 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
806 			      struct request_values *rvp)
807 {
808 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
809 	return tcp_v4_send_synack(sk, NULL, req, rvp);
810 }
811 
812 /*
813  *	IPv4 request_sock destructor.
814  */
tcp_v4_reqsk_destructor(struct request_sock * req)815 static void tcp_v4_reqsk_destructor(struct request_sock *req)
816 {
817 	kfree(inet_rsk(req)->opt);
818 }
819 
820 /*
821  * Return 1 if a syncookie should be sent
822  */
tcp_syn_flood_action(struct sock * sk,const struct sk_buff * skb,const char * proto)823 int tcp_syn_flood_action(struct sock *sk,
824 			 const struct sk_buff *skb,
825 			 const char *proto)
826 {
827 	const char *msg = "Dropping request";
828 	int want_cookie = 0;
829 	struct listen_sock *lopt;
830 
831 
832 
833 #ifdef CONFIG_SYN_COOKIES
834 	if (sysctl_tcp_syncookies) {
835 		msg = "Sending cookies";
836 		want_cookie = 1;
837 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
838 	} else
839 #endif
840 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
841 
842 	lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
843 	if (!lopt->synflood_warned) {
844 		lopt->synflood_warned = 1;
845 		pr_info("%s: Possible SYN flooding on port %d. %s. "
846 			" Check SNMP counters.\n",
847 			proto, ntohs(tcp_hdr(skb)->dest), msg);
848 	}
849 	return want_cookie;
850 }
851 EXPORT_SYMBOL(tcp_syn_flood_action);
852 
853 /*
854  * Save and compile IPv4 options into the request_sock if needed.
855  */
tcp_v4_save_options(struct sock * sk,struct sk_buff * skb)856 static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
857 						  struct sk_buff *skb)
858 {
859 	const struct ip_options *opt = &(IPCB(skb)->opt);
860 	struct ip_options_rcu *dopt = NULL;
861 
862 	if (opt && opt->optlen) {
863 		int opt_size = sizeof(*dopt) + opt->optlen;
864 
865 		dopt = kmalloc(opt_size, GFP_ATOMIC);
866 		if (dopt) {
867 			if (ip_options_echo(&dopt->opt, skb)) {
868 				kfree(dopt);
869 				dopt = NULL;
870 			}
871 		}
872 	}
873 	return dopt;
874 }
875 
876 #ifdef CONFIG_TCP_MD5SIG
877 /*
878  * RFC2385 MD5 checksumming requires a mapping of
879  * IP address->MD5 Key.
880  * We need to maintain these in the sk structure.
881  */
882 
883 /* Find the Key structure for an address.  */
884 static struct tcp_md5sig_key *
tcp_v4_md5_do_lookup(struct sock * sk,__be32 addr)885 			tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
886 {
887 	struct tcp_sock *tp = tcp_sk(sk);
888 	int i;
889 
890 	if (!tp->md5sig_info || !tp->md5sig_info->entries4)
891 		return NULL;
892 	for (i = 0; i < tp->md5sig_info->entries4; i++) {
893 		if (tp->md5sig_info->keys4[i].addr == addr)
894 			return &tp->md5sig_info->keys4[i].base;
895 	}
896 	return NULL;
897 }
898 
tcp_v4_md5_lookup(struct sock * sk,struct sock * addr_sk)899 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
900 					 struct sock *addr_sk)
901 {
902 	return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
903 }
904 EXPORT_SYMBOL(tcp_v4_md5_lookup);
905 
tcp_v4_reqsk_md5_lookup(struct sock * sk,struct request_sock * req)906 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
907 						      struct request_sock *req)
908 {
909 	return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
910 }
911 
912 /* This can be called on a newly created socket, from other files */
tcp_v4_md5_do_add(struct sock * sk,__be32 addr,u8 * newkey,u8 newkeylen)913 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
914 		      u8 *newkey, u8 newkeylen)
915 {
916 	/* Add Key to the list */
917 	struct tcp_md5sig_key *key;
918 	struct tcp_sock *tp = tcp_sk(sk);
919 	struct tcp4_md5sig_key *keys;
920 
921 	key = tcp_v4_md5_do_lookup(sk, addr);
922 	if (key) {
923 		/* Pre-existing entry - just update that one. */
924 		kfree(key->key);
925 		key->key = newkey;
926 		key->keylen = newkeylen;
927 	} else {
928 		struct tcp_md5sig_info *md5sig;
929 
930 		if (!tp->md5sig_info) {
931 			tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
932 						  GFP_ATOMIC);
933 			if (!tp->md5sig_info) {
934 				kfree(newkey);
935 				return -ENOMEM;
936 			}
937 			sk_nocaps_add(sk, NETIF_F_GSO_MASK);
938 		}
939 
940 		md5sig = tp->md5sig_info;
941 		if (md5sig->entries4 == 0 &&
942 		    tcp_alloc_md5sig_pool(sk) == NULL) {
943 			kfree(newkey);
944 			return -ENOMEM;
945 		}
946 
947 		if (md5sig->alloced4 == md5sig->entries4) {
948 			keys = kmalloc((sizeof(*keys) *
949 					(md5sig->entries4 + 1)), GFP_ATOMIC);
950 			if (!keys) {
951 				kfree(newkey);
952 				if (md5sig->entries4 == 0)
953 					tcp_free_md5sig_pool();
954 				return -ENOMEM;
955 			}
956 
957 			if (md5sig->entries4)
958 				memcpy(keys, md5sig->keys4,
959 				       sizeof(*keys) * md5sig->entries4);
960 
961 			/* Free old key list, and reference new one */
962 			kfree(md5sig->keys4);
963 			md5sig->keys4 = keys;
964 			md5sig->alloced4++;
965 		}
966 		md5sig->entries4++;
967 		md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
968 		md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
969 		md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
970 	}
971 	return 0;
972 }
973 EXPORT_SYMBOL(tcp_v4_md5_do_add);
974 
tcp_v4_md5_add_func(struct sock * sk,struct sock * addr_sk,u8 * newkey,u8 newkeylen)975 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
976 			       u8 *newkey, u8 newkeylen)
977 {
978 	return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
979 				 newkey, newkeylen);
980 }
981 
tcp_v4_md5_do_del(struct sock * sk,__be32 addr)982 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
983 {
984 	struct tcp_sock *tp = tcp_sk(sk);
985 	int i;
986 
987 	for (i = 0; i < tp->md5sig_info->entries4; i++) {
988 		if (tp->md5sig_info->keys4[i].addr == addr) {
989 			/* Free the key */
990 			kfree(tp->md5sig_info->keys4[i].base.key);
991 			tp->md5sig_info->entries4--;
992 
993 			if (tp->md5sig_info->entries4 == 0) {
994 				kfree(tp->md5sig_info->keys4);
995 				tp->md5sig_info->keys4 = NULL;
996 				tp->md5sig_info->alloced4 = 0;
997 				tcp_free_md5sig_pool();
998 			} else if (tp->md5sig_info->entries4 != i) {
999 				/* Need to do some manipulation */
1000 				memmove(&tp->md5sig_info->keys4[i],
1001 					&tp->md5sig_info->keys4[i+1],
1002 					(tp->md5sig_info->entries4 - i) *
1003 					 sizeof(struct tcp4_md5sig_key));
1004 			}
1005 			return 0;
1006 		}
1007 	}
1008 	return -ENOENT;
1009 }
1010 EXPORT_SYMBOL(tcp_v4_md5_do_del);
1011 
tcp_v4_clear_md5_list(struct sock * sk)1012 static void tcp_v4_clear_md5_list(struct sock *sk)
1013 {
1014 	struct tcp_sock *tp = tcp_sk(sk);
1015 
1016 	/* Free each key, then the set of key keys,
1017 	 * the crypto element, and then decrement our
1018 	 * hold on the last resort crypto.
1019 	 */
1020 	if (tp->md5sig_info->entries4) {
1021 		int i;
1022 		for (i = 0; i < tp->md5sig_info->entries4; i++)
1023 			kfree(tp->md5sig_info->keys4[i].base.key);
1024 		tp->md5sig_info->entries4 = 0;
1025 		tcp_free_md5sig_pool();
1026 	}
1027 	if (tp->md5sig_info->keys4) {
1028 		kfree(tp->md5sig_info->keys4);
1029 		tp->md5sig_info->keys4 = NULL;
1030 		tp->md5sig_info->alloced4  = 0;
1031 	}
1032 }
1033 
tcp_v4_parse_md5_keys(struct sock * sk,char __user * optval,int optlen)1034 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1035 				 int optlen)
1036 {
1037 	struct tcp_md5sig cmd;
1038 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1039 	u8 *newkey;
1040 
1041 	if (optlen < sizeof(cmd))
1042 		return -EINVAL;
1043 
1044 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1045 		return -EFAULT;
1046 
1047 	if (sin->sin_family != AF_INET)
1048 		return -EINVAL;
1049 
1050 	if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1051 		if (!tcp_sk(sk)->md5sig_info)
1052 			return -ENOENT;
1053 		return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1054 	}
1055 
1056 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1057 		return -EINVAL;
1058 
1059 	if (!tcp_sk(sk)->md5sig_info) {
1060 		struct tcp_sock *tp = tcp_sk(sk);
1061 		struct tcp_md5sig_info *p;
1062 
1063 		p = kzalloc(sizeof(*p), sk->sk_allocation);
1064 		if (!p)
1065 			return -EINVAL;
1066 
1067 		tp->md5sig_info = p;
1068 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1069 	}
1070 
1071 	newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1072 	if (!newkey)
1073 		return -ENOMEM;
1074 	return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1075 				 newkey, cmd.tcpm_keylen);
1076 }
1077 
tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,int nbytes)1078 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1079 					__be32 daddr, __be32 saddr, int nbytes)
1080 {
1081 	struct tcp4_pseudohdr *bp;
1082 	struct scatterlist sg;
1083 
1084 	bp = &hp->md5_blk.ip4;
1085 
1086 	/*
1087 	 * 1. the TCP pseudo-header (in the order: source IP address,
1088 	 * destination IP address, zero-padded protocol number, and
1089 	 * segment length)
1090 	 */
1091 	bp->saddr = saddr;
1092 	bp->daddr = daddr;
1093 	bp->pad = 0;
1094 	bp->protocol = IPPROTO_TCP;
1095 	bp->len = cpu_to_be16(nbytes);
1096 
1097 	sg_init_one(&sg, bp, sizeof(*bp));
1098 	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1099 }
1100 
tcp_v4_md5_hash_hdr(char * md5_hash,struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1101 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1102 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1103 {
1104 	struct tcp_md5sig_pool *hp;
1105 	struct hash_desc *desc;
1106 
1107 	hp = tcp_get_md5sig_pool();
1108 	if (!hp)
1109 		goto clear_hash_noput;
1110 	desc = &hp->md5_desc;
1111 
1112 	if (crypto_hash_init(desc))
1113 		goto clear_hash;
1114 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1115 		goto clear_hash;
1116 	if (tcp_md5_hash_header(hp, th))
1117 		goto clear_hash;
1118 	if (tcp_md5_hash_key(hp, key))
1119 		goto clear_hash;
1120 	if (crypto_hash_final(desc, md5_hash))
1121 		goto clear_hash;
1122 
1123 	tcp_put_md5sig_pool();
1124 	return 0;
1125 
1126 clear_hash:
1127 	tcp_put_md5sig_pool();
1128 clear_hash_noput:
1129 	memset(md5_hash, 0, 16);
1130 	return 1;
1131 }
1132 
tcp_v4_md5_hash_skb(char * md5_hash,struct tcp_md5sig_key * key,const struct sock * sk,const struct request_sock * req,const struct sk_buff * skb)1133 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1134 			const struct sock *sk, const struct request_sock *req,
1135 			const struct sk_buff *skb)
1136 {
1137 	struct tcp_md5sig_pool *hp;
1138 	struct hash_desc *desc;
1139 	const struct tcphdr *th = tcp_hdr(skb);
1140 	__be32 saddr, daddr;
1141 
1142 	if (sk) {
1143 		saddr = inet_sk(sk)->inet_saddr;
1144 		daddr = inet_sk(sk)->inet_daddr;
1145 	} else if (req) {
1146 		saddr = inet_rsk(req)->loc_addr;
1147 		daddr = inet_rsk(req)->rmt_addr;
1148 	} else {
1149 		const struct iphdr *iph = ip_hdr(skb);
1150 		saddr = iph->saddr;
1151 		daddr = iph->daddr;
1152 	}
1153 
1154 	hp = tcp_get_md5sig_pool();
1155 	if (!hp)
1156 		goto clear_hash_noput;
1157 	desc = &hp->md5_desc;
1158 
1159 	if (crypto_hash_init(desc))
1160 		goto clear_hash;
1161 
1162 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1163 		goto clear_hash;
1164 	if (tcp_md5_hash_header(hp, th))
1165 		goto clear_hash;
1166 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1167 		goto clear_hash;
1168 	if (tcp_md5_hash_key(hp, key))
1169 		goto clear_hash;
1170 	if (crypto_hash_final(desc, md5_hash))
1171 		goto clear_hash;
1172 
1173 	tcp_put_md5sig_pool();
1174 	return 0;
1175 
1176 clear_hash:
1177 	tcp_put_md5sig_pool();
1178 clear_hash_noput:
1179 	memset(md5_hash, 0, 16);
1180 	return 1;
1181 }
1182 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1183 
tcp_v4_inbound_md5_hash(struct sock * sk,const struct sk_buff * skb)1184 static int tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1185 {
1186 	/*
1187 	 * This gets called for each TCP segment that arrives
1188 	 * so we want to be efficient.
1189 	 * We have 3 drop cases:
1190 	 * o No MD5 hash and one expected.
1191 	 * o MD5 hash and we're not expecting one.
1192 	 * o MD5 hash and its wrong.
1193 	 */
1194 	const __u8 *hash_location = NULL;
1195 	struct tcp_md5sig_key *hash_expected;
1196 	const struct iphdr *iph = ip_hdr(skb);
1197 	const struct tcphdr *th = tcp_hdr(skb);
1198 	int genhash;
1199 	unsigned char newhash[16];
1200 
1201 	hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1202 	hash_location = tcp_parse_md5sig_option(th);
1203 
1204 	/* We've parsed the options - do we have a hash? */
1205 	if (!hash_expected && !hash_location)
1206 		return 0;
1207 
1208 	if (hash_expected && !hash_location) {
1209 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1210 		return 1;
1211 	}
1212 
1213 	if (!hash_expected && hash_location) {
1214 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1215 		return 1;
1216 	}
1217 
1218 	/* Okay, so this is hash_expected and hash_location -
1219 	 * so we need to calculate the checksum.
1220 	 */
1221 	genhash = tcp_v4_md5_hash_skb(newhash,
1222 				      hash_expected,
1223 				      NULL, NULL, skb);
1224 
1225 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1226 		if (net_ratelimit()) {
1227 			printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1228 			       &iph->saddr, ntohs(th->source),
1229 			       &iph->daddr, ntohs(th->dest),
1230 			       genhash ? " tcp_v4_calc_md5_hash failed" : "");
1231 		}
1232 		return 1;
1233 	}
1234 	return 0;
1235 }
1236 
1237 #endif
1238 
1239 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1240 	.family		=	PF_INET,
1241 	.obj_size	=	sizeof(struct tcp_request_sock),
1242 	.rtx_syn_ack	=	tcp_v4_rtx_synack,
1243 	.send_ack	=	tcp_v4_reqsk_send_ack,
1244 	.destructor	=	tcp_v4_reqsk_destructor,
1245 	.send_reset	=	tcp_v4_send_reset,
1246 	.syn_ack_timeout = 	tcp_syn_ack_timeout,
1247 };
1248 
1249 #ifdef CONFIG_TCP_MD5SIG
1250 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1251 	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1252 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1253 };
1254 #endif
1255 
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1256 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1257 {
1258 	struct tcp_extend_values tmp_ext;
1259 	struct tcp_options_received tmp_opt;
1260 	const u8 *hash_location;
1261 	struct request_sock *req;
1262 	struct inet_request_sock *ireq;
1263 	struct tcp_sock *tp = tcp_sk(sk);
1264 	struct dst_entry *dst = NULL;
1265 	__be32 saddr = ip_hdr(skb)->saddr;
1266 	__be32 daddr = ip_hdr(skb)->daddr;
1267 	__u32 isn = TCP_SKB_CB(skb)->when;
1268 	int want_cookie = 0;
1269 
1270 	/* Never answer to SYNs send to broadcast or multicast */
1271 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1272 		goto drop;
1273 
1274 	/* TW buckets are converted to open requests without
1275 	 * limitations, they conserve resources and peer is
1276 	 * evidently real one.
1277 	 */
1278 	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1279 		want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1280 		if (!want_cookie)
1281 			goto drop;
1282 	}
1283 
1284 	/* Accept backlog is full. If we have already queued enough
1285 	 * of warm entries in syn queue, drop request. It is better than
1286 	 * clogging syn queue with openreqs with exponentially increasing
1287 	 * timeout.
1288 	 */
1289 	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1290 		goto drop;
1291 
1292 	req = inet_reqsk_alloc(&tcp_request_sock_ops);
1293 	if (!req)
1294 		goto drop;
1295 
1296 #ifdef CONFIG_TCP_MD5SIG
1297 	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1298 #endif
1299 
1300 	tcp_clear_options(&tmp_opt);
1301 	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1302 	tmp_opt.user_mss  = tp->rx_opt.user_mss;
1303 	tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1304 
1305 	if (tmp_opt.cookie_plus > 0 &&
1306 	    tmp_opt.saw_tstamp &&
1307 	    !tp->rx_opt.cookie_out_never &&
1308 	    (sysctl_tcp_cookie_size > 0 ||
1309 	     (tp->cookie_values != NULL &&
1310 	      tp->cookie_values->cookie_desired > 0))) {
1311 		u8 *c;
1312 		u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1313 		int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1314 
1315 		if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1316 			goto drop_and_release;
1317 
1318 		/* Secret recipe starts with IP addresses */
1319 		*mess++ ^= (__force u32)daddr;
1320 		*mess++ ^= (__force u32)saddr;
1321 
1322 		/* plus variable length Initiator Cookie */
1323 		c = (u8 *)mess;
1324 		while (l-- > 0)
1325 			*c++ ^= *hash_location++;
1326 
1327 		want_cookie = 0;	/* not our kind of cookie */
1328 		tmp_ext.cookie_out_never = 0; /* false */
1329 		tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1330 	} else if (!tp->rx_opt.cookie_in_always) {
1331 		/* redundant indications, but ensure initialization. */
1332 		tmp_ext.cookie_out_never = 1; /* true */
1333 		tmp_ext.cookie_plus = 0;
1334 	} else {
1335 		goto drop_and_release;
1336 	}
1337 	tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1338 
1339 	if (want_cookie && !tmp_opt.saw_tstamp)
1340 		tcp_clear_options(&tmp_opt);
1341 
1342 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1343 	tcp_openreq_init(req, &tmp_opt, skb);
1344 
1345 	ireq = inet_rsk(req);
1346 	ireq->loc_addr = daddr;
1347 	ireq->rmt_addr = saddr;
1348 	ireq->no_srccheck = inet_sk(sk)->transparent;
1349 	ireq->opt = tcp_v4_save_options(sk, skb);
1350 
1351 	if (security_inet_conn_request(sk, skb, req))
1352 		goto drop_and_free;
1353 
1354 	if (!want_cookie || tmp_opt.tstamp_ok)
1355 		TCP_ECN_create_request(req, tcp_hdr(skb));
1356 
1357 	if (want_cookie) {
1358 		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1359 		req->cookie_ts = tmp_opt.tstamp_ok;
1360 	} else if (!isn) {
1361 		struct inet_peer *peer = NULL;
1362 		struct flowi4 fl4;
1363 
1364 		/* VJ's idea. We save last timestamp seen
1365 		 * from the destination in peer table, when entering
1366 		 * state TIME-WAIT, and check against it before
1367 		 * accepting new connection request.
1368 		 *
1369 		 * If "isn" is not zero, this request hit alive
1370 		 * timewait bucket, so that all the necessary checks
1371 		 * are made in the function processing timewait state.
1372 		 */
1373 		if (tmp_opt.saw_tstamp &&
1374 		    tcp_death_row.sysctl_tw_recycle &&
1375 		    (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1376 		    fl4.daddr == saddr &&
1377 		    (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
1378 			inet_peer_refcheck(peer);
1379 			if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1380 			    (s32)(peer->tcp_ts - req->ts_recent) >
1381 							TCP_PAWS_WINDOW) {
1382 				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1383 				goto drop_and_release;
1384 			}
1385 		}
1386 		/* Kill the following clause, if you dislike this way. */
1387 		else if (!sysctl_tcp_syncookies &&
1388 			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1389 			  (sysctl_max_syn_backlog >> 2)) &&
1390 			 (!peer || !peer->tcp_ts_stamp) &&
1391 			 (!dst || !dst_metric(dst, RTAX_RTT))) {
1392 			/* Without syncookies last quarter of
1393 			 * backlog is filled with destinations,
1394 			 * proven to be alive.
1395 			 * It means that we continue to communicate
1396 			 * to destinations, already remembered
1397 			 * to the moment of synflood.
1398 			 */
1399 			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1400 				       &saddr, ntohs(tcp_hdr(skb)->source));
1401 			goto drop_and_release;
1402 		}
1403 
1404 		isn = tcp_v4_init_sequence(skb);
1405 	}
1406 	tcp_rsk(req)->snt_isn = isn;
1407 	tcp_rsk(req)->snt_synack = tcp_time_stamp;
1408 
1409 	if (tcp_v4_send_synack(sk, dst, req,
1410 			       (struct request_values *)&tmp_ext) ||
1411 	    want_cookie)
1412 		goto drop_and_free;
1413 
1414 	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1415 	return 0;
1416 
1417 drop_and_release:
1418 	dst_release(dst);
1419 drop_and_free:
1420 	reqsk_free(req);
1421 drop:
1422 	return 0;
1423 }
1424 EXPORT_SYMBOL(tcp_v4_conn_request);
1425 
1426 
1427 /*
1428  * The three way handshake has completed - we got a valid synack -
1429  * now create the new socket.
1430  */
tcp_v4_syn_recv_sock(struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst)1431 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1432 				  struct request_sock *req,
1433 				  struct dst_entry *dst)
1434 {
1435 	struct inet_request_sock *ireq;
1436 	struct inet_sock *newinet;
1437 	struct tcp_sock *newtp;
1438 	struct sock *newsk;
1439 #ifdef CONFIG_TCP_MD5SIG
1440 	struct tcp_md5sig_key *key;
1441 #endif
1442 	struct ip_options_rcu *inet_opt;
1443 
1444 	if (sk_acceptq_is_full(sk))
1445 		goto exit_overflow;
1446 
1447 	newsk = tcp_create_openreq_child(sk, req, skb);
1448 	if (!newsk)
1449 		goto exit_nonewsk;
1450 
1451 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1452 
1453 	newtp		      = tcp_sk(newsk);
1454 	newinet		      = inet_sk(newsk);
1455 	ireq		      = inet_rsk(req);
1456 	newinet->inet_daddr   = ireq->rmt_addr;
1457 	newinet->inet_rcv_saddr = ireq->loc_addr;
1458 	newinet->inet_saddr	      = ireq->loc_addr;
1459 	inet_opt	      = ireq->opt;
1460 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1461 	ireq->opt	      = NULL;
1462 	newinet->mc_index     = inet_iif(skb);
1463 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1464 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1465 	if (inet_opt)
1466 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1467 	newinet->inet_id = newtp->write_seq ^ jiffies;
1468 
1469 	if (!dst) {
1470 		dst = inet_csk_route_child_sock(sk, newsk, req);
1471 		if (!dst)
1472 			goto put_and_exit;
1473 	} else {
1474 		/* syncookie case : see end of cookie_v4_check() */
1475 	}
1476 	sk_setup_caps(newsk, dst);
1477 
1478 	tcp_mtup_init(newsk);
1479 	tcp_sync_mss(newsk, dst_mtu(dst));
1480 	newtp->advmss = dst_metric_advmss(dst);
1481 	if (tcp_sk(sk)->rx_opt.user_mss &&
1482 	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1483 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1484 
1485 	tcp_initialize_rcv_mss(newsk);
1486 	if (tcp_rsk(req)->snt_synack)
1487 		tcp_valid_rtt_meas(newsk,
1488 		    tcp_time_stamp - tcp_rsk(req)->snt_synack);
1489 	newtp->total_retrans = req->retrans;
1490 
1491 #ifdef CONFIG_TCP_MD5SIG
1492 	/* Copy over the MD5 key from the original socket */
1493 	key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1494 	if (key != NULL) {
1495 		/*
1496 		 * We're using one, so create a matching key
1497 		 * on the newsk structure. If we fail to get
1498 		 * memory, then we end up not copying the key
1499 		 * across. Shucks.
1500 		 */
1501 		char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1502 		if (newkey != NULL)
1503 			tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1504 					  newkey, key->keylen);
1505 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1506 	}
1507 #endif
1508 
1509 	if (__inet_inherit_port(sk, newsk) < 0)
1510 		goto put_and_exit;
1511 	__inet_hash_nolisten(newsk, NULL);
1512 
1513 	return newsk;
1514 
1515 exit_overflow:
1516 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1517 exit_nonewsk:
1518 	dst_release(dst);
1519 exit:
1520 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1521 	return NULL;
1522 put_and_exit:
1523 	tcp_clear_xmit_timers(newsk);
1524 	tcp_cleanup_congestion_control(newsk);
1525 	bh_unlock_sock(newsk);
1526 	sock_put(newsk);
1527 	goto exit;
1528 }
1529 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1530 
tcp_v4_hnd_req(struct sock * sk,struct sk_buff * skb)1531 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1532 {
1533 	struct tcphdr *th = tcp_hdr(skb);
1534 	const struct iphdr *iph = ip_hdr(skb);
1535 	struct sock *nsk;
1536 	struct request_sock **prev;
1537 	/* Find possible connection requests. */
1538 	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1539 						       iph->saddr, iph->daddr);
1540 	if (req)
1541 		return tcp_check_req(sk, skb, req, prev);
1542 
1543 	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1544 			th->source, iph->daddr, th->dest, inet_iif(skb));
1545 
1546 	if (nsk) {
1547 		if (nsk->sk_state != TCP_TIME_WAIT) {
1548 			bh_lock_sock(nsk);
1549 			return nsk;
1550 		}
1551 		inet_twsk_put(inet_twsk(nsk));
1552 		return NULL;
1553 	}
1554 
1555 #ifdef CONFIG_SYN_COOKIES
1556 	if (!th->syn)
1557 		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1558 #endif
1559 	return sk;
1560 }
1561 
tcp_v4_checksum_init(struct sk_buff * skb)1562 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1563 {
1564 	const struct iphdr *iph = ip_hdr(skb);
1565 
1566 	if (skb->ip_summed == CHECKSUM_COMPLETE) {
1567 		if (!tcp_v4_check(skb->len, iph->saddr,
1568 				  iph->daddr, skb->csum)) {
1569 			skb->ip_summed = CHECKSUM_UNNECESSARY;
1570 			return 0;
1571 		}
1572 	}
1573 
1574 	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1575 				       skb->len, IPPROTO_TCP, 0);
1576 
1577 	if (skb->len <= 76) {
1578 		return __skb_checksum_complete(skb);
1579 	}
1580 	return 0;
1581 }
1582 
1583 
1584 /* The socket must have it's spinlock held when we get
1585  * here.
1586  *
1587  * We have a potential double-lock case here, so even when
1588  * doing backlog processing we use the BH locking scheme.
1589  * This is because we cannot sleep with the original spinlock
1590  * held.
1591  */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1592 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1593 {
1594 	struct sock *rsk;
1595 #ifdef CONFIG_TCP_MD5SIG
1596 	/*
1597 	 * We really want to reject the packet as early as possible
1598 	 * if:
1599 	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1600 	 *  o There is an MD5 option and we're not expecting one
1601 	 */
1602 	if (tcp_v4_inbound_md5_hash(sk, skb))
1603 		goto discard;
1604 #endif
1605 
1606 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1607 		sock_rps_save_rxhash(sk, skb);
1608 		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1609 			rsk = sk;
1610 			goto reset;
1611 		}
1612 		return 0;
1613 	}
1614 
1615 	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1616 		goto csum_err;
1617 
1618 	if (sk->sk_state == TCP_LISTEN) {
1619 		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1620 		if (!nsk)
1621 			goto discard;
1622 
1623 		if (nsk != sk) {
1624 			sock_rps_save_rxhash(nsk, skb);
1625 			if (tcp_child_process(sk, nsk, skb)) {
1626 				rsk = nsk;
1627 				goto reset;
1628 			}
1629 			return 0;
1630 		}
1631 	} else
1632 		sock_rps_save_rxhash(sk, skb);
1633 
1634 	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1635 		rsk = sk;
1636 		goto reset;
1637 	}
1638 	return 0;
1639 
1640 reset:
1641 	tcp_v4_send_reset(rsk, skb);
1642 discard:
1643 	kfree_skb(skb);
1644 	/* Be careful here. If this function gets more complicated and
1645 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1646 	 * might be destroyed here. This current version compiles correctly,
1647 	 * but you have been warned.
1648 	 */
1649 	return 0;
1650 
1651 csum_err:
1652 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1653 	goto discard;
1654 }
1655 EXPORT_SYMBOL(tcp_v4_do_rcv);
1656 
1657 /*
1658  *	From tcp_input.c
1659  */
1660 
tcp_v4_rcv(struct sk_buff * skb)1661 int tcp_v4_rcv(struct sk_buff *skb)
1662 {
1663 	const struct iphdr *iph;
1664 	const struct tcphdr *th;
1665 	struct sock *sk;
1666 	int ret;
1667 	struct net *net = dev_net(skb->dev);
1668 
1669 	if (skb->pkt_type != PACKET_HOST)
1670 		goto discard_it;
1671 
1672 	/* Count it even if it's bad */
1673 	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1674 
1675 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1676 		goto discard_it;
1677 
1678 	th = tcp_hdr(skb);
1679 
1680 	if (th->doff < sizeof(struct tcphdr) / 4)
1681 		goto bad_packet;
1682 	if (!pskb_may_pull(skb, th->doff * 4))
1683 		goto discard_it;
1684 
1685 	/* An explanation is required here, I think.
1686 	 * Packet length and doff are validated by header prediction,
1687 	 * provided case of th->doff==0 is eliminated.
1688 	 * So, we defer the checks. */
1689 	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1690 		goto bad_packet;
1691 
1692 	th = tcp_hdr(skb);
1693 	iph = ip_hdr(skb);
1694 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1695 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1696 				    skb->len - th->doff * 4);
1697 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1698 	TCP_SKB_CB(skb)->when	 = 0;
1699 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1700 	TCP_SKB_CB(skb)->sacked	 = 0;
1701 
1702 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1703 	if (!sk)
1704 		goto no_tcp_socket;
1705 
1706 process:
1707 	if (sk->sk_state == TCP_TIME_WAIT)
1708 		goto do_time_wait;
1709 
1710 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1711 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1712 		goto discard_and_relse;
1713 	}
1714 
1715 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1716 		goto discard_and_relse;
1717 	nf_reset(skb);
1718 
1719 	if (sk_filter(sk, skb))
1720 		goto discard_and_relse;
1721 
1722 	skb->dev = NULL;
1723 
1724 	bh_lock_sock_nested(sk);
1725 	ret = 0;
1726 	if (!sock_owned_by_user(sk)) {
1727 #ifdef CONFIG_NET_DMA
1728 		struct tcp_sock *tp = tcp_sk(sk);
1729 		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1730 			tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1731 		if (tp->ucopy.dma_chan)
1732 			ret = tcp_v4_do_rcv(sk, skb);
1733 		else
1734 #endif
1735 		{
1736 			if (!tcp_prequeue(sk, skb))
1737 				ret = tcp_v4_do_rcv(sk, skb);
1738 		}
1739 	} else if (unlikely(sk_add_backlog(sk, skb))) {
1740 		bh_unlock_sock(sk);
1741 		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1742 		goto discard_and_relse;
1743 	}
1744 	bh_unlock_sock(sk);
1745 
1746 	sock_put(sk);
1747 
1748 	return ret;
1749 
1750 no_tcp_socket:
1751 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1752 		goto discard_it;
1753 
1754 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1755 bad_packet:
1756 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1757 	} else {
1758 		tcp_v4_send_reset(NULL, skb);
1759 	}
1760 
1761 discard_it:
1762 	/* Discard frame. */
1763 	kfree_skb(skb);
1764 	return 0;
1765 
1766 discard_and_relse:
1767 	sock_put(sk);
1768 	goto discard_it;
1769 
1770 do_time_wait:
1771 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1772 		inet_twsk_put(inet_twsk(sk));
1773 		goto discard_it;
1774 	}
1775 
1776 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1777 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1778 		inet_twsk_put(inet_twsk(sk));
1779 		goto discard_it;
1780 	}
1781 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1782 	case TCP_TW_SYN: {
1783 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1784 							&tcp_hashinfo,
1785 							iph->daddr, th->dest,
1786 							inet_iif(skb));
1787 		if (sk2) {
1788 			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1789 			inet_twsk_put(inet_twsk(sk));
1790 			sk = sk2;
1791 			goto process;
1792 		}
1793 		/* Fall through to ACK */
1794 	}
1795 	case TCP_TW_ACK:
1796 		tcp_v4_timewait_ack(sk, skb);
1797 		break;
1798 	case TCP_TW_RST:
1799 		goto no_tcp_socket;
1800 	case TCP_TW_SUCCESS:;
1801 	}
1802 	goto discard_it;
1803 }
1804 
tcp_v4_get_peer(struct sock * sk,bool * release_it)1805 struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1806 {
1807 	struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1808 	struct inet_sock *inet = inet_sk(sk);
1809 	struct inet_peer *peer;
1810 
1811 	if (!rt ||
1812 	    inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
1813 		peer = inet_getpeer_v4(inet->inet_daddr, 1);
1814 		*release_it = true;
1815 	} else {
1816 		if (!rt->peer)
1817 			rt_bind_peer(rt, inet->inet_daddr, 1);
1818 		peer = rt->peer;
1819 		*release_it = false;
1820 	}
1821 
1822 	return peer;
1823 }
1824 EXPORT_SYMBOL(tcp_v4_get_peer);
1825 
tcp_v4_tw_get_peer(struct sock * sk)1826 void *tcp_v4_tw_get_peer(struct sock *sk)
1827 {
1828 	const struct inet_timewait_sock *tw = inet_twsk(sk);
1829 
1830 	return inet_getpeer_v4(tw->tw_daddr, 1);
1831 }
1832 EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1833 
1834 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1835 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1836 	.twsk_unique	= tcp_twsk_unique,
1837 	.twsk_destructor= tcp_twsk_destructor,
1838 	.twsk_getpeer	= tcp_v4_tw_get_peer,
1839 };
1840 
1841 const struct inet_connection_sock_af_ops ipv4_specific = {
1842 	.queue_xmit	   = ip_queue_xmit,
1843 	.send_check	   = tcp_v4_send_check,
1844 	.rebuild_header	   = inet_sk_rebuild_header,
1845 	.conn_request	   = tcp_v4_conn_request,
1846 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1847 	.get_peer	   = tcp_v4_get_peer,
1848 	.net_header_len	   = sizeof(struct iphdr),
1849 	.setsockopt	   = ip_setsockopt,
1850 	.getsockopt	   = ip_getsockopt,
1851 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1852 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1853 	.bind_conflict	   = inet_csk_bind_conflict,
1854 #ifdef CONFIG_COMPAT
1855 	.compat_setsockopt = compat_ip_setsockopt,
1856 	.compat_getsockopt = compat_ip_getsockopt,
1857 #endif
1858 };
1859 EXPORT_SYMBOL(ipv4_specific);
1860 
1861 #ifdef CONFIG_TCP_MD5SIG
1862 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1863 	.md5_lookup		= tcp_v4_md5_lookup,
1864 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1865 	.md5_add		= tcp_v4_md5_add_func,
1866 	.md5_parse		= tcp_v4_parse_md5_keys,
1867 };
1868 #endif
1869 
1870 /* NOTE: A lot of things set to zero explicitly by call to
1871  *       sk_alloc() so need not be done here.
1872  */
tcp_v4_init_sock(struct sock * sk)1873 static int tcp_v4_init_sock(struct sock *sk)
1874 {
1875 	struct inet_connection_sock *icsk = inet_csk(sk);
1876 	struct tcp_sock *tp = tcp_sk(sk);
1877 
1878 	skb_queue_head_init(&tp->out_of_order_queue);
1879 	tcp_init_xmit_timers(sk);
1880 	tcp_prequeue_init(tp);
1881 
1882 	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1883 	tp->mdev = TCP_TIMEOUT_INIT;
1884 
1885 	/* So many TCP implementations out there (incorrectly) count the
1886 	 * initial SYN frame in their delayed-ACK and congestion control
1887 	 * algorithms that we must have the following bandaid to talk
1888 	 * efficiently to them.  -DaveM
1889 	 */
1890 	tp->snd_cwnd = TCP_INIT_CWND;
1891 
1892 	/* See draft-stevens-tcpca-spec-01 for discussion of the
1893 	 * initialization of these values.
1894 	 */
1895 	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1896 	tp->snd_cwnd_clamp = ~0;
1897 	tp->mss_cache = TCP_MSS_DEFAULT;
1898 
1899 	tp->reordering = sysctl_tcp_reordering;
1900 	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1901 
1902 	sk->sk_state = TCP_CLOSE;
1903 
1904 	sk->sk_write_space = sk_stream_write_space;
1905 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1906 
1907 	icsk->icsk_af_ops = &ipv4_specific;
1908 	icsk->icsk_sync_mss = tcp_sync_mss;
1909 #ifdef CONFIG_TCP_MD5SIG
1910 	tp->af_specific = &tcp_sock_ipv4_specific;
1911 #endif
1912 
1913 	/* TCP Cookie Transactions */
1914 	if (sysctl_tcp_cookie_size > 0) {
1915 		/* Default, cookies without s_data_payload. */
1916 		tp->cookie_values =
1917 			kzalloc(sizeof(*tp->cookie_values),
1918 				sk->sk_allocation);
1919 		if (tp->cookie_values != NULL)
1920 			kref_init(&tp->cookie_values->kref);
1921 	}
1922 	/* Presumed zeroed, in order of appearance:
1923 	 *	cookie_in_always, cookie_out_never,
1924 	 *	s_data_constant, s_data_in, s_data_out
1925 	 */
1926 	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1927 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1928 
1929 	local_bh_disable();
1930 	sock_update_memcg(sk);
1931 	sk_sockets_allocated_inc(sk);
1932 	local_bh_enable();
1933 
1934 	return 0;
1935 }
1936 
tcp_v4_destroy_sock(struct sock * sk)1937 void tcp_v4_destroy_sock(struct sock *sk)
1938 {
1939 	struct tcp_sock *tp = tcp_sk(sk);
1940 
1941 	tcp_clear_xmit_timers(sk);
1942 
1943 	tcp_cleanup_congestion_control(sk);
1944 
1945 	/* Cleanup up the write buffer. */
1946 	tcp_write_queue_purge(sk);
1947 
1948 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1949 	__skb_queue_purge(&tp->out_of_order_queue);
1950 
1951 #ifdef CONFIG_TCP_MD5SIG
1952 	/* Clean up the MD5 key list, if any */
1953 	if (tp->md5sig_info) {
1954 		tcp_v4_clear_md5_list(sk);
1955 		kfree(tp->md5sig_info);
1956 		tp->md5sig_info = NULL;
1957 	}
1958 #endif
1959 
1960 #ifdef CONFIG_NET_DMA
1961 	/* Cleans up our sk_async_wait_queue */
1962 	__skb_queue_purge(&sk->sk_async_wait_queue);
1963 #endif
1964 
1965 	/* Clean prequeue, it must be empty really */
1966 	__skb_queue_purge(&tp->ucopy.prequeue);
1967 
1968 	/* Clean up a referenced TCP bind bucket. */
1969 	if (inet_csk(sk)->icsk_bind_hash)
1970 		inet_put_port(sk);
1971 
1972 	/*
1973 	 * If sendmsg cached page exists, toss it.
1974 	 */
1975 	if (sk->sk_sndmsg_page) {
1976 		__free_page(sk->sk_sndmsg_page);
1977 		sk->sk_sndmsg_page = NULL;
1978 	}
1979 
1980 	/* TCP Cookie Transactions */
1981 	if (tp->cookie_values != NULL) {
1982 		kref_put(&tp->cookie_values->kref,
1983 			 tcp_cookie_values_release);
1984 		tp->cookie_values = NULL;
1985 	}
1986 
1987 	sk_sockets_allocated_dec(sk);
1988 	sock_release_memcg(sk);
1989 }
1990 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1991 
1992 #ifdef CONFIG_PROC_FS
1993 /* Proc filesystem TCP sock list dumping. */
1994 
tw_head(struct hlist_nulls_head * head)1995 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1996 {
1997 	return hlist_nulls_empty(head) ? NULL :
1998 		list_entry(head->first, struct inet_timewait_sock, tw_node);
1999 }
2000 
tw_next(struct inet_timewait_sock * tw)2001 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
2002 {
2003 	return !is_a_nulls(tw->tw_node.next) ?
2004 		hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2005 }
2006 
2007 /*
2008  * Get next listener socket follow cur.  If cur is NULL, get first socket
2009  * starting from bucket given in st->bucket; when st->bucket is zero the
2010  * very first socket in the hash table is returned.
2011  */
listening_get_next(struct seq_file * seq,void * cur)2012 static void *listening_get_next(struct seq_file *seq, void *cur)
2013 {
2014 	struct inet_connection_sock *icsk;
2015 	struct hlist_nulls_node *node;
2016 	struct sock *sk = cur;
2017 	struct inet_listen_hashbucket *ilb;
2018 	struct tcp_iter_state *st = seq->private;
2019 	struct net *net = seq_file_net(seq);
2020 
2021 	if (!sk) {
2022 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2023 		spin_lock_bh(&ilb->lock);
2024 		sk = sk_nulls_head(&ilb->head);
2025 		st->offset = 0;
2026 		goto get_sk;
2027 	}
2028 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2029 	++st->num;
2030 	++st->offset;
2031 
2032 	if (st->state == TCP_SEQ_STATE_OPENREQ) {
2033 		struct request_sock *req = cur;
2034 
2035 		icsk = inet_csk(st->syn_wait_sk);
2036 		req = req->dl_next;
2037 		while (1) {
2038 			while (req) {
2039 				if (req->rsk_ops->family == st->family) {
2040 					cur = req;
2041 					goto out;
2042 				}
2043 				req = req->dl_next;
2044 			}
2045 			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2046 				break;
2047 get_req:
2048 			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2049 		}
2050 		sk	  = sk_nulls_next(st->syn_wait_sk);
2051 		st->state = TCP_SEQ_STATE_LISTENING;
2052 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2053 	} else {
2054 		icsk = inet_csk(sk);
2055 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2056 		if (reqsk_queue_len(&icsk->icsk_accept_queue))
2057 			goto start_req;
2058 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2059 		sk = sk_nulls_next(sk);
2060 	}
2061 get_sk:
2062 	sk_nulls_for_each_from(sk, node) {
2063 		if (!net_eq(sock_net(sk), net))
2064 			continue;
2065 		if (sk->sk_family == st->family) {
2066 			cur = sk;
2067 			goto out;
2068 		}
2069 		icsk = inet_csk(sk);
2070 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2071 		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2072 start_req:
2073 			st->uid		= sock_i_uid(sk);
2074 			st->syn_wait_sk = sk;
2075 			st->state	= TCP_SEQ_STATE_OPENREQ;
2076 			st->sbucket	= 0;
2077 			goto get_req;
2078 		}
2079 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2080 	}
2081 	spin_unlock_bh(&ilb->lock);
2082 	st->offset = 0;
2083 	if (++st->bucket < INET_LHTABLE_SIZE) {
2084 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2085 		spin_lock_bh(&ilb->lock);
2086 		sk = sk_nulls_head(&ilb->head);
2087 		goto get_sk;
2088 	}
2089 	cur = NULL;
2090 out:
2091 	return cur;
2092 }
2093 
listening_get_idx(struct seq_file * seq,loff_t * pos)2094 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2095 {
2096 	struct tcp_iter_state *st = seq->private;
2097 	void *rc;
2098 
2099 	st->bucket = 0;
2100 	st->offset = 0;
2101 	rc = listening_get_next(seq, NULL);
2102 
2103 	while (rc && *pos) {
2104 		rc = listening_get_next(seq, rc);
2105 		--*pos;
2106 	}
2107 	return rc;
2108 }
2109 
empty_bucket(struct tcp_iter_state * st)2110 static inline int empty_bucket(struct tcp_iter_state *st)
2111 {
2112 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2113 		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2114 }
2115 
2116 /*
2117  * Get first established socket starting from bucket given in st->bucket.
2118  * If st->bucket is zero, the very first socket in the hash is returned.
2119  */
established_get_first(struct seq_file * seq)2120 static void *established_get_first(struct seq_file *seq)
2121 {
2122 	struct tcp_iter_state *st = seq->private;
2123 	struct net *net = seq_file_net(seq);
2124 	void *rc = NULL;
2125 
2126 	st->offset = 0;
2127 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2128 		struct sock *sk;
2129 		struct hlist_nulls_node *node;
2130 		struct inet_timewait_sock *tw;
2131 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2132 
2133 		/* Lockless fast path for the common case of empty buckets */
2134 		if (empty_bucket(st))
2135 			continue;
2136 
2137 		spin_lock_bh(lock);
2138 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2139 			if (sk->sk_family != st->family ||
2140 			    !net_eq(sock_net(sk), net)) {
2141 				continue;
2142 			}
2143 			rc = sk;
2144 			goto out;
2145 		}
2146 		st->state = TCP_SEQ_STATE_TIME_WAIT;
2147 		inet_twsk_for_each(tw, node,
2148 				   &tcp_hashinfo.ehash[st->bucket].twchain) {
2149 			if (tw->tw_family != st->family ||
2150 			    !net_eq(twsk_net(tw), net)) {
2151 				continue;
2152 			}
2153 			rc = tw;
2154 			goto out;
2155 		}
2156 		spin_unlock_bh(lock);
2157 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2158 	}
2159 out:
2160 	return rc;
2161 }
2162 
established_get_next(struct seq_file * seq,void * cur)2163 static void *established_get_next(struct seq_file *seq, void *cur)
2164 {
2165 	struct sock *sk = cur;
2166 	struct inet_timewait_sock *tw;
2167 	struct hlist_nulls_node *node;
2168 	struct tcp_iter_state *st = seq->private;
2169 	struct net *net = seq_file_net(seq);
2170 
2171 	++st->num;
2172 	++st->offset;
2173 
2174 	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2175 		tw = cur;
2176 		tw = tw_next(tw);
2177 get_tw:
2178 		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2179 			tw = tw_next(tw);
2180 		}
2181 		if (tw) {
2182 			cur = tw;
2183 			goto out;
2184 		}
2185 		spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2186 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2187 
2188 		/* Look for next non empty bucket */
2189 		st->offset = 0;
2190 		while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2191 				empty_bucket(st))
2192 			;
2193 		if (st->bucket > tcp_hashinfo.ehash_mask)
2194 			return NULL;
2195 
2196 		spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2197 		sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2198 	} else
2199 		sk = sk_nulls_next(sk);
2200 
2201 	sk_nulls_for_each_from(sk, node) {
2202 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2203 			goto found;
2204 	}
2205 
2206 	st->state = TCP_SEQ_STATE_TIME_WAIT;
2207 	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2208 	goto get_tw;
2209 found:
2210 	cur = sk;
2211 out:
2212 	return cur;
2213 }
2214 
established_get_idx(struct seq_file * seq,loff_t pos)2215 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2216 {
2217 	struct tcp_iter_state *st = seq->private;
2218 	void *rc;
2219 
2220 	st->bucket = 0;
2221 	rc = established_get_first(seq);
2222 
2223 	while (rc && pos) {
2224 		rc = established_get_next(seq, rc);
2225 		--pos;
2226 	}
2227 	return rc;
2228 }
2229 
tcp_get_idx(struct seq_file * seq,loff_t pos)2230 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2231 {
2232 	void *rc;
2233 	struct tcp_iter_state *st = seq->private;
2234 
2235 	st->state = TCP_SEQ_STATE_LISTENING;
2236 	rc	  = listening_get_idx(seq, &pos);
2237 
2238 	if (!rc) {
2239 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2240 		rc	  = established_get_idx(seq, pos);
2241 	}
2242 
2243 	return rc;
2244 }
2245 
tcp_seek_last_pos(struct seq_file * seq)2246 static void *tcp_seek_last_pos(struct seq_file *seq)
2247 {
2248 	struct tcp_iter_state *st = seq->private;
2249 	int offset = st->offset;
2250 	int orig_num = st->num;
2251 	void *rc = NULL;
2252 
2253 	switch (st->state) {
2254 	case TCP_SEQ_STATE_OPENREQ:
2255 	case TCP_SEQ_STATE_LISTENING:
2256 		if (st->bucket >= INET_LHTABLE_SIZE)
2257 			break;
2258 		st->state = TCP_SEQ_STATE_LISTENING;
2259 		rc = listening_get_next(seq, NULL);
2260 		while (offset-- && rc)
2261 			rc = listening_get_next(seq, rc);
2262 		if (rc)
2263 			break;
2264 		st->bucket = 0;
2265 		/* Fallthrough */
2266 	case TCP_SEQ_STATE_ESTABLISHED:
2267 	case TCP_SEQ_STATE_TIME_WAIT:
2268 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2269 		if (st->bucket > tcp_hashinfo.ehash_mask)
2270 			break;
2271 		rc = established_get_first(seq);
2272 		while (offset-- && rc)
2273 			rc = established_get_next(seq, rc);
2274 	}
2275 
2276 	st->num = orig_num;
2277 
2278 	return rc;
2279 }
2280 
tcp_seq_start(struct seq_file * seq,loff_t * pos)2281 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2282 {
2283 	struct tcp_iter_state *st = seq->private;
2284 	void *rc;
2285 
2286 	if (*pos && *pos == st->last_pos) {
2287 		rc = tcp_seek_last_pos(seq);
2288 		if (rc)
2289 			goto out;
2290 	}
2291 
2292 	st->state = TCP_SEQ_STATE_LISTENING;
2293 	st->num = 0;
2294 	st->bucket = 0;
2295 	st->offset = 0;
2296 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2297 
2298 out:
2299 	st->last_pos = *pos;
2300 	return rc;
2301 }
2302 
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2303 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2304 {
2305 	struct tcp_iter_state *st = seq->private;
2306 	void *rc = NULL;
2307 
2308 	if (v == SEQ_START_TOKEN) {
2309 		rc = tcp_get_idx(seq, 0);
2310 		goto out;
2311 	}
2312 
2313 	switch (st->state) {
2314 	case TCP_SEQ_STATE_OPENREQ:
2315 	case TCP_SEQ_STATE_LISTENING:
2316 		rc = listening_get_next(seq, v);
2317 		if (!rc) {
2318 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2319 			st->bucket = 0;
2320 			st->offset = 0;
2321 			rc	  = established_get_first(seq);
2322 		}
2323 		break;
2324 	case TCP_SEQ_STATE_ESTABLISHED:
2325 	case TCP_SEQ_STATE_TIME_WAIT:
2326 		rc = established_get_next(seq, v);
2327 		break;
2328 	}
2329 out:
2330 	++*pos;
2331 	st->last_pos = *pos;
2332 	return rc;
2333 }
2334 
tcp_seq_stop(struct seq_file * seq,void * v)2335 static void tcp_seq_stop(struct seq_file *seq, void *v)
2336 {
2337 	struct tcp_iter_state *st = seq->private;
2338 
2339 	switch (st->state) {
2340 	case TCP_SEQ_STATE_OPENREQ:
2341 		if (v) {
2342 			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2343 			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2344 		}
2345 	case TCP_SEQ_STATE_LISTENING:
2346 		if (v != SEQ_START_TOKEN)
2347 			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2348 		break;
2349 	case TCP_SEQ_STATE_TIME_WAIT:
2350 	case TCP_SEQ_STATE_ESTABLISHED:
2351 		if (v)
2352 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2353 		break;
2354 	}
2355 }
2356 
tcp_seq_open(struct inode * inode,struct file * file)2357 int tcp_seq_open(struct inode *inode, struct file *file)
2358 {
2359 	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2360 	struct tcp_iter_state *s;
2361 	int err;
2362 
2363 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2364 			  sizeof(struct tcp_iter_state));
2365 	if (err < 0)
2366 		return err;
2367 
2368 	s = ((struct seq_file *)file->private_data)->private;
2369 	s->family		= afinfo->family;
2370 	s->last_pos 		= 0;
2371 	return 0;
2372 }
2373 EXPORT_SYMBOL(tcp_seq_open);
2374 
tcp_proc_register(struct net * net,struct tcp_seq_afinfo * afinfo)2375 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2376 {
2377 	int rc = 0;
2378 	struct proc_dir_entry *p;
2379 
2380 	afinfo->seq_ops.start		= tcp_seq_start;
2381 	afinfo->seq_ops.next		= tcp_seq_next;
2382 	afinfo->seq_ops.stop		= tcp_seq_stop;
2383 
2384 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2385 			     afinfo->seq_fops, afinfo);
2386 	if (!p)
2387 		rc = -ENOMEM;
2388 	return rc;
2389 }
2390 EXPORT_SYMBOL(tcp_proc_register);
2391 
tcp_proc_unregister(struct net * net,struct tcp_seq_afinfo * afinfo)2392 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2393 {
2394 	proc_net_remove(net, afinfo->name);
2395 }
2396 EXPORT_SYMBOL(tcp_proc_unregister);
2397 
get_openreq4(const struct sock * sk,const struct request_sock * req,struct seq_file * f,int i,int uid,int * len)2398 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2399 			 struct seq_file *f, int i, int uid, int *len)
2400 {
2401 	const struct inet_request_sock *ireq = inet_rsk(req);
2402 	int ttd = req->expires - jiffies;
2403 
2404 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2405 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2406 		i,
2407 		ireq->loc_addr,
2408 		ntohs(inet_sk(sk)->inet_sport),
2409 		ireq->rmt_addr,
2410 		ntohs(ireq->rmt_port),
2411 		TCP_SYN_RECV,
2412 		0, 0, /* could print option size, but that is af dependent. */
2413 		1,    /* timers active (only the expire timer) */
2414 		jiffies_to_clock_t(ttd),
2415 		req->retrans,
2416 		uid,
2417 		0,  /* non standard timer */
2418 		0, /* open_requests have no inode */
2419 		atomic_read(&sk->sk_refcnt),
2420 		req,
2421 		len);
2422 }
2423 
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i,int * len)2424 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2425 {
2426 	int timer_active;
2427 	unsigned long timer_expires;
2428 	const struct tcp_sock *tp = tcp_sk(sk);
2429 	const struct inet_connection_sock *icsk = inet_csk(sk);
2430 	const struct inet_sock *inet = inet_sk(sk);
2431 	__be32 dest = inet->inet_daddr;
2432 	__be32 src = inet->inet_rcv_saddr;
2433 	__u16 destp = ntohs(inet->inet_dport);
2434 	__u16 srcp = ntohs(inet->inet_sport);
2435 	int rx_queue;
2436 
2437 	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2438 		timer_active	= 1;
2439 		timer_expires	= icsk->icsk_timeout;
2440 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2441 		timer_active	= 4;
2442 		timer_expires	= icsk->icsk_timeout;
2443 	} else if (timer_pending(&sk->sk_timer)) {
2444 		timer_active	= 2;
2445 		timer_expires	= sk->sk_timer.expires;
2446 	} else {
2447 		timer_active	= 0;
2448 		timer_expires = jiffies;
2449 	}
2450 
2451 	if (sk->sk_state == TCP_LISTEN)
2452 		rx_queue = sk->sk_ack_backlog;
2453 	else
2454 		/*
2455 		 * because we dont lock socket, we might find a transient negative value
2456 		 */
2457 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2458 
2459 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2460 			"%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2461 		i, src, srcp, dest, destp, sk->sk_state,
2462 		tp->write_seq - tp->snd_una,
2463 		rx_queue,
2464 		timer_active,
2465 		jiffies_to_clock_t(timer_expires - jiffies),
2466 		icsk->icsk_retransmits,
2467 		sock_i_uid(sk),
2468 		icsk->icsk_probes_out,
2469 		sock_i_ino(sk),
2470 		atomic_read(&sk->sk_refcnt), sk,
2471 		jiffies_to_clock_t(icsk->icsk_rto),
2472 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2473 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2474 		tp->snd_cwnd,
2475 		tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2476 		len);
2477 }
2478 
get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i,int * len)2479 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2480 			       struct seq_file *f, int i, int *len)
2481 {
2482 	__be32 dest, src;
2483 	__u16 destp, srcp;
2484 	int ttd = tw->tw_ttd - jiffies;
2485 
2486 	if (ttd < 0)
2487 		ttd = 0;
2488 
2489 	dest  = tw->tw_daddr;
2490 	src   = tw->tw_rcv_saddr;
2491 	destp = ntohs(tw->tw_dport);
2492 	srcp  = ntohs(tw->tw_sport);
2493 
2494 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2495 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2496 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2497 		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2498 		atomic_read(&tw->tw_refcnt), tw, len);
2499 }
2500 
2501 #define TMPSZ 150
2502 
tcp4_seq_show(struct seq_file * seq,void * v)2503 static int tcp4_seq_show(struct seq_file *seq, void *v)
2504 {
2505 	struct tcp_iter_state *st;
2506 	int len;
2507 
2508 	if (v == SEQ_START_TOKEN) {
2509 		seq_printf(seq, "%-*s\n", TMPSZ - 1,
2510 			   "  sl  local_address rem_address   st tx_queue "
2511 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2512 			   "inode");
2513 		goto out;
2514 	}
2515 	st = seq->private;
2516 
2517 	switch (st->state) {
2518 	case TCP_SEQ_STATE_LISTENING:
2519 	case TCP_SEQ_STATE_ESTABLISHED:
2520 		get_tcp4_sock(v, seq, st->num, &len);
2521 		break;
2522 	case TCP_SEQ_STATE_OPENREQ:
2523 		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2524 		break;
2525 	case TCP_SEQ_STATE_TIME_WAIT:
2526 		get_timewait4_sock(v, seq, st->num, &len);
2527 		break;
2528 	}
2529 	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2530 out:
2531 	return 0;
2532 }
2533 
2534 static const struct file_operations tcp_afinfo_seq_fops = {
2535 	.owner   = THIS_MODULE,
2536 	.open    = tcp_seq_open,
2537 	.read    = seq_read,
2538 	.llseek  = seq_lseek,
2539 	.release = seq_release_net
2540 };
2541 
2542 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2543 	.name		= "tcp",
2544 	.family		= AF_INET,
2545 	.seq_fops	= &tcp_afinfo_seq_fops,
2546 	.seq_ops	= {
2547 		.show		= tcp4_seq_show,
2548 	},
2549 };
2550 
tcp4_proc_init_net(struct net * net)2551 static int __net_init tcp4_proc_init_net(struct net *net)
2552 {
2553 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2554 }
2555 
tcp4_proc_exit_net(struct net * net)2556 static void __net_exit tcp4_proc_exit_net(struct net *net)
2557 {
2558 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2559 }
2560 
2561 static struct pernet_operations tcp4_net_ops = {
2562 	.init = tcp4_proc_init_net,
2563 	.exit = tcp4_proc_exit_net,
2564 };
2565 
tcp4_proc_init(void)2566 int __init tcp4_proc_init(void)
2567 {
2568 	return register_pernet_subsys(&tcp4_net_ops);
2569 }
2570 
tcp4_proc_exit(void)2571 void tcp4_proc_exit(void)
2572 {
2573 	unregister_pernet_subsys(&tcp4_net_ops);
2574 }
2575 #endif /* CONFIG_PROC_FS */
2576 
tcp4_gro_receive(struct sk_buff ** head,struct sk_buff * skb)2577 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2578 {
2579 	const struct iphdr *iph = skb_gro_network_header(skb);
2580 
2581 	switch (skb->ip_summed) {
2582 	case CHECKSUM_COMPLETE:
2583 		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2584 				  skb->csum)) {
2585 			skb->ip_summed = CHECKSUM_UNNECESSARY;
2586 			break;
2587 		}
2588 
2589 		/* fall through */
2590 	case CHECKSUM_NONE:
2591 		NAPI_GRO_CB(skb)->flush = 1;
2592 		return NULL;
2593 	}
2594 
2595 	return tcp_gro_receive(head, skb);
2596 }
2597 
tcp4_gro_complete(struct sk_buff * skb)2598 int tcp4_gro_complete(struct sk_buff *skb)
2599 {
2600 	const struct iphdr *iph = ip_hdr(skb);
2601 	struct tcphdr *th = tcp_hdr(skb);
2602 
2603 	th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2604 				  iph->saddr, iph->daddr, 0);
2605 	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2606 
2607 	return tcp_gro_complete(skb);
2608 }
2609 
2610 struct proto tcp_prot = {
2611 	.name			= "TCP",
2612 	.owner			= THIS_MODULE,
2613 	.close			= tcp_close,
2614 	.connect		= tcp_v4_connect,
2615 	.disconnect		= tcp_disconnect,
2616 	.accept			= inet_csk_accept,
2617 	.ioctl			= tcp_ioctl,
2618 	.init			= tcp_v4_init_sock,
2619 	.destroy		= tcp_v4_destroy_sock,
2620 	.shutdown		= tcp_shutdown,
2621 	.setsockopt		= tcp_setsockopt,
2622 	.getsockopt		= tcp_getsockopt,
2623 	.recvmsg		= tcp_recvmsg,
2624 	.sendmsg		= tcp_sendmsg,
2625 	.sendpage		= tcp_sendpage,
2626 	.backlog_rcv		= tcp_v4_do_rcv,
2627 	.hash			= inet_hash,
2628 	.unhash			= inet_unhash,
2629 	.get_port		= inet_csk_get_port,
2630 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2631 	.sockets_allocated	= &tcp_sockets_allocated,
2632 	.orphan_count		= &tcp_orphan_count,
2633 	.memory_allocated	= &tcp_memory_allocated,
2634 	.memory_pressure	= &tcp_memory_pressure,
2635 	.sysctl_wmem		= sysctl_tcp_wmem,
2636 	.sysctl_rmem		= sysctl_tcp_rmem,
2637 	.max_header		= MAX_TCP_HEADER,
2638 	.obj_size		= sizeof(struct tcp_sock),
2639 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2640 	.twsk_prot		= &tcp_timewait_sock_ops,
2641 	.rsk_prot		= &tcp_request_sock_ops,
2642 	.h.hashinfo		= &tcp_hashinfo,
2643 	.no_autobind		= true,
2644 #ifdef CONFIG_COMPAT
2645 	.compat_setsockopt	= compat_tcp_setsockopt,
2646 	.compat_getsockopt	= compat_tcp_getsockopt,
2647 #endif
2648 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
2649 	.init_cgroup		= tcp_init_cgroup,
2650 	.destroy_cgroup		= tcp_destroy_cgroup,
2651 	.proto_cgroup		= tcp_proto_cgroup,
2652 #endif
2653 };
2654 EXPORT_SYMBOL(tcp_prot);
2655 
tcp_sk_init(struct net * net)2656 static int __net_init tcp_sk_init(struct net *net)
2657 {
2658 	return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2659 				    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2660 }
2661 
tcp_sk_exit(struct net * net)2662 static void __net_exit tcp_sk_exit(struct net *net)
2663 {
2664 	inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2665 }
2666 
tcp_sk_exit_batch(struct list_head * net_exit_list)2667 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2668 {
2669 	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2670 }
2671 
2672 static struct pernet_operations __net_initdata tcp_sk_ops = {
2673        .init	   = tcp_sk_init,
2674        .exit	   = tcp_sk_exit,
2675        .exit_batch = tcp_sk_exit_batch,
2676 };
2677 
tcp_v4_init(void)2678 void __init tcp_v4_init(void)
2679 {
2680 	inet_hashinfo_init(&tcp_hashinfo);
2681 	if (register_pernet_subsys(&tcp_sk_ops))
2682 		panic("Failed to create the TCP control socket.\n");
2683 }
2684