1 /*-
2 * Copyright (c) 2016-2020 Netflix, Inc.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 *
25 */
26 /*
27 * Author: Randall Stewart <rrs@netflix.com>
28 * This work is based on the ACM Queue paper
29 * BBR - Congestion Based Congestion Control
30 * and also numerous discussions with Neal, Yuchung and Van.
31 */
32
33 #include <sys/cdefs.h>
34 #include "opt_inet.h"
35 #include "opt_inet6.h"
36 #include "opt_ipsec.h"
37 #include "opt_ratelimit.h"
38 #include <sys/param.h>
39 #include <sys/arb.h>
40 #include <sys/module.h>
41 #include <sys/kernel.h>
42 #ifdef TCP_HHOOK
43 #include <sys/hhook.h>
44 #endif
45 #include <sys/malloc.h>
46 #include <sys/mbuf.h>
47 #include <sys/proc.h>
48 #include <sys/qmath.h>
49 #include <sys/socket.h>
50 #include <sys/socketvar.h>
51 #include <sys/sysctl.h>
52 #include <sys/systm.h>
53 #include <sys/tree.h>
54 #ifdef NETFLIX_STATS
55 #include <sys/stats.h> /* Must come after qmath.h and tree.h */
56 #endif
57 #include <sys/refcount.h>
58 #include <sys/queue.h>
59 #include <sys/smp.h>
60 #include <sys/kthread.h>
61 #include <sys/lock.h>
62 #include <sys/mutex.h>
63 #include <sys/tim_filter.h>
64 #include <sys/time.h>
65 #include <vm/uma.h>
66 #include <sys/kern_prefetch.h>
67
68 #include <net/route.h>
69 #include <net/vnet.h>
70 #include <net/ethernet.h>
71 #include <net/bpf.h>
72
73 #define TCPSTATES /* for logging */
74
75 #include <netinet/in.h>
76 #include <netinet/in_kdtrace.h>
77 #include <netinet/in_pcb.h>
78 #include <netinet/ip.h>
79 #include <netinet/ip_var.h>
80 #include <netinet/ip6.h>
81 #include <netinet6/in6_pcb.h>
82 #include <netinet6/ip6_var.h>
83 #include <netinet/tcp.h>
84 #include <netinet/tcp_fsm.h>
85 #include <netinet/tcp_seq.h>
86 #include <netinet/tcp_timer.h>
87 #include <netinet/tcp_var.h>
88 #include <netinet/tcpip.h>
89 #include <netinet/tcp_ecn.h>
90 #include <netinet/tcp_hpts.h>
91 #include <netinet/tcp_lro.h>
92 #include <netinet/cc/cc.h>
93 #include <netinet/tcp_log_buf.h>
94 #ifdef TCP_OFFLOAD
95 #include <netinet/tcp_offload.h>
96 #endif
97 #ifdef INET6
98 #include <netinet6/tcp6_var.h>
99 #endif
100 #include <netinet/tcp_fastopen.h>
101
102 #include <netipsec/ipsec_support.h>
103 #include <net/if.h>
104 #include <net/if_var.h>
105 #include <net/if_private.h>
106
107 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
108 #include <netipsec/ipsec.h>
109 #include <netipsec/ipsec6.h>
110 #endif /* IPSEC */
111
112 #include <netinet/udp.h>
113 #include <netinet/udp_var.h>
114 #include <machine/in_cksum.h>
115
116 #ifdef MAC
117 #include <security/mac/mac_framework.h>
118 #endif
119 #include "rack_bbr_common.h"
120
121 /*
122 * Common TCP Functions - These are shared by borth
123 * rack and BBR.
124 */
125 static int
ctf_get_enet_type(struct ifnet * ifp,struct mbuf * m)126 ctf_get_enet_type(struct ifnet *ifp, struct mbuf *m)
127 {
128 struct ether_header *eh;
129 #ifdef INET6
130 struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */
131 #endif
132 #ifdef INET
133 struct ip *ip = NULL; /* Keep compiler happy. */
134 #endif
135 #if defined(INET) || defined(INET6)
136 struct tcphdr *th;
137 int32_t tlen;
138 uint16_t drop_hdrlen;
139 #endif
140 uint16_t etype;
141 #ifdef INET
142 uint8_t iptos;
143 #endif
144
145 /* Is it the easy way? */
146 if (m->m_flags & M_LRO_EHDRSTRP)
147 return (m->m_pkthdr.lro_etype);
148 /*
149 * Ok this is the old style call, the ethernet header is here.
150 * This also means no checksum or BPF were done. This
151 * can happen if the race to setup the inp fails and
152 * LRO sees no INP at packet input, but by the time
153 * we queue the packets an INP gets there. Its rare
154 * but it can occur so we will handle it. Note that
155 * this means duplicated work but with the rarity of it
156 * its not worth worrying about.
157 */
158 /* Let the BPF see the packet */
159 if (bpf_peers_present(ifp->if_bpf))
160 ETHER_BPF_MTAP(ifp, m);
161 /* Now the csum */
162 eh = mtod(m, struct ether_header *);
163 etype = ntohs(eh->ether_type);
164 m_adj(m, sizeof(*eh));
165 switch (etype) {
166 #ifdef INET6
167 case ETHERTYPE_IPV6:
168 {
169 if (m->m_len < (sizeof(*ip6) + sizeof(*th))) {
170 m = m_pullup(m, sizeof(*ip6) + sizeof(*th));
171 if (m == NULL) {
172 KMOD_TCPSTAT_INC(tcps_rcvshort);
173 return (-1);
174 }
175 }
176 ip6 = (struct ip6_hdr *)(eh + 1);
177 th = (struct tcphdr *)(ip6 + 1);
178 drop_hdrlen = sizeof(*ip6);
179 tlen = ntohs(ip6->ip6_plen);
180 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) {
181 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
182 th->th_sum = m->m_pkthdr.csum_data;
183 else
184 th->th_sum = in6_cksum_pseudo(ip6, tlen,
185 IPPROTO_TCP,
186 m->m_pkthdr.csum_data);
187 th->th_sum ^= 0xffff;
188 } else
189 th->th_sum = in6_cksum(m, IPPROTO_TCP, drop_hdrlen, tlen);
190 if (th->th_sum) {
191 KMOD_TCPSTAT_INC(tcps_rcvbadsum);
192 m_freem(m);
193 return (-1);
194 }
195 return (etype);
196 }
197 #endif
198 #ifdef INET
199 case ETHERTYPE_IP:
200 {
201 if (m->m_len < sizeof (struct tcpiphdr)) {
202 m = m_pullup(m, sizeof (struct tcpiphdr));
203 if (m == NULL) {
204 KMOD_TCPSTAT_INC(tcps_rcvshort);
205 return (-1);
206 }
207 }
208 ip = (struct ip *)(eh + 1);
209 th = (struct tcphdr *)(ip + 1);
210 drop_hdrlen = sizeof(*ip);
211 iptos = ip->ip_tos;
212 tlen = ntohs(ip->ip_len) - sizeof(struct ip);
213 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
214 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
215 th->th_sum = m->m_pkthdr.csum_data;
216 else
217 th->th_sum = in_pseudo(ip->ip_src.s_addr,
218 ip->ip_dst.s_addr,
219 htonl(m->m_pkthdr.csum_data + tlen + IPPROTO_TCP));
220 th->th_sum ^= 0xffff;
221 } else {
222 int len;
223 struct ipovly *ipov = (struct ipovly *)ip;
224 /*
225 * Checksum extended TCP header and data.
226 */
227 len = drop_hdrlen + tlen;
228 bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
229 ipov->ih_len = htons(tlen);
230 th->th_sum = in_cksum(m, len);
231 /* Reset length for SDT probes. */
232 ip->ip_len = htons(len);
233 /* Reset TOS bits */
234 ip->ip_tos = iptos;
235 /* Re-initialization for later version check */
236 ip->ip_v = IPVERSION;
237 ip->ip_hl = sizeof(*ip) >> 2;
238 }
239 if (th->th_sum) {
240 KMOD_TCPSTAT_INC(tcps_rcvbadsum);
241 m_freem(m);
242 return (-1);
243 }
244 break;
245 }
246 #endif
247 };
248 return (etype);
249 }
250
251 /*
252 * The function ctf_process_inbound_raw() is used by
253 * transport developers to do the steps needed to
254 * support MBUF Queuing i.e. the flags in
255 * inp->inp_flags2:
256 *
257 * - INP_SUPPORTS_MBUFQ
258 * - INP_MBUF_QUEUE_READY
259 * - INP_DONT_SACK_QUEUE
260 * - INP_MBUF_ACKCMP
261 *
262 * These flags help control how LRO will deliver
263 * packets to the transport. You first set in inp_flags2
264 * the INP_SUPPORTS_MBUFQ to tell the LRO code that you
265 * will gladly take a queue of packets instead of a compressed
266 * single packet. You also set in your t_fb pointer the
267 * tfb_do_queued_segments to point to ctf_process_inbound_raw.
268 *
269 * This then gets you lists of inbound ACK's/Data instead
270 * of a condensed compressed ACK/DATA packet. Why would you
271 * want that? This will get you access to all the arrival
272 * times of at least LRO and possibly at the Hardware (if
273 * the interface card supports that) of the actual ACK/DATA.
274 * In some transport designs this is important since knowing
275 * the actual time we got the packet is useful information.
276 *
277 * A new special type of mbuf may also be supported by the transport
278 * if it has set the INP_MBUF_ACKCMP flag. If its set, LRO will
279 * possibly create a M_ACKCMP type mbuf. This is a mbuf with
280 * an array of "acks". One thing also to note is that when this
281 * occurs a subsequent LRO may find at the back of the untouched
282 * mbuf queue chain a M_ACKCMP and append on to it. This means
283 * that until the transport pulls in the mbuf chain queued
284 * for it more ack's may get on the mbufs that were already
285 * delivered. There currently is a limit of 6 acks condensed
286 * into 1 mbuf which means often when this is occuring, we
287 * don't get that effect but it does happen.
288 *
289 * Now there are some interesting Caveats that the transport
290 * designer needs to take into account when using this feature.
291 *
292 * 1) It is used with HPTS and pacing, when the pacing timer
293 * for output calls it will first call the input.
294 * 2) When you set INP_MBUF_QUEUE_READY this tells LRO
295 * queue normal packets, I am busy pacing out data and
296 * will process the queued packets before my tfb_tcp_output
297 * call from pacing. If a non-normal packet arrives, (e.g. sack)
298 * you will be awoken immediately.
299 * 3) Finally you can add the INP_DONT_SACK_QUEUE to not even
300 * be awoken if a SACK has arrived. You would do this when
301 * you were not only running a pacing for output timer
302 * but a Rack timer as well i.e. you know you are in recovery
303 * and are in the process (via the timers) of dealing with
304 * the loss.
305 *
306 * Now a critical thing you must be aware of here is that the
307 * use of the flags has a far greater scope then just your
308 * typical LRO. Why? Well thats because in the normal compressed
309 * LRO case at the end of a driver interupt all packets are going
310 * to get presented to the transport no matter if there is one
311 * or 100. With the MBUF_QUEUE model, this is not true. You will
312 * only be awoken to process the queue of packets when:
313 * a) The flags discussed above allow it.
314 * <or>
315 * b) You exceed a ack or data limit (by default the
316 * ack limit is infinity (64k acks) and the data
317 * limit is 64k of new TCP data)
318 * <or>
319 * c) The push bit has been set by the peer
320 */
321
322 static int
ctf_process_inbound_raw(struct tcpcb * tp,struct mbuf * m,int has_pkt)323 ctf_process_inbound_raw(struct tcpcb *tp, struct mbuf *m, int has_pkt)
324 {
325 /*
326 * We are passed a raw change of mbuf packets
327 * that arrived in LRO. They are linked via
328 * the m_nextpkt link in the pkt-headers.
329 *
330 * We process each one by:
331 * a) saving off the next
332 * b) stripping off the ether-header
333 * c) formulating the arguments for tfb_do_segment_nounlock()
334 * d) calling each mbuf to tfb_do_segment_nounlock()
335 * after adjusting the time to match the arrival time.
336 * Note that the LRO code assures no IP options are present.
337 *
338 * The symantics for calling tfb_do_segment_nounlock() are the
339 * following:
340 * 1) It returns 0 if all went well and you (the caller) need
341 * to release the lock.
342 * 2) If nxt_pkt is set, then the function will surpress calls
343 * to tcp_output() since you are promising to call again
344 * with another packet.
345 * 3) If it returns 1, then you must free all the packets being
346 * shipped in, the tcb has been destroyed (or about to be destroyed).
347 */
348 struct mbuf *m_save;
349 struct tcphdr *th;
350 #ifdef INET6
351 struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */
352 #endif
353 #ifdef INET
354 struct ip *ip = NULL; /* Keep compiler happy. */
355 #endif
356 struct ifnet *ifp;
357 struct timeval tv;
358 struct inpcb *inp __diagused;
359 int32_t retval, nxt_pkt, tlen, off;
360 int etype = 0;
361 uint16_t drop_hdrlen;
362 uint8_t iptos;
363
364 inp = tptoinpcb(tp);
365 INP_WLOCK_ASSERT(inp);
366 NET_EPOCH_ASSERT();
367 KASSERT(m != NULL, ("ctf_process_inbound_raw: m == NULL"));
368 ifp = m_rcvif(m);
369 KASSERT(ifp != NULL, ("ctf_process_inbound_raw: ifp == NULL"));
370 CURVNET_SET(ifp->if_vnet);
371 tcp_get_usecs(&tv);
372 while (m) {
373 m_save = m->m_nextpkt;
374 m->m_nextpkt = NULL;
375 if ((m->m_flags & M_ACKCMP) == 0) {
376 /* Now lets get the ether header */
377 etype = ctf_get_enet_type(ifp, m);
378 if (etype == -1) {
379 /* Skip this packet it was freed by checksum */
380 goto skipped_pkt;
381 }
382 KASSERT(((etype == ETHERTYPE_IPV6) || (etype == ETHERTYPE_IP)),
383 ("tp:%p m:%p etype:0x%x -- not IP or IPv6", tp, m, etype));
384 /* Trim off the ethernet header */
385 switch (etype) {
386 #ifdef INET6
387 case ETHERTYPE_IPV6:
388 ip6 = mtod(m, struct ip6_hdr *);
389 th = (struct tcphdr *)(ip6 + 1);
390 tlen = ntohs(ip6->ip6_plen);
391 drop_hdrlen = sizeof(*ip6);
392 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
393 break;
394 #endif
395 #ifdef INET
396 case ETHERTYPE_IP:
397 ip = mtod(m, struct ip *);
398 th = (struct tcphdr *)(ip + 1);
399 drop_hdrlen = sizeof(*ip);
400 iptos = ip->ip_tos;
401 tlen = ntohs(ip->ip_len) - sizeof(struct ip);
402 break;
403 #endif
404 } /* end switch */
405 off = th->th_off << 2;
406 if (off < sizeof (struct tcphdr) || off > tlen) {
407 printf("off:%d < hdrlen:%zu || > tlen:%u -- dump\n",
408 off,
409 sizeof(struct tcphdr),
410 tlen);
411 KMOD_TCPSTAT_INC(tcps_rcvbadoff);
412 m_freem(m);
413 goto skipped_pkt;
414 }
415 tlen -= off;
416 drop_hdrlen += off;
417 /*
418 * Now lets setup the timeval to be when we should
419 * have been called (if we can).
420 */
421 m->m_pkthdr.lro_nsegs = 1;
422 /* Now what about next packet? */
423 } else {
424 /*
425 * This mbuf is an array of acks that have
426 * been compressed. We assert the inp has
427 * the flag set to enable this!
428 */
429 KASSERT((tp->t_flags2 & TF2_MBUF_ACKCMP),
430 ("tp:%p no TF2_MBUF_ACKCMP flags?", tp));
431 tlen = 0;
432 drop_hdrlen = 0;
433 th = NULL;
434 iptos = 0;
435 }
436 tcp_get_usecs(&tv);
437 if (m_save || has_pkt)
438 nxt_pkt = 1;
439 else
440 nxt_pkt = 0;
441 if ((m->m_flags & M_ACKCMP) == 0)
442 KMOD_TCPSTAT_INC(tcps_rcvtotal);
443 else
444 KMOD_TCPSTAT_ADD(tcps_rcvtotal, (m->m_len / sizeof(struct tcp_ackent)));
445 retval = (*tp->t_fb->tfb_do_segment_nounlock)(tp, m, th,
446 drop_hdrlen, tlen, iptos, nxt_pkt, &tv);
447 if (retval) {
448 /* We lost the lock and tcb probably */
449 m = m_save;
450 while(m) {
451 m_save = m->m_nextpkt;
452 m->m_nextpkt = NULL;
453 m_freem(m);
454 m = m_save;
455 }
456 CURVNET_RESTORE();
457 INP_UNLOCK_ASSERT(inp);
458 return (retval);
459 }
460 skipped_pkt:
461 m = m_save;
462 }
463 CURVNET_RESTORE();
464 return (0);
465 }
466
467 int
ctf_do_queued_segments(struct tcpcb * tp,int have_pkt)468 ctf_do_queued_segments(struct tcpcb *tp, int have_pkt)
469 {
470 struct mbuf *m;
471
472 /* First lets see if we have old packets */
473 if ((m = STAILQ_FIRST(&tp->t_inqueue)) != NULL) {
474 STAILQ_INIT(&tp->t_inqueue);
475 if (ctf_process_inbound_raw(tp, m, have_pkt)) {
476 /* We lost the tcpcb (maybe a RST came in)? */
477 return(1);
478 }
479 }
480 return (0);
481 }
482
483 uint32_t
ctf_outstanding(struct tcpcb * tp)484 ctf_outstanding(struct tcpcb *tp)
485 {
486 uint32_t bytes_out;
487
488 bytes_out = tp->snd_max - tp->snd_una;
489 if (tp->t_state < TCPS_ESTABLISHED)
490 bytes_out++;
491 if (tp->t_flags & TF_SENTFIN)
492 bytes_out++;
493 return (bytes_out);
494 }
495
496 uint32_t
ctf_flight_size(struct tcpcb * tp,uint32_t rc_sacked)497 ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked)
498 {
499 if (rc_sacked <= ctf_outstanding(tp))
500 return(ctf_outstanding(tp) - rc_sacked);
501 else {
502 return (0);
503 }
504 }
505
506 void
ctf_do_dropwithreset(struct mbuf * m,struct tcpcb * tp,struct tcphdr * th,int32_t tlen)507 ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
508 int32_t tlen)
509 {
510 tcp_dropwithreset(m, th, tp, tlen);
511 if (tp != NULL)
512 INP_WUNLOCK(tptoinpcb(tp));
513 }
514
515 void
ctf_ack_war_checks(struct tcpcb * tp)516 ctf_ack_war_checks(struct tcpcb *tp)
517 {
518 sbintime_t now;
519
520 if ((V_tcp_ack_war_time_window > 0) && (V_tcp_ack_war_cnt > 0)) {
521 now = getsbinuptime();
522 if (tp->t_challenge_ack_end < now) {
523 tp->t_challenge_ack_cnt = 0;
524 tp->t_challenge_ack_end = now +
525 V_tcp_ack_war_time_window * SBT_1MS;
526 }
527 if (tp->t_challenge_ack_cnt < V_tcp_ack_war_cnt) {
528 tp->t_challenge_ack_cnt++;
529 tp->t_flags |= TF_ACKNOW;
530 } else
531 tp->t_flags &= ~TF_ACKNOW;
532 } else
533 tp->t_flags |= TF_ACKNOW;
534 }
535
536 /*
537 * ctf_drop_checks returns 1 for you should not proceed. It places
538 * in ret_val what should be returned 1/0 by the caller. The 1 indicates
539 * that the TCB is unlocked and probably dropped. The 0 indicates the
540 * TCB is still valid and locked.
541 */
542 int
ctf_drop_checks(struct tcpopt * to,struct mbuf * m,struct tcphdr * th,struct tcpcb * tp,int32_t * tlenp,int32_t * thf,int32_t * drop_hdrlen,int32_t * ret_val)543 ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th,
544 struct tcpcb *tp, int32_t *tlenp,
545 int32_t *thf, int32_t *drop_hdrlen, int32_t *ret_val)
546 {
547 int32_t todrop;
548 int32_t thflags;
549 int32_t tlen;
550
551 thflags = *thf;
552 tlen = *tlenp;
553 todrop = tp->rcv_nxt - th->th_seq;
554 if (todrop > 0) {
555 if (thflags & TH_SYN) {
556 thflags &= ~TH_SYN;
557 th->th_seq++;
558 if (th->th_urp > 1)
559 th->th_urp--;
560 else
561 thflags &= ~TH_URG;
562 todrop--;
563 }
564 /*
565 * Following if statement from Stevens, vol. 2, p. 960.
566 */
567 if (todrop > tlen
568 || (todrop == tlen && (thflags & TH_FIN) == 0)) {
569 /*
570 * Any valid FIN must be to the left of the window.
571 * At this point the FIN must be a duplicate or out
572 * of sequence; drop it.
573 */
574 thflags &= ~TH_FIN;
575 /*
576 * Send an ACK to resynchronize and drop any data.
577 * But keep on processing for RST or ACK.
578 */
579 ctf_ack_war_checks(tp);
580 todrop = tlen;
581 KMOD_TCPSTAT_INC(tcps_rcvduppack);
582 KMOD_TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
583 } else {
584 KMOD_TCPSTAT_INC(tcps_rcvpartduppack);
585 KMOD_TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
586 }
587 /*
588 * DSACK - add SACK block for dropped range
589 */
590 if ((todrop > 0) && (tp->t_flags & TF_SACK_PERMIT)) {
591 /*
592 * ACK now, as the next in-sequence segment
593 * will clear the DSACK block again
594 */
595 ctf_ack_war_checks(tp);
596 if (tp->t_flags & TF_ACKNOW)
597 tcp_update_sack_list(tp, th->th_seq,
598 th->th_seq + todrop);
599 }
600 *drop_hdrlen += todrop; /* drop from the top afterwards */
601 th->th_seq += todrop;
602 tlen -= todrop;
603 if (th->th_urp > todrop)
604 th->th_urp -= todrop;
605 else {
606 thflags &= ~TH_URG;
607 th->th_urp = 0;
608 }
609 }
610 /*
611 * If segment ends after window, drop trailing data (and PUSH and
612 * FIN); if nothing left, just ACK.
613 */
614 todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
615 if (todrop > 0) {
616 KMOD_TCPSTAT_INC(tcps_rcvpackafterwin);
617 if (todrop >= tlen) {
618 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
619 /*
620 * If window is closed can only take segments at
621 * window edge, and have to drop data and PUSH from
622 * incoming segments. Continue processing, but
623 * remember to ack. Otherwise, drop segment and
624 * ack.
625 */
626 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
627 ctf_ack_war_checks(tp);
628 KMOD_TCPSTAT_INC(tcps_rcvwinprobe);
629 } else {
630 ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
631 return (1);
632 }
633 } else
634 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
635 m_adj(m, -todrop);
636 tlen -= todrop;
637 thflags &= ~(TH_PUSH | TH_FIN);
638 }
639 *thf = thflags;
640 *tlenp = tlen;
641 return (0);
642 }
643
644 /*
645 * The value in ret_val informs the caller
646 * if we dropped the tcb (and lock) or not.
647 * 1 = we dropped it, 0 = the TCB is still locked
648 * and valid.
649 */
650 void
ctf_do_dropafterack(struct mbuf * m,struct tcpcb * tp,struct tcphdr * th,int32_t thflags,int32_t tlen,int32_t * ret_val)651 ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t *ret_val)
652 {
653 /*
654 * Generate an ACK dropping incoming segment if it occupies sequence
655 * space, where the ACK reflects our state.
656 *
657 * We can now skip the test for the RST flag since all paths to this
658 * code happen after packets containing RST have been dropped.
659 *
660 * In the SYN-RECEIVED state, don't send an ACK unless the segment
661 * we received passes the SYN-RECEIVED ACK test. If it fails send a
662 * RST. This breaks the loop in the "LAND" DoS attack, and also
663 * prevents an ACK storm between two listening ports that have been
664 * sent forged SYN segments, each with the source address of the
665 * other.
666 */
667 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
668 (SEQ_GT(tp->snd_una, th->th_ack) ||
669 SEQ_GT(th->th_ack, tp->snd_max))) {
670 *ret_val = 1;
671 ctf_do_dropwithreset(m, tp, th, tlen);
672 return;
673 } else
674 *ret_val = 0;
675 ctf_ack_war_checks(tp);
676 if (m)
677 m_freem(m);
678 }
679
680 void
ctf_do_drop(struct mbuf * m,struct tcpcb * tp)681 ctf_do_drop(struct mbuf *m, struct tcpcb *tp)
682 {
683
684 /*
685 * Drop space held by incoming segment and return.
686 */
687 if (tp != NULL)
688 INP_WUNLOCK(tptoinpcb(tp));
689 if (m)
690 m_freem(m);
691 }
692
693 int
ctf_process_rst(struct mbuf * m,struct tcphdr * th,struct socket * so,struct tcpcb * tp)694 ctf_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so,
695 struct tcpcb *tp)
696 {
697 /*
698 * RFC5961 Section 3.2
699 *
700 * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in
701 * window, we send challenge ACK.
702 *
703 * Note: to take into account delayed ACKs, we should test against
704 * last_ack_sent instead of rcv_nxt. Note 2: we handle special case
705 * of closed window, not covered by the RFC.
706 */
707 int dropped = 0;
708
709 if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
710 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
711 (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
712 KASSERT(tp->t_state != TCPS_SYN_SENT,
713 ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
714 __func__, th, tp));
715
716 if (V_tcp_insecure_rst ||
717 (tp->last_ack_sent == th->th_seq) ||
718 (tp->rcv_nxt == th->th_seq)) {
719 KMOD_TCPSTAT_INC(tcps_drops);
720 /* Drop the connection. */
721 switch (tp->t_state) {
722 case TCPS_SYN_RECEIVED:
723 so->so_error = ECONNREFUSED;
724 goto close;
725 case TCPS_ESTABLISHED:
726 case TCPS_FIN_WAIT_1:
727 case TCPS_FIN_WAIT_2:
728 case TCPS_CLOSE_WAIT:
729 case TCPS_CLOSING:
730 case TCPS_LAST_ACK:
731 so->so_error = ECONNRESET;
732 close:
733 /* FALLTHROUGH */
734 default:
735 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_RST);
736 tp = tcp_close(tp);
737 }
738 dropped = 1;
739 ctf_do_drop(m, tp);
740 } else {
741 KMOD_TCPSTAT_INC(tcps_badrst);
742 tcp_send_challenge_ack(tp, th, m);
743 }
744 } else {
745 m_freem(m);
746 }
747 return (dropped);
748 }
749
750 /*
751 * The value in ret_val informs the caller
752 * if we dropped the tcb (and lock) or not.
753 * 1 = we dropped it, 0 = the TCB is still locked
754 * and valid.
755 */
756 void
ctf_challenge_ack(struct mbuf * m,struct tcphdr * th,struct tcpcb * tp,uint8_t iptos,int32_t * ret_val)757 ctf_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, uint8_t iptos, int32_t * ret_val)
758 {
759
760 NET_EPOCH_ASSERT();
761
762 KMOD_TCPSTAT_INC(tcps_badsyn);
763 if (V_tcp_insecure_syn &&
764 SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
765 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
766 tp = tcp_drop(tp, ECONNRESET);
767 *ret_val = 1;
768 ctf_do_drop(m, tp);
769 } else {
770 tcp_ecn_input_syn_sent(tp, tcp_get_flags(th), iptos);
771 /* Send challenge ACK. */
772 tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
773 tp->snd_nxt, TH_ACK);
774 tp->last_ack_sent = tp->rcv_nxt;
775 m = NULL;
776 *ret_val = 0;
777 ctf_do_drop(m, NULL);
778 }
779 }
780
781 /*
782 * ctf_ts_check returns 1 for you should not proceed, the state
783 * machine should return. It places in ret_val what should
784 * be returned 1/0 by the caller (hpts_do_segment). The 1 indicates
785 * that the TCB is unlocked and probably dropped. The 0 indicates the
786 * TCB is still valid and locked.
787 */
788 int
ctf_ts_check(struct mbuf * m,struct tcphdr * th,struct tcpcb * tp,int32_t tlen,int32_t thflags,int32_t * ret_val)789 ctf_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
790 int32_t tlen, int32_t thflags, int32_t * ret_val)
791 {
792
793 if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
794 /*
795 * Invalidate ts_recent. If this segment updates ts_recent,
796 * the age will be reset later and ts_recent will get a
797 * valid value. If it does not, setting ts_recent to zero
798 * will at least satisfy the requirement that zero be placed
799 * in the timestamp echo reply when ts_recent isn't valid.
800 * The age isn't reset until we get a valid ts_recent
801 * because we don't want out-of-order segments to be dropped
802 * when ts_recent is old.
803 */
804 tp->ts_recent = 0;
805 } else {
806 KMOD_TCPSTAT_INC(tcps_rcvduppack);
807 KMOD_TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
808 KMOD_TCPSTAT_INC(tcps_pawsdrop);
809 *ret_val = 0;
810 if (tlen) {
811 ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
812 } else {
813 ctf_do_drop(m, NULL);
814 }
815 return (1);
816 }
817 return (0);
818 }
819
820 int
ctf_ts_check_ac(struct tcpcb * tp,int32_t thflags)821 ctf_ts_check_ac(struct tcpcb *tp, int32_t thflags)
822 {
823
824 if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
825 /*
826 * Invalidate ts_recent. If this segment updates ts_recent,
827 * the age will be reset later and ts_recent will get a
828 * valid value. If it does not, setting ts_recent to zero
829 * will at least satisfy the requirement that zero be placed
830 * in the timestamp echo reply when ts_recent isn't valid.
831 * The age isn't reset until we get a valid ts_recent
832 * because we don't want out-of-order segments to be dropped
833 * when ts_recent is old.
834 */
835 tp->ts_recent = 0;
836 } else {
837 KMOD_TCPSTAT_INC(tcps_rcvduppack);
838 KMOD_TCPSTAT_INC(tcps_pawsdrop);
839 return (1);
840 }
841 return (0);
842 }
843
844
845
846 void
ctf_calc_rwin(struct socket * so,struct tcpcb * tp)847 ctf_calc_rwin(struct socket *so, struct tcpcb *tp)
848 {
849 int32_t win;
850
851 /*
852 * Calculate amount of space in receive window, and then do TCP
853 * input processing. Receive window is amount of space in rcv queue,
854 * but not less than advertised window.
855 */
856 win = sbspace(&so->so_rcv);
857 if (win < 0)
858 win = 0;
859 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
860 }
861
862 void
ctf_do_dropwithreset_conn(struct mbuf * m,struct tcpcb * tp,struct tcphdr * th,int32_t tlen)863 ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
864 int32_t tlen)
865 {
866
867 tcp_dropwithreset(m, th, tp, tlen);
868 tp = tcp_drop(tp, ETIMEDOUT);
869 if (tp)
870 INP_WUNLOCK(tptoinpcb(tp));
871 }
872
873 uint32_t
ctf_fixed_maxseg(struct tcpcb * tp)874 ctf_fixed_maxseg(struct tcpcb *tp)
875 {
876 return (tcp_fixed_maxseg(tp));
877 }
878
879 void
ctf_log_sack_filter(struct tcpcb * tp,int num_sack_blks,struct sackblk * sack_blocks)880 ctf_log_sack_filter(struct tcpcb *tp, int num_sack_blks, struct sackblk *sack_blocks)
881 {
882 if (tcp_bblogging_on(tp)) {
883 union tcp_log_stackspecific log;
884 struct timeval tv;
885
886 memset(&log, 0, sizeof(log));
887 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
888 log.u_bbr.flex8 = num_sack_blks;
889 if (num_sack_blks > 0) {
890 log.u_bbr.flex1 = sack_blocks[0].start;
891 log.u_bbr.flex2 = sack_blocks[0].end;
892 }
893 if (num_sack_blks > 1) {
894 log.u_bbr.flex3 = sack_blocks[1].start;
895 log.u_bbr.flex4 = sack_blocks[1].end;
896 }
897 if (num_sack_blks > 2) {
898 log.u_bbr.flex5 = sack_blocks[2].start;
899 log.u_bbr.flex6 = sack_blocks[2].end;
900 }
901 if (num_sack_blks > 3) {
902 log.u_bbr.applimited = sack_blocks[3].start;
903 log.u_bbr.pkts_out = sack_blocks[3].end;
904 }
905 TCP_LOG_EVENTP(tp, NULL,
906 &tptosocket(tp)->so_rcv,
907 &tptosocket(tp)->so_snd,
908 TCP_SACK_FILTER_RES, 0,
909 0, &log, false, &tv);
910 }
911 }
912
913 uint32_t
ctf_decay_count(uint32_t count,uint32_t decay)914 ctf_decay_count(uint32_t count, uint32_t decay)
915 {
916 /*
917 * Given a count, decay it by a set percentage. The
918 * percentage is in thousands i.e. 100% = 1000,
919 * 19.3% = 193.
920 */
921 uint64_t perc_count, decay_per;
922 uint32_t decayed_count;
923 if (decay > 1000) {
924 /* We don't raise it */
925 return (count);
926 }
927 perc_count = count;
928 decay_per = decay;
929 perc_count *= decay_per;
930 perc_count /= 1000;
931 /*
932 * So now perc_count holds the
933 * count decay value.
934 */
935 decayed_count = count - (uint32_t)perc_count;
936 return(decayed_count);
937 }
938
939 int32_t
ctf_progress_timeout_check(struct tcpcb * tp,bool log)940 ctf_progress_timeout_check(struct tcpcb *tp, bool log)
941 {
942 if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) {
943 if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) {
944 /*
945 * There is an assumption that the caller
946 * will drop the connection so we will
947 * increment the counters here.
948 */
949 if (log)
950 tcp_log_end_status(tp, TCP_EI_STATUS_PROGRESS);
951 #ifdef NETFLIX_STATS
952 KMOD_TCPSTAT_INC(tcps_progdrops);
953 #endif
954 return (1);
955 }
956 }
957 return (0);
958 }
959