1 /*-
2 * Copyright (c) 2016-2020 Netflix, Inc.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 *
25 */
26
27 #include <sys/cdefs.h>
28 #include "opt_inet.h"
29 #include "opt_inet6.h"
30 #include "opt_ipsec.h"
31 #include "opt_ratelimit.h"
32 #include "opt_kern_tls.h"
33 #if defined(INET) || defined(INET6)
34 #include <sys/param.h>
35 #include <sys/arb.h>
36 #include <sys/module.h>
37 #include <sys/kernel.h>
38 #ifdef TCP_HHOOK
39 #include <sys/hhook.h>
40 #endif
41 #include <sys/lock.h>
42 #include <sys/malloc.h>
43 #include <sys/mutex.h>
44 #include <sys/mbuf.h>
45 #include <sys/proc.h> /* for proc0 declaration */
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #include <sys/sysctl.h>
49 #include <sys/systm.h>
50 #ifdef STATS
51 #include <sys/qmath.h>
52 #include <sys/tree.h>
53 #include <sys/stats.h> /* Must come after qmath.h and tree.h */
54 #else
55 #include <sys/tree.h>
56 #endif
57 #include <sys/refcount.h>
58 #include <sys/queue.h>
59 #include <sys/tim_filter.h>
60 #include <sys/smp.h>
61 #include <sys/kthread.h>
62 #include <sys/kern_prefetch.h>
63 #include <sys/protosw.h>
64 #ifdef TCP_ACCOUNTING
65 #include <sys/sched.h>
66 #include <machine/cpu.h>
67 #endif
68 #include <vm/uma.h>
69
70 #include <net/route.h>
71 #include <net/route/nhop.h>
72 #include <net/vnet.h>
73
74 #define TCPSTATES /* for logging */
75
76 #include <netinet/in.h>
77 #include <netinet/in_kdtrace.h>
78 #include <netinet/in_pcb.h>
79 #include <netinet/ip.h>
80 #include <netinet/ip_var.h>
81 #include <netinet/ip6.h>
82 #include <netinet6/in6_pcb.h>
83 #include <netinet6/ip6_var.h>
84 #include <netinet/tcp.h>
85 #define TCPOUTFLAGS
86 #include <netinet/tcp_fsm.h>
87 #include <netinet/tcp_seq.h>
88 #include <netinet/tcp_timer.h>
89 #include <netinet/tcp_var.h>
90 #include <netinet/tcp_log_buf.h>
91 #include <netinet/tcp_syncache.h>
92 #include <netinet/tcp_hpts.h>
93 #include <netinet/tcp_ratelimit.h>
94 #include <netinet/tcp_accounting.h>
95 #include <netinet/tcpip.h>
96 #include <netinet/cc/cc.h>
97 #include <netinet/cc/cc_newreno.h>
98 #include <netinet/tcp_fastopen.h>
99 #include <netinet/tcp_lro.h>
100 #ifdef NETFLIX_SHARED_CWND
101 #include <netinet/tcp_shared_cwnd.h>
102 #endif
103 #ifdef TCP_OFFLOAD
104 #include <netinet/tcp_offload.h>
105 #endif
106 #ifdef INET6
107 #include <netinet6/tcp6_var.h>
108 #endif
109 #include <netinet/tcp_ecn.h>
110
111 #include <netipsec/ipsec_support.h>
112
113 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
114 #include <netipsec/ipsec.h>
115 #include <netipsec/ipsec6.h>
116 #endif /* IPSEC */
117
118 #include <netinet/udp.h>
119 #include <netinet/udp_var.h>
120 #include <machine/in_cksum.h>
121
122 #ifdef MAC
123 #include <security/mac/mac_framework.h>
124 #endif
125 #include "sack_filter.h"
126 #include "tcp_rack.h"
127 #include "tailq_hash.h"
128 #include "rack_bbr_common.h"
129
130 uma_zone_t rack_zone;
131 uma_zone_t rack_pcb_zone;
132
133 #ifndef TICKS2SBT
134 #define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t)))
135 #endif
136
137 VNET_DECLARE(uint32_t, newreno_beta);
138 VNET_DECLARE(uint32_t, newreno_beta_ecn);
139 #define V_newreno_beta VNET(newreno_beta)
140 #define V_newreno_beta_ecn VNET(newreno_beta_ecn)
141
142 #define M_TCPFSB __CONCAT(M_TCPFSB, STACKNAME)
143 #define M_TCPDO __CONCAT(M_TCPDO, STACKNAME)
144
145 MALLOC_DEFINE(M_TCPFSB, "tcp_fsb_" __XSTRING(STACKNAME), "TCP fast send block");
146 MALLOC_DEFINE(M_TCPDO, "tcp_do_" __XSTRING(STACKNAME), "TCP deferred options");
147 MALLOC_DEFINE(M_TCPPCM, "tcp_pcm_" __XSTRING(STACKNAME), "TCP PCM measurement information");
148
149 struct sysctl_ctx_list rack_sysctl_ctx;
150 struct sysctl_oid *rack_sysctl_root;
151
152 #define CUM_ACKED 1
153 #define SACKED 2
154
155 /*
156 * The RACK module incorporates a number of
157 * TCP ideas that have been put out into the IETF
158 * over the last few years:
159 * - Matt Mathis's Rate Halving which slowly drops
160 * the congestion window so that the ack clock can
161 * be maintained during a recovery.
162 * - Yuchung Cheng's RACK TCP (for which its named) that
163 * will stop us using the number of dup acks and instead
164 * use time as the gage of when we retransmit.
165 * - Reorder Detection of RFC4737 and the Tail-Loss probe draft
166 * of Dukkipati et.al.
167 * RACK depends on SACK, so if an endpoint arrives that
168 * cannot do SACK the state machine below will shuttle the
169 * connection back to using the "default" TCP stack that is
170 * in FreeBSD.
171 *
172 * To implement RACK the original TCP stack was first decomposed
173 * into a functional state machine with individual states
174 * for each of the possible TCP connection states. The do_segment
175 * functions role in life is to mandate the connection supports SACK
176 * initially and then assure that the RACK state matches the conenction
177 * state before calling the states do_segment function. Each
178 * state is simplified due to the fact that the original do_segment
179 * has been decomposed and we *know* what state we are in (no
180 * switches on the state) and all tests for SACK are gone. This
181 * greatly simplifies what each state does.
182 *
183 * TCP output is also over-written with a new version since it
184 * must maintain the new rack scoreboard.
185 *
186 */
187 static int32_t rack_tlp_thresh = 1;
188 static int32_t rack_tlp_limit = 2; /* No more than 2 TLPs w-out new data */
189 static int32_t rack_tlp_use_greater = 1;
190 static int32_t rack_reorder_thresh = 2;
191 static int32_t rack_reorder_fade = 60000000; /* 0 - never fade, def 60,000,000
192 * - 60 seconds */
193 static uint32_t rack_pcm_every_n_rounds = 100;
194 static uint32_t rack_pcm_blast = 0;
195 static uint32_t rack_pcm_is_enabled = 1;
196 static uint8_t rack_ssthresh_rest_rto_rec = 0; /* Do we restore ssthresh when we have rec -> rto -> rec */
197
198 static uint32_t rack_gp_gain_req = 1200; /* Amount percent wise required to gain to record a round as "gaining" */
199 static uint32_t rack_rnd_cnt_req = 0x10005; /* Default number of rounds if we are below rack_gp_gain_req where we exit ss */
200
201
202 static int32_t rack_rxt_scoreboard_clear_thresh = 2;
203 static int32_t rack_dnd_default = 0; /* For rr_conf = 3, what is the default for dnd */
204 static int32_t rack_rxt_controls = 0;
205 static int32_t rack_fill_cw_state = 0;
206 static uint8_t rack_req_measurements = 1;
207 static uint32_t rack_rtt_divisor = 2;
208 static int32_t rack_enable_hw_pacing = 0; /* Due to CCSP keep it off by default */
209 static int32_t rack_hw_rate_caps = 0; /* 1; */
210 static int32_t rack_hw_rate_cap_per = 0; /* 0 -- off */
211 static int32_t rack_hw_rate_min = 0; /* 1500000;*/
212 static int32_t rack_hw_rate_to_low = 0; /* 1200000; */
213 static int32_t rack_hw_up_only = 0;
214 static int32_t rack_stats_gets_ms_rtt = 1;
215 static int32_t rack_prr_addbackmax = 2;
216 static int32_t rack_do_hystart = 0;
217 static int32_t rack_apply_rtt_with_reduced_conf = 0;
218 static int32_t rack_hibeta_setting = 0;
219 static int32_t rack_default_pacing_divisor = 250;
220 static uint16_t rack_pacing_min_seg = 0;
221 static int32_t rack_timely_off = 0;
222
223 static int32_t rack_pkt_delay = 1000;
224 static int32_t rack_send_a_lot_in_prr = 1;
225 static int32_t rack_min_to = 1000; /* Number of microsecond min timeout */
226 static int32_t rack_verbose_logging = 0;
227 static int32_t rack_ignore_data_after_close = 1;
228 static int32_t rack_enable_shared_cwnd = 1;
229 static int32_t rack_use_cmp_acks = 1;
230 static int32_t rack_use_fsb = 1;
231 static int32_t rack_use_rfo = 1;
232 static int32_t rack_use_rsm_rfo = 1;
233 static int32_t rack_max_abc_post_recovery = 2;
234 static int32_t rack_client_low_buf = 0;
235 static int32_t rack_dsack_std_based = 0x3; /* bit field bit 1 sets rc_rack_tmr_std_based and bit 2 sets rc_rack_use_dsack */
236 static int32_t rack_bw_multipler = 0; /* Limit on fill cw's jump up to be this x gp_est */
237 #ifdef TCP_ACCOUNTING
238 static int32_t rack_tcp_accounting = 0;
239 #endif
240 static int32_t rack_limits_scwnd = 1;
241 static int32_t rack_enable_mqueue_for_nonpaced = 0;
242 static int32_t rack_hybrid_allow_set_maxseg = 0;
243 static int32_t rack_disable_prr = 0;
244 static int32_t use_rack_rr = 1;
245 static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */
246 static int32_t rack_persist_min = 250000; /* 250usec */
247 static int32_t rack_persist_max = 2000000; /* 2 Second in usec's */
248 static int32_t rack_honors_hpts_min_to = 1; /* Do we honor the hpts minimum time out for pacing timers */
249 static uint32_t rack_max_reduce = 10; /* Percent we can reduce pacing delay by */
250 static int32_t rack_sack_not_required = 1; /* set to one to allow non-sack to use rack */
251 static int32_t rack_limit_time_with_srtt = 0;
252 static int32_t rack_autosndbuf_inc = 20; /* In percentage form */
253 static int32_t rack_enobuf_hw_boost_mult = 0; /* How many times the hw rate we boost pacing delay using time_between */
254 static int32_t rack_enobuf_hw_max = 12000; /* 12 ms in usecs */
255 static int32_t rack_enobuf_hw_min = 10000; /* 10 ms in usecs */
256 static int32_t rack_hw_rwnd_factor = 2; /* How many max_segs the rwnd must be before we hold off sending */
257 static int32_t rack_hw_check_queue = 0; /* Do we always pre-check queue depth of a hw queue */
258
259 /*
260 * Currently regular tcp has a rto_min of 30ms
261 * the backoff goes 12 times so that ends up
262 * being a total of 122.850 seconds before a
263 * connection is killed.
264 */
265 static uint32_t rack_def_data_window = 20;
266 static uint32_t rack_goal_bdp = 2;
267 static uint32_t rack_min_srtts = 1;
268 static uint32_t rack_min_measure_usec = 0;
269 static int32_t rack_tlp_min = 10000; /* 10ms */
270 static int32_t rack_rto_min = 30000; /* 30,000 usec same as main freebsd */
271 static int32_t rack_rto_max = 4000000; /* 4 seconds in usec's */
272 static const int32_t rack_free_cache = 2;
273 static int32_t rack_hptsi_segments = 40;
274 static int32_t rack_rate_sample_method = USE_RTT_LOW;
275 static int32_t rack_pace_every_seg = 0;
276 static int32_t rack_delayed_ack_time = 40000; /* 40ms in usecs */
277 static int32_t rack_pacing_delay_reduction = 4;
278 static int32_t rack_wma_divisor = 8; /* For WMA calculation */
279 static int32_t rack_cwnd_block_ends_measure = 0;
280 static int32_t rack_rwnd_block_ends_measure = 0;
281 static int32_t rack_def_profile = 0;
282
283 static int32_t rack_lower_cwnd_at_tlp = 0;
284 static int32_t rack_always_send_oldest = 0;
285 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE;
286
287 static uint16_t rack_per_of_gp_ss = 250; /* 250 % slow-start */
288 static uint16_t rack_per_of_gp_ca = 200; /* 200 % congestion-avoidance */
289 static uint16_t rack_per_of_gp_rec = 200; /* 200 % of bw */
290
291 /* Probertt */
292 static uint16_t rack_per_of_gp_probertt = 60; /* 60% of bw */
293 static uint16_t rack_per_of_gp_lowthresh = 40; /* 40% is bottom */
294 static uint16_t rack_per_of_gp_probertt_reduce = 10; /* 10% reduction */
295 static uint16_t rack_atexit_prtt_hbp = 130; /* Clamp to 130% on exit prtt if highly buffered path */
296 static uint16_t rack_atexit_prtt = 130; /* Clamp to 100% on exit prtt if non highly buffered path */
297
298 static uint32_t rack_max_drain_wait = 2; /* How man gp srtt's before we give up draining */
299 static uint32_t rack_must_drain = 1; /* How many GP srtt's we *must* wait */
300 static uint32_t rack_probertt_use_min_rtt_entry = 1; /* Use the min to calculate the goal else gp_srtt */
301 static uint32_t rack_probertt_use_min_rtt_exit = 0;
302 static uint32_t rack_probe_rtt_sets_cwnd = 0;
303 static uint32_t rack_probe_rtt_safety_val = 2000000; /* No more than 2 sec in probe-rtt */
304 static uint32_t rack_time_between_probertt = 9600000; /* 9.6 sec in usecs */
305 static uint32_t rack_probertt_gpsrtt_cnt_mul = 0; /* How many srtt periods does probe-rtt last top fraction */
306 static uint32_t rack_probertt_gpsrtt_cnt_div = 0; /* How many srtt periods does probe-rtt last bottom fraction */
307 static uint32_t rack_min_probertt_hold = 40000; /* Equal to delayed ack time */
308 static uint32_t rack_probertt_filter_life = 10000000;
309 static uint32_t rack_probertt_lower_within = 10;
310 static uint32_t rack_min_rtt_movement = 250000; /* Must move at least 250ms (in microseconds) to count as a lowering */
311 static int32_t rack_pace_one_seg = 0; /* Shall we pace for less than 1.4Meg 1MSS at a time */
312 static int32_t rack_probertt_clear_is = 1;
313 static int32_t rack_max_drain_hbp = 1; /* Extra drain times gpsrtt for highly buffered paths */
314 static int32_t rack_hbp_thresh = 3; /* what is the divisor max_rtt/min_rtt to decided a hbp */
315
316 /* Part of pacing */
317 static int32_t rack_max_per_above = 30; /* When we go to increment stop if above 100+this% */
318
319 /* Timely information:
320 *
321 * Here we have various control parameters on how
322 * timely may change the multiplier. rack_gain_p5_ub
323 * is associated with timely but not directly influencing
324 * the rate decision like the other variables. It controls
325 * the way fill-cw interacts with timely and caps how much
326 * timely can boost the fill-cw b/w.
327 *
328 * The other values are various boost/shrink numbers as well
329 * as potential caps when adjustments are made to the timely
330 * gain (returned by rack_get_output_gain(). Remember too that
331 * the gain returned can be overriden by other factors such as
332 * probeRTT as well as fixed-rate-pacing.
333 */
334 static int32_t rack_gain_p5_ub = 250;
335 static int32_t rack_gp_per_bw_mul_up = 2; /* 2% */
336 static int32_t rack_gp_per_bw_mul_down = 4; /* 4% */
337 static int32_t rack_gp_rtt_maxmul = 3; /* 3 x maxmin */
338 static int32_t rack_gp_rtt_minmul = 1; /* minrtt + (minrtt/mindiv) is lower rtt */
339 static int32_t rack_gp_rtt_mindiv = 4; /* minrtt + (minrtt * minmul/mindiv) is lower rtt */
340 static int32_t rack_gp_decrease_per = 80; /* Beta value of timely decrease (.8) = 80 */
341 static int32_t rack_gp_increase_per = 2; /* 2% increase in multiplier */
342 static int32_t rack_per_lower_bound = 50; /* Don't allow to drop below this multiplier */
343 static int32_t rack_per_upper_bound_ss = 0; /* Don't allow SS to grow above this */
344 static int32_t rack_per_upper_bound_ca = 0; /* Don't allow CA to grow above this */
345 static int32_t rack_do_dyn_mul = 0; /* Are the rack gp multipliers dynamic */
346 static int32_t rack_gp_no_rec_chg = 1; /* Prohibit recovery from reducing it's multiplier */
347 static int32_t rack_timely_dec_clear = 6; /* Do we clear decrement count at a value (6)? */
348 static int32_t rack_timely_max_push_rise = 3; /* One round of pushing */
349 static int32_t rack_timely_max_push_drop = 3; /* Three round of pushing */
350 static int32_t rack_timely_min_segs = 4; /* 4 segment minimum */
351 static int32_t rack_timely_no_stopping = 0;
352 static int32_t rack_down_raise_thresh = 100;
353 static int32_t rack_req_segs = 1;
354 static uint64_t rack_bw_rate_cap = 0;
355 static uint64_t rack_fillcw_bw_cap = 3750000; /* Cap fillcw at 30Mbps */
356
357
358 /* Rack specific counters */
359 counter_u64_t rack_saw_enobuf;
360 counter_u64_t rack_saw_enobuf_hw;
361 counter_u64_t rack_saw_enetunreach;
362 counter_u64_t rack_persists_sends;
363 counter_u64_t rack_persists_acks;
364 counter_u64_t rack_persists_loss;
365 counter_u64_t rack_persists_lost_ends;
366 counter_u64_t rack_total_bytes;
367 #ifdef INVARIANTS
368 counter_u64_t rack_adjust_map_bw;
369 #endif
370 /* Tail loss probe counters */
371 counter_u64_t rack_tlp_tot;
372 counter_u64_t rack_tlp_newdata;
373 counter_u64_t rack_tlp_retran;
374 counter_u64_t rack_tlp_retran_bytes;
375 counter_u64_t rack_to_tot;
376 counter_u64_t rack_hot_alloc;
377 counter_u64_t rack_to_alloc;
378 counter_u64_t rack_to_alloc_hard;
379 counter_u64_t rack_to_alloc_emerg;
380 counter_u64_t rack_to_alloc_limited;
381 counter_u64_t rack_alloc_limited_conns;
382 counter_u64_t rack_split_limited;
383 counter_u64_t rack_rxt_clamps_cwnd;
384 counter_u64_t rack_rxt_clamps_cwnd_uniq;
385
386 counter_u64_t rack_multi_single_eq;
387 counter_u64_t rack_proc_non_comp_ack;
388
389 counter_u64_t rack_fto_send;
390 counter_u64_t rack_fto_rsm_send;
391 counter_u64_t rack_nfto_resend;
392 counter_u64_t rack_non_fto_send;
393 counter_u64_t rack_extended_rfo;
394
395 counter_u64_t rack_sack_proc_all;
396 counter_u64_t rack_sack_proc_short;
397 counter_u64_t rack_sack_proc_restart;
398
399 counter_u64_t rack_input_idle_reduces;
400 counter_u64_t rack_collapsed_win;
401 counter_u64_t rack_collapsed_win_seen;
402 counter_u64_t rack_collapsed_win_rxt;
403 counter_u64_t rack_collapsed_win_rxt_bytes;
404 counter_u64_t rack_try_scwnd;
405 counter_u64_t rack_hw_pace_init_fail;
406 counter_u64_t rack_hw_pace_lost;
407
408 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE];
409 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
410
411
412 #define RACK_REXMTVAL(tp) max(rack_rto_min, ((tp)->t_srtt + ((tp)->t_rttvar << 2)))
413
414 #define RACK_TCPT_RANGESET(tv, value, tvmin, tvmax, slop) do { \
415 (tv) = (value) + slop; \
416 if ((u_long)(tv) < (u_long)(tvmin)) \
417 (tv) = (tvmin); \
418 if ((u_long)(tv) > (u_long)(tvmax)) \
419 (tv) = (tvmax); \
420 } while (0)
421
422 static void
423 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line);
424
425 static int
426 rack_process_ack(struct mbuf *m, struct tcphdr *th,
427 struct socket *so, struct tcpcb *tp, struct tcpopt *to,
428 uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val, int32_t orig_tlen);
429 static int
430 rack_process_data(struct mbuf *m, struct tcphdr *th,
431 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
432 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
433 static void
434 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack,
435 uint32_t th_ack, uint16_t nsegs, uint16_t type, int32_t recovery);
436 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack);
437 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack,
438 uint8_t limit_type);
439 static struct rack_sendmap *
440 rack_check_recovery_mode(struct tcpcb *tp,
441 uint32_t tsused);
442 static uint32_t
443 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack);
444 static void
445 rack_cong_signal(struct tcpcb *tp,
446 uint32_t type, uint32_t ack, int );
447 static void rack_counter_destroy(void);
448 static int
449 rack_ctloutput(struct tcpcb *tp, struct sockopt *sopt);
450 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how);
451 static void
452 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override);
453 static void
454 rack_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
455 int32_t drop_hdrlen, int32_t tlen, uint8_t iptos);
456 static void rack_dtor(void *mem, int32_t size, void *arg);
457 static void
458 rack_log_alt_to_to_cancel(struct tcp_rack *rack,
459 uint32_t flex1, uint32_t flex2,
460 uint32_t flex3, uint32_t flex4,
461 uint32_t flex5, uint32_t flex6,
462 uint16_t flex7, uint8_t mod);
463
464 static void
465 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t pacing_delay,
466 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line,
467 struct rack_sendmap *rsm, uint8_t quality);
468 static struct rack_sendmap *
469 rack_find_high_nonack(struct tcp_rack *rack,
470 struct rack_sendmap *rsm);
471 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack);
472 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm);
473 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged);
474 static int rack_get_sockopt(struct tcpcb *tp, struct sockopt *sopt);
475 static void
476 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
477 tcp_seq th_ack, int line, uint8_t quality);
478 static void
479 rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint8_t frm);
480
481 static uint32_t
482 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss);
483 static int32_t rack_handoff_ok(struct tcpcb *tp);
484 static int32_t rack_init(struct tcpcb *tp, void **ptr);
485 static void rack_init_sysctls(void);
486
487 static void
488 rack_log_ack(struct tcpcb *tp, struct tcpopt *to,
489 struct tcphdr *th, int entered_rec, int dup_ack_struck,
490 int *dsack_seen, int *sacks_seen);
491 static void
492 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
493 uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t ts,
494 struct rack_sendmap *hintrsm, uint32_t add_flags, struct mbuf *s_mb, uint32_t s_moff, int hw_tls, int segsiz);
495
496 static uint64_t rack_get_gp_est(struct tcp_rack *rack);
497
498
499 static void
500 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack,
501 struct rack_sendmap *rsm, uint32_t cts, int line);
502 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm);
503 static int32_t rack_output(struct tcpcb *tp);
504
505 static uint32_t
506 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack,
507 struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm,
508 uint32_t cts, uint32_t segsiz);
509 static void rack_post_recovery(struct tcpcb *tp, uint32_t th_seq);
510 static void rack_remxt_tmr(struct tcpcb *tp);
511 static int rack_set_sockopt(struct tcpcb *tp, struct sockopt *sopt);
512 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack);
513 static int32_t rack_stopall(struct tcpcb *tp);
514 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line);
515 static uint32_t
516 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
517 struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint32_t add_flag, int segsiz);
518 static void
519 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
520 struct rack_sendmap *rsm, uint64_t ts, uint32_t add_flag, int segsiz);
521 static int
522 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
523 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack);
524 static int32_t tcp_addrack(module_t mod, int32_t type, void *data);
525 static int
526 rack_do_close_wait(struct mbuf *m, struct tcphdr *th,
527 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
528 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
529
530 static int
531 rack_do_closing(struct mbuf *m, struct tcphdr *th,
532 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
533 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
534 static int
535 rack_do_established(struct mbuf *m, struct tcphdr *th,
536 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
537 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
538 static int
539 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th,
540 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
541 int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos);
542 static int
543 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th,
544 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
545 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
546 static int
547 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th,
548 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
549 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
550 static int
551 rack_do_lastack(struct mbuf *m, struct tcphdr *th,
552 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
553 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
554 static int
555 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th,
556 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
557 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
558 static int
559 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th,
560 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
561 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
562 static void rack_chk_req_and_hybrid_on_out(struct tcp_rack *rack, tcp_seq seq, uint32_t len, uint64_t cts);
563 struct rack_sendmap *
564 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack,
565 uint32_t tsused);
566 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt,
567 uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt);
568 static void
569 tcp_rack_partialack(struct tcpcb *tp);
570 static int
571 rack_set_profile(struct tcp_rack *rack, int prof);
572 static void
573 rack_apply_deferred_options(struct tcp_rack *rack);
574
575 int32_t rack_clear_counter=0;
576
577 static uint64_t
rack_get_lt_bw(struct tcp_rack * rack)578 rack_get_lt_bw(struct tcp_rack *rack)
579 {
580 struct timeval tv;
581 uint64_t tim, bytes;
582
583 tim = rack->r_ctl.lt_bw_time;
584 bytes = rack->r_ctl.lt_bw_bytes;
585 if (rack->lt_bw_up) {
586 /* Include all the current bytes too */
587 microuptime(&tv);
588 bytes += (rack->rc_tp->snd_una - rack->r_ctl.lt_seq);
589 tim += (tcp_tv_to_lusec(&tv) - rack->r_ctl.lt_timemark);
590 }
591 if ((bytes != 0) && (tim != 0))
592 return ((bytes * (uint64_t)1000000) / tim);
593 else
594 return (0);
595 }
596
597 static void
rack_swap_beta_values(struct tcp_rack * rack,uint8_t flex8)598 rack_swap_beta_values(struct tcp_rack *rack, uint8_t flex8)
599 {
600 struct sockopt sopt;
601 struct cc_newreno_opts opt;
602 struct tcpcb *tp;
603 uint32_t old_beta;
604 uint32_t old_beta_ecn;
605 int error = 0, failed = 0;
606
607 tp = rack->rc_tp;
608 if (tp->t_cc == NULL) {
609 /* Tcb is leaving */
610 return;
611 }
612 rack->rc_pacing_cc_set = 1;
613 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) {
614 /* Not new-reno we can't play games with beta! */
615 failed = 1;
616 goto out;
617
618 }
619 if (CC_ALGO(tp)->ctl_output == NULL) {
620 /* Huh, not using new-reno so no swaps.? */
621 failed = 2;
622 goto out;
623 }
624 /* Get the current values out */
625 sopt.sopt_valsize = sizeof(struct cc_newreno_opts);
626 sopt.sopt_dir = SOPT_GET;
627 opt.name = CC_NEWRENO_BETA;
628 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt);
629 if (error) {
630 failed = 3;
631 goto out;
632 }
633 old_beta = opt.val;
634 opt.name = CC_NEWRENO_BETA_ECN;
635 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt);
636 if (error) {
637 failed = 4;
638 goto out;
639 }
640 old_beta_ecn = opt.val;
641
642 /* Now lets set in the values we have stored */
643 sopt.sopt_dir = SOPT_SET;
644 opt.name = CC_NEWRENO_BETA;
645 opt.val = rack->r_ctl.rc_saved_beta;
646 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt);
647 if (error) {
648 failed = 5;
649 goto out;
650 }
651 opt.name = CC_NEWRENO_BETA_ECN;
652 opt.val = rack->r_ctl.rc_saved_beta_ecn;
653 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt);
654 if (error) {
655 failed = 6;
656 goto out;
657 }
658 /* Save off the values for restoral */
659 rack->r_ctl.rc_saved_beta = old_beta;
660 rack->r_ctl.rc_saved_beta_ecn = old_beta_ecn;
661 out:
662 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
663 union tcp_log_stackspecific log;
664 struct timeval tv;
665 struct newreno *ptr;
666
667 ptr = ((struct newreno *)tp->t_ccv.cc_data);
668 memset(&log, 0, sizeof(log));
669 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
670 log.u_bbr.flex1 = ptr->beta;
671 log.u_bbr.flex2 = ptr->beta_ecn;
672 log.u_bbr.flex3 = ptr->newreno_flags;
673 log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta;
674 log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta_ecn;
675 log.u_bbr.flex6 = failed;
676 log.u_bbr.flex7 = rack->gp_ready;
677 log.u_bbr.flex7 <<= 1;
678 log.u_bbr.flex7 |= rack->use_fixed_rate;
679 log.u_bbr.flex7 <<= 1;
680 log.u_bbr.flex7 |= rack->rc_pacing_cc_set;
681 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
682 log.u_bbr.flex8 = flex8;
683 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, error,
684 0, &log, false, NULL, NULL, 0, &tv);
685 }
686 }
687
688 static void
rack_set_cc_pacing(struct tcp_rack * rack)689 rack_set_cc_pacing(struct tcp_rack *rack)
690 {
691 if (rack->rc_pacing_cc_set)
692 return;
693 /*
694 * Use the swap utility placing in 3 for flex8 to id a
695 * set of a new set of values.
696 */
697 rack->rc_pacing_cc_set = 1;
698 rack_swap_beta_values(rack, 3);
699 }
700
701 static void
rack_undo_cc_pacing(struct tcp_rack * rack)702 rack_undo_cc_pacing(struct tcp_rack *rack)
703 {
704 if (rack->rc_pacing_cc_set == 0)
705 return;
706 /*
707 * Use the swap utility placing in 4 for flex8 to id a
708 * restoral of the old values.
709 */
710 rack->rc_pacing_cc_set = 0;
711 rack_swap_beta_values(rack, 4);
712 }
713
714 static void
rack_remove_pacing(struct tcp_rack * rack)715 rack_remove_pacing(struct tcp_rack *rack)
716 {
717 if (rack->rc_pacing_cc_set)
718 rack_undo_cc_pacing(rack);
719 if (rack->r_ctl.pacing_method & RACK_REG_PACING)
720 tcp_decrement_paced_conn();
721 if (rack->r_ctl.pacing_method & RACK_DGP_PACING)
722 tcp_dec_dgp_pacing_cnt();
723 rack->rc_always_pace = 0;
724 rack->r_ctl.pacing_method = RACK_PACING_NONE;
725 rack->dgp_on = 0;
726 rack->rc_hybrid_mode = 0;
727 rack->use_fixed_rate = 0;
728 }
729
730 static void
rack_log_gpset(struct tcp_rack * rack,uint32_t seq_end,uint32_t ack_end_t,uint32_t send_end_t,int line,uint8_t mode,struct rack_sendmap * rsm)731 rack_log_gpset(struct tcp_rack *rack, uint32_t seq_end, uint32_t ack_end_t,
732 uint32_t send_end_t, int line, uint8_t mode, struct rack_sendmap *rsm)
733 {
734 if (tcp_bblogging_on(rack->rc_tp) && (rack_verbose_logging != 0)) {
735 union tcp_log_stackspecific log;
736 struct timeval tv;
737
738 memset(&log, 0, sizeof(log));
739 log.u_bbr.flex1 = seq_end;
740 log.u_bbr.flex2 = rack->rc_tp->gput_seq;
741 log.u_bbr.flex3 = ack_end_t;
742 log.u_bbr.flex4 = rack->rc_tp->gput_ts;
743 log.u_bbr.flex5 = send_end_t;
744 log.u_bbr.flex6 = rack->rc_tp->gput_ack;
745 log.u_bbr.flex7 = mode;
746 log.u_bbr.flex8 = 69;
747 log.u_bbr.rttProp = rack->r_ctl.rc_gp_cumack_ts;
748 log.u_bbr.delRate = rack->r_ctl.rc_gp_output_ts;
749 log.u_bbr.pkts_out = line;
750 log.u_bbr.cwnd_gain = rack->app_limited_needs_set;
751 log.u_bbr.pkt_epoch = rack->r_ctl.rc_app_limited_cnt;
752 log.u_bbr.epoch = rack->r_ctl.current_round;
753 log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost;
754 if (rsm != NULL) {
755 log.u_bbr.applimited = rsm->r_start;
756 log.u_bbr.delivered = rsm->r_end;
757 log.u_bbr.epoch = rsm->r_flags;
758 }
759 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
760 TCP_LOG_EVENTP(rack->rc_tp, NULL,
761 &rack->rc_inp->inp_socket->so_rcv,
762 &rack->rc_inp->inp_socket->so_snd,
763 BBR_LOG_HPTSI_CALC, 0,
764 0, &log, false, &tv);
765 }
766 }
767
768 static int
sysctl_rack_clear(SYSCTL_HANDLER_ARGS)769 sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
770 {
771 uint32_t stat;
772 int32_t error;
773
774 error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t));
775 if (error || req->newptr == NULL)
776 return error;
777
778 error = SYSCTL_IN(req, &stat, sizeof(uint32_t));
779 if (error)
780 return (error);
781 if (stat == 1) {
782 #ifdef INVARIANTS
783 printf("Clearing RACK counters\n");
784 #endif
785 counter_u64_zero(rack_tlp_tot);
786 counter_u64_zero(rack_tlp_newdata);
787 counter_u64_zero(rack_tlp_retran);
788 counter_u64_zero(rack_tlp_retran_bytes);
789 counter_u64_zero(rack_to_tot);
790 counter_u64_zero(rack_saw_enobuf);
791 counter_u64_zero(rack_saw_enobuf_hw);
792 counter_u64_zero(rack_saw_enetunreach);
793 counter_u64_zero(rack_persists_sends);
794 counter_u64_zero(rack_total_bytes);
795 counter_u64_zero(rack_persists_acks);
796 counter_u64_zero(rack_persists_loss);
797 counter_u64_zero(rack_persists_lost_ends);
798 #ifdef INVARIANTS
799 counter_u64_zero(rack_adjust_map_bw);
800 #endif
801 counter_u64_zero(rack_to_alloc_hard);
802 counter_u64_zero(rack_to_alloc_emerg);
803 counter_u64_zero(rack_sack_proc_all);
804 counter_u64_zero(rack_fto_send);
805 counter_u64_zero(rack_fto_rsm_send);
806 counter_u64_zero(rack_extended_rfo);
807 counter_u64_zero(rack_hw_pace_init_fail);
808 counter_u64_zero(rack_hw_pace_lost);
809 counter_u64_zero(rack_non_fto_send);
810 counter_u64_zero(rack_nfto_resend);
811 counter_u64_zero(rack_sack_proc_short);
812 counter_u64_zero(rack_sack_proc_restart);
813 counter_u64_zero(rack_to_alloc);
814 counter_u64_zero(rack_to_alloc_limited);
815 counter_u64_zero(rack_alloc_limited_conns);
816 counter_u64_zero(rack_split_limited);
817 counter_u64_zero(rack_rxt_clamps_cwnd);
818 counter_u64_zero(rack_rxt_clamps_cwnd_uniq);
819 counter_u64_zero(rack_multi_single_eq);
820 counter_u64_zero(rack_proc_non_comp_ack);
821 counter_u64_zero(rack_try_scwnd);
822 counter_u64_zero(rack_collapsed_win);
823 counter_u64_zero(rack_collapsed_win_rxt);
824 counter_u64_zero(rack_collapsed_win_seen);
825 counter_u64_zero(rack_collapsed_win_rxt_bytes);
826 } else if (stat == 2) {
827 #ifdef INVARIANTS
828 printf("Clearing RACK option array\n");
829 #endif
830 COUNTER_ARRAY_ZERO(rack_opts_arry, RACK_OPTS_SIZE);
831 } else if (stat == 3) {
832 printf("Rack has no stats counters to clear (use 1 to clear all stats in sysctl node)\n");
833 } else if (stat == 4) {
834 #ifdef INVARIANTS
835 printf("Clearing RACK out size array\n");
836 #endif
837 COUNTER_ARRAY_ZERO(rack_out_size, TCP_MSS_ACCT_SIZE);
838 }
839 rack_clear_counter = 0;
840 return (0);
841 }
842
843 static void
rack_init_sysctls(void)844 rack_init_sysctls(void)
845 {
846 struct sysctl_oid *rack_counters;
847 struct sysctl_oid *rack_pacing;
848 struct sysctl_oid *rack_timely;
849 struct sysctl_oid *rack_timers;
850 struct sysctl_oid *rack_tlp;
851 struct sysctl_oid *rack_misc;
852 struct sysctl_oid *rack_features;
853 struct sysctl_oid *rack_measure;
854 struct sysctl_oid *rack_probertt;
855 struct sysctl_oid *rack_hw_pacing;
856
857 rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
858 SYSCTL_CHILDREN(rack_sysctl_root),
859 OID_AUTO,
860 "stats",
861 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
862 "Rack Counters");
863 SYSCTL_ADD_S32(&rack_sysctl_ctx,
864 SYSCTL_CHILDREN(rack_sysctl_root),
865 OID_AUTO, "rate_sample_method", CTLFLAG_RW,
866 &rack_rate_sample_method , USE_RTT_LOW,
867 "What method should we use for rate sampling 0=high, 1=low ");
868 /* Probe rtt related controls */
869 rack_probertt = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
870 SYSCTL_CHILDREN(rack_sysctl_root),
871 OID_AUTO,
872 "probertt",
873 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
874 "ProbeRTT related Controls");
875 SYSCTL_ADD_U16(&rack_sysctl_ctx,
876 SYSCTL_CHILDREN(rack_probertt),
877 OID_AUTO, "exit_per_hpb", CTLFLAG_RW,
878 &rack_atexit_prtt_hbp, 130,
879 "What percentage above goodput do we clamp CA/SS to at exit on high-BDP path 110%");
880 SYSCTL_ADD_U16(&rack_sysctl_ctx,
881 SYSCTL_CHILDREN(rack_probertt),
882 OID_AUTO, "exit_per_nonhpb", CTLFLAG_RW,
883 &rack_atexit_prtt, 130,
884 "What percentage above goodput do we clamp CA/SS to at exit on a non high-BDP path 100%");
885 SYSCTL_ADD_U16(&rack_sysctl_ctx,
886 SYSCTL_CHILDREN(rack_probertt),
887 OID_AUTO, "gp_per_mul", CTLFLAG_RW,
888 &rack_per_of_gp_probertt, 60,
889 "What percentage of goodput do we pace at in probertt");
890 SYSCTL_ADD_U16(&rack_sysctl_ctx,
891 SYSCTL_CHILDREN(rack_probertt),
892 OID_AUTO, "gp_per_reduce", CTLFLAG_RW,
893 &rack_per_of_gp_probertt_reduce, 10,
894 "What percentage of goodput do we reduce every gp_srtt");
895 SYSCTL_ADD_U16(&rack_sysctl_ctx,
896 SYSCTL_CHILDREN(rack_probertt),
897 OID_AUTO, "gp_per_low", CTLFLAG_RW,
898 &rack_per_of_gp_lowthresh, 40,
899 "What percentage of goodput do we allow the multiplier to fall to");
900 SYSCTL_ADD_U32(&rack_sysctl_ctx,
901 SYSCTL_CHILDREN(rack_probertt),
902 OID_AUTO, "time_between", CTLFLAG_RW,
903 &rack_time_between_probertt, 96000000,
904 "How many useconds between the lowest rtt falling must past before we enter probertt");
905 SYSCTL_ADD_U32(&rack_sysctl_ctx,
906 SYSCTL_CHILDREN(rack_probertt),
907 OID_AUTO, "safety", CTLFLAG_RW,
908 &rack_probe_rtt_safety_val, 2000000,
909 "If not zero, provides a maximum usecond that you can stay in probertt (2sec = 2000000)");
910 SYSCTL_ADD_U32(&rack_sysctl_ctx,
911 SYSCTL_CHILDREN(rack_probertt),
912 OID_AUTO, "sets_cwnd", CTLFLAG_RW,
913 &rack_probe_rtt_sets_cwnd, 0,
914 "Do we set the cwnd too (if always_lower is on)");
915 SYSCTL_ADD_U32(&rack_sysctl_ctx,
916 SYSCTL_CHILDREN(rack_probertt),
917 OID_AUTO, "maxdrainsrtts", CTLFLAG_RW,
918 &rack_max_drain_wait, 2,
919 "Maximum number of gp_srtt's to hold in drain waiting for flight to reach goal");
920 SYSCTL_ADD_U32(&rack_sysctl_ctx,
921 SYSCTL_CHILDREN(rack_probertt),
922 OID_AUTO, "mustdrainsrtts", CTLFLAG_RW,
923 &rack_must_drain, 1,
924 "We must drain this many gp_srtt's waiting for flight to reach goal");
925 SYSCTL_ADD_U32(&rack_sysctl_ctx,
926 SYSCTL_CHILDREN(rack_probertt),
927 OID_AUTO, "goal_use_min_entry", CTLFLAG_RW,
928 &rack_probertt_use_min_rtt_entry, 1,
929 "Should we use the min-rtt to calculate the goal rtt (else gp_srtt) at entry");
930 SYSCTL_ADD_U32(&rack_sysctl_ctx,
931 SYSCTL_CHILDREN(rack_probertt),
932 OID_AUTO, "goal_use_min_exit", CTLFLAG_RW,
933 &rack_probertt_use_min_rtt_exit, 0,
934 "How to set cwnd at exit, 0 - dynamic, 1 - use min-rtt, 2 - use curgprtt, 3 - entry gp-rtt");
935 SYSCTL_ADD_U32(&rack_sysctl_ctx,
936 SYSCTL_CHILDREN(rack_probertt),
937 OID_AUTO, "length_div", CTLFLAG_RW,
938 &rack_probertt_gpsrtt_cnt_div, 0,
939 "How many recent goodput srtt periods plus hold tim does probertt last (bottom of fraction)");
940 SYSCTL_ADD_U32(&rack_sysctl_ctx,
941 SYSCTL_CHILDREN(rack_probertt),
942 OID_AUTO, "length_mul", CTLFLAG_RW,
943 &rack_probertt_gpsrtt_cnt_mul, 0,
944 "How many recent goodput srtt periods plus hold tim does probertt last (top of fraction)");
945 SYSCTL_ADD_U32(&rack_sysctl_ctx,
946 SYSCTL_CHILDREN(rack_probertt),
947 OID_AUTO, "holdtim_at_target", CTLFLAG_RW,
948 &rack_min_probertt_hold, 200000,
949 "What is the minimum time we hold probertt at target");
950 SYSCTL_ADD_U32(&rack_sysctl_ctx,
951 SYSCTL_CHILDREN(rack_probertt),
952 OID_AUTO, "filter_life", CTLFLAG_RW,
953 &rack_probertt_filter_life, 10000000,
954 "What is the time for the filters life in useconds");
955 SYSCTL_ADD_U32(&rack_sysctl_ctx,
956 SYSCTL_CHILDREN(rack_probertt),
957 OID_AUTO, "lower_within", CTLFLAG_RW,
958 &rack_probertt_lower_within, 10,
959 "If the rtt goes lower within this percentage of the time, go into probe-rtt");
960 SYSCTL_ADD_U32(&rack_sysctl_ctx,
961 SYSCTL_CHILDREN(rack_probertt),
962 OID_AUTO, "must_move", CTLFLAG_RW,
963 &rack_min_rtt_movement, 250,
964 "How much is the minimum movement in rtt to count as a drop for probertt purposes");
965 SYSCTL_ADD_U32(&rack_sysctl_ctx,
966 SYSCTL_CHILDREN(rack_probertt),
967 OID_AUTO, "clear_is_cnts", CTLFLAG_RW,
968 &rack_probertt_clear_is, 1,
969 "Do we clear I/S counts on exiting probe-rtt");
970 SYSCTL_ADD_S32(&rack_sysctl_ctx,
971 SYSCTL_CHILDREN(rack_probertt),
972 OID_AUTO, "hbp_extra_drain", CTLFLAG_RW,
973 &rack_max_drain_hbp, 1,
974 "How many extra drain gpsrtt's do we get in highly buffered paths");
975 SYSCTL_ADD_S32(&rack_sysctl_ctx,
976 SYSCTL_CHILDREN(rack_probertt),
977 OID_AUTO, "hbp_threshold", CTLFLAG_RW,
978 &rack_hbp_thresh, 3,
979 "We are highly buffered if min_rtt_seen / max_rtt_seen > this-threshold");
980 /* Pacing related sysctls */
981 rack_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
982 SYSCTL_CHILDREN(rack_sysctl_root),
983 OID_AUTO,
984 "pacing",
985 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
986 "Pacing related Controls");
987 SYSCTL_ADD_U32(&rack_sysctl_ctx,
988 SYSCTL_CHILDREN(rack_pacing),
989 OID_AUTO, "pcm_enabled", CTLFLAG_RW,
990 &rack_pcm_is_enabled, 1,
991 "Do we by default do PCM measurements?");
992 SYSCTL_ADD_U32(&rack_sysctl_ctx,
993 SYSCTL_CHILDREN(rack_pacing),
994 OID_AUTO, "pcm_rnds", CTLFLAG_RW,
995 &rack_pcm_every_n_rounds, 100,
996 "How many rounds before we need to do a PCM measurement");
997 SYSCTL_ADD_U32(&rack_sysctl_ctx,
998 SYSCTL_CHILDREN(rack_pacing),
999 OID_AUTO, "pcm_blast", CTLFLAG_RW,
1000 &rack_pcm_blast, 0,
1001 "Blast out the full cwnd/rwnd when doing a PCM measurement");
1002 SYSCTL_ADD_U32(&rack_sysctl_ctx,
1003 SYSCTL_CHILDREN(rack_pacing),
1004 OID_AUTO, "rnd_gp_gain", CTLFLAG_RW,
1005 &rack_gp_gain_req, 1200,
1006 "How much do we have to increase the GP to record the round 1200 = 120.0");
1007 SYSCTL_ADD_U32(&rack_sysctl_ctx,
1008 SYSCTL_CHILDREN(rack_pacing),
1009 OID_AUTO, "dgp_out_of_ss_at", CTLFLAG_RW,
1010 &rack_rnd_cnt_req, 0x10005,
1011 "How many rounds less than rnd_gp_gain will drop us out of SS");
1012 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1013 SYSCTL_CHILDREN(rack_pacing),
1014 OID_AUTO, "no_timely", CTLFLAG_RW,
1015 &rack_timely_off, 0,
1016 "Do we not use timely in DGP?");
1017 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1018 SYSCTL_CHILDREN(rack_pacing),
1019 OID_AUTO, "fillcw", CTLFLAG_RW,
1020 &rack_fill_cw_state, 0,
1021 "Enable fillcw on new connections (default=0 off)?");
1022 SYSCTL_ADD_U16(&rack_sysctl_ctx,
1023 SYSCTL_CHILDREN(rack_pacing),
1024 OID_AUTO, "min_burst", CTLFLAG_RW,
1025 &rack_pacing_min_seg, 0,
1026 "What is the min burst size for pacing (0 disables)?");
1027 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1028 SYSCTL_CHILDREN(rack_pacing),
1029 OID_AUTO, "divisor", CTLFLAG_RW,
1030 &rack_default_pacing_divisor, 250,
1031 "What is the default divisor given to the rl code?");
1032 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1033 SYSCTL_CHILDREN(rack_pacing),
1034 OID_AUTO, "fillcw_max_mult", CTLFLAG_RW,
1035 &rack_bw_multipler, 0,
1036 "What is the limit multiplier of the current gp_est that fillcw can increase the b/w too, 200 == 200% (0 = off)?");
1037 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1038 SYSCTL_CHILDREN(rack_pacing),
1039 OID_AUTO, "max_pace_over", CTLFLAG_RW,
1040 &rack_max_per_above, 30,
1041 "What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)");
1042 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1043 SYSCTL_CHILDREN(rack_pacing),
1044 OID_AUTO, "allow1mss", CTLFLAG_RW,
1045 &rack_pace_one_seg, 0,
1046 "Do we allow low b/w pacing of 1MSS instead of two (1.2Meg and less)?");
1047 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1048 SYSCTL_CHILDREN(rack_pacing),
1049 OID_AUTO, "limit_wsrtt", CTLFLAG_RW,
1050 &rack_limit_time_with_srtt, 0,
1051 "Do we limit pacing time based on srtt");
1052 SYSCTL_ADD_U16(&rack_sysctl_ctx,
1053 SYSCTL_CHILDREN(rack_pacing),
1054 OID_AUTO, "gp_per_ss", CTLFLAG_RW,
1055 &rack_per_of_gp_ss, 250,
1056 "If non zero, what percentage of goodput to pace at in slow start");
1057 SYSCTL_ADD_U16(&rack_sysctl_ctx,
1058 SYSCTL_CHILDREN(rack_pacing),
1059 OID_AUTO, "gp_per_ca", CTLFLAG_RW,
1060 &rack_per_of_gp_ca, 150,
1061 "If non zero, what percentage of goodput to pace at in congestion avoidance");
1062 SYSCTL_ADD_U16(&rack_sysctl_ctx,
1063 SYSCTL_CHILDREN(rack_pacing),
1064 OID_AUTO, "gp_per_rec", CTLFLAG_RW,
1065 &rack_per_of_gp_rec, 200,
1066 "If non zero, what percentage of goodput to pace at in recovery");
1067 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1068 SYSCTL_CHILDREN(rack_pacing),
1069 OID_AUTO, "pace_max_seg", CTLFLAG_RW,
1070 &rack_hptsi_segments, 40,
1071 "What size is the max for TSO segments in pacing and burst mitigation");
1072 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1073 SYSCTL_CHILDREN(rack_pacing),
1074 OID_AUTO, "burst_reduces", CTLFLAG_RW,
1075 &rack_pacing_delay_reduction, 4,
1076 "When doing only burst mitigation what is the reduce divisor");
1077 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1078 SYSCTL_CHILDREN(rack_sysctl_root),
1079 OID_AUTO, "use_pacing", CTLFLAG_RW,
1080 &rack_pace_every_seg, 0,
1081 "If set we use pacing, if clear we use only the original burst mitigation");
1082 SYSCTL_ADD_U64(&rack_sysctl_ctx,
1083 SYSCTL_CHILDREN(rack_pacing),
1084 OID_AUTO, "rate_cap", CTLFLAG_RW,
1085 &rack_bw_rate_cap, 0,
1086 "If set we apply this value to the absolute rate cap used by pacing");
1087 SYSCTL_ADD_U64(&rack_sysctl_ctx,
1088 SYSCTL_CHILDREN(rack_pacing),
1089 OID_AUTO, "fillcw_cap", CTLFLAG_RW,
1090 &rack_fillcw_bw_cap, 3750000,
1091 "Do we have an absolute cap on the amount of b/w fillcw can specify (0 = no)?");
1092 SYSCTL_ADD_U8(&rack_sysctl_ctx,
1093 SYSCTL_CHILDREN(rack_sysctl_root),
1094 OID_AUTO, "req_measure_cnt", CTLFLAG_RW,
1095 &rack_req_measurements, 1,
1096 "If doing dynamic pacing, how many measurements must be in before we start pacing?");
1097 /* Hardware pacing */
1098 rack_hw_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1099 SYSCTL_CHILDREN(rack_sysctl_root),
1100 OID_AUTO,
1101 "hdwr_pacing",
1102 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1103 "Pacing related Controls");
1104 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1105 SYSCTL_CHILDREN(rack_hw_pacing),
1106 OID_AUTO, "rwnd_factor", CTLFLAG_RW,
1107 &rack_hw_rwnd_factor, 2,
1108 "How many times does snd_wnd need to be bigger than pace_max_seg so we will hold off and get more acks?");
1109 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1110 SYSCTL_CHILDREN(rack_hw_pacing),
1111 OID_AUTO, "precheck", CTLFLAG_RW,
1112 &rack_hw_check_queue, 0,
1113 "Do we always precheck the hdwr pacing queue to avoid ENOBUF's?");
1114 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1115 SYSCTL_CHILDREN(rack_hw_pacing),
1116 OID_AUTO, "pace_enobuf_mult", CTLFLAG_RW,
1117 &rack_enobuf_hw_boost_mult, 0,
1118 "By how many time_betweens should we boost the pacing time if we see a ENOBUFS?");
1119 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1120 SYSCTL_CHILDREN(rack_hw_pacing),
1121 OID_AUTO, "pace_enobuf_max", CTLFLAG_RW,
1122 &rack_enobuf_hw_max, 2,
1123 "What is the max boost the pacing time if we see a ENOBUFS?");
1124 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1125 SYSCTL_CHILDREN(rack_hw_pacing),
1126 OID_AUTO, "pace_enobuf_min", CTLFLAG_RW,
1127 &rack_enobuf_hw_min, 2,
1128 "What is the min boost the pacing time if we see a ENOBUFS?");
1129 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1130 SYSCTL_CHILDREN(rack_hw_pacing),
1131 OID_AUTO, "enable", CTLFLAG_RW,
1132 &rack_enable_hw_pacing, 0,
1133 "Should RACK attempt to use hw pacing?");
1134 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1135 SYSCTL_CHILDREN(rack_hw_pacing),
1136 OID_AUTO, "rate_cap", CTLFLAG_RW,
1137 &rack_hw_rate_caps, 0,
1138 "Does the highest hardware pacing rate cap the rate we will send at??");
1139 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1140 SYSCTL_CHILDREN(rack_hw_pacing),
1141 OID_AUTO, "uncap_per", CTLFLAG_RW,
1142 &rack_hw_rate_cap_per, 0,
1143 "If you go over b/w by this amount you will be uncapped (0 = never)");
1144 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1145 SYSCTL_CHILDREN(rack_hw_pacing),
1146 OID_AUTO, "rate_min", CTLFLAG_RW,
1147 &rack_hw_rate_min, 0,
1148 "Do we need a minimum estimate of this many bytes per second in order to engage hw pacing?");
1149 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1150 SYSCTL_CHILDREN(rack_hw_pacing),
1151 OID_AUTO, "rate_to_low", CTLFLAG_RW,
1152 &rack_hw_rate_to_low, 0,
1153 "If we fall below this rate, dis-engage hw pacing?");
1154 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1155 SYSCTL_CHILDREN(rack_hw_pacing),
1156 OID_AUTO, "up_only", CTLFLAG_RW,
1157 &rack_hw_up_only, 0,
1158 "Do we allow hw pacing to lower the rate selected?");
1159 rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1160 SYSCTL_CHILDREN(rack_sysctl_root),
1161 OID_AUTO,
1162 "timely",
1163 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1164 "Rack Timely RTT Controls");
1165 /* Timely based GP dynmics */
1166 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1167 SYSCTL_CHILDREN(rack_timely),
1168 OID_AUTO, "upper", CTLFLAG_RW,
1169 &rack_gp_per_bw_mul_up, 2,
1170 "Rack timely upper range for equal b/w (in percentage)");
1171 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1172 SYSCTL_CHILDREN(rack_timely),
1173 OID_AUTO, "lower", CTLFLAG_RW,
1174 &rack_gp_per_bw_mul_down, 4,
1175 "Rack timely lower range for equal b/w (in percentage)");
1176 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1177 SYSCTL_CHILDREN(rack_timely),
1178 OID_AUTO, "rtt_max_mul", CTLFLAG_RW,
1179 &rack_gp_rtt_maxmul, 3,
1180 "Rack timely multiplier of lowest rtt for rtt_max");
1181 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1182 SYSCTL_CHILDREN(rack_timely),
1183 OID_AUTO, "rtt_min_div", CTLFLAG_RW,
1184 &rack_gp_rtt_mindiv, 4,
1185 "Rack timely divisor used for rtt + (rtt * mul/divisor) for check for lower rtt");
1186 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1187 SYSCTL_CHILDREN(rack_timely),
1188 OID_AUTO, "rtt_min_mul", CTLFLAG_RW,
1189 &rack_gp_rtt_minmul, 1,
1190 "Rack timely multiplier used for rtt + (rtt * mul/divisor) for check for lower rtt");
1191 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1192 SYSCTL_CHILDREN(rack_timely),
1193 OID_AUTO, "decrease", CTLFLAG_RW,
1194 &rack_gp_decrease_per, 80,
1195 "Rack timely Beta value 80 = .8 (scaled by 100)");
1196 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1197 SYSCTL_CHILDREN(rack_timely),
1198 OID_AUTO, "increase", CTLFLAG_RW,
1199 &rack_gp_increase_per, 2,
1200 "Rack timely increase perentage of our GP multiplication factor");
1201 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1202 SYSCTL_CHILDREN(rack_timely),
1203 OID_AUTO, "lowerbound", CTLFLAG_RW,
1204 &rack_per_lower_bound, 50,
1205 "Rack timely lowest percentage we allow GP multiplier to fall to");
1206 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1207 SYSCTL_CHILDREN(rack_timely),
1208 OID_AUTO, "p5_upper", CTLFLAG_RW,
1209 &rack_gain_p5_ub, 250,
1210 "Profile 5 upper bound to timely gain");
1211
1212 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1213 SYSCTL_CHILDREN(rack_timely),
1214 OID_AUTO, "upperboundss", CTLFLAG_RW,
1215 &rack_per_upper_bound_ss, 0,
1216 "Rack timely highest percentage we allow GP multiplier in SS to raise to (0 is no upperbound)");
1217 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1218 SYSCTL_CHILDREN(rack_timely),
1219 OID_AUTO, "upperboundca", CTLFLAG_RW,
1220 &rack_per_upper_bound_ca, 0,
1221 "Rack timely highest percentage we allow GP multiplier to CA raise to (0 is no upperbound)");
1222 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1223 SYSCTL_CHILDREN(rack_timely),
1224 OID_AUTO, "dynamicgp", CTLFLAG_RW,
1225 &rack_do_dyn_mul, 0,
1226 "Rack timely do we enable dynmaic timely goodput by default");
1227 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1228 SYSCTL_CHILDREN(rack_timely),
1229 OID_AUTO, "no_rec_red", CTLFLAG_RW,
1230 &rack_gp_no_rec_chg, 1,
1231 "Rack timely do we prohibit the recovery multiplier from being lowered");
1232 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1233 SYSCTL_CHILDREN(rack_timely),
1234 OID_AUTO, "red_clear_cnt", CTLFLAG_RW,
1235 &rack_timely_dec_clear, 6,
1236 "Rack timely what threshold do we count to before another boost during b/w decent");
1237 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1238 SYSCTL_CHILDREN(rack_timely),
1239 OID_AUTO, "max_push_rise", CTLFLAG_RW,
1240 &rack_timely_max_push_rise, 3,
1241 "Rack timely how many times do we push up with b/w increase");
1242 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1243 SYSCTL_CHILDREN(rack_timely),
1244 OID_AUTO, "max_push_drop", CTLFLAG_RW,
1245 &rack_timely_max_push_drop, 3,
1246 "Rack timely how many times do we push back on b/w decent");
1247 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1248 SYSCTL_CHILDREN(rack_timely),
1249 OID_AUTO, "min_segs", CTLFLAG_RW,
1250 &rack_timely_min_segs, 4,
1251 "Rack timely when setting the cwnd what is the min num segments");
1252 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1253 SYSCTL_CHILDREN(rack_timely),
1254 OID_AUTO, "nonstop", CTLFLAG_RW,
1255 &rack_timely_no_stopping, 0,
1256 "Rack timely don't stop increase");
1257 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1258 SYSCTL_CHILDREN(rack_timely),
1259 OID_AUTO, "dec_raise_thresh", CTLFLAG_RW,
1260 &rack_down_raise_thresh, 100,
1261 "If the CA or SS is below this threshold raise on the first 3 b/w lowers (0=always)");
1262 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1263 SYSCTL_CHILDREN(rack_timely),
1264 OID_AUTO, "bottom_drag_segs", CTLFLAG_RW,
1265 &rack_req_segs, 1,
1266 "Bottom dragging if not these many segments outstanding and room");
1267
1268 /* TLP and Rack related parameters */
1269 rack_tlp = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1270 SYSCTL_CHILDREN(rack_sysctl_root),
1271 OID_AUTO,
1272 "tlp",
1273 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1274 "TLP and Rack related Controls");
1275 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1276 SYSCTL_CHILDREN(rack_tlp),
1277 OID_AUTO, "use_rrr", CTLFLAG_RW,
1278 &use_rack_rr, 1,
1279 "Do we use Rack Rapid Recovery");
1280 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1281 SYSCTL_CHILDREN(rack_tlp),
1282 OID_AUTO, "post_rec_labc", CTLFLAG_RW,
1283 &rack_max_abc_post_recovery, 2,
1284 "Since we do early recovery, do we override the l_abc to a value, if so what?");
1285 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1286 SYSCTL_CHILDREN(rack_tlp),
1287 OID_AUTO, "nonrxt_use_cr", CTLFLAG_RW,
1288 &rack_non_rxt_use_cr, 0,
1289 "Do we use ss/ca rate if in recovery we are transmitting a new data chunk");
1290 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1291 SYSCTL_CHILDREN(rack_tlp),
1292 OID_AUTO, "tlpmethod", CTLFLAG_RW,
1293 &rack_tlp_threshold_use, TLP_USE_TWO_ONE,
1294 "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2");
1295 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1296 SYSCTL_CHILDREN(rack_tlp),
1297 OID_AUTO, "limit", CTLFLAG_RW,
1298 &rack_tlp_limit, 2,
1299 "How many TLP's can be sent without sending new data");
1300 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1301 SYSCTL_CHILDREN(rack_tlp),
1302 OID_AUTO, "use_greater", CTLFLAG_RW,
1303 &rack_tlp_use_greater, 1,
1304 "Should we use the rack_rtt time if its greater than srtt");
1305 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1306 SYSCTL_CHILDREN(rack_tlp),
1307 OID_AUTO, "tlpminto", CTLFLAG_RW,
1308 &rack_tlp_min, 10000,
1309 "TLP minimum timeout per the specification (in microseconds)");
1310 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1311 SYSCTL_CHILDREN(rack_tlp),
1312 OID_AUTO, "send_oldest", CTLFLAG_RW,
1313 &rack_always_send_oldest, 0,
1314 "Should we always send the oldest TLP and RACK-TLP");
1315 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1316 SYSCTL_CHILDREN(rack_tlp),
1317 OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW,
1318 &rack_lower_cwnd_at_tlp, 0,
1319 "When a TLP completes a retran should we enter recovery");
1320 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1321 SYSCTL_CHILDREN(rack_tlp),
1322 OID_AUTO, "reorder_thresh", CTLFLAG_RW,
1323 &rack_reorder_thresh, 2,
1324 "What factor for rack will be added when seeing reordering (shift right)");
1325 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1326 SYSCTL_CHILDREN(rack_tlp),
1327 OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW,
1328 &rack_tlp_thresh, 1,
1329 "What divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)");
1330 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1331 SYSCTL_CHILDREN(rack_tlp),
1332 OID_AUTO, "reorder_fade", CTLFLAG_RW,
1333 &rack_reorder_fade, 60000000,
1334 "Does reorder detection fade, if so how many microseconds (0 means never)");
1335 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1336 SYSCTL_CHILDREN(rack_tlp),
1337 OID_AUTO, "pktdelay", CTLFLAG_RW,
1338 &rack_pkt_delay, 1000,
1339 "Extra RACK time (in microseconds) besides reordering thresh");
1340
1341 /* Timer related controls */
1342 rack_timers = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1343 SYSCTL_CHILDREN(rack_sysctl_root),
1344 OID_AUTO,
1345 "timers",
1346 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1347 "Timer related controls");
1348 SYSCTL_ADD_U8(&rack_sysctl_ctx,
1349 SYSCTL_CHILDREN(rack_timers),
1350 OID_AUTO, "reset_ssth_rec_rto", CTLFLAG_RW,
1351 &rack_ssthresh_rest_rto_rec, 0,
1352 "When doing recovery -> rto -> recovery do we reset SSthresh?");
1353 SYSCTL_ADD_U32(&rack_sysctl_ctx,
1354 SYSCTL_CHILDREN(rack_timers),
1355 OID_AUTO, "rtt_divisor", CTLFLAG_RW,
1356 &rack_rtt_divisor, 2,
1357 "When calculating the rtt threshold what 1/N is a rtt that indicates reordering");
1358 SYSCTL_ADD_U32(&rack_sysctl_ctx,
1359 SYSCTL_CHILDREN(rack_timers),
1360 OID_AUTO, "scoreboard_thresh", CTLFLAG_RW,
1361 &rack_rxt_scoreboard_clear_thresh, 2,
1362 "How many RTO's are allowed before we clear the scoreboard");
1363 SYSCTL_ADD_U32(&rack_sysctl_ctx,
1364 SYSCTL_CHILDREN(rack_timers),
1365 OID_AUTO, "honor_hpts_min", CTLFLAG_RW,
1366 &rack_honors_hpts_min_to, 1,
1367 "Do rack pacing timers honor hpts min timeout");
1368 SYSCTL_ADD_U32(&rack_sysctl_ctx,
1369 SYSCTL_CHILDREN(rack_timers),
1370 OID_AUTO, "hpts_max_reduce", CTLFLAG_RW,
1371 &rack_max_reduce, 10,
1372 "Max percentage we will reduce pacing delay by for pacing when we are behind");
1373 SYSCTL_ADD_U32(&rack_sysctl_ctx,
1374 SYSCTL_CHILDREN(rack_timers),
1375 OID_AUTO, "persmin", CTLFLAG_RW,
1376 &rack_persist_min, 250000,
1377 "What is the minimum time in microseconds between persists");
1378 SYSCTL_ADD_U32(&rack_sysctl_ctx,
1379 SYSCTL_CHILDREN(rack_timers),
1380 OID_AUTO, "persmax", CTLFLAG_RW,
1381 &rack_persist_max, 2000000,
1382 "What is the largest delay in microseconds between persists");
1383 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1384 SYSCTL_CHILDREN(rack_timers),
1385 OID_AUTO, "delayed_ack", CTLFLAG_RW,
1386 &rack_delayed_ack_time, 40000,
1387 "Delayed ack time (40ms in microseconds)");
1388 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1389 SYSCTL_CHILDREN(rack_timers),
1390 OID_AUTO, "minrto", CTLFLAG_RW,
1391 &rack_rto_min, 30000,
1392 "Minimum RTO in microseconds -- set with caution below 1000 due to TLP");
1393 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1394 SYSCTL_CHILDREN(rack_timers),
1395 OID_AUTO, "maxrto", CTLFLAG_RW,
1396 &rack_rto_max, 4000000,
1397 "Maximum RTO in microseconds -- should be at least as large as min_rto");
1398 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1399 SYSCTL_CHILDREN(rack_timers),
1400 OID_AUTO, "minto", CTLFLAG_RW,
1401 &rack_min_to, 1000,
1402 "Minimum rack timeout in microseconds");
1403 /* Measure controls */
1404 rack_measure = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1405 SYSCTL_CHILDREN(rack_sysctl_root),
1406 OID_AUTO,
1407 "measure",
1408 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1409 "Measure related controls");
1410 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1411 SYSCTL_CHILDREN(rack_measure),
1412 OID_AUTO, "wma_divisor", CTLFLAG_RW,
1413 &rack_wma_divisor, 8,
1414 "When doing b/w calculation what is the divisor for the WMA");
1415 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1416 SYSCTL_CHILDREN(rack_measure),
1417 OID_AUTO, "end_cwnd", CTLFLAG_RW,
1418 &rack_cwnd_block_ends_measure, 0,
1419 "Does a cwnd just-return end the measurement window (app limited)");
1420 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1421 SYSCTL_CHILDREN(rack_measure),
1422 OID_AUTO, "end_rwnd", CTLFLAG_RW,
1423 &rack_rwnd_block_ends_measure, 0,
1424 "Does an rwnd just-return end the measurement window (app limited -- not persists)");
1425 SYSCTL_ADD_U32(&rack_sysctl_ctx,
1426 SYSCTL_CHILDREN(rack_measure),
1427 OID_AUTO, "min_target", CTLFLAG_RW,
1428 &rack_def_data_window, 20,
1429 "What is the minimum target window (in mss) for a GP measurements");
1430 SYSCTL_ADD_U32(&rack_sysctl_ctx,
1431 SYSCTL_CHILDREN(rack_measure),
1432 OID_AUTO, "goal_bdp", CTLFLAG_RW,
1433 &rack_goal_bdp, 2,
1434 "What is the goal BDP to measure");
1435 SYSCTL_ADD_U32(&rack_sysctl_ctx,
1436 SYSCTL_CHILDREN(rack_measure),
1437 OID_AUTO, "min_srtts", CTLFLAG_RW,
1438 &rack_min_srtts, 1,
1439 "What is the goal BDP to measure");
1440 SYSCTL_ADD_U32(&rack_sysctl_ctx,
1441 SYSCTL_CHILDREN(rack_measure),
1442 OID_AUTO, "min_measure_tim", CTLFLAG_RW,
1443 &rack_min_measure_usec, 0,
1444 "What is the Minimum time time for a measurement if 0, this is off");
1445 /* Features */
1446 rack_features = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1447 SYSCTL_CHILDREN(rack_sysctl_root),
1448 OID_AUTO,
1449 "features",
1450 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1451 "Feature controls");
1452 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1453 SYSCTL_CHILDREN(rack_features),
1454 OID_AUTO, "hybrid_set_maxseg", CTLFLAG_RW,
1455 &rack_hybrid_allow_set_maxseg, 0,
1456 "Should hybrid pacing allow the setmss command");
1457 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1458 SYSCTL_CHILDREN(rack_features),
1459 OID_AUTO, "cmpack", CTLFLAG_RW,
1460 &rack_use_cmp_acks, 1,
1461 "Should RACK have LRO send compressed acks");
1462 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1463 SYSCTL_CHILDREN(rack_features),
1464 OID_AUTO, "fsb", CTLFLAG_RW,
1465 &rack_use_fsb, 1,
1466 "Should RACK use the fast send block?");
1467 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1468 SYSCTL_CHILDREN(rack_features),
1469 OID_AUTO, "rfo", CTLFLAG_RW,
1470 &rack_use_rfo, 1,
1471 "Should RACK use rack_fast_output()?");
1472 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1473 SYSCTL_CHILDREN(rack_features),
1474 OID_AUTO, "rsmrfo", CTLFLAG_RW,
1475 &rack_use_rsm_rfo, 1,
1476 "Should RACK use rack_fast_rsm_output()?");
1477 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1478 SYSCTL_CHILDREN(rack_features),
1479 OID_AUTO, "non_paced_lro_queue", CTLFLAG_RW,
1480 &rack_enable_mqueue_for_nonpaced, 0,
1481 "Should RACK use mbuf queuing for non-paced connections");
1482 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1483 SYSCTL_CHILDREN(rack_features),
1484 OID_AUTO, "hystartplusplus", CTLFLAG_RW,
1485 &rack_do_hystart, 0,
1486 "Should RACK enable HyStart++ on connections?");
1487 /* Misc rack controls */
1488 rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1489 SYSCTL_CHILDREN(rack_sysctl_root),
1490 OID_AUTO,
1491 "misc",
1492 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1493 "Misc related controls");
1494 #ifdef TCP_ACCOUNTING
1495 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1496 SYSCTL_CHILDREN(rack_misc),
1497 OID_AUTO, "tcp_acct", CTLFLAG_RW,
1498 &rack_tcp_accounting, 0,
1499 "Should we turn on TCP accounting for all rack sessions?");
1500 #endif
1501 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1502 SYSCTL_CHILDREN(rack_misc),
1503 OID_AUTO, "dnd", CTLFLAG_RW,
1504 &rack_dnd_default, 0,
1505 "Do not disturb default for rack_rrr = 3");
1506 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1507 SYSCTL_CHILDREN(rack_misc),
1508 OID_AUTO, "rxt_controls", CTLFLAG_RW,
1509 &rack_rxt_controls, 0,
1510 "Retransmit sending size controls (valid values 0, 1, 2 default=1)?");
1511 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1512 SYSCTL_CHILDREN(rack_misc),
1513 OID_AUTO, "rack_hibeta", CTLFLAG_RW,
1514 &rack_hibeta_setting, 0,
1515 "Do we ue a high beta (80 instead of 50)?");
1516 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1517 SYSCTL_CHILDREN(rack_misc),
1518 OID_AUTO, "apply_rtt_with_low_conf", CTLFLAG_RW,
1519 &rack_apply_rtt_with_reduced_conf, 0,
1520 "When a persist or keep-alive probe is not answered do we calculate rtt on subsequent answers?");
1521 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1522 SYSCTL_CHILDREN(rack_misc),
1523 OID_AUTO, "rack_dsack_ctl", CTLFLAG_RW,
1524 &rack_dsack_std_based, 3,
1525 "How do we process dsack with respect to rack timers, bit field, 3 is standards based?");
1526 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1527 SYSCTL_CHILDREN(rack_misc),
1528 OID_AUTO, "prr_addback_max", CTLFLAG_RW,
1529 &rack_prr_addbackmax, 2,
1530 "What is the maximum number of MSS we allow to be added back if prr can't send all its data?");
1531 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1532 SYSCTL_CHILDREN(rack_misc),
1533 OID_AUTO, "stats_gets_ms", CTLFLAG_RW,
1534 &rack_stats_gets_ms_rtt, 1,
1535 "What do we feed the stats framework (1 = ms_rtt, 0 = us_rtt, 2 = ms_rtt from hdwr, > 2 usec rtt from hdwr)?");
1536 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1537 SYSCTL_CHILDREN(rack_misc),
1538 OID_AUTO, "clientlowbuf", CTLFLAG_RW,
1539 &rack_client_low_buf, 0,
1540 "Client low buffer level (below this we are more aggressive in DGP exiting recovery (0 = off)?");
1541 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1542 SYSCTL_CHILDREN(rack_misc),
1543 OID_AUTO, "defprofile", CTLFLAG_RW,
1544 &rack_def_profile, 0,
1545 "Should RACK use a default profile (0=no, num == profile num)?");
1546 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1547 SYSCTL_CHILDREN(rack_misc),
1548 OID_AUTO, "shared_cwnd", CTLFLAG_RW,
1549 &rack_enable_shared_cwnd, 1,
1550 "Should RACK try to use the shared cwnd on connections where allowed");
1551 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1552 SYSCTL_CHILDREN(rack_misc),
1553 OID_AUTO, "limits_on_scwnd", CTLFLAG_RW,
1554 &rack_limits_scwnd, 1,
1555 "Should RACK place low end time limits on the shared cwnd feature");
1556 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1557 SYSCTL_CHILDREN(rack_misc),
1558 OID_AUTO, "no_prr", CTLFLAG_RW,
1559 &rack_disable_prr, 0,
1560 "Should RACK not use prr and only pace (must have pacing on)");
1561 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1562 SYSCTL_CHILDREN(rack_misc),
1563 OID_AUTO, "bb_verbose", CTLFLAG_RW,
1564 &rack_verbose_logging, 0,
1565 "Should RACK black box logging be verbose");
1566 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1567 SYSCTL_CHILDREN(rack_misc),
1568 OID_AUTO, "data_after_close", CTLFLAG_RW,
1569 &rack_ignore_data_after_close, 1,
1570 "Do we hold off sending a RST until all pending data is ack'd");
1571 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1572 SYSCTL_CHILDREN(rack_misc),
1573 OID_AUTO, "no_sack_needed", CTLFLAG_RW,
1574 &rack_sack_not_required, 1,
1575 "Do we allow rack to run on connections not supporting SACK");
1576 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1577 SYSCTL_CHILDREN(rack_misc),
1578 OID_AUTO, "prr_sendalot", CTLFLAG_RW,
1579 &rack_send_a_lot_in_prr, 1,
1580 "Send a lot in prr");
1581 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1582 SYSCTL_CHILDREN(rack_misc),
1583 OID_AUTO, "autoscale", CTLFLAG_RW,
1584 &rack_autosndbuf_inc, 20,
1585 "What percentage should rack scale up its snd buffer by?");
1586
1587 /* Counters */
1588 rack_total_bytes = counter_u64_alloc(M_WAITOK);
1589 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1590 SYSCTL_CHILDREN(rack_counters),
1591 OID_AUTO, "totalbytes", CTLFLAG_RD,
1592 &rack_total_bytes,
1593 "Total number of bytes sent");
1594 rack_fto_send = counter_u64_alloc(M_WAITOK);
1595 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1596 SYSCTL_CHILDREN(rack_counters),
1597 OID_AUTO, "fto_send", CTLFLAG_RD,
1598 &rack_fto_send, "Total number of rack_fast_output sends");
1599 rack_fto_rsm_send = counter_u64_alloc(M_WAITOK);
1600 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1601 SYSCTL_CHILDREN(rack_counters),
1602 OID_AUTO, "fto_rsm_send", CTLFLAG_RD,
1603 &rack_fto_rsm_send, "Total number of rack_fast_rsm_output sends");
1604 rack_nfto_resend = counter_u64_alloc(M_WAITOK);
1605 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1606 SYSCTL_CHILDREN(rack_counters),
1607 OID_AUTO, "nfto_resend", CTLFLAG_RD,
1608 &rack_nfto_resend, "Total number of rack_output retransmissions");
1609 rack_non_fto_send = counter_u64_alloc(M_WAITOK);
1610 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1611 SYSCTL_CHILDREN(rack_counters),
1612 OID_AUTO, "nfto_send", CTLFLAG_RD,
1613 &rack_non_fto_send, "Total number of rack_output first sends");
1614 rack_extended_rfo = counter_u64_alloc(M_WAITOK);
1615 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1616 SYSCTL_CHILDREN(rack_counters),
1617 OID_AUTO, "rfo_extended", CTLFLAG_RD,
1618 &rack_extended_rfo, "Total number of times we extended rfo");
1619
1620 rack_hw_pace_init_fail = counter_u64_alloc(M_WAITOK);
1621 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1622 SYSCTL_CHILDREN(rack_counters),
1623 OID_AUTO, "hwpace_init_fail", CTLFLAG_RD,
1624 &rack_hw_pace_init_fail, "Total number of times we failed to initialize hw pacing");
1625 rack_hw_pace_lost = counter_u64_alloc(M_WAITOK);
1626
1627 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1628 SYSCTL_CHILDREN(rack_counters),
1629 OID_AUTO, "hwpace_lost", CTLFLAG_RD,
1630 &rack_hw_pace_lost, "Total number of times we failed to initialize hw pacing");
1631 rack_tlp_tot = counter_u64_alloc(M_WAITOK);
1632 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1633 SYSCTL_CHILDREN(rack_counters),
1634 OID_AUTO, "tlp_to_total", CTLFLAG_RD,
1635 &rack_tlp_tot,
1636 "Total number of tail loss probe expirations");
1637 rack_tlp_newdata = counter_u64_alloc(M_WAITOK);
1638 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1639 SYSCTL_CHILDREN(rack_counters),
1640 OID_AUTO, "tlp_new", CTLFLAG_RD,
1641 &rack_tlp_newdata,
1642 "Total number of tail loss probe sending new data");
1643 rack_tlp_retran = counter_u64_alloc(M_WAITOK);
1644 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1645 SYSCTL_CHILDREN(rack_counters),
1646 OID_AUTO, "tlp_retran", CTLFLAG_RD,
1647 &rack_tlp_retran,
1648 "Total number of tail loss probe sending retransmitted data");
1649 rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK);
1650 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1651 SYSCTL_CHILDREN(rack_counters),
1652 OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD,
1653 &rack_tlp_retran_bytes,
1654 "Total bytes of tail loss probe sending retransmitted data");
1655 rack_to_tot = counter_u64_alloc(M_WAITOK);
1656 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1657 SYSCTL_CHILDREN(rack_counters),
1658 OID_AUTO, "rack_to_tot", CTLFLAG_RD,
1659 &rack_to_tot,
1660 "Total number of times the rack to expired");
1661 rack_saw_enobuf = counter_u64_alloc(M_WAITOK);
1662 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1663 SYSCTL_CHILDREN(rack_counters),
1664 OID_AUTO, "saw_enobufs", CTLFLAG_RD,
1665 &rack_saw_enobuf,
1666 "Total number of times a sends returned enobuf for non-hdwr paced connections");
1667 rack_saw_enobuf_hw = counter_u64_alloc(M_WAITOK);
1668 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1669 SYSCTL_CHILDREN(rack_counters),
1670 OID_AUTO, "saw_enobufs_hw", CTLFLAG_RD,
1671 &rack_saw_enobuf_hw,
1672 "Total number of times a send returned enobuf for hdwr paced connections");
1673 rack_saw_enetunreach = counter_u64_alloc(M_WAITOK);
1674 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1675 SYSCTL_CHILDREN(rack_counters),
1676 OID_AUTO, "saw_enetunreach", CTLFLAG_RD,
1677 &rack_saw_enetunreach,
1678 "Total number of times a send received a enetunreachable");
1679 rack_hot_alloc = counter_u64_alloc(M_WAITOK);
1680 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1681 SYSCTL_CHILDREN(rack_counters),
1682 OID_AUTO, "alloc_hot", CTLFLAG_RD,
1683 &rack_hot_alloc,
1684 "Total allocations from the top of our list");
1685 rack_to_alloc = counter_u64_alloc(M_WAITOK);
1686 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1687 SYSCTL_CHILDREN(rack_counters),
1688 OID_AUTO, "allocs", CTLFLAG_RD,
1689 &rack_to_alloc,
1690 "Total allocations of tracking structures");
1691 rack_to_alloc_hard = counter_u64_alloc(M_WAITOK);
1692 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1693 SYSCTL_CHILDREN(rack_counters),
1694 OID_AUTO, "allochard", CTLFLAG_RD,
1695 &rack_to_alloc_hard,
1696 "Total allocations done with sleeping the hard way");
1697 rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK);
1698 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1699 SYSCTL_CHILDREN(rack_counters),
1700 OID_AUTO, "allocemerg", CTLFLAG_RD,
1701 &rack_to_alloc_emerg,
1702 "Total allocations done from emergency cache");
1703 rack_to_alloc_limited = counter_u64_alloc(M_WAITOK);
1704 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1705 SYSCTL_CHILDREN(rack_counters),
1706 OID_AUTO, "alloc_limited", CTLFLAG_RD,
1707 &rack_to_alloc_limited,
1708 "Total allocations dropped due to limit");
1709 rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK);
1710 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1711 SYSCTL_CHILDREN(rack_counters),
1712 OID_AUTO, "alloc_limited_conns", CTLFLAG_RD,
1713 &rack_alloc_limited_conns,
1714 "Connections with allocations dropped due to limit");
1715 rack_split_limited = counter_u64_alloc(M_WAITOK);
1716 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1717 SYSCTL_CHILDREN(rack_counters),
1718 OID_AUTO, "split_limited", CTLFLAG_RD,
1719 &rack_split_limited,
1720 "Split allocations dropped due to limit");
1721 rack_rxt_clamps_cwnd = counter_u64_alloc(M_WAITOK);
1722 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1723 SYSCTL_CHILDREN(rack_counters),
1724 OID_AUTO, "rxt_clamps_cwnd", CTLFLAG_RD,
1725 &rack_rxt_clamps_cwnd,
1726 "Number of times that excessive rxt clamped the cwnd down");
1727 rack_rxt_clamps_cwnd_uniq = counter_u64_alloc(M_WAITOK);
1728 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1729 SYSCTL_CHILDREN(rack_counters),
1730 OID_AUTO, "rxt_clamps_cwnd_uniq", CTLFLAG_RD,
1731 &rack_rxt_clamps_cwnd_uniq,
1732 "Number of connections that have had excessive rxt clamped the cwnd down");
1733 rack_persists_sends = counter_u64_alloc(M_WAITOK);
1734 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1735 SYSCTL_CHILDREN(rack_counters),
1736 OID_AUTO, "persist_sends", CTLFLAG_RD,
1737 &rack_persists_sends,
1738 "Number of times we sent a persist probe");
1739 rack_persists_acks = counter_u64_alloc(M_WAITOK);
1740 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1741 SYSCTL_CHILDREN(rack_counters),
1742 OID_AUTO, "persist_acks", CTLFLAG_RD,
1743 &rack_persists_acks,
1744 "Number of times a persist probe was acked");
1745 rack_persists_loss = counter_u64_alloc(M_WAITOK);
1746 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1747 SYSCTL_CHILDREN(rack_counters),
1748 OID_AUTO, "persist_loss", CTLFLAG_RD,
1749 &rack_persists_loss,
1750 "Number of times we detected a lost persist probe (no ack)");
1751 rack_persists_lost_ends = counter_u64_alloc(M_WAITOK);
1752 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1753 SYSCTL_CHILDREN(rack_counters),
1754 OID_AUTO, "persist_loss_ends", CTLFLAG_RD,
1755 &rack_persists_lost_ends,
1756 "Number of lost persist probe (no ack) that the run ended with a PERSIST abort");
1757 #ifdef INVARIANTS
1758 rack_adjust_map_bw = counter_u64_alloc(M_WAITOK);
1759 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1760 SYSCTL_CHILDREN(rack_counters),
1761 OID_AUTO, "map_adjust_req", CTLFLAG_RD,
1762 &rack_adjust_map_bw,
1763 "Number of times we hit the case where the sb went up and down on a sendmap entry");
1764 #endif
1765 rack_multi_single_eq = counter_u64_alloc(M_WAITOK);
1766 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1767 SYSCTL_CHILDREN(rack_counters),
1768 OID_AUTO, "cmp_ack_equiv", CTLFLAG_RD,
1769 &rack_multi_single_eq,
1770 "Number of compressed acks total represented");
1771 rack_proc_non_comp_ack = counter_u64_alloc(M_WAITOK);
1772 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1773 SYSCTL_CHILDREN(rack_counters),
1774 OID_AUTO, "cmp_ack_not", CTLFLAG_RD,
1775 &rack_proc_non_comp_ack,
1776 "Number of non compresseds acks that we processed");
1777
1778
1779 rack_sack_proc_all = counter_u64_alloc(M_WAITOK);
1780 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1781 SYSCTL_CHILDREN(rack_counters),
1782 OID_AUTO, "sack_long", CTLFLAG_RD,
1783 &rack_sack_proc_all,
1784 "Total times we had to walk whole list for sack processing");
1785 rack_sack_proc_restart = counter_u64_alloc(M_WAITOK);
1786 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1787 SYSCTL_CHILDREN(rack_counters),
1788 OID_AUTO, "sack_restart", CTLFLAG_RD,
1789 &rack_sack_proc_restart,
1790 "Total times we had to walk whole list due to a restart");
1791 rack_sack_proc_short = counter_u64_alloc(M_WAITOK);
1792 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1793 SYSCTL_CHILDREN(rack_counters),
1794 OID_AUTO, "sack_short", CTLFLAG_RD,
1795 &rack_sack_proc_short,
1796 "Total times we took shortcut for sack processing");
1797 rack_input_idle_reduces = counter_u64_alloc(M_WAITOK);
1798 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1799 SYSCTL_CHILDREN(rack_counters),
1800 OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD,
1801 &rack_input_idle_reduces,
1802 "Total number of idle reductions on input");
1803 rack_collapsed_win_seen = counter_u64_alloc(M_WAITOK);
1804 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1805 SYSCTL_CHILDREN(rack_counters),
1806 OID_AUTO, "collapsed_win_seen", CTLFLAG_RD,
1807 &rack_collapsed_win_seen,
1808 "Total number of collapsed window events seen (where our window shrinks)");
1809
1810 rack_collapsed_win = counter_u64_alloc(M_WAITOK);
1811 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1812 SYSCTL_CHILDREN(rack_counters),
1813 OID_AUTO, "collapsed_win", CTLFLAG_RD,
1814 &rack_collapsed_win,
1815 "Total number of collapsed window events where we mark packets");
1816 rack_collapsed_win_rxt = counter_u64_alloc(M_WAITOK);
1817 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1818 SYSCTL_CHILDREN(rack_counters),
1819 OID_AUTO, "collapsed_win_rxt", CTLFLAG_RD,
1820 &rack_collapsed_win_rxt,
1821 "Total number of packets that were retransmitted");
1822 rack_collapsed_win_rxt_bytes = counter_u64_alloc(M_WAITOK);
1823 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1824 SYSCTL_CHILDREN(rack_counters),
1825 OID_AUTO, "collapsed_win_bytes", CTLFLAG_RD,
1826 &rack_collapsed_win_rxt_bytes,
1827 "Total number of bytes that were retransmitted");
1828 rack_try_scwnd = counter_u64_alloc(M_WAITOK);
1829 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1830 SYSCTL_CHILDREN(rack_counters),
1831 OID_AUTO, "tried_scwnd", CTLFLAG_RD,
1832 &rack_try_scwnd,
1833 "Total number of scwnd attempts");
1834 COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK);
1835 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
1836 OID_AUTO, "outsize", CTLFLAG_RD,
1837 rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes");
1838 COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK);
1839 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
1840 OID_AUTO, "opts", CTLFLAG_RD,
1841 rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats");
1842 SYSCTL_ADD_PROC(&rack_sysctl_ctx,
1843 SYSCTL_CHILDREN(rack_sysctl_root),
1844 OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
1845 &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters");
1846 }
1847
1848 static uint32_t
rc_init_window(struct tcp_rack * rack)1849 rc_init_window(struct tcp_rack *rack)
1850 {
1851 return (tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)));
1852
1853 }
1854
1855 static uint64_t
rack_get_fixed_pacing_bw(struct tcp_rack * rack)1856 rack_get_fixed_pacing_bw(struct tcp_rack *rack)
1857 {
1858 if (IN_FASTRECOVERY(rack->rc_tp->t_flags))
1859 return (rack->r_ctl.rc_fixed_pacing_rate_rec);
1860 else if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh)
1861 return (rack->r_ctl.rc_fixed_pacing_rate_ss);
1862 else
1863 return (rack->r_ctl.rc_fixed_pacing_rate_ca);
1864 }
1865
1866 static void
rack_log_hybrid_bw(struct tcp_rack * rack,uint32_t seq,uint64_t cbw,uint64_t tim,uint64_t data,uint8_t mod,uint16_t aux,struct tcp_sendfile_track * cur,int line)1867 rack_log_hybrid_bw(struct tcp_rack *rack, uint32_t seq, uint64_t cbw, uint64_t tim,
1868 uint64_t data, uint8_t mod, uint16_t aux,
1869 struct tcp_sendfile_track *cur, int line)
1870 {
1871 #ifdef TCP_REQUEST_TRK
1872 int do_log = 0;
1873
1874 /*
1875 * The rate cap one is noisy and only should come out when normal BB logging
1876 * is enabled, the other logs (not RATE_CAP and NOT CAP_CALC) only come out
1877 * once per chunk and make up the BBpoint that can be turned on by the client.
1878 */
1879 if ((mod == HYBRID_LOG_RATE_CAP) || (mod == HYBRID_LOG_CAP_CALC)) {
1880 /*
1881 * The very noisy two need to only come out when
1882 * we have verbose logging on.
1883 */
1884 if (rack_verbose_logging != 0)
1885 do_log = tcp_bblogging_on(rack->rc_tp);
1886 else
1887 do_log = 0;
1888 } else if (mod != HYBRID_LOG_BW_MEASURE) {
1889 /*
1890 * All other less noisy logs here except the measure which
1891 * also needs to come out on the point and the log.
1892 */
1893 do_log = tcp_bblogging_on(rack->rc_tp);
1894 } else {
1895 do_log = tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING);
1896 }
1897
1898 if (do_log) {
1899 union tcp_log_stackspecific log;
1900 struct timeval tv;
1901 uint64_t lt_bw;
1902
1903 /* Convert our ms to a microsecond */
1904 memset(&log, 0, sizeof(log));
1905
1906 log.u_bbr.cwnd_gain = line;
1907 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1908 log.u_bbr.rttProp = tim;
1909 log.u_bbr.bw_inuse = cbw;
1910 log.u_bbr.delRate = rack_get_gp_est(rack);
1911 lt_bw = rack_get_lt_bw(rack);
1912 log.u_bbr.flex1 = seq;
1913 log.u_bbr.pacing_gain = aux;
1914 /* lt_bw = < flex3 | flex2 > */
1915 log.u_bbr.flex2 = (uint32_t)(lt_bw & 0x00000000ffffffff);
1916 log.u_bbr.flex3 = (uint32_t)((lt_bw >> 32) & 0x00000000ffffffff);
1917 /* Record the last obtained us rtt in inflight */
1918 if (cur == NULL) {
1919 /* Make sure we are looking at the right log if an overide comes in */
1920 cur = rack->r_ctl.rc_last_sft;
1921 }
1922 if (rack->r_ctl.rack_rs.rs_flags != RACK_RTT_EMPTY)
1923 log.u_bbr.inflight = rack->r_ctl.rack_rs.rs_us_rtt;
1924 else {
1925 /* Use the last known rtt i.e. the rack-rtt */
1926 log.u_bbr.inflight = rack->rc_rack_rtt;
1927 }
1928 if (cur != NULL) {
1929 uint64_t off;
1930
1931 log.u_bbr.cur_del_rate = cur->deadline;
1932 if ((mod == HYBRID_LOG_RATE_CAP) || (mod == HYBRID_LOG_CAP_CALC)) {
1933 /* start = < lost | pkt_epoch > */
1934 log.u_bbr.pkt_epoch = (uint32_t)(cur->start & 0x00000000ffffffff);
1935 log.u_bbr.lost = (uint32_t)((cur->start >> 32) & 0x00000000ffffffff);
1936 log.u_bbr.flex6 = cur->start_seq;
1937 log.u_bbr.pkts_out = cur->end_seq;
1938 } else {
1939 /* start = < lost | pkt_epoch > */
1940 log.u_bbr.pkt_epoch = (uint32_t)(cur->start & 0x00000000ffffffff);
1941 log.u_bbr.lost = (uint32_t)((cur->start >> 32) & 0x00000000ffffffff);
1942 /* end = < pkts_out | flex6 > */
1943 log.u_bbr.flex6 = (uint32_t)(cur->end & 0x00000000ffffffff);
1944 log.u_bbr.pkts_out = (uint32_t)((cur->end >> 32) & 0x00000000ffffffff);
1945 }
1946 /* first_send = <lt_epoch | epoch> */
1947 log.u_bbr.epoch = (uint32_t)(cur->first_send & 0x00000000ffffffff);
1948 log.u_bbr.lt_epoch = (uint32_t)((cur->first_send >> 32) & 0x00000000ffffffff);
1949 /* localtime = <delivered | applimited>*/
1950 log.u_bbr.applimited = (uint32_t)(cur->localtime & 0x00000000ffffffff);
1951 log.u_bbr.delivered = (uint32_t)((cur->localtime >> 32) & 0x00000000ffffffff);
1952 #ifdef TCP_REQUEST_TRK
1953 off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_tcpreq_info[0]);
1954 log.u_bbr.bbr_substate = (uint8_t)(off / sizeof(struct tcp_sendfile_track));
1955 #endif
1956 log.u_bbr.inhpts = 1;
1957 log.u_bbr.flex4 = (uint32_t)(rack->rc_tp->t_sndbytes - cur->sent_at_fs);
1958 log.u_bbr.flex5 = (uint32_t)(rack->rc_tp->t_snd_rxt_bytes - cur->rxt_at_fs);
1959 log.u_bbr.flex7 = (uint16_t)cur->hybrid_flags;
1960 } else {
1961 log.u_bbr.flex7 = 0xffff;
1962 log.u_bbr.cur_del_rate = 0xffffffffffffffff;
1963 }
1964 /*
1965 * Compose bbr_state to be a bit wise 0000ADHF
1966 * where A is the always_pace flag
1967 * where D is the dgp_on flag
1968 * where H is the hybrid_mode on flag
1969 * where F is the use_fixed_rate flag.
1970 */
1971 log.u_bbr.bbr_state = rack->rc_always_pace;
1972 log.u_bbr.bbr_state <<= 1;
1973 log.u_bbr.bbr_state |= rack->dgp_on;
1974 log.u_bbr.bbr_state <<= 1;
1975 log.u_bbr.bbr_state |= rack->rc_hybrid_mode;
1976 log.u_bbr.bbr_state <<= 1;
1977 log.u_bbr.bbr_state |= rack->use_fixed_rate;
1978 log.u_bbr.flex8 = mod;
1979 tcp_log_event(rack->rc_tp, NULL,
1980 &rack->rc_inp->inp_socket->so_rcv,
1981 &rack->rc_inp->inp_socket->so_snd,
1982 TCP_HYBRID_PACING_LOG, 0,
1983 0, &log, false, NULL, __func__, __LINE__, &tv);
1984
1985 }
1986 #endif
1987 }
1988
1989 #ifdef TCP_REQUEST_TRK
1990 static void
rack_log_hybrid_sends(struct tcp_rack * rack,struct tcp_sendfile_track * cur,int line)1991 rack_log_hybrid_sends(struct tcp_rack *rack, struct tcp_sendfile_track *cur, int line)
1992 {
1993 if (tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING)) {
1994 union tcp_log_stackspecific log;
1995 struct timeval tv;
1996 uint64_t off;
1997
1998 /* Convert our ms to a microsecond */
1999 memset(&log, 0, sizeof(log));
2000
2001 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2002 log.u_bbr.delRate = cur->sent_at_fs;
2003
2004 if ((cur->flags & TCP_TRK_TRACK_FLG_LSND) == 0) {
2005 /*
2006 * We did not get a new Rules Applied to set so
2007 * no overlapping send occured, this means the
2008 * current byte counts are correct.
2009 */
2010 log.u_bbr.cur_del_rate = rack->rc_tp->t_sndbytes;
2011 log.u_bbr.rttProp = rack->rc_tp->t_snd_rxt_bytes;
2012 } else {
2013 /*
2014 * Overlapping send case, we switched to a new
2015 * send and did a rules applied.
2016 */
2017 log.u_bbr.cur_del_rate = cur->sent_at_ls;
2018 log.u_bbr.rttProp = cur->rxt_at_ls;
2019 }
2020 log.u_bbr.bw_inuse = cur->rxt_at_fs;
2021 log.u_bbr.cwnd_gain = line;
2022 off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_tcpreq_info[0]);
2023 log.u_bbr.bbr_substate = (uint8_t)(off / sizeof(struct tcp_sendfile_track));
2024 /* start = < flex1 | flex2 > */
2025 log.u_bbr.flex2 = (uint32_t)(cur->start & 0x00000000ffffffff);
2026 log.u_bbr.flex1 = (uint32_t)((cur->start >> 32) & 0x00000000ffffffff);
2027 /* end = < flex3 | flex4 > */
2028 log.u_bbr.flex4 = (uint32_t)(cur->end & 0x00000000ffffffff);
2029 log.u_bbr.flex3 = (uint32_t)((cur->end >> 32) & 0x00000000ffffffff);
2030
2031 /* localtime = <delivered | applimited>*/
2032 log.u_bbr.applimited = (uint32_t)(cur->localtime & 0x00000000ffffffff);
2033 log.u_bbr.delivered = (uint32_t)((cur->localtime >> 32) & 0x00000000ffffffff);
2034 /* client timestamp = <lt_epoch | epoch>*/
2035 log.u_bbr.epoch = (uint32_t)(cur->timestamp & 0x00000000ffffffff);
2036 log.u_bbr.lt_epoch = (uint32_t)((cur->timestamp >> 32) & 0x00000000ffffffff);
2037 /* now set all the flags in */
2038 log.u_bbr.pkts_out = cur->hybrid_flags;
2039 log.u_bbr.lost = cur->playout_ms;
2040 log.u_bbr.flex6 = cur->flags;
2041 /*
2042 * Last send time = <flex5 | pkt_epoch> note we do not distinguish cases
2043 * where a false retransmit occurred so first_send <-> lastsend may
2044 * include longer time then it actually took if we have a false rxt.
2045 */
2046 log.u_bbr.pkt_epoch = (uint32_t)(rack->r_ctl.last_tmit_time_acked & 0x00000000ffffffff);
2047 log.u_bbr.flex5 = (uint32_t)((rack->r_ctl.last_tmit_time_acked >> 32) & 0x00000000ffffffff);
2048 /*
2049 * Compose bbr_state to be a bit wise 0000ADHF
2050 * where A is the always_pace flag
2051 * where D is the dgp_on flag
2052 * where H is the hybrid_mode on flag
2053 * where F is the use_fixed_rate flag.
2054 */
2055 log.u_bbr.bbr_state = rack->rc_always_pace;
2056 log.u_bbr.bbr_state <<= 1;
2057 log.u_bbr.bbr_state |= rack->dgp_on;
2058 log.u_bbr.bbr_state <<= 1;
2059 log.u_bbr.bbr_state |= rack->rc_hybrid_mode;
2060 log.u_bbr.bbr_state <<= 1;
2061 log.u_bbr.bbr_state |= rack->use_fixed_rate;
2062
2063 log.u_bbr.flex8 = HYBRID_LOG_SENT_LOST;
2064 tcp_log_event(rack->rc_tp, NULL,
2065 &rack->rc_inp->inp_socket->so_rcv,
2066 &rack->rc_inp->inp_socket->so_snd,
2067 TCP_HYBRID_PACING_LOG, 0,
2068 0, &log, false, NULL, __func__, __LINE__, &tv);
2069 }
2070 }
2071 #endif
2072
2073 static inline uint64_t
rack_compensate_for_linerate(struct tcp_rack * rack,uint64_t bw)2074 rack_compensate_for_linerate(struct tcp_rack *rack, uint64_t bw)
2075 {
2076 uint64_t ret_bw, ether;
2077 uint64_t u_segsiz;
2078
2079 ether = rack->rc_tp->t_maxseg + sizeof(struct tcphdr);
2080 if (rack->r_is_v6){
2081 #ifdef INET6
2082 ether += sizeof(struct ip6_hdr);
2083 #endif
2084 ether += 14; /* eheader size 6+6+2 */
2085 } else {
2086 #ifdef INET
2087 ether += sizeof(struct ip);
2088 #endif
2089 ether += 14; /* eheader size 6+6+2 */
2090 }
2091 u_segsiz = (uint64_t)min(ctf_fixed_maxseg(rack->rc_tp), rack->r_ctl.rc_pace_min_segs);
2092 ret_bw = bw;
2093 ret_bw *= ether;
2094 ret_bw /= u_segsiz;
2095 return (ret_bw);
2096 }
2097
2098 static void
rack_rate_cap_bw(struct tcp_rack * rack,uint64_t * bw,int * capped)2099 rack_rate_cap_bw(struct tcp_rack *rack, uint64_t *bw, int *capped)
2100 {
2101 #ifdef TCP_REQUEST_TRK
2102 struct timeval tv;
2103 uint64_t timenow, timeleft, lenleft, lengone, calcbw;
2104 #endif
2105
2106 if (rack->r_ctl.bw_rate_cap == 0)
2107 return;
2108 #ifdef TCP_REQUEST_TRK
2109 if (rack->rc_catch_up && rack->rc_hybrid_mode &&
2110 (rack->r_ctl.rc_last_sft != NULL)) {
2111 /*
2112 * We have a dynamic cap. The original target
2113 * is in bw_rate_cap, but we need to look at
2114 * how long it is until we hit the deadline.
2115 */
2116 struct tcp_sendfile_track *ent;
2117
2118 ent = rack->r_ctl.rc_last_sft;
2119 microuptime(&tv);
2120 timenow = tcp_tv_to_lusec(&tv);
2121 if (timenow >= ent->deadline) {
2122 /* No time left we do DGP only */
2123 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
2124 0, 0, 0, HYBRID_LOG_OUTOFTIME, 0, ent, __LINE__);
2125 rack->r_ctl.bw_rate_cap = 0;
2126 return;
2127 }
2128 /* We have the time */
2129 timeleft = rack->r_ctl.rc_last_sft->deadline - timenow;
2130 if (timeleft < HPTS_MSEC_IN_SEC) {
2131 /* If there is less than a ms left just use DGPs rate */
2132 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
2133 0, timeleft, 0, HYBRID_LOG_OUTOFTIME, 0, ent, __LINE__);
2134 rack->r_ctl.bw_rate_cap = 0;
2135 return;
2136 }
2137 /*
2138 * Now lets find the amount of data left to send.
2139 *
2140 * Now ideally we want to use the end_seq to figure out how much more
2141 * but it might not be possible (only if we have the TRACK_FG_COMP on the entry..
2142 */
2143 if (ent->flags & TCP_TRK_TRACK_FLG_COMP) {
2144 if (SEQ_GT(ent->end_seq, rack->rc_tp->snd_una))
2145 lenleft = ent->end_seq - rack->rc_tp->snd_una;
2146 else {
2147 /* TSNH, we should catch it at the send */
2148 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
2149 0, timeleft, 0, HYBRID_LOG_CAPERROR, 0, ent, __LINE__);
2150 rack->r_ctl.bw_rate_cap = 0;
2151 return;
2152 }
2153 } else {
2154 /*
2155 * The hard way, figure out how much is gone and then
2156 * take that away from the total the client asked for
2157 * (thats off by tls overhead if this is tls).
2158 */
2159 if (SEQ_GT(rack->rc_tp->snd_una, ent->start_seq))
2160 lengone = rack->rc_tp->snd_una - ent->start_seq;
2161 else
2162 lengone = 0;
2163 if (lengone < (ent->end - ent->start))
2164 lenleft = (ent->end - ent->start) - lengone;
2165 else {
2166 /* TSNH, we should catch it at the send */
2167 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
2168 0, timeleft, lengone, HYBRID_LOG_CAPERROR, 0, ent, __LINE__);
2169 rack->r_ctl.bw_rate_cap = 0;
2170 return;
2171 }
2172 }
2173 if (lenleft == 0) {
2174 /* We have it all sent */
2175 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
2176 0, timeleft, lenleft, HYBRID_LOG_ALLSENT, 0, ent, __LINE__);
2177 if (rack->r_ctl.bw_rate_cap)
2178 goto normal_ratecap;
2179 else
2180 return;
2181 }
2182 calcbw = lenleft * HPTS_USEC_IN_SEC;
2183 calcbw /= timeleft;
2184 /* Now we must compensate for IP/TCP overhead */
2185 calcbw = rack_compensate_for_linerate(rack, calcbw);
2186 /* Update the bit rate cap */
2187 rack->r_ctl.bw_rate_cap = calcbw;
2188 if ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_S_MSS) &&
2189 (rack_hybrid_allow_set_maxseg == 1) &&
2190 ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_SETMSS) == 0)) {
2191 /* Lets set in a smaller mss possibly here to match our rate-cap */
2192 uint32_t orig_max;
2193
2194 orig_max = rack->r_ctl.rc_pace_max_segs;
2195 rack->r_ctl.rc_last_sft->hybrid_flags |= TCP_HYBRID_PACING_SETMSS;
2196 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, calcbw, ctf_fixed_maxseg(rack->rc_tp));
2197 rack_log_type_pacing_sizes(rack->rc_tp, rack, rack->r_ctl.client_suggested_maxseg, orig_max, __LINE__, 5);
2198 }
2199 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
2200 calcbw, timeleft, lenleft, HYBRID_LOG_CAP_CALC, 0, ent, __LINE__);
2201 if ((calcbw > 0) && (*bw > calcbw)) {
2202 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
2203 *bw, ent->deadline, lenleft, HYBRID_LOG_RATE_CAP, 0, ent, __LINE__);
2204 *capped = 1;
2205 *bw = calcbw;
2206 }
2207 return;
2208 }
2209 normal_ratecap:
2210 #endif
2211 if ((rack->r_ctl.bw_rate_cap > 0) && (*bw > rack->r_ctl.bw_rate_cap)) {
2212 #ifdef TCP_REQUEST_TRK
2213 if (rack->rc_hybrid_mode &&
2214 rack->rc_catch_up &&
2215 (rack->r_ctl.rc_last_sft != NULL) &&
2216 (rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_S_MSS) &&
2217 (rack_hybrid_allow_set_maxseg == 1) &&
2218 ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_SETMSS) == 0)) {
2219 /* Lets set in a smaller mss possibly here to match our rate-cap */
2220 uint32_t orig_max;
2221
2222 orig_max = rack->r_ctl.rc_pace_max_segs;
2223 rack->r_ctl.rc_last_sft->hybrid_flags |= TCP_HYBRID_PACING_SETMSS;
2224 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, rack->r_ctl.bw_rate_cap, ctf_fixed_maxseg(rack->rc_tp));
2225 rack_log_type_pacing_sizes(rack->rc_tp, rack, rack->r_ctl.client_suggested_maxseg, orig_max, __LINE__, 5);
2226 }
2227 #endif
2228 *capped = 1;
2229 *bw = rack->r_ctl.bw_rate_cap;
2230 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
2231 *bw, 0, 0,
2232 HYBRID_LOG_RATE_CAP, 1, NULL, __LINE__);
2233 }
2234 }
2235
2236 static uint64_t
rack_get_gp_est(struct tcp_rack * rack)2237 rack_get_gp_est(struct tcp_rack *rack)
2238 {
2239 uint64_t bw, lt_bw, ret_bw;
2240
2241 if (rack->rc_gp_filled == 0) {
2242 /*
2243 * We have yet no b/w measurement,
2244 * if we have a user set initial bw
2245 * return it. If we don't have that and
2246 * we have an srtt, use the tcp IW (10) to
2247 * calculate a fictional b/w over the SRTT
2248 * which is more or less a guess. Note
2249 * we don't use our IW from rack on purpose
2250 * so if we have like IW=30, we are not
2251 * calculating a "huge" b/w.
2252 */
2253 uint64_t srtt;
2254
2255 if (rack->dis_lt_bw == 1)
2256 lt_bw = 0;
2257 else
2258 lt_bw = rack_get_lt_bw(rack);
2259 if (lt_bw) {
2260 /*
2261 * No goodput bw but a long-term b/w does exist
2262 * lets use that.
2263 */
2264 ret_bw = lt_bw;
2265 goto compensate;
2266 }
2267 if (rack->r_ctl.init_rate)
2268 return (rack->r_ctl.init_rate);
2269
2270 /* Ok lets come up with the IW guess, if we have a srtt */
2271 if (rack->rc_tp->t_srtt == 0) {
2272 /*
2273 * Go with old pacing method
2274 * i.e. burst mitigation only.
2275 */
2276 return (0);
2277 }
2278 /* Ok lets get the initial TCP win (not racks) */
2279 bw = tcp_compute_initwnd(tcp_maxseg(rack->rc_tp));
2280 srtt = (uint64_t)rack->rc_tp->t_srtt;
2281 bw *= (uint64_t)USECS_IN_SECOND;
2282 bw /= srtt;
2283 ret_bw = bw;
2284 goto compensate;
2285
2286 }
2287 if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) {
2288 /* Averaging is done, we can return the value */
2289 bw = rack->r_ctl.gp_bw;
2290 } else {
2291 /* Still doing initial average must calculate */
2292 bw = rack->r_ctl.gp_bw / max(rack->r_ctl.num_measurements, 1);
2293 }
2294 if (rack->dis_lt_bw) {
2295 /* We are not using lt-bw */
2296 ret_bw = bw;
2297 goto compensate;
2298 }
2299 lt_bw = rack_get_lt_bw(rack);
2300 if (lt_bw == 0) {
2301 /* If we don't have one then equate it to the gp_bw */
2302 lt_bw = rack->r_ctl.gp_bw;
2303 }
2304 if (rack->use_lesser_lt_bw) {
2305 if (lt_bw < bw)
2306 ret_bw = lt_bw;
2307 else
2308 ret_bw = bw;
2309 } else {
2310 if (lt_bw > bw)
2311 ret_bw = lt_bw;
2312 else
2313 ret_bw = bw;
2314 }
2315 /*
2316 * Now lets compensate based on the TCP/IP overhead. Our
2317 * Goodput estimate does not include this so we must pace out
2318 * a bit faster since our pacing calculations do. The pacing
2319 * calculations use the base ETHERNET_SEGMENT_SIZE and the segsiz
2320 * we are using to do this, so we do that here in the opposite
2321 * direction as well. This means that if we are tunneled and the
2322 * segsiz is say 1200 bytes we will get quite a boost, but its
2323 * compensated for in the pacing time the opposite way.
2324 */
2325 compensate:
2326 ret_bw = rack_compensate_for_linerate(rack, ret_bw);
2327 return(ret_bw);
2328 }
2329
2330
2331 static uint64_t
rack_get_bw(struct tcp_rack * rack)2332 rack_get_bw(struct tcp_rack *rack)
2333 {
2334 uint64_t bw;
2335
2336 if (rack->use_fixed_rate) {
2337 /* Return the fixed pacing rate */
2338 return (rack_get_fixed_pacing_bw(rack));
2339 }
2340 bw = rack_get_gp_est(rack);
2341 return (bw);
2342 }
2343
2344 static uint16_t
rack_get_output_gain(struct tcp_rack * rack,struct rack_sendmap * rsm)2345 rack_get_output_gain(struct tcp_rack *rack, struct rack_sendmap *rsm)
2346 {
2347 if (rack->use_fixed_rate) {
2348 return (100);
2349 } else if (rack->in_probe_rtt && (rsm == NULL))
2350 return (rack->r_ctl.rack_per_of_gp_probertt);
2351 else if ((IN_FASTRECOVERY(rack->rc_tp->t_flags) &&
2352 rack->r_ctl.rack_per_of_gp_rec)) {
2353 if (rsm) {
2354 /* a retransmission always use the recovery rate */
2355 return (rack->r_ctl.rack_per_of_gp_rec);
2356 } else if (rack->rack_rec_nonrxt_use_cr) {
2357 /* Directed to use the configured rate */
2358 goto configured_rate;
2359 } else if (rack->rack_no_prr &&
2360 (rack->r_ctl.rack_per_of_gp_rec > 100)) {
2361 /* No PRR, lets just use the b/w estimate only */
2362 return (100);
2363 } else {
2364 /*
2365 * Here we may have a non-retransmit but we
2366 * have no overrides, so just use the recovery
2367 * rate (prr is in effect).
2368 */
2369 return (rack->r_ctl.rack_per_of_gp_rec);
2370 }
2371 }
2372 configured_rate:
2373 /* For the configured rate we look at our cwnd vs the ssthresh */
2374 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh)
2375 return (rack->r_ctl.rack_per_of_gp_ss);
2376 else
2377 return (rack->r_ctl.rack_per_of_gp_ca);
2378 }
2379
2380 static void
rack_log_dsack_event(struct tcp_rack * rack,uint8_t mod,uint32_t flex4,uint32_t flex5,uint32_t flex6)2381 rack_log_dsack_event(struct tcp_rack *rack, uint8_t mod, uint32_t flex4, uint32_t flex5, uint32_t flex6)
2382 {
2383 /*
2384 * Types of logs (mod value)
2385 * 1 = dsack_persists reduced by 1 via T-O or fast recovery exit.
2386 * 2 = a dsack round begins, persist is reset to 16.
2387 * 3 = a dsack round ends
2388 * 4 = Dsack option increases rack rtt flex5 is the srtt input, flex6 is thresh
2389 * 5 = Socket option set changing the control flags rc_rack_tmr_std_based, rc_rack_use_dsack
2390 * 6 = Final rack rtt, flex4 is srtt and flex6 is final limited thresh.
2391 */
2392 if (tcp_bblogging_on(rack->rc_tp)) {
2393 union tcp_log_stackspecific log;
2394 struct timeval tv;
2395
2396 memset(&log, 0, sizeof(log));
2397 log.u_bbr.flex1 = rack->rc_rack_tmr_std_based;
2398 log.u_bbr.flex1 <<= 1;
2399 log.u_bbr.flex1 |= rack->rc_rack_use_dsack;
2400 log.u_bbr.flex1 <<= 1;
2401 log.u_bbr.flex1 |= rack->rc_dsack_round_seen;
2402 log.u_bbr.flex2 = rack->r_ctl.dsack_round_end;
2403 log.u_bbr.flex3 = rack->r_ctl.num_dsack;
2404 log.u_bbr.flex4 = flex4;
2405 log.u_bbr.flex5 = flex5;
2406 log.u_bbr.flex6 = flex6;
2407 log.u_bbr.flex7 = rack->r_ctl.dsack_persist;
2408 log.u_bbr.flex8 = mod;
2409 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2410 log.u_bbr.epoch = rack->r_ctl.current_round;
2411 log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost;
2412 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2413 &rack->rc_inp->inp_socket->so_rcv,
2414 &rack->rc_inp->inp_socket->so_snd,
2415 RACK_DSACK_HANDLING, 0,
2416 0, &log, false, &tv);
2417 }
2418 }
2419
2420 static void
rack_log_hdwr_pacing(struct tcp_rack * rack,uint64_t rate,uint64_t hw_rate,int line,int error,uint16_t mod)2421 rack_log_hdwr_pacing(struct tcp_rack *rack,
2422 uint64_t rate, uint64_t hw_rate, int line,
2423 int error, uint16_t mod)
2424 {
2425 if (tcp_bblogging_on(rack->rc_tp)) {
2426 union tcp_log_stackspecific log;
2427 struct timeval tv;
2428 const struct ifnet *ifp;
2429 uint64_t ifp64;
2430
2431 memset(&log, 0, sizeof(log));
2432 log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff);
2433 log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff);
2434 if (rack->r_ctl.crte) {
2435 ifp = rack->r_ctl.crte->ptbl->rs_ifp;
2436 } else if (rack->rc_inp->inp_route.ro_nh &&
2437 rack->rc_inp->inp_route.ro_nh->nh_ifp) {
2438 ifp = rack->rc_inp->inp_route.ro_nh->nh_ifp;
2439 } else
2440 ifp = NULL;
2441 if (ifp) {
2442 ifp64 = (uintptr_t)ifp;
2443 log.u_bbr.flex3 = ((ifp64 >> 32) & 0x00000000ffffffff);
2444 log.u_bbr.flex4 = (ifp64 & 0x00000000ffffffff);
2445 }
2446 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2447 log.u_bbr.bw_inuse = rate;
2448 log.u_bbr.flex5 = line;
2449 log.u_bbr.flex6 = error;
2450 log.u_bbr.flex7 = mod;
2451 log.u_bbr.applimited = rack->r_ctl.rc_pace_max_segs;
2452 log.u_bbr.flex8 = rack->use_fixed_rate;
2453 log.u_bbr.flex8 <<= 1;
2454 log.u_bbr.flex8 |= rack->rack_hdrw_pacing;
2455 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg;
2456 log.u_bbr.delRate = rack->r_ctl.crte_prev_rate;
2457 if (rack->r_ctl.crte)
2458 log.u_bbr.cur_del_rate = rack->r_ctl.crte->rate;
2459 else
2460 log.u_bbr.cur_del_rate = 0;
2461 log.u_bbr.rttProp = rack->r_ctl.last_hw_bw_req;
2462 log.u_bbr.epoch = rack->r_ctl.current_round;
2463 log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost;
2464 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2465 &rack->rc_inp->inp_socket->so_rcv,
2466 &rack->rc_inp->inp_socket->so_snd,
2467 BBR_LOG_HDWR_PACE, 0,
2468 0, &log, false, &tv);
2469 }
2470 }
2471
2472 static uint64_t
rack_get_output_bw(struct tcp_rack * rack,uint64_t bw,struct rack_sendmap * rsm,int * capped)2473 rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm, int *capped)
2474 {
2475 /*
2476 * We allow rack_per_of_gp_xx to dictate our bw rate we want.
2477 */
2478 uint64_t bw_est, high_rate;
2479 uint64_t gain;
2480
2481 gain = (uint64_t)rack_get_output_gain(rack, rsm);
2482 bw_est = bw * gain;
2483 bw_est /= (uint64_t)100;
2484 /* Never fall below the minimum (def 64kbps) */
2485 if (bw_est < RACK_MIN_BW)
2486 bw_est = RACK_MIN_BW;
2487 if (rack->r_rack_hw_rate_caps) {
2488 /* Rate caps are in place */
2489 if (rack->r_ctl.crte != NULL) {
2490 /* We have a hdwr rate already */
2491 high_rate = tcp_hw_highest_rate(rack->r_ctl.crte);
2492 if (bw_est >= high_rate) {
2493 /* We are capping bw at the highest rate table entry */
2494 if (rack_hw_rate_cap_per &&
2495 (((high_rate * (100 + rack_hw_rate_cap_per)) / 100) < bw_est)) {
2496 rack->r_rack_hw_rate_caps = 0;
2497 goto done;
2498 }
2499 rack_log_hdwr_pacing(rack,
2500 bw_est, high_rate, __LINE__,
2501 0, 3);
2502 bw_est = high_rate;
2503 if (capped)
2504 *capped = 1;
2505 }
2506 } else if ((rack->rack_hdrw_pacing == 0) &&
2507 (rack->rack_hdw_pace_ena) &&
2508 (rack->rack_attempt_hdwr_pace == 0) &&
2509 (rack->rc_inp->inp_route.ro_nh != NULL) &&
2510 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) {
2511 /*
2512 * Special case, we have not yet attempted hardware
2513 * pacing, and yet we may, when we do, find out if we are
2514 * above the highest rate. We need to know the maxbw for the interface
2515 * in question (if it supports ratelimiting). We get back
2516 * a 0, if the interface is not found in the RL lists.
2517 */
2518 high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp);
2519 if (high_rate) {
2520 /* Yep, we have a rate is it above this rate? */
2521 if (bw_est > high_rate) {
2522 bw_est = high_rate;
2523 if (capped)
2524 *capped = 1;
2525 }
2526 }
2527 }
2528 }
2529 done:
2530 return (bw_est);
2531 }
2532
2533 static void
rack_log_retran_reason(struct tcp_rack * rack,struct rack_sendmap * rsm,uint32_t tsused,uint32_t thresh,int mod)2534 rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod)
2535 {
2536 if (tcp_bblogging_on(rack->rc_tp)) {
2537 union tcp_log_stackspecific log;
2538 struct timeval tv;
2539
2540 if ((mod != 1) && (rack_verbose_logging == 0)) {
2541 /*
2542 * We get 3 values currently for mod
2543 * 1 - We are retransmitting and this tells the reason.
2544 * 2 - We are clearing a dup-ack count.
2545 * 3 - We are incrementing a dup-ack count.
2546 *
2547 * The clear/increment are only logged
2548 * if you have BBverbose on.
2549 */
2550 return;
2551 }
2552 memset(&log, 0, sizeof(log));
2553 log.u_bbr.flex1 = tsused;
2554 log.u_bbr.flex2 = thresh;
2555 log.u_bbr.flex3 = rsm->r_flags;
2556 log.u_bbr.flex4 = rsm->r_dupack;
2557 log.u_bbr.flex5 = rsm->r_start;
2558 log.u_bbr.flex6 = rsm->r_end;
2559 log.u_bbr.flex8 = mod;
2560 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
2561 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2562 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2563 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2564 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2565 log.u_bbr.pacing_gain = rack->r_must_retran;
2566 log.u_bbr.epoch = rack->r_ctl.current_round;
2567 log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost;
2568 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2569 &rack->rc_inp->inp_socket->so_rcv,
2570 &rack->rc_inp->inp_socket->so_snd,
2571 BBR_LOG_SETTINGS_CHG, 0,
2572 0, &log, false, &tv);
2573 }
2574 }
2575
2576 static void
rack_log_to_start(struct tcp_rack * rack,uint32_t cts,uint32_t to,int32_t pacing_delay,uint8_t which)2577 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t pacing_delay, uint8_t which)
2578 {
2579 if (tcp_bblogging_on(rack->rc_tp)) {
2580 union tcp_log_stackspecific log;
2581 struct timeval tv;
2582
2583 memset(&log, 0, sizeof(log));
2584 log.u_bbr.flex1 = rack->rc_tp->t_srtt;
2585 log.u_bbr.flex2 = to;
2586 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
2587 log.u_bbr.flex4 = pacing_delay;
2588 log.u_bbr.flex5 = rack->rc_tp->t_hpts_slot;
2589 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
2590 log.u_bbr.flex7 = rack->rc_in_persist;
2591 log.u_bbr.flex8 = which;
2592 if (rack->rack_no_prr)
2593 log.u_bbr.pkts_out = 0;
2594 else
2595 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
2596 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
2597 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2598 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2599 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2600 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2601 log.u_bbr.pacing_gain = rack->r_must_retran;
2602 log.u_bbr.cwnd_gain = rack->rack_deferred_inited;
2603 log.u_bbr.pkt_epoch = rack->rc_has_collapsed;
2604 log.u_bbr.lt_epoch = rack->rc_tp->t_rxtshift;
2605 log.u_bbr.lost = rack_rto_min;
2606 log.u_bbr.epoch = rack->r_ctl.roundends;
2607 log.u_bbr.bw_inuse = rack->r_ctl.current_round;
2608 log.u_bbr.bw_inuse <<= 32;
2609 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost;
2610 log.u_bbr.applimited = rack->rc_tp->t_flags2;
2611 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2612 &rack->rc_inp->inp_socket->so_rcv,
2613 &rack->rc_inp->inp_socket->so_snd,
2614 BBR_LOG_TIMERSTAR, 0,
2615 0, &log, false, &tv);
2616 }
2617 }
2618
2619 static void
rack_log_to_event(struct tcp_rack * rack,int32_t to_num,struct rack_sendmap * rsm)2620 rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm)
2621 {
2622 if (tcp_bblogging_on(rack->rc_tp)) {
2623 union tcp_log_stackspecific log;
2624 struct timeval tv;
2625
2626 memset(&log, 0, sizeof(log));
2627 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
2628 log.u_bbr.flex8 = to_num;
2629 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt;
2630 log.u_bbr.flex2 = rack->rc_rack_rtt;
2631 if (rsm == NULL)
2632 log.u_bbr.flex3 = 0;
2633 else
2634 log.u_bbr.flex3 = rsm->r_end - rsm->r_start;
2635 if (rack->rack_no_prr)
2636 log.u_bbr.flex5 = 0;
2637 else
2638 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
2639 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2640 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2641 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2642 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2643 log.u_bbr.pacing_gain = rack->r_must_retran;
2644 log.u_bbr.bw_inuse = rack->r_ctl.current_round;
2645 log.u_bbr.bw_inuse <<= 32;
2646 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost;
2647 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2648 &rack->rc_inp->inp_socket->so_rcv,
2649 &rack->rc_inp->inp_socket->so_snd,
2650 BBR_LOG_RTO, 0,
2651 0, &log, false, &tv);
2652 }
2653 }
2654
2655 static void
rack_log_map_chg(struct tcpcb * tp,struct tcp_rack * rack,struct rack_sendmap * prev,struct rack_sendmap * rsm,struct rack_sendmap * next,int flag,uint32_t th_ack,int line)2656 rack_log_map_chg(struct tcpcb *tp, struct tcp_rack *rack,
2657 struct rack_sendmap *prev,
2658 struct rack_sendmap *rsm,
2659 struct rack_sendmap *next,
2660 int flag, uint32_t th_ack, int line)
2661 {
2662 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
2663 union tcp_log_stackspecific log;
2664 struct timeval tv;
2665
2666 memset(&log, 0, sizeof(log));
2667 log.u_bbr.flex8 = flag;
2668 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
2669 log.u_bbr.cur_del_rate = (uintptr_t)prev;
2670 log.u_bbr.delRate = (uintptr_t)rsm;
2671 log.u_bbr.rttProp = (uintptr_t)next;
2672 if (rsm)
2673 log.u_bbr.flex1 = rsm->r_flags;
2674 log.u_bbr.flex7 = 0;
2675 if (prev) {
2676 log.u_bbr.flex1 = prev->r_start;
2677 log.u_bbr.flex2 = prev->r_end;
2678 log.u_bbr.flex7 |= 0x4;
2679 }
2680 if (rsm) {
2681 log.u_bbr.flex3 = rsm->r_start;
2682 log.u_bbr.flex4 = rsm->r_end;
2683 log.u_bbr.flex7 |= 0x2;
2684 }
2685 if (next) {
2686 log.u_bbr.flex5 = next->r_start;
2687 log.u_bbr.flex6 = next->r_end;
2688 log.u_bbr.flex7 |= 0x1;
2689 }
2690 log.u_bbr.applimited = line;
2691 log.u_bbr.pkts_out = th_ack;
2692 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2693 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2694 if (rack->rack_no_prr)
2695 log.u_bbr.lost = 0;
2696 else
2697 log.u_bbr.lost = rack->r_ctl.rc_prr_sndcnt;
2698 log.u_bbr.bw_inuse = rack->r_ctl.current_round;
2699 log.u_bbr.bw_inuse <<= 32;
2700 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost;
2701 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2702 &rack->rc_inp->inp_socket->so_rcv,
2703 &rack->rc_inp->inp_socket->so_snd,
2704 TCP_LOG_MAPCHG, 0,
2705 0, &log, false, &tv);
2706 }
2707 }
2708
2709 static void
rack_log_rtt_upd(struct tcpcb * tp,struct tcp_rack * rack,uint32_t t,uint32_t len,struct rack_sendmap * rsm,int conf)2710 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t len,
2711 struct rack_sendmap *rsm, int conf)
2712 {
2713 if (tcp_bblogging_on(tp)) {
2714 union tcp_log_stackspecific log;
2715 struct timeval tv;
2716 memset(&log, 0, sizeof(log));
2717 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
2718 log.u_bbr.flex1 = t;
2719 log.u_bbr.flex2 = len;
2720 log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt;
2721 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest;
2722 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest;
2723 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_us_rtrcnt;
2724 log.u_bbr.flex7 = conf;
2725 log.u_bbr.rttProp = (uint64_t)rack->r_ctl.rack_rs.rs_rtt_tot;
2726 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method;
2727 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2728 log.u_bbr.delivered = rack->r_ctl.rack_rs.rs_us_rtrcnt;
2729 log.u_bbr.pkts_out = rack->r_ctl.rack_rs.rs_flags;
2730 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2731 if (rsm) {
2732 log.u_bbr.pkt_epoch = rsm->r_start;
2733 log.u_bbr.lost = rsm->r_end;
2734 log.u_bbr.cwnd_gain = rsm->r_rtr_cnt;
2735 /* We loose any upper of the 24 bits */
2736 log.u_bbr.pacing_gain = (uint16_t)rsm->r_flags;
2737 } else {
2738 /* Its a SYN */
2739 log.u_bbr.pkt_epoch = rack->rc_tp->iss;
2740 log.u_bbr.lost = 0;
2741 log.u_bbr.cwnd_gain = 0;
2742 log.u_bbr.pacing_gain = 0;
2743 }
2744 /* Write out general bits of interest rrs here */
2745 log.u_bbr.use_lt_bw = rack->rc_highly_buffered;
2746 log.u_bbr.use_lt_bw <<= 1;
2747 log.u_bbr.use_lt_bw |= rack->forced_ack;
2748 log.u_bbr.use_lt_bw <<= 1;
2749 log.u_bbr.use_lt_bw |= rack->rc_gp_dyn_mul;
2750 log.u_bbr.use_lt_bw <<= 1;
2751 log.u_bbr.use_lt_bw |= rack->in_probe_rtt;
2752 log.u_bbr.use_lt_bw <<= 1;
2753 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt;
2754 log.u_bbr.use_lt_bw <<= 1;
2755 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set;
2756 log.u_bbr.use_lt_bw <<= 1;
2757 log.u_bbr.use_lt_bw |= rack->rc_gp_filled;
2758 log.u_bbr.use_lt_bw <<= 1;
2759 log.u_bbr.use_lt_bw |= rack->rc_dragged_bottom;
2760 log.u_bbr.applimited = rack->r_ctl.rc_target_probertt_flight;
2761 log.u_bbr.epoch = rack->r_ctl.rc_time_probertt_starts;
2762 log.u_bbr.lt_epoch = rack->r_ctl.rc_time_probertt_entered;
2763 log.u_bbr.cur_del_rate = rack->r_ctl.rc_lower_rtt_us_cts;
2764 log.u_bbr.delRate = rack->r_ctl.rc_gp_srtt;
2765 log.u_bbr.bw_inuse = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time);
2766 log.u_bbr.bw_inuse <<= 32;
2767 if (rsm)
2768 log.u_bbr.bw_inuse |= ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]);
2769 TCP_LOG_EVENTP(tp, NULL,
2770 &rack->rc_inp->inp_socket->so_rcv,
2771 &rack->rc_inp->inp_socket->so_snd,
2772 BBR_LOG_BBRRTT, 0,
2773 0, &log, false, &tv);
2774
2775
2776 }
2777 }
2778
2779 static void
rack_log_rtt_sample(struct tcp_rack * rack,uint32_t rtt)2780 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt)
2781 {
2782 /*
2783 * Log the rtt sample we are
2784 * applying to the srtt algorithm in
2785 * useconds.
2786 */
2787 if (tcp_bblogging_on(rack->rc_tp)) {
2788 union tcp_log_stackspecific log;
2789 struct timeval tv;
2790
2791 /* Convert our ms to a microsecond */
2792 memset(&log, 0, sizeof(log));
2793 log.u_bbr.flex1 = rtt;
2794 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
2795 log.u_bbr.flex7 = 1;
2796 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2797 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2798 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2799 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2800 log.u_bbr.pacing_gain = rack->r_must_retran;
2801 /*
2802 * We capture in delRate the upper 32 bits as
2803 * the confidence level we had declared, and the
2804 * lower 32 bits as the actual RTT using the arrival
2805 * timestamp.
2806 */
2807 log.u_bbr.delRate = rack->r_ctl.rack_rs.confidence;
2808 log.u_bbr.delRate <<= 32;
2809 log.u_bbr.delRate |= rack->r_ctl.rack_rs.rs_us_rtt;
2810 /* Lets capture all the things that make up t_rtxcur */
2811 log.u_bbr.applimited = rack_rto_min;
2812 log.u_bbr.epoch = rack_rto_max;
2813 log.u_bbr.lt_epoch = rack->r_ctl.timer_slop;
2814 log.u_bbr.lost = rack_rto_min;
2815 log.u_bbr.pkt_epoch = TICKS_2_USEC(tcp_rexmit_slop);
2816 log.u_bbr.rttProp = RACK_REXMTVAL(rack->rc_tp);
2817 log.u_bbr.bw_inuse = rack->r_ctl.act_rcv_time.tv_sec;
2818 log.u_bbr.bw_inuse *= HPTS_USEC_IN_SEC;
2819 log.u_bbr.bw_inuse += rack->r_ctl.act_rcv_time.tv_usec;
2820 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2821 &rack->rc_inp->inp_socket->so_rcv,
2822 &rack->rc_inp->inp_socket->so_snd,
2823 TCP_LOG_RTT, 0,
2824 0, &log, false, &tv);
2825 }
2826 }
2827
2828 static void
rack_log_rtt_sample_calc(struct tcp_rack * rack,uint32_t rtt,uint32_t send_time,uint32_t ack_time,int where)2829 rack_log_rtt_sample_calc(struct tcp_rack *rack, uint32_t rtt, uint32_t send_time, uint32_t ack_time, int where)
2830 {
2831 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
2832 union tcp_log_stackspecific log;
2833 struct timeval tv;
2834
2835 /* Convert our ms to a microsecond */
2836 memset(&log, 0, sizeof(log));
2837 log.u_bbr.flex1 = rtt;
2838 log.u_bbr.flex2 = send_time;
2839 log.u_bbr.flex3 = ack_time;
2840 log.u_bbr.flex4 = where;
2841 log.u_bbr.flex7 = 2;
2842 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2843 log.u_bbr.bw_inuse = rack->r_ctl.current_round;
2844 log.u_bbr.bw_inuse <<= 32;
2845 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost;
2846 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2847 &rack->rc_inp->inp_socket->so_rcv,
2848 &rack->rc_inp->inp_socket->so_snd,
2849 TCP_LOG_RTT, 0,
2850 0, &log, false, &tv);
2851 }
2852 }
2853
2854
2855 static void
rack_log_rtt_sendmap(struct tcp_rack * rack,uint32_t idx,uint64_t tsv,uint32_t tsecho)2856 rack_log_rtt_sendmap(struct tcp_rack *rack, uint32_t idx, uint64_t tsv, uint32_t tsecho)
2857 {
2858 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
2859 union tcp_log_stackspecific log;
2860 struct timeval tv;
2861
2862 /* Convert our ms to a microsecond */
2863 memset(&log, 0, sizeof(log));
2864 log.u_bbr.flex1 = idx;
2865 log.u_bbr.flex2 = rack_ts_to_msec(tsv);
2866 log.u_bbr.flex3 = tsecho;
2867 log.u_bbr.flex7 = 3;
2868 log.u_bbr.rttProp = tsv;
2869 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2870 log.u_bbr.bw_inuse = rack->r_ctl.current_round;
2871 log.u_bbr.bw_inuse <<= 32;
2872 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost;
2873 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2874 &rack->rc_inp->inp_socket->so_rcv,
2875 &rack->rc_inp->inp_socket->so_snd,
2876 TCP_LOG_RTT, 0,
2877 0, &log, false, &tv);
2878 }
2879 }
2880
2881
2882 static inline void
rack_log_progress_event(struct tcp_rack * rack,struct tcpcb * tp,uint32_t tick,int event,int line)2883 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line)
2884 {
2885 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
2886 union tcp_log_stackspecific log;
2887 struct timeval tv;
2888
2889 memset(&log, 0, sizeof(log));
2890 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
2891 log.u_bbr.flex1 = line;
2892 log.u_bbr.flex2 = tick;
2893 log.u_bbr.flex3 = tp->t_maxunacktime;
2894 log.u_bbr.flex4 = tp->t_acktime;
2895 log.u_bbr.flex8 = event;
2896 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2897 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2898 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2899 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2900 log.u_bbr.pacing_gain = rack->r_must_retran;
2901 log.u_bbr.bw_inuse = rack->r_ctl.current_round;
2902 log.u_bbr.bw_inuse <<= 32;
2903 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost;
2904 TCP_LOG_EVENTP(tp, NULL,
2905 &rack->rc_inp->inp_socket->so_rcv,
2906 &rack->rc_inp->inp_socket->so_snd,
2907 BBR_LOG_PROGRESS, 0,
2908 0, &log, false, &tv);
2909 }
2910 }
2911
2912 static void
rack_log_type_bbrsnd(struct tcp_rack * rack,uint32_t len,uint32_t pacing_delay,uint32_t cts,struct timeval * tv,int line)2913 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t pacing_delay, uint32_t cts, struct timeval *tv, int line)
2914 {
2915 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
2916 union tcp_log_stackspecific log;
2917
2918 memset(&log, 0, sizeof(log));
2919 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
2920 log.u_bbr.flex1 = pacing_delay;
2921 if (rack->rack_no_prr)
2922 log.u_bbr.flex2 = 0;
2923 else
2924 log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt;
2925 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
2926 log.u_bbr.flex6 = line;
2927 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags);
2928 log.u_bbr.flex8 = rack->rc_in_persist;
2929 log.u_bbr.timeStamp = cts;
2930 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2931 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2932 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2933 log.u_bbr.pacing_gain = rack->r_must_retran;
2934 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2935 &rack->rc_inp->inp_socket->so_rcv,
2936 &rack->rc_inp->inp_socket->so_snd,
2937 BBR_LOG_BBRSND, 0,
2938 0, &log, false, tv);
2939 }
2940 }
2941
2942 static void
rack_log_doseg_done(struct tcp_rack * rack,uint32_t cts,int32_t nxt_pkt,int32_t did_out,int way_out,int nsegs)2943 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out, int nsegs)
2944 {
2945 if (tcp_bblogging_on(rack->rc_tp)) {
2946 union tcp_log_stackspecific log;
2947 struct timeval tv;
2948
2949 memset(&log, 0, sizeof(log));
2950 log.u_bbr.flex1 = did_out;
2951 log.u_bbr.flex2 = nxt_pkt;
2952 log.u_bbr.flex3 = way_out;
2953 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
2954 if (rack->rack_no_prr)
2955 log.u_bbr.flex5 = 0;
2956 else
2957 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
2958 log.u_bbr.flex6 = nsegs;
2959 log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs;
2960 log.u_bbr.flex7 = rack->rc_ack_can_sendout_data; /* Do we have ack-can-send set */
2961 log.u_bbr.flex7 <<= 1;
2962 log.u_bbr.flex7 |= rack->r_fast_output; /* is fast output primed */
2963 log.u_bbr.flex7 <<= 1;
2964 log.u_bbr.flex7 |= rack->r_wanted_output; /* Do we want output */
2965 log.u_bbr.flex8 = rack->rc_in_persist;
2966 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
2967 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2968 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2969 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
2970 log.u_bbr.use_lt_bw <<= 1;
2971 log.u_bbr.use_lt_bw |= rack->r_might_revert;
2972 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2973 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2974 log.u_bbr.pacing_gain = rack->r_must_retran;
2975 log.u_bbr.bw_inuse = rack->r_ctl.current_round;
2976 log.u_bbr.bw_inuse <<= 32;
2977 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost;
2978 log.u_bbr.epoch = rack->rc_inp->inp_socket->so_snd.sb_hiwat;
2979 log.u_bbr.lt_epoch = rack->rc_inp->inp_socket->so_rcv.sb_hiwat;
2980 log.u_bbr.lost = rack->rc_tp->t_srtt;
2981 log.u_bbr.pkt_epoch = rack->rc_tp->rfbuf_cnt;
2982 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2983 &rack->rc_inp->inp_socket->so_rcv,
2984 &rack->rc_inp->inp_socket->so_snd,
2985 BBR_LOG_DOSEG_DONE, 0,
2986 0, &log, false, &tv);
2987 }
2988 }
2989
2990 static void
rack_log_type_pacing_sizes(struct tcpcb * tp,struct tcp_rack * rack,uint32_t arg1,uint32_t arg2,uint32_t arg3,uint8_t frm)2991 rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint8_t frm)
2992 {
2993 if (tcp_bblogging_on(rack->rc_tp)) {
2994 union tcp_log_stackspecific log;
2995 struct timeval tv;
2996
2997 memset(&log, 0, sizeof(log));
2998 log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs;
2999 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
3000 log.u_bbr.flex4 = arg1;
3001 log.u_bbr.flex5 = arg2;
3002 log.u_bbr.flex7 = rack->r_ctl.rc_user_set_min_segs;
3003 log.u_bbr.flex6 = arg3;
3004 log.u_bbr.flex8 = frm;
3005 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
3006 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
3007 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
3008 log.u_bbr.applimited = rack->r_ctl.rc_sacked;
3009 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
3010 log.u_bbr.pacing_gain = rack->r_must_retran;
3011 TCP_LOG_EVENTP(tp, NULL, &tptosocket(tp)->so_rcv,
3012 &tptosocket(tp)->so_snd,
3013 TCP_HDWR_PACE_SIZE, 0, 0, &log, false, &tv);
3014 }
3015 }
3016
3017 static void
rack_log_type_just_return(struct tcp_rack * rack,uint32_t cts,uint32_t tlen,uint32_t pacing_delay,uint8_t hpts_calling,int reason,uint32_t cwnd_to_use)3018 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t pacing_delay,
3019 uint8_t hpts_calling, int reason, uint32_t cwnd_to_use)
3020 {
3021 if (tcp_bblogging_on(rack->rc_tp)) {
3022 union tcp_log_stackspecific log;
3023 struct timeval tv;
3024
3025 memset(&log, 0, sizeof(log));
3026 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
3027 log.u_bbr.flex1 = pacing_delay;
3028 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags;
3029 log.u_bbr.flex4 = reason;
3030 if (rack->rack_no_prr)
3031 log.u_bbr.flex5 = 0;
3032 else
3033 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
3034 log.u_bbr.flex7 = hpts_calling;
3035 log.u_bbr.flex8 = rack->rc_in_persist;
3036 log.u_bbr.lt_epoch = cwnd_to_use;
3037 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
3038 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
3039 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
3040 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
3041 log.u_bbr.pacing_gain = rack->r_must_retran;
3042 log.u_bbr.cwnd_gain = rack->rc_has_collapsed;
3043 log.u_bbr.bw_inuse = rack->r_ctl.current_round;
3044 log.u_bbr.bw_inuse <<= 32;
3045 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost;
3046 TCP_LOG_EVENTP(rack->rc_tp, NULL,
3047 &rack->rc_inp->inp_socket->so_rcv,
3048 &rack->rc_inp->inp_socket->so_snd,
3049 BBR_LOG_JUSTRET, 0,
3050 tlen, &log, false, &tv);
3051 }
3052 }
3053
3054 static void
rack_log_to_cancel(struct tcp_rack * rack,int32_t hpts_removed,int line,uint32_t us_cts,struct timeval * tv,uint32_t flags_on_entry)3055 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32_t us_cts,
3056 struct timeval *tv, uint32_t flags_on_entry)
3057 {
3058 if (tcp_bblogging_on(rack->rc_tp)) {
3059 union tcp_log_stackspecific log;
3060
3061 memset(&log, 0, sizeof(log));
3062 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
3063 log.u_bbr.flex1 = line;
3064 log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to;
3065 log.u_bbr.flex3 = flags_on_entry;
3066 log.u_bbr.flex4 = us_cts;
3067 if (rack->rack_no_prr)
3068 log.u_bbr.flex5 = 0;
3069 else
3070 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
3071 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
3072 log.u_bbr.flex7 = hpts_removed;
3073 log.u_bbr.flex8 = 1;
3074 log.u_bbr.applimited = rack->r_ctl.rc_hpts_flags;
3075 log.u_bbr.timeStamp = us_cts;
3076 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
3077 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
3078 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
3079 log.u_bbr.pacing_gain = rack->r_must_retran;
3080 log.u_bbr.bw_inuse = rack->r_ctl.current_round;
3081 log.u_bbr.bw_inuse <<= 32;
3082 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost;
3083 TCP_LOG_EVENTP(rack->rc_tp, NULL,
3084 &rack->rc_inp->inp_socket->so_rcv,
3085 &rack->rc_inp->inp_socket->so_snd,
3086 BBR_LOG_TIMERCANC, 0,
3087 0, &log, false, tv);
3088 }
3089 }
3090
3091 static void
rack_log_alt_to_to_cancel(struct tcp_rack * rack,uint32_t flex1,uint32_t flex2,uint32_t flex3,uint32_t flex4,uint32_t flex5,uint32_t flex6,uint16_t flex7,uint8_t mod)3092 rack_log_alt_to_to_cancel(struct tcp_rack *rack,
3093 uint32_t flex1, uint32_t flex2,
3094 uint32_t flex3, uint32_t flex4,
3095 uint32_t flex5, uint32_t flex6,
3096 uint16_t flex7, uint8_t mod)
3097 {
3098 if (tcp_bblogging_on(rack->rc_tp)) {
3099 union tcp_log_stackspecific log;
3100 struct timeval tv;
3101
3102 if (mod == 1) {
3103 /* No you can't use 1, its for the real to cancel */
3104 return;
3105 }
3106 memset(&log, 0, sizeof(log));
3107 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
3108 log.u_bbr.flex1 = flex1;
3109 log.u_bbr.flex2 = flex2;
3110 log.u_bbr.flex3 = flex3;
3111 log.u_bbr.flex4 = flex4;
3112 log.u_bbr.flex5 = flex5;
3113 log.u_bbr.flex6 = flex6;
3114 log.u_bbr.flex7 = flex7;
3115 log.u_bbr.flex8 = mod;
3116 TCP_LOG_EVENTP(rack->rc_tp, NULL,
3117 &rack->rc_inp->inp_socket->so_rcv,
3118 &rack->rc_inp->inp_socket->so_snd,
3119 BBR_LOG_TIMERCANC, 0,
3120 0, &log, false, &tv);
3121 }
3122 }
3123
3124 static void
rack_log_to_processing(struct tcp_rack * rack,uint32_t cts,int32_t ret,int32_t timers)3125 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers)
3126 {
3127 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
3128 union tcp_log_stackspecific log;
3129 struct timeval tv;
3130
3131 memset(&log, 0, sizeof(log));
3132 log.u_bbr.flex1 = timers;
3133 log.u_bbr.flex2 = ret;
3134 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp;
3135 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
3136 log.u_bbr.flex5 = cts;
3137 if (rack->rack_no_prr)
3138 log.u_bbr.flex6 = 0;
3139 else
3140 log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt;
3141 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
3142 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
3143 log.u_bbr.pacing_gain = rack->r_must_retran;
3144 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
3145 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
3146 TCP_LOG_EVENTP(rack->rc_tp, NULL,
3147 &rack->rc_inp->inp_socket->so_rcv,
3148 &rack->rc_inp->inp_socket->so_snd,
3149 BBR_LOG_TO_PROCESS, 0,
3150 0, &log, false, &tv);
3151 }
3152 }
3153
3154 static void
rack_log_to_prr(struct tcp_rack * rack,int frm,int orig_cwnd,int line)3155 rack_log_to_prr(struct tcp_rack *rack, int frm, int orig_cwnd, int line)
3156 {
3157 if (tcp_bblogging_on(rack->rc_tp)) {
3158 union tcp_log_stackspecific log;
3159 struct timeval tv;
3160
3161 memset(&log, 0, sizeof(log));
3162 log.u_bbr.flex1 = rack->r_ctl.rc_prr_out;
3163 log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs;
3164 if (rack->rack_no_prr)
3165 log.u_bbr.flex3 = 0;
3166 else
3167 log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt;
3168 log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered;
3169 log.u_bbr.flex5 = rack->r_ctl.rc_sacked;
3170 log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt;
3171 log.u_bbr.flex7 = line;
3172 log.u_bbr.flex8 = frm;
3173 log.u_bbr.pkts_out = orig_cwnd;
3174 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
3175 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
3176 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
3177 log.u_bbr.use_lt_bw <<= 1;
3178 log.u_bbr.use_lt_bw |= rack->r_might_revert;
3179 TCP_LOG_EVENTP(rack->rc_tp, NULL,
3180 &rack->rc_inp->inp_socket->so_rcv,
3181 &rack->rc_inp->inp_socket->so_snd,
3182 BBR_LOG_BBRUPD, 0,
3183 0, &log, false, &tv);
3184 }
3185 }
3186
3187 static void
rack_counter_destroy(void)3188 rack_counter_destroy(void)
3189 {
3190 counter_u64_free(rack_total_bytes);
3191 counter_u64_free(rack_fto_send);
3192 counter_u64_free(rack_fto_rsm_send);
3193 counter_u64_free(rack_nfto_resend);
3194 counter_u64_free(rack_hw_pace_init_fail);
3195 counter_u64_free(rack_hw_pace_lost);
3196 counter_u64_free(rack_non_fto_send);
3197 counter_u64_free(rack_extended_rfo);
3198 counter_u64_free(rack_tlp_tot);
3199 counter_u64_free(rack_tlp_newdata);
3200 counter_u64_free(rack_tlp_retran);
3201 counter_u64_free(rack_tlp_retran_bytes);
3202 counter_u64_free(rack_to_tot);
3203 counter_u64_free(rack_saw_enobuf);
3204 counter_u64_free(rack_saw_enobuf_hw);
3205 counter_u64_free(rack_saw_enetunreach);
3206 counter_u64_free(rack_hot_alloc);
3207 counter_u64_free(rack_to_alloc);
3208 counter_u64_free(rack_to_alloc_hard);
3209 counter_u64_free(rack_to_alloc_emerg);
3210 counter_u64_free(rack_to_alloc_limited);
3211 counter_u64_free(rack_alloc_limited_conns);
3212 counter_u64_free(rack_split_limited);
3213 counter_u64_free(rack_multi_single_eq);
3214 counter_u64_free(rack_rxt_clamps_cwnd);
3215 counter_u64_free(rack_rxt_clamps_cwnd_uniq);
3216 counter_u64_free(rack_proc_non_comp_ack);
3217 counter_u64_free(rack_sack_proc_all);
3218 counter_u64_free(rack_sack_proc_restart);
3219 counter_u64_free(rack_sack_proc_short);
3220 counter_u64_free(rack_input_idle_reduces);
3221 counter_u64_free(rack_collapsed_win);
3222 counter_u64_free(rack_collapsed_win_rxt);
3223 counter_u64_free(rack_collapsed_win_rxt_bytes);
3224 counter_u64_free(rack_collapsed_win_seen);
3225 counter_u64_free(rack_try_scwnd);
3226 counter_u64_free(rack_persists_sends);
3227 counter_u64_free(rack_persists_acks);
3228 counter_u64_free(rack_persists_loss);
3229 counter_u64_free(rack_persists_lost_ends);
3230 #ifdef INVARIANTS
3231 counter_u64_free(rack_adjust_map_bw);
3232 #endif
3233 COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE);
3234 COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE);
3235 }
3236
3237 static struct rack_sendmap *
rack_alloc(struct tcp_rack * rack)3238 rack_alloc(struct tcp_rack *rack)
3239 {
3240 struct rack_sendmap *rsm;
3241
3242 /*
3243 * First get the top of the list it in
3244 * theory is the "hottest" rsm we have,
3245 * possibly just freed by ack processing.
3246 */
3247 if (rack->rc_free_cnt > rack_free_cache) {
3248 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
3249 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
3250 counter_u64_add(rack_hot_alloc, 1);
3251 rack->rc_free_cnt--;
3252 return (rsm);
3253 }
3254 /*
3255 * Once we get under our free cache we probably
3256 * no longer have a "hot" one available. Lets
3257 * get one from UMA.
3258 */
3259 rsm = uma_zalloc(rack_zone, M_NOWAIT);
3260 if (rsm) {
3261 rack->r_ctl.rc_num_maps_alloced++;
3262 counter_u64_add(rack_to_alloc, 1);
3263 return (rsm);
3264 }
3265 /*
3266 * Dig in to our aux rsm's (the last two) since
3267 * UMA failed to get us one.
3268 */
3269 if (rack->rc_free_cnt) {
3270 counter_u64_add(rack_to_alloc_emerg, 1);
3271 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
3272 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
3273 rack->rc_free_cnt--;
3274 return (rsm);
3275 }
3276 return (NULL);
3277 }
3278
3279 static struct rack_sendmap *
rack_alloc_full_limit(struct tcp_rack * rack)3280 rack_alloc_full_limit(struct tcp_rack *rack)
3281 {
3282 if ((V_tcp_map_entries_limit > 0) &&
3283 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) {
3284 counter_u64_add(rack_to_alloc_limited, 1);
3285 if (!rack->alloc_limit_reported) {
3286 rack->alloc_limit_reported = 1;
3287 counter_u64_add(rack_alloc_limited_conns, 1);
3288 }
3289 return (NULL);
3290 }
3291 return (rack_alloc(rack));
3292 }
3293
3294 /* wrapper to allocate a sendmap entry, subject to a specific limit */
3295 static struct rack_sendmap *
rack_alloc_limit(struct tcp_rack * rack,uint8_t limit_type)3296 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type)
3297 {
3298 struct rack_sendmap *rsm;
3299
3300 if (limit_type) {
3301 /* currently there is only one limit type */
3302 if (rack->r_ctl.rc_split_limit > 0 &&
3303 rack->r_ctl.rc_num_split_allocs >= rack->r_ctl.rc_split_limit) {
3304 counter_u64_add(rack_split_limited, 1);
3305 if (!rack->alloc_limit_reported) {
3306 rack->alloc_limit_reported = 1;
3307 counter_u64_add(rack_alloc_limited_conns, 1);
3308 }
3309 return (NULL);
3310 }
3311 }
3312
3313 /* allocate and mark in the limit type, if set */
3314 rsm = rack_alloc(rack);
3315 if (rsm != NULL && limit_type) {
3316 rsm->r_limit_type = limit_type;
3317 rack->r_ctl.rc_num_split_allocs++;
3318 }
3319 return (rsm);
3320 }
3321
3322 static void
rack_free_trim(struct tcp_rack * rack)3323 rack_free_trim(struct tcp_rack *rack)
3324 {
3325 struct rack_sendmap *rsm;
3326
3327 /*
3328 * Free up all the tail entries until
3329 * we get our list down to the limit.
3330 */
3331 while (rack->rc_free_cnt > rack_free_cache) {
3332 rsm = TAILQ_LAST(&rack->r_ctl.rc_free, rack_head);
3333 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
3334 rack->rc_free_cnt--;
3335 rack->r_ctl.rc_num_maps_alloced--;
3336 uma_zfree(rack_zone, rsm);
3337 }
3338 }
3339
3340 static void
rack_free(struct tcp_rack * rack,struct rack_sendmap * rsm)3341 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm)
3342 {
3343 if (rsm->r_flags & RACK_APP_LIMITED) {
3344 KASSERT((rack->r_ctl.rc_app_limited_cnt > 0),
3345 ("app_cnt %u, rsm %p", rack->r_ctl.rc_app_limited_cnt, rsm));
3346 rack->r_ctl.rc_app_limited_cnt--;
3347 }
3348 if (rsm->r_limit_type) {
3349 /* currently there is only one limit type */
3350 rack->r_ctl.rc_num_split_allocs--;
3351 }
3352 if (rsm == rack->r_ctl.rc_first_appl) {
3353 rack->r_ctl.cleared_app_ack_seq = rsm->r_end;
3354 rack->r_ctl.cleared_app_ack = 1;
3355 if (rack->r_ctl.rc_app_limited_cnt == 0)
3356 rack->r_ctl.rc_first_appl = NULL;
3357 else
3358 rack->r_ctl.rc_first_appl = tqhash_find(rack->r_ctl.tqh, rsm->r_nseq_appl);
3359 }
3360 if (rsm == rack->r_ctl.rc_resend)
3361 rack->r_ctl.rc_resend = NULL;
3362 if (rsm == rack->r_ctl.rc_end_appl)
3363 rack->r_ctl.rc_end_appl = NULL;
3364 if (rack->r_ctl.rc_tlpsend == rsm)
3365 rack->r_ctl.rc_tlpsend = NULL;
3366 if (rack->r_ctl.rc_sacklast == rsm)
3367 rack->r_ctl.rc_sacklast = NULL;
3368 memset(rsm, 0, sizeof(struct rack_sendmap));
3369 /* Make sure we are not going to overrun our count limit of 0xff */
3370 if ((rack->rc_free_cnt + 1) > RACK_FREE_CNT_MAX) {
3371 rack_free_trim(rack);
3372 }
3373 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_free, rsm, r_tnext);
3374 rack->rc_free_cnt++;
3375 }
3376
3377 static uint32_t
rack_get_measure_window(struct tcpcb * tp,struct tcp_rack * rack)3378 rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack)
3379 {
3380 uint64_t srtt, bw, len, tim;
3381 uint32_t segsiz, def_len, minl;
3382
3383 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
3384 def_len = rack_def_data_window * segsiz;
3385 if (rack->rc_gp_filled == 0) {
3386 /*
3387 * We have no measurement (IW is in flight?) so
3388 * we can only guess using our data_window sysctl
3389 * value (usually 20MSS).
3390 */
3391 return (def_len);
3392 }
3393 /*
3394 * Now we have a number of factors to consider.
3395 *
3396 * 1) We have a desired BDP which is usually
3397 * at least 2.
3398 * 2) We have a minimum number of rtt's usually 1 SRTT
3399 * but we allow it too to be more.
3400 * 3) We want to make sure a measurement last N useconds (if
3401 * we have set rack_min_measure_usec.
3402 *
3403 * We handle the first concern here by trying to create a data
3404 * window of max(rack_def_data_window, DesiredBDP). The
3405 * second concern we handle in not letting the measurement
3406 * window end normally until at least the required SRTT's
3407 * have gone by which is done further below in
3408 * rack_enough_for_measurement(). Finally the third concern
3409 * we also handle here by calculating how long that time
3410 * would take at the current BW and then return the
3411 * max of our first calculation and that length. Note
3412 * that if rack_min_measure_usec is 0, we don't deal
3413 * with concern 3. Also for both Concern 1 and 3 an
3414 * application limited period could end the measurement
3415 * earlier.
3416 *
3417 * So lets calculate the BDP with the "known" b/w using
3418 * the SRTT as our rtt and then multiply it by the goal.
3419 */
3420 bw = rack_get_bw(rack);
3421 srtt = (uint64_t)tp->t_srtt;
3422 len = bw * srtt;
3423 len /= (uint64_t)HPTS_USEC_IN_SEC;
3424 len *= max(1, rack_goal_bdp);
3425 /* Now we need to round up to the nearest MSS */
3426 len = roundup(len, segsiz);
3427 if (rack_min_measure_usec) {
3428 /* Now calculate our min length for this b/w */
3429 tim = rack_min_measure_usec;
3430 minl = (tim * bw) / (uint64_t)HPTS_USEC_IN_SEC;
3431 if (minl == 0)
3432 minl = 1;
3433 minl = roundup(minl, segsiz);
3434 if (len < minl)
3435 len = minl;
3436 }
3437 /*
3438 * Now if we have a very small window we want
3439 * to attempt to get the window that is
3440 * as small as possible. This happens on
3441 * low b/w connections and we don't want to
3442 * span huge numbers of rtt's between measurements.
3443 *
3444 * We basically include 2 over our "MIN window" so
3445 * that the measurement can be shortened (possibly) by
3446 * an ack'ed packet.
3447 */
3448 if (len < def_len)
3449 return (max((uint32_t)len, ((MIN_GP_WIN+2) * segsiz)));
3450 else
3451 return (max((uint32_t)len, def_len));
3452
3453 }
3454
3455 static int
rack_enough_for_measurement(struct tcpcb * tp,struct tcp_rack * rack,tcp_seq th_ack,uint8_t * quality)3456 rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack, uint8_t *quality)
3457 {
3458 uint32_t tim, srtts, segsiz;
3459
3460 /*
3461 * Has enough time passed for the GP measurement to be valid?
3462 */
3463 if (SEQ_LT(th_ack, tp->gput_seq)) {
3464 /* Not enough bytes yet */
3465 return (0);
3466 }
3467 if ((tp->snd_max == tp->snd_una) ||
3468 (th_ack == tp->snd_max)){
3469 /*
3470 * All is acked quality of all acked is
3471 * usually low or medium, but we in theory could split
3472 * all acked into two cases, where you got
3473 * a signifigant amount of your window and
3474 * where you did not. For now we leave it
3475 * but it is something to contemplate in the
3476 * future. The danger here is that delayed ack
3477 * is effecting the last byte (which is a 50:50 chance).
3478 */
3479 *quality = RACK_QUALITY_ALLACKED;
3480 return (1);
3481 }
3482 if (SEQ_GEQ(th_ack, tp->gput_ack)) {
3483 /*
3484 * We obtained our entire window of data we wanted
3485 * no matter if we are in recovery or not then
3486 * its ok since expanding the window does not
3487 * make things fuzzy (or at least not as much).
3488 */
3489 *quality = RACK_QUALITY_HIGH;
3490 return (1);
3491 }
3492 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
3493 if (SEQ_LT(th_ack, tp->gput_ack) &&
3494 ((th_ack - tp->gput_seq) < max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) {
3495 /* Not enough bytes yet */
3496 return (0);
3497 }
3498 if (rack->r_ctl.rc_first_appl &&
3499 (SEQ_GEQ(th_ack, rack->r_ctl.rc_first_appl->r_end))) {
3500 /*
3501 * We are up to the app limited send point
3502 * we have to measure irrespective of the time..
3503 */
3504 *quality = RACK_QUALITY_APPLIMITED;
3505 return (1);
3506 }
3507 /* Now what about time? */
3508 srtts = (rack->r_ctl.rc_gp_srtt * rack_min_srtts);
3509 tim = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time) - tp->gput_ts;
3510 if ((tim >= srtts) && (IN_RECOVERY(rack->rc_tp->t_flags) == 0)) {
3511 /*
3512 * We do not allow a measurement if we are in recovery
3513 * that would shrink the goodput window we wanted.
3514 * This is to prevent cloudyness of when the last send
3515 * was actually made.
3516 */
3517 *quality = RACK_QUALITY_HIGH;
3518 return (1);
3519 }
3520 /* Nope not even a full SRTT has passed */
3521 return (0);
3522 }
3523
3524 static void
rack_log_timely(struct tcp_rack * rack,uint32_t logged,uint64_t cur_bw,uint64_t low_bnd,uint64_t up_bnd,int line,uint8_t method)3525 rack_log_timely(struct tcp_rack *rack,
3526 uint32_t logged, uint64_t cur_bw, uint64_t low_bnd,
3527 uint64_t up_bnd, int line, uint8_t method)
3528 {
3529 if (tcp_bblogging_on(rack->rc_tp)) {
3530 union tcp_log_stackspecific log;
3531 struct timeval tv;
3532
3533 memset(&log, 0, sizeof(log));
3534 log.u_bbr.flex1 = logged;
3535 log.u_bbr.flex2 = rack->rc_gp_timely_inc_cnt;
3536 log.u_bbr.flex2 <<= 4;
3537 log.u_bbr.flex2 |= rack->rc_gp_timely_dec_cnt;
3538 log.u_bbr.flex2 <<= 4;
3539 log.u_bbr.flex2 |= rack->rc_gp_incr;
3540 log.u_bbr.flex2 <<= 4;
3541 log.u_bbr.flex2 |= rack->rc_gp_bwred;
3542 log.u_bbr.flex3 = rack->rc_gp_incr;
3543 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss;
3544 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ca;
3545 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_rec;
3546 log.u_bbr.flex7 = rack->rc_gp_bwred;
3547 log.u_bbr.flex8 = method;
3548 log.u_bbr.cur_del_rate = cur_bw;
3549 log.u_bbr.delRate = low_bnd;
3550 log.u_bbr.bw_inuse = up_bnd;
3551 log.u_bbr.rttProp = rack_get_bw(rack);
3552 log.u_bbr.pkt_epoch = line;
3553 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff;
3554 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
3555 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
3556 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt;
3557 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt;
3558 log.u_bbr.cwnd_gain = rack->rc_dragged_bottom;
3559 log.u_bbr.cwnd_gain <<= 1;
3560 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_rec;
3561 log.u_bbr.cwnd_gain <<= 1;
3562 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss;
3563 log.u_bbr.cwnd_gain <<= 1;
3564 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca;
3565 log.u_bbr.lost = rack->r_ctl.rc_loss_count;
3566 TCP_LOG_EVENTP(rack->rc_tp, NULL,
3567 &rack->rc_inp->inp_socket->so_rcv,
3568 &rack->rc_inp->inp_socket->so_snd,
3569 TCP_TIMELY_WORK, 0,
3570 0, &log, false, &tv);
3571 }
3572 }
3573
3574 static int
rack_bw_can_be_raised(struct tcp_rack * rack,uint64_t cur_bw,uint64_t last_bw_est,uint16_t mult)3575 rack_bw_can_be_raised(struct tcp_rack *rack, uint64_t cur_bw, uint64_t last_bw_est, uint16_t mult)
3576 {
3577 /*
3578 * Before we increase we need to know if
3579 * the estimate just made was less than
3580 * our pacing goal (i.e. (cur_bw * mult) > last_bw_est)
3581 *
3582 * If we already are pacing at a fast enough
3583 * rate to push us faster there is no sense of
3584 * increasing.
3585 *
3586 * We first caculate our actual pacing rate (ss or ca multiplier
3587 * times our cur_bw).
3588 *
3589 * Then we take the last measured rate and multipy by our
3590 * maximum pacing overage to give us a max allowable rate.
3591 *
3592 * If our act_rate is smaller than our max_allowable rate
3593 * then we should increase. Else we should hold steady.
3594 *
3595 */
3596 uint64_t act_rate, max_allow_rate;
3597
3598 if (rack_timely_no_stopping)
3599 return (1);
3600
3601 if ((cur_bw == 0) || (last_bw_est == 0)) {
3602 /*
3603 * Initial startup case or
3604 * everything is acked case.
3605 */
3606 rack_log_timely(rack, mult, cur_bw, 0, 0,
3607 __LINE__, 9);
3608 return (1);
3609 }
3610 if (mult <= 100) {
3611 /*
3612 * We can always pace at or slightly above our rate.
3613 */
3614 rack_log_timely(rack, mult, cur_bw, 0, 0,
3615 __LINE__, 9);
3616 return (1);
3617 }
3618 act_rate = cur_bw * (uint64_t)mult;
3619 act_rate /= 100;
3620 max_allow_rate = last_bw_est * ((uint64_t)rack_max_per_above + (uint64_t)100);
3621 max_allow_rate /= 100;
3622 if (act_rate < max_allow_rate) {
3623 /*
3624 * Here the rate we are actually pacing at
3625 * is smaller than 10% above our last measurement.
3626 * This means we are pacing below what we would
3627 * like to try to achieve (plus some wiggle room).
3628 */
3629 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate,
3630 __LINE__, 9);
3631 return (1);
3632 } else {
3633 /*
3634 * Here we are already pacing at least rack_max_per_above(10%)
3635 * what we are getting back. This indicates most likely
3636 * that we are being limited (cwnd/rwnd/app) and can't
3637 * get any more b/w. There is no sense of trying to
3638 * raise up the pacing rate its not speeding us up
3639 * and we already are pacing faster than we are getting.
3640 */
3641 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate,
3642 __LINE__, 8);
3643 return (0);
3644 }
3645 }
3646
3647 static void
rack_validate_multipliers_at_or_above100(struct tcp_rack * rack)3648 rack_validate_multipliers_at_or_above100(struct tcp_rack *rack)
3649 {
3650 /*
3651 * When we drag bottom, we want to assure
3652 * that no multiplier is below 1.0, if so
3653 * we want to restore it to at least that.
3654 */
3655 if (rack->r_ctl.rack_per_of_gp_rec < 100) {
3656 /* This is unlikely we usually do not touch recovery */
3657 rack->r_ctl.rack_per_of_gp_rec = 100;
3658 }
3659 if (rack->r_ctl.rack_per_of_gp_ca < 100) {
3660 rack->r_ctl.rack_per_of_gp_ca = 100;
3661 }
3662 if (rack->r_ctl.rack_per_of_gp_ss < 100) {
3663 rack->r_ctl.rack_per_of_gp_ss = 100;
3664 }
3665 }
3666
3667 static void
rack_validate_multipliers_at_or_below_100(struct tcp_rack * rack)3668 rack_validate_multipliers_at_or_below_100(struct tcp_rack *rack)
3669 {
3670 if (rack->r_ctl.rack_per_of_gp_ca > 100) {
3671 rack->r_ctl.rack_per_of_gp_ca = 100;
3672 }
3673 if (rack->r_ctl.rack_per_of_gp_ss > 100) {
3674 rack->r_ctl.rack_per_of_gp_ss = 100;
3675 }
3676 }
3677
3678 static void
rack_increase_bw_mul(struct tcp_rack * rack,int timely_says,uint64_t cur_bw,uint64_t last_bw_est,int override)3679 rack_increase_bw_mul(struct tcp_rack *rack, int timely_says, uint64_t cur_bw, uint64_t last_bw_est, int override)
3680 {
3681 int32_t calc, logged, plus;
3682
3683 logged = 0;
3684
3685 if (rack->rc_skip_timely)
3686 return;
3687 if (override) {
3688 /*
3689 * override is passed when we are
3690 * loosing b/w and making one last
3691 * gasp at trying to not loose out
3692 * to a new-reno flow.
3693 */
3694 goto extra_boost;
3695 }
3696 /* In classic timely we boost by 5x if we have 5 increases in a row, lets not */
3697 if (rack->rc_gp_incr &&
3698 ((rack->rc_gp_timely_inc_cnt + 1) >= RACK_TIMELY_CNT_BOOST)) {
3699 /*
3700 * Reset and get 5 strokes more before the boost. Note
3701 * that the count is 0 based so we have to add one.
3702 */
3703 extra_boost:
3704 plus = (uint32_t)rack_gp_increase_per * RACK_TIMELY_CNT_BOOST;
3705 rack->rc_gp_timely_inc_cnt = 0;
3706 } else
3707 plus = (uint32_t)rack_gp_increase_per;
3708 /* Must be at least 1% increase for true timely increases */
3709 if ((plus < 1) &&
3710 ((rack->r_ctl.rc_rtt_diff <= 0) || (timely_says <= 0)))
3711 plus = 1;
3712 if (rack->rc_gp_saw_rec &&
3713 (rack->rc_gp_no_rec_chg == 0) &&
3714 rack_bw_can_be_raised(rack, cur_bw, last_bw_est,
3715 rack->r_ctl.rack_per_of_gp_rec)) {
3716 /* We have been in recovery ding it too */
3717 calc = rack->r_ctl.rack_per_of_gp_rec + plus;
3718 if (calc > 0xffff)
3719 calc = 0xffff;
3720 logged |= 1;
3721 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)calc;
3722 if (rack->r_ctl.rack_per_upper_bound_ca &&
3723 (rack->rc_dragged_bottom == 0) &&
3724 (rack->r_ctl.rack_per_of_gp_rec > rack->r_ctl.rack_per_upper_bound_ca))
3725 rack->r_ctl.rack_per_of_gp_rec = rack->r_ctl.rack_per_upper_bound_ca;
3726 }
3727 if (rack->rc_gp_saw_ca &&
3728 (rack->rc_gp_saw_ss == 0) &&
3729 rack_bw_can_be_raised(rack, cur_bw, last_bw_est,
3730 rack->r_ctl.rack_per_of_gp_ca)) {
3731 /* In CA */
3732 calc = rack->r_ctl.rack_per_of_gp_ca + plus;
3733 if (calc > 0xffff)
3734 calc = 0xffff;
3735 logged |= 2;
3736 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)calc;
3737 if (rack->r_ctl.rack_per_upper_bound_ca &&
3738 (rack->rc_dragged_bottom == 0) &&
3739 (rack->r_ctl.rack_per_of_gp_ca > rack->r_ctl.rack_per_upper_bound_ca))
3740 rack->r_ctl.rack_per_of_gp_ca = rack->r_ctl.rack_per_upper_bound_ca;
3741 }
3742 if (rack->rc_gp_saw_ss &&
3743 rack_bw_can_be_raised(rack, cur_bw, last_bw_est,
3744 rack->r_ctl.rack_per_of_gp_ss)) {
3745 /* In SS */
3746 calc = rack->r_ctl.rack_per_of_gp_ss + plus;
3747 if (calc > 0xffff)
3748 calc = 0xffff;
3749 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)calc;
3750 if (rack->r_ctl.rack_per_upper_bound_ss &&
3751 (rack->rc_dragged_bottom == 0) &&
3752 (rack->r_ctl.rack_per_of_gp_ss > rack->r_ctl.rack_per_upper_bound_ss))
3753 rack->r_ctl.rack_per_of_gp_ss = rack->r_ctl.rack_per_upper_bound_ss;
3754 logged |= 4;
3755 }
3756 if (logged &&
3757 (rack->rc_gp_incr == 0)){
3758 /* Go into increment mode */
3759 rack->rc_gp_incr = 1;
3760 rack->rc_gp_timely_inc_cnt = 0;
3761 }
3762 if (rack->rc_gp_incr &&
3763 logged &&
3764 (rack->rc_gp_timely_inc_cnt < RACK_TIMELY_CNT_BOOST)) {
3765 rack->rc_gp_timely_inc_cnt++;
3766 }
3767 rack_log_timely(rack, logged, plus, 0, 0,
3768 __LINE__, 1);
3769 }
3770
3771 static uint32_t
rack_get_decrease(struct tcp_rack * rack,uint32_t curper,int32_t rtt_diff)3772 rack_get_decrease(struct tcp_rack *rack, uint32_t curper, int32_t rtt_diff)
3773 {
3774 /*-
3775 * norm_grad = rtt_diff / minrtt;
3776 * new_per = curper * (1 - B * norm_grad)
3777 *
3778 * B = rack_gp_decrease_per (default 80%)
3779 * rtt_dif = input var current rtt-diff
3780 * curper = input var current percentage
3781 * minrtt = from rack filter
3782 *
3783 * In order to do the floating point calculations above we
3784 * do an integer conversion. The code looks confusing so let me
3785 * translate it into something that use more variables and
3786 * is clearer for us humans :)
3787 *
3788 * uint64_t norm_grad, inverse, reduce_by, final_result;
3789 * uint32_t perf;
3790 *
3791 * norm_grad = (((uint64_t)rtt_diff * 1000000) /
3792 * (uint64_t)get_filter_small(&rack->r_ctl.rc_gp_min_rtt));
3793 * inverse = ((uint64_t)rack_gp_decrease * (uint64_t)1000000) * norm_grad;
3794 * inverse /= 1000000;
3795 * reduce_by = (1000000 - inverse);
3796 * final_result = (cur_per * reduce_by) / 1000000;
3797 * perf = (uint32_t)final_result;
3798 */
3799 uint64_t perf;
3800
3801 perf = (((uint64_t)curper * ((uint64_t)1000000 -
3802 ((uint64_t)rack_gp_decrease_per * (uint64_t)10000 *
3803 (((uint64_t)rtt_diff * (uint64_t)1000000)/
3804 (uint64_t)get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)))/
3805 (uint64_t)1000000)) /
3806 (uint64_t)1000000);
3807 if (perf > curper) {
3808 /* TSNH */
3809 perf = curper - 1;
3810 }
3811 return ((uint32_t)perf);
3812 }
3813
3814 static uint32_t
rack_decrease_highrtt(struct tcp_rack * rack,uint32_t curper,uint32_t rtt)3815 rack_decrease_highrtt(struct tcp_rack *rack, uint32_t curper, uint32_t rtt)
3816 {
3817 /*
3818 * highrttthresh
3819 * result = curper * (1 - (B * ( 1 - ------ ))
3820 * gp_srtt
3821 *
3822 * B = rack_gp_decrease_per (default .8 i.e. 80)
3823 * highrttthresh = filter_min * rack_gp_rtt_maxmul
3824 */
3825 uint64_t perf;
3826 uint32_t highrttthresh;
3827
3828 highrttthresh = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul;
3829
3830 perf = (((uint64_t)curper * ((uint64_t)1000000 -
3831 ((uint64_t)rack_gp_decrease_per * ((uint64_t)1000000 -
3832 ((uint64_t)highrttthresh * (uint64_t)1000000) /
3833 (uint64_t)rtt)) / 100)) /(uint64_t)1000000);
3834 if (tcp_bblogging_on(rack->rc_tp)) {
3835 uint64_t log1;
3836
3837 log1 = rtt;
3838 log1 <<= 32;
3839 log1 |= highrttthresh;
3840 rack_log_timely(rack,
3841 rack_gp_decrease_per,
3842 (uint64_t)curper,
3843 log1,
3844 perf,
3845 __LINE__,
3846 15);
3847 }
3848 return (perf);
3849 }
3850
3851 static void
rack_decrease_bw_mul(struct tcp_rack * rack,int timely_says,uint32_t rtt,int32_t rtt_diff)3852 rack_decrease_bw_mul(struct tcp_rack *rack, int timely_says, uint32_t rtt, int32_t rtt_diff)
3853 {
3854 uint64_t logvar, logvar2, logvar3;
3855 uint32_t logged, new_per, ss_red, ca_red, rec_red, alt, val;
3856
3857 if (rack->rc_skip_timely)
3858 return;
3859 if (rack->rc_gp_incr) {
3860 /* Turn off increment counting */
3861 rack->rc_gp_incr = 0;
3862 rack->rc_gp_timely_inc_cnt = 0;
3863 }
3864 ss_red = ca_red = rec_red = 0;
3865 logged = 0;
3866 /* Calculate the reduction value */
3867 if (rtt_diff < 0) {
3868 rtt_diff *= -1;
3869 }
3870 /* Must be at least 1% reduction */
3871 if (rack->rc_gp_saw_rec && (rack->rc_gp_no_rec_chg == 0)) {
3872 /* We have been in recovery ding it too */
3873 if (timely_says == 2) {
3874 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_rec, rtt);
3875 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff);
3876 if (alt < new_per)
3877 val = alt;
3878 else
3879 val = new_per;
3880 } else
3881 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff);
3882 if (rack->r_ctl.rack_per_of_gp_rec > val) {
3883 rec_red = (rack->r_ctl.rack_per_of_gp_rec - val);
3884 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)val;
3885 } else {
3886 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound;
3887 rec_red = 0;
3888 }
3889 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_rec)
3890 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound;
3891 logged |= 1;
3892 }
3893 if (rack->rc_gp_saw_ss) {
3894 /* Sent in SS */
3895 if (timely_says == 2) {
3896 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ss, rtt);
3897 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff);
3898 if (alt < new_per)
3899 val = alt;
3900 else
3901 val = new_per;
3902 } else
3903 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff);
3904 if (rack->r_ctl.rack_per_of_gp_ss > new_per) {
3905 ss_red = rack->r_ctl.rack_per_of_gp_ss - val;
3906 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)val;
3907 } else {
3908 ss_red = new_per;
3909 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound;
3910 logvar = new_per;
3911 logvar <<= 32;
3912 logvar |= alt;
3913 logvar2 = (uint32_t)rtt;
3914 logvar2 <<= 32;
3915 logvar2 |= (uint32_t)rtt_diff;
3916 logvar3 = rack_gp_rtt_maxmul;
3917 logvar3 <<= 32;
3918 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
3919 rack_log_timely(rack, timely_says,
3920 logvar2, logvar3,
3921 logvar, __LINE__, 10);
3922 }
3923 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ss)
3924 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound;
3925 logged |= 4;
3926 } else if (rack->rc_gp_saw_ca) {
3927 /* Sent in CA */
3928 if (timely_says == 2) {
3929 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ca, rtt);
3930 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff);
3931 if (alt < new_per)
3932 val = alt;
3933 else
3934 val = new_per;
3935 } else
3936 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff);
3937 if (rack->r_ctl.rack_per_of_gp_ca > val) {
3938 ca_red = rack->r_ctl.rack_per_of_gp_ca - val;
3939 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)val;
3940 } else {
3941 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound;
3942 ca_red = 0;
3943 logvar = new_per;
3944 logvar <<= 32;
3945 logvar |= alt;
3946 logvar2 = (uint32_t)rtt;
3947 logvar2 <<= 32;
3948 logvar2 |= (uint32_t)rtt_diff;
3949 logvar3 = rack_gp_rtt_maxmul;
3950 logvar3 <<= 32;
3951 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
3952 rack_log_timely(rack, timely_says,
3953 logvar2, logvar3,
3954 logvar, __LINE__, 10);
3955 }
3956 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ca)
3957 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound;
3958 logged |= 2;
3959 }
3960 if (rack->rc_gp_timely_dec_cnt < 0x7) {
3961 rack->rc_gp_timely_dec_cnt++;
3962 if (rack_timely_dec_clear &&
3963 (rack->rc_gp_timely_dec_cnt == rack_timely_dec_clear))
3964 rack->rc_gp_timely_dec_cnt = 0;
3965 }
3966 logvar = ss_red;
3967 logvar <<= 32;
3968 logvar |= ca_red;
3969 rack_log_timely(rack, logged, rec_red, rack_per_lower_bound, logvar,
3970 __LINE__, 2);
3971 }
3972
3973 static void
rack_log_rtt_shrinks(struct tcp_rack * rack,uint32_t us_cts,uint32_t rtt,uint32_t line,uint8_t reas)3974 rack_log_rtt_shrinks(struct tcp_rack *rack, uint32_t us_cts,
3975 uint32_t rtt, uint32_t line, uint8_t reas)
3976 {
3977 if (tcp_bblogging_on(rack->rc_tp)) {
3978 union tcp_log_stackspecific log;
3979 struct timeval tv;
3980
3981 memset(&log, 0, sizeof(log));
3982 log.u_bbr.flex1 = line;
3983 log.u_bbr.flex2 = rack->r_ctl.rc_time_probertt_starts;
3984 log.u_bbr.flex3 = rack->r_ctl.rc_lower_rtt_us_cts;
3985 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss;
3986 log.u_bbr.flex5 = rtt;
3987 log.u_bbr.flex6 = rack->rc_highly_buffered;
3988 log.u_bbr.flex6 <<= 1;
3989 log.u_bbr.flex6 |= rack->forced_ack;
3990 log.u_bbr.flex6 <<= 1;
3991 log.u_bbr.flex6 |= rack->rc_gp_dyn_mul;
3992 log.u_bbr.flex6 <<= 1;
3993 log.u_bbr.flex6 |= rack->in_probe_rtt;
3994 log.u_bbr.flex6 <<= 1;
3995 log.u_bbr.flex6 |= rack->measure_saw_probe_rtt;
3996 log.u_bbr.flex7 = rack->r_ctl.rack_per_of_gp_probertt;
3997 log.u_bbr.pacing_gain = rack->r_ctl.rack_per_of_gp_ca;
3998 log.u_bbr.cwnd_gain = rack->r_ctl.rack_per_of_gp_rec;
3999 log.u_bbr.flex8 = reas;
4000 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
4001 log.u_bbr.delRate = rack_get_bw(rack);
4002 log.u_bbr.cur_del_rate = rack->r_ctl.rc_highest_us_rtt;
4003 log.u_bbr.cur_del_rate <<= 32;
4004 log.u_bbr.cur_del_rate |= rack->r_ctl.rc_lowest_us_rtt;
4005 log.u_bbr.applimited = rack->r_ctl.rc_time_probertt_entered;
4006 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff;
4007 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
4008 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt;
4009 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt;
4010 log.u_bbr.pkt_epoch = rack->r_ctl.rc_lower_rtt_us_cts;
4011 log.u_bbr.delivered = rack->r_ctl.rc_target_probertt_flight;
4012 log.u_bbr.lost = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
4013 log.u_bbr.rttProp = us_cts;
4014 log.u_bbr.rttProp <<= 32;
4015 log.u_bbr.rttProp |= rack->r_ctl.rc_entry_gp_rtt;
4016 TCP_LOG_EVENTP(rack->rc_tp, NULL,
4017 &rack->rc_inp->inp_socket->so_rcv,
4018 &rack->rc_inp->inp_socket->so_snd,
4019 BBR_LOG_RTT_SHRINKS, 0,
4020 0, &log, false, &rack->r_ctl.act_rcv_time);
4021 }
4022 }
4023
4024 static void
rack_set_prtt_target(struct tcp_rack * rack,uint32_t segsiz,uint32_t rtt)4025 rack_set_prtt_target(struct tcp_rack *rack, uint32_t segsiz, uint32_t rtt)
4026 {
4027 uint64_t bwdp;
4028
4029 bwdp = rack_get_bw(rack);
4030 bwdp *= (uint64_t)rtt;
4031 bwdp /= (uint64_t)HPTS_USEC_IN_SEC;
4032 rack->r_ctl.rc_target_probertt_flight = roundup((uint32_t)bwdp, segsiz);
4033 if (rack->r_ctl.rc_target_probertt_flight < (segsiz * rack_timely_min_segs)) {
4034 /*
4035 * A window protocol must be able to have 4 packets
4036 * outstanding as the floor in order to function
4037 * (especially considering delayed ack :D).
4038 */
4039 rack->r_ctl.rc_target_probertt_flight = (segsiz * rack_timely_min_segs);
4040 }
4041 }
4042
4043 static void
rack_enter_probertt(struct tcp_rack * rack,uint32_t us_cts)4044 rack_enter_probertt(struct tcp_rack *rack, uint32_t us_cts)
4045 {
4046 /**
4047 * ProbeRTT is a bit different in rack_pacing than in
4048 * BBR. It is like BBR in that it uses the lowering of
4049 * the RTT as a signal that we saw something new and
4050 * counts from there for how long between. But it is
4051 * different in that its quite simple. It does not
4052 * play with the cwnd and wait until we get down
4053 * to N segments outstanding and hold that for
4054 * 200ms. Instead it just sets the pacing reduction
4055 * rate to a set percentage (70 by default) and hold
4056 * that for a number of recent GP Srtt's.
4057 */
4058 uint32_t segsiz;
4059
4060 rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
4061 if (rack->rc_gp_dyn_mul == 0)
4062 return;
4063
4064 if (rack->rc_tp->snd_max == rack->rc_tp->snd_una) {
4065 /* We are idle */
4066 return;
4067 }
4068 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) &&
4069 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) {
4070 /*
4071 * Stop the goodput now, the idea here is
4072 * that future measurements with in_probe_rtt
4073 * won't register if they are not greater so
4074 * we want to get what info (if any) is available
4075 * now.
4076 */
4077 rack_do_goodput_measurement(rack->rc_tp, rack,
4078 rack->rc_tp->snd_una, __LINE__,
4079 RACK_QUALITY_PROBERTT);
4080 }
4081 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt;
4082 rack->r_ctl.rc_time_probertt_entered = us_cts;
4083 segsiz = min(ctf_fixed_maxseg(rack->rc_tp),
4084 rack->r_ctl.rc_pace_min_segs);
4085 rack->in_probe_rtt = 1;
4086 rack->measure_saw_probe_rtt = 1;
4087 rack->r_ctl.rc_time_probertt_starts = 0;
4088 rack->r_ctl.rc_entry_gp_rtt = rack->r_ctl.rc_gp_srtt;
4089 if (rack_probertt_use_min_rtt_entry)
4090 rack_set_prtt_target(rack, segsiz, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt));
4091 else
4092 rack_set_prtt_target(rack, segsiz, rack->r_ctl.rc_gp_srtt);
4093 rack_log_rtt_shrinks(rack, us_cts, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
4094 __LINE__, RACK_RTTS_ENTERPROBE);
4095 }
4096
4097 static void
rack_exit_probertt(struct tcp_rack * rack,uint32_t us_cts)4098 rack_exit_probertt(struct tcp_rack *rack, uint32_t us_cts)
4099 {
4100 struct rack_sendmap *rsm;
4101 uint32_t segsiz;
4102
4103 segsiz = min(ctf_fixed_maxseg(rack->rc_tp),
4104 rack->r_ctl.rc_pace_min_segs);
4105 rack->in_probe_rtt = 0;
4106 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) &&
4107 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) {
4108 /*
4109 * Stop the goodput now, the idea here is
4110 * that future measurements with in_probe_rtt
4111 * won't register if they are not greater so
4112 * we want to get what info (if any) is available
4113 * now.
4114 */
4115 rack_do_goodput_measurement(rack->rc_tp, rack,
4116 rack->rc_tp->snd_una, __LINE__,
4117 RACK_QUALITY_PROBERTT);
4118 } else if (rack->rc_tp->t_flags & TF_GPUTINPROG) {
4119 /*
4120 * We don't have enough data to make a measurement.
4121 * So lets just stop and start here after exiting
4122 * probe-rtt. We probably are not interested in
4123 * the results anyway.
4124 */
4125 rack->rc_tp->t_flags &= ~TF_GPUTINPROG;
4126 }
4127 /*
4128 * Measurements through the current snd_max are going
4129 * to be limited by the slower pacing rate.
4130 *
4131 * We need to mark these as app-limited so we
4132 * don't collapse the b/w.
4133 */
4134 rsm = tqhash_max(rack->r_ctl.tqh);
4135 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) {
4136 if (rack->r_ctl.rc_app_limited_cnt == 0)
4137 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm;
4138 else {
4139 /*
4140 * Go out to the end app limited and mark
4141 * this new one as next and move the end_appl up
4142 * to this guy.
4143 */
4144 if (rack->r_ctl.rc_end_appl)
4145 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start;
4146 rack->r_ctl.rc_end_appl = rsm;
4147 }
4148 rsm->r_flags |= RACK_APP_LIMITED;
4149 rack->r_ctl.rc_app_limited_cnt++;
4150 }
4151 /*
4152 * Now, we need to examine our pacing rate multipliers.
4153 * If its under 100%, we need to kick it back up to
4154 * 100%. We also don't let it be over our "max" above
4155 * the actual rate i.e. 100% + rack_clamp_atexit_prtt.
4156 * Note setting clamp_atexit_prtt to 0 has the effect
4157 * of setting CA/SS to 100% always at exit (which is
4158 * the default behavior).
4159 */
4160 if (rack_probertt_clear_is) {
4161 rack->rc_gp_incr = 0;
4162 rack->rc_gp_bwred = 0;
4163 rack->rc_gp_timely_inc_cnt = 0;
4164 rack->rc_gp_timely_dec_cnt = 0;
4165 }
4166 /* Do we do any clamping at exit? */
4167 if (rack->rc_highly_buffered && rack_atexit_prtt_hbp) {
4168 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt_hbp;
4169 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt_hbp;
4170 }
4171 if ((rack->rc_highly_buffered == 0) && rack_atexit_prtt) {
4172 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt;
4173 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt;
4174 }
4175 /*
4176 * Lets set rtt_diff to 0, so that we will get a "boost"
4177 * after exiting.
4178 */
4179 rack->r_ctl.rc_rtt_diff = 0;
4180
4181 /* Clear all flags so we start fresh */
4182 rack->rc_tp->t_bytes_acked = 0;
4183 rack->rc_tp->t_ccv.flags &= ~CCF_ABC_SENTAWND;
4184 /*
4185 * If configured to, set the cwnd and ssthresh to
4186 * our targets.
4187 */
4188 if (rack_probe_rtt_sets_cwnd) {
4189 uint64_t ebdp;
4190 uint32_t setto;
4191
4192 /* Set ssthresh so we get into CA once we hit our target */
4193 if (rack_probertt_use_min_rtt_exit == 1) {
4194 /* Set to min rtt */
4195 rack_set_prtt_target(rack, segsiz,
4196 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt));
4197 } else if (rack_probertt_use_min_rtt_exit == 2) {
4198 /* Set to current gp rtt */
4199 rack_set_prtt_target(rack, segsiz,
4200 rack->r_ctl.rc_gp_srtt);
4201 } else if (rack_probertt_use_min_rtt_exit == 3) {
4202 /* Set to entry gp rtt */
4203 rack_set_prtt_target(rack, segsiz,
4204 rack->r_ctl.rc_entry_gp_rtt);
4205 } else {
4206 uint64_t sum;
4207 uint32_t setval;
4208
4209 sum = rack->r_ctl.rc_entry_gp_rtt;
4210 sum *= 10;
4211 sum /= (uint64_t)(max(1, rack->r_ctl.rc_gp_srtt));
4212 if (sum >= 20) {
4213 /*
4214 * A highly buffered path needs
4215 * cwnd space for timely to work.
4216 * Lets set things up as if
4217 * we are heading back here again.
4218 */
4219 setval = rack->r_ctl.rc_entry_gp_rtt;
4220 } else if (sum >= 15) {
4221 /*
4222 * Lets take the smaller of the
4223 * two since we are just somewhat
4224 * buffered.
4225 */
4226 setval = rack->r_ctl.rc_gp_srtt;
4227 if (setval > rack->r_ctl.rc_entry_gp_rtt)
4228 setval = rack->r_ctl.rc_entry_gp_rtt;
4229 } else {
4230 /*
4231 * Here we are not highly buffered
4232 * and should pick the min we can to
4233 * keep from causing loss.
4234 */
4235 setval = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
4236 }
4237 rack_set_prtt_target(rack, segsiz,
4238 setval);
4239 }
4240 if (rack_probe_rtt_sets_cwnd > 1) {
4241 /* There is a percentage here to boost */
4242 ebdp = rack->r_ctl.rc_target_probertt_flight;
4243 ebdp *= rack_probe_rtt_sets_cwnd;
4244 ebdp /= 100;
4245 setto = rack->r_ctl.rc_target_probertt_flight + ebdp;
4246 } else
4247 setto = rack->r_ctl.rc_target_probertt_flight;
4248 rack->rc_tp->snd_cwnd = roundup(setto, segsiz);
4249 if (rack->rc_tp->snd_cwnd < (segsiz * rack_timely_min_segs)) {
4250 /* Enforce a min */
4251 rack->rc_tp->snd_cwnd = segsiz * rack_timely_min_segs;
4252 }
4253 /* If we set in the cwnd also set the ssthresh point so we are in CA */
4254 rack->rc_tp->snd_ssthresh = (rack->rc_tp->snd_cwnd - 1);
4255 }
4256 rack_log_rtt_shrinks(rack, us_cts,
4257 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
4258 __LINE__, RACK_RTTS_EXITPROBE);
4259 /* Clear times last so log has all the info */
4260 rack->r_ctl.rc_probertt_sndmax_atexit = rack->rc_tp->snd_max;
4261 rack->r_ctl.rc_time_probertt_entered = us_cts;
4262 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
4263 rack->r_ctl.rc_time_of_last_probertt = us_cts;
4264 }
4265
4266 static void
rack_check_probe_rtt(struct tcp_rack * rack,uint32_t us_cts)4267 rack_check_probe_rtt(struct tcp_rack *rack, uint32_t us_cts)
4268 {
4269 /* Check in on probe-rtt */
4270
4271 if (rack->rc_gp_filled == 0) {
4272 /* We do not do p-rtt unless we have gp measurements */
4273 return;
4274 }
4275 if (rack->in_probe_rtt) {
4276 uint64_t no_overflow;
4277 uint32_t endtime, must_stay;
4278
4279 if (rack->r_ctl.rc_went_idle_time &&
4280 ((us_cts - rack->r_ctl.rc_went_idle_time) > rack_min_probertt_hold)) {
4281 /*
4282 * We went idle during prtt, just exit now.
4283 */
4284 rack_exit_probertt(rack, us_cts);
4285 } else if (rack_probe_rtt_safety_val &&
4286 TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered) &&
4287 ((us_cts - rack->r_ctl.rc_time_probertt_entered) > rack_probe_rtt_safety_val)) {
4288 /*
4289 * Probe RTT safety value triggered!
4290 */
4291 rack_log_rtt_shrinks(rack, us_cts,
4292 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
4293 __LINE__, RACK_RTTS_SAFETY);
4294 rack_exit_probertt(rack, us_cts);
4295 }
4296 /* Calculate the max we will wait */
4297 endtime = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_max_drain_wait);
4298 if (rack->rc_highly_buffered)
4299 endtime += (rack->r_ctl.rc_gp_srtt * rack_max_drain_hbp);
4300 /* Calculate the min we must wait */
4301 must_stay = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_must_drain);
4302 if ((ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.rc_target_probertt_flight) &&
4303 TSTMP_LT(us_cts, endtime)) {
4304 uint32_t calc;
4305 /* Do we lower more? */
4306 no_exit:
4307 if (TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered))
4308 calc = us_cts - rack->r_ctl.rc_time_probertt_entered;
4309 else
4310 calc = 0;
4311 calc /= max(rack->r_ctl.rc_gp_srtt, 1);
4312 if (calc) {
4313 /* Maybe */
4314 calc *= rack_per_of_gp_probertt_reduce;
4315 if (calc > rack_per_of_gp_probertt)
4316 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh;
4317 else
4318 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc;
4319 /* Limit it too */
4320 if (rack->r_ctl.rack_per_of_gp_probertt < rack_per_of_gp_lowthresh)
4321 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh;
4322 }
4323 /* We must reach target or the time set */
4324 return;
4325 }
4326 if (rack->r_ctl.rc_time_probertt_starts == 0) {
4327 if ((TSTMP_LT(us_cts, must_stay) &&
4328 rack->rc_highly_buffered) ||
4329 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) >
4330 rack->r_ctl.rc_target_probertt_flight)) {
4331 /* We are not past the must_stay time */
4332 goto no_exit;
4333 }
4334 rack_log_rtt_shrinks(rack, us_cts,
4335 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
4336 __LINE__, RACK_RTTS_REACHTARGET);
4337 rack->r_ctl.rc_time_probertt_starts = us_cts;
4338 if (rack->r_ctl.rc_time_probertt_starts == 0)
4339 rack->r_ctl.rc_time_probertt_starts = 1;
4340 /* Restore back to our rate we want to pace at in prtt */
4341 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt;
4342 }
4343 /*
4344 * Setup our end time, some number of gp_srtts plus 200ms.
4345 */
4346 no_overflow = ((uint64_t)rack->r_ctl.rc_gp_srtt *
4347 (uint64_t)rack_probertt_gpsrtt_cnt_mul);
4348 if (rack_probertt_gpsrtt_cnt_div)
4349 endtime = (uint32_t)(no_overflow / (uint64_t)rack_probertt_gpsrtt_cnt_div);
4350 else
4351 endtime = 0;
4352 endtime += rack_min_probertt_hold;
4353 endtime += rack->r_ctl.rc_time_probertt_starts;
4354 if (TSTMP_GEQ(us_cts, endtime)) {
4355 /* yes, exit probertt */
4356 rack_exit_probertt(rack, us_cts);
4357 }
4358
4359 } else if ((rack->rc_skip_timely == 0) &&
4360 (TSTMP_GT(us_cts, rack->r_ctl.rc_lower_rtt_us_cts)) &&
4361 ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt)) {
4362 /* Go into probertt, its been too long since we went lower */
4363 rack_enter_probertt(rack, us_cts);
4364 }
4365 }
4366
4367 static void
rack_update_multiplier(struct tcp_rack * rack,int32_t timely_says,uint64_t last_bw_est,uint32_t rtt,int32_t rtt_diff)4368 rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last_bw_est,
4369 uint32_t rtt, int32_t rtt_diff)
4370 {
4371 uint64_t cur_bw, up_bnd, low_bnd, subfr;
4372 uint32_t losses;
4373
4374 if ((rack->rc_gp_dyn_mul == 0) ||
4375 (rack->use_fixed_rate) ||
4376 (rack->in_probe_rtt) ||
4377 (rack->rc_always_pace == 0)) {
4378 /* No dynamic GP multiplier in play */
4379 return;
4380 }
4381 losses = rack->r_ctl.rc_loss_count - rack->r_ctl.rc_loss_at_start;
4382 cur_bw = rack_get_bw(rack);
4383 /* Calculate our up and down range */
4384 up_bnd = rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_up;
4385 up_bnd /= 100;
4386 up_bnd += rack->r_ctl.last_gp_comp_bw;
4387
4388 subfr = (uint64_t)rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_down;
4389 subfr /= 100;
4390 low_bnd = rack->r_ctl.last_gp_comp_bw - subfr;
4391 if ((timely_says == 2) && (rack->r_ctl.rc_no_push_at_mrtt)) {
4392 /*
4393 * This is the case where our RTT is above
4394 * the max target and we have been configured
4395 * to just do timely no bonus up stuff in that case.
4396 *
4397 * There are two configurations, set to 1, and we
4398 * just do timely if we are over our max. If its
4399 * set above 1 then we slam the multipliers down
4400 * to 100 and then decrement per timely.
4401 */
4402 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd,
4403 __LINE__, 3);
4404 if (rack->r_ctl.rc_no_push_at_mrtt > 1)
4405 rack_validate_multipliers_at_or_below_100(rack);
4406 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff);
4407 } else if ((timely_says != 0) && (last_bw_est < low_bnd) && !losses) {
4408 /*
4409 * We are decreasing this is a bit complicated this
4410 * means we are loosing ground. This could be
4411 * because another flow entered and we are competing
4412 * for b/w with it. This will push the RTT up which
4413 * makes timely unusable unless we want to get shoved
4414 * into a corner and just be backed off (the age
4415 * old problem with delay based CC).
4416 *
4417 * On the other hand if it was a route change we
4418 * would like to stay somewhat contained and not
4419 * blow out the buffers.
4420 */
4421 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd,
4422 __LINE__, 3);
4423 rack->r_ctl.last_gp_comp_bw = cur_bw;
4424 if (rack->rc_gp_bwred == 0) {
4425 /* Go into reduction counting */
4426 rack->rc_gp_bwred = 1;
4427 rack->rc_gp_timely_dec_cnt = 0;
4428 }
4429 if (rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) {
4430 /*
4431 * Push another time with a faster pacing
4432 * to try to gain back (we include override to
4433 * get a full raise factor).
4434 */
4435 if ((rack->rc_gp_saw_ca && rack->r_ctl.rack_per_of_gp_ca <= rack_down_raise_thresh) ||
4436 (rack->rc_gp_saw_ss && rack->r_ctl.rack_per_of_gp_ss <= rack_down_raise_thresh) ||
4437 (timely_says == 0) ||
4438 (rack_down_raise_thresh == 0)) {
4439 /*
4440 * Do an override up in b/w if we were
4441 * below the threshold or if the threshold
4442 * is zero we always do the raise.
4443 */
4444 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 1);
4445 } else {
4446 /* Log it stays the same */
4447 rack_log_timely(rack, 0, last_bw_est, low_bnd, 0,
4448 __LINE__, 11);
4449 }
4450 rack->rc_gp_timely_dec_cnt++;
4451 /* We are not incrementing really no-count */
4452 rack->rc_gp_incr = 0;
4453 rack->rc_gp_timely_inc_cnt = 0;
4454 } else {
4455 /*
4456 * Lets just use the RTT
4457 * information and give up
4458 * pushing.
4459 */
4460 goto use_timely;
4461 }
4462 } else if ((timely_says != 2) &&
4463 !losses &&
4464 (last_bw_est > up_bnd)) {
4465 /*
4466 * We are increasing b/w lets keep going, updating
4467 * our b/w and ignoring any timely input, unless
4468 * of course we are at our max raise (if there is one).
4469 */
4470
4471 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd,
4472 __LINE__, 3);
4473 rack->r_ctl.last_gp_comp_bw = cur_bw;
4474 if (rack->rc_gp_saw_ss &&
4475 rack->r_ctl.rack_per_upper_bound_ss &&
4476 (rack->r_ctl.rack_per_of_gp_ss == rack->r_ctl.rack_per_upper_bound_ss)) {
4477 /*
4478 * In cases where we can't go higher
4479 * we should just use timely.
4480 */
4481 goto use_timely;
4482 }
4483 if (rack->rc_gp_saw_ca &&
4484 rack->r_ctl.rack_per_upper_bound_ca &&
4485 (rack->r_ctl.rack_per_of_gp_ca == rack->r_ctl.rack_per_upper_bound_ca)) {
4486 /*
4487 * In cases where we can't go higher
4488 * we should just use timely.
4489 */
4490 goto use_timely;
4491 }
4492 rack->rc_gp_bwred = 0;
4493 rack->rc_gp_timely_dec_cnt = 0;
4494 /* You get a set number of pushes if timely is trying to reduce */
4495 if ((rack->rc_gp_incr < rack_timely_max_push_rise) || (timely_says == 0)) {
4496 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0);
4497 } else {
4498 /* Log it stays the same */
4499 rack_log_timely(rack, 0, last_bw_est, up_bnd, 0,
4500 __LINE__, 12);
4501 }
4502 return;
4503 } else {
4504 /*
4505 * We are staying between the lower and upper range bounds
4506 * so use timely to decide.
4507 */
4508 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd,
4509 __LINE__, 3);
4510 use_timely:
4511 if (timely_says) {
4512 rack->rc_gp_incr = 0;
4513 rack->rc_gp_timely_inc_cnt = 0;
4514 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) &&
4515 !losses &&
4516 (last_bw_est < low_bnd)) {
4517 /* We are loosing ground */
4518 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0);
4519 rack->rc_gp_timely_dec_cnt++;
4520 /* We are not incrementing really no-count */
4521 rack->rc_gp_incr = 0;
4522 rack->rc_gp_timely_inc_cnt = 0;
4523 } else
4524 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff);
4525 } else {
4526 rack->rc_gp_bwred = 0;
4527 rack->rc_gp_timely_dec_cnt = 0;
4528 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0);
4529 }
4530 }
4531 }
4532
4533 static int32_t
rack_make_timely_judgement(struct tcp_rack * rack,uint32_t rtt,int32_t rtt_diff,uint32_t prev_rtt)4534 rack_make_timely_judgement(struct tcp_rack *rack, uint32_t rtt, int32_t rtt_diff, uint32_t prev_rtt)
4535 {
4536 int32_t timely_says;
4537 uint64_t log_mult, log_rtt_a_diff;
4538
4539 log_rtt_a_diff = rtt;
4540 log_rtt_a_diff <<= 32;
4541 log_rtt_a_diff |= (uint32_t)rtt_diff;
4542 if (rtt >= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) *
4543 rack_gp_rtt_maxmul)) {
4544 /* Reduce the b/w multiplier */
4545 timely_says = 2;
4546 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul;
4547 log_mult <<= 32;
4548 log_mult |= prev_rtt;
4549 rack_log_timely(rack, timely_says, log_mult,
4550 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
4551 log_rtt_a_diff, __LINE__, 4);
4552 } else if (rtt <= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) +
4553 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) /
4554 max(rack_gp_rtt_mindiv , 1)))) {
4555 /* Increase the b/w multiplier */
4556 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) +
4557 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) /
4558 max(rack_gp_rtt_mindiv , 1));
4559 log_mult <<= 32;
4560 log_mult |= prev_rtt;
4561 timely_says = 0;
4562 rack_log_timely(rack, timely_says, log_mult ,
4563 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
4564 log_rtt_a_diff, __LINE__, 5);
4565 } else {
4566 /*
4567 * Use a gradient to find it the timely gradient
4568 * is:
4569 * grad = rc_rtt_diff / min_rtt;
4570 *
4571 * anything below or equal to 0 will be
4572 * a increase indication. Anything above
4573 * zero is a decrease. Note we take care
4574 * of the actual gradient calculation
4575 * in the reduction (its not needed for
4576 * increase).
4577 */
4578 log_mult = prev_rtt;
4579 if (rtt_diff <= 0) {
4580 /*
4581 * Rttdiff is less than zero, increase the
4582 * b/w multiplier (its 0 or negative)
4583 */
4584 timely_says = 0;
4585 rack_log_timely(rack, timely_says, log_mult,
4586 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 6);
4587 } else {
4588 /* Reduce the b/w multiplier */
4589 timely_says = 1;
4590 rack_log_timely(rack, timely_says, log_mult,
4591 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 7);
4592 }
4593 }
4594 return (timely_says);
4595 }
4596
4597 static inline int
rack_in_gp_window(struct tcpcb * tp,struct rack_sendmap * rsm)4598 rack_in_gp_window(struct tcpcb *tp, struct rack_sendmap *rsm)
4599 {
4600 if (SEQ_GEQ(rsm->r_start, tp->gput_seq) &&
4601 SEQ_LEQ(rsm->r_end, tp->gput_ack)) {
4602 /**
4603 * This covers the case that the
4604 * resent is completely inside
4605 * the gp range or up to it.
4606 * |----------------|
4607 * |-----| <or>
4608 * |----|
4609 * <or> |---|
4610 */
4611 return (1);
4612 } else if (SEQ_LT(rsm->r_start, tp->gput_seq) &&
4613 SEQ_GT(rsm->r_end, tp->gput_seq)){
4614 /**
4615 * This covers the case of
4616 * |--------------|
4617 * |-------->|
4618 */
4619 return (1);
4620 } else if (SEQ_GEQ(rsm->r_start, tp->gput_seq) &&
4621 SEQ_LT(rsm->r_start, tp->gput_ack) &&
4622 SEQ_GEQ(rsm->r_end, tp->gput_ack)) {
4623
4624 /**
4625 * This covers the case of
4626 * |--------------|
4627 * |-------->|
4628 */
4629 return (1);
4630 }
4631 return (0);
4632 }
4633
4634 static inline void
rack_mark_in_gp_win(struct tcpcb * tp,struct rack_sendmap * rsm)4635 rack_mark_in_gp_win(struct tcpcb *tp, struct rack_sendmap *rsm)
4636 {
4637
4638 if ((tp->t_flags & TF_GPUTINPROG) == 0)
4639 return;
4640 /*
4641 * We have a Goodput measurement in progress. Mark
4642 * the send if its within the window. If its not
4643 * in the window make sure it does not have the mark.
4644 */
4645 if (rack_in_gp_window(tp, rsm))
4646 rsm->r_flags |= RACK_IN_GP_WIN;
4647 else
4648 rsm->r_flags &= ~RACK_IN_GP_WIN;
4649 }
4650
4651 static inline void
rack_clear_gp_marks(struct tcpcb * tp,struct tcp_rack * rack)4652 rack_clear_gp_marks(struct tcpcb *tp, struct tcp_rack *rack)
4653 {
4654 /* A GP measurement is ending, clear all marks on the send map*/
4655 struct rack_sendmap *rsm = NULL;
4656
4657 rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq);
4658 if (rsm == NULL) {
4659 rsm = tqhash_min(rack->r_ctl.tqh);
4660 }
4661 /* Nothing left? */
4662 while ((rsm != NULL) && (SEQ_GEQ(tp->gput_ack, rsm->r_start))){
4663 rsm->r_flags &= ~RACK_IN_GP_WIN;
4664 rsm = tqhash_next(rack->r_ctl.tqh, rsm);
4665 }
4666 }
4667
4668
4669 static inline void
rack_tend_gp_marks(struct tcpcb * tp,struct tcp_rack * rack)4670 rack_tend_gp_marks(struct tcpcb *tp, struct tcp_rack *rack)
4671 {
4672 struct rack_sendmap *rsm = NULL;
4673
4674 if (tp->snd_una == tp->snd_max) {
4675 /* Nothing outstanding yet, nothing to do here */
4676 return;
4677 }
4678 if (SEQ_GT(tp->gput_seq, tp->snd_una)) {
4679 /*
4680 * We are measuring ahead of some outstanding
4681 * data. We need to walk through up until we get
4682 * to gp_seq marking so that no rsm is set incorrectly
4683 * with RACK_IN_GP_WIN.
4684 */
4685 rsm = tqhash_min(rack->r_ctl.tqh);
4686 while (rsm != NULL) {
4687 rack_mark_in_gp_win(tp, rsm);
4688 if (SEQ_GEQ(rsm->r_end, tp->gput_seq))
4689 break;
4690 rsm = tqhash_next(rack->r_ctl.tqh, rsm);
4691 }
4692 }
4693 if (rsm == NULL) {
4694 /*
4695 * Need to find the GP seq, if rsm is
4696 * set we stopped as we hit it.
4697 */
4698 rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq);
4699 if (rsm == NULL)
4700 return;
4701 rack_mark_in_gp_win(tp, rsm);
4702 }
4703 /*
4704 * Now we may need to mark already sent rsm, ahead of
4705 * gput_seq in the window since they may have been sent
4706 * *before* we started our measurment. The rsm, if non-null
4707 * has been marked (note if rsm would have been NULL we would have
4708 * returned in the previous block). So we go to the next, and continue
4709 * until we run out of entries or we exceed the gp_ack value.
4710 */
4711 rsm = tqhash_next(rack->r_ctl.tqh, rsm);
4712 while (rsm) {
4713 rack_mark_in_gp_win(tp, rsm);
4714 if (SEQ_GT(rsm->r_end, tp->gput_ack))
4715 break;
4716 rsm = tqhash_next(rack->r_ctl.tqh, rsm);
4717 }
4718 }
4719
4720 static void
rack_log_gp_calc(struct tcp_rack * rack,uint32_t add_part,uint32_t sub_part,uint32_t srtt,uint64_t meas_bw,uint64_t utim,uint8_t meth,uint32_t line)4721 rack_log_gp_calc(struct tcp_rack *rack, uint32_t add_part, uint32_t sub_part, uint32_t srtt, uint64_t meas_bw, uint64_t utim, uint8_t meth, uint32_t line)
4722 {
4723 if (tcp_bblogging_on(rack->rc_tp)) {
4724 union tcp_log_stackspecific log;
4725 struct timeval tv;
4726
4727 memset(&log, 0, sizeof(log));
4728 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
4729 log.u_bbr.flex1 = add_part;
4730 log.u_bbr.flex2 = sub_part;
4731 log.u_bbr.flex3 = rack_wma_divisor;
4732 log.u_bbr.flex4 = srtt;
4733 log.u_bbr.flex7 = (uint16_t)line;
4734 log.u_bbr.flex8 = meth;
4735 log.u_bbr.delRate = rack->r_ctl.gp_bw;
4736 log.u_bbr.cur_del_rate = meas_bw;
4737 log.u_bbr.rttProp = utim;
4738 TCP_LOG_EVENTP(rack->rc_tp, NULL,
4739 &rack->rc_inp->inp_socket->so_rcv,
4740 &rack->rc_inp->inp_socket->so_snd,
4741 BBR_LOG_THRESH_CALC, 0,
4742 0, &log, false, &rack->r_ctl.act_rcv_time);
4743 }
4744 }
4745
4746 static void
rack_do_goodput_measurement(struct tcpcb * tp,struct tcp_rack * rack,tcp_seq th_ack,int line,uint8_t quality)4747 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
4748 tcp_seq th_ack, int line, uint8_t quality)
4749 {
4750 uint64_t tim, bytes_ps, stim, utim;
4751 uint32_t segsiz, bytes, reqbytes, us_cts;
4752 int32_t gput, new_rtt_diff, timely_says;
4753 uint64_t resid_bw, subpart = 0, addpart = 0, srtt;
4754 int did_add = 0;
4755
4756 us_cts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time);
4757 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
4758 if (TSTMP_GEQ(us_cts, tp->gput_ts))
4759 tim = us_cts - tp->gput_ts;
4760 else
4761 tim = 0;
4762 if (rack->r_ctl.rc_gp_cumack_ts > rack->r_ctl.rc_gp_output_ts)
4763 stim = rack->r_ctl.rc_gp_cumack_ts - rack->r_ctl.rc_gp_output_ts;
4764 else
4765 stim = 0;
4766 /*
4767 * Use the larger of the send time or ack time. This prevents us
4768 * from being influenced by ack artifacts to come up with too
4769 * high of measurement. Note that since we are spanning over many more
4770 * bytes in most of our measurements hopefully that is less likely to
4771 * occur.
4772 */
4773 if (tim > stim)
4774 utim = max(tim, 1);
4775 else
4776 utim = max(stim, 1);
4777 reqbytes = min(rc_init_window(rack), (MIN_GP_WIN * segsiz));
4778 rack_log_gpset(rack, th_ack, us_cts, rack->r_ctl.rc_gp_cumack_ts, __LINE__, 3, NULL);
4779 if ((tim == 0) && (stim == 0)) {
4780 /*
4781 * Invalid measurement time, maybe
4782 * all on one ack/one send?
4783 */
4784 bytes = 0;
4785 bytes_ps = 0;
4786 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
4787 0, 0, 0, 10, __LINE__, NULL, quality);
4788 goto skip_measurement;
4789 }
4790 if (rack->r_ctl.rc_gp_lowrtt == 0xffffffff) {
4791 /* We never made a us_rtt measurement? */
4792 bytes = 0;
4793 bytes_ps = 0;
4794 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
4795 0, 0, 0, 10, __LINE__, NULL, quality);
4796 goto skip_measurement;
4797 }
4798 /*
4799 * Calculate the maximum possible b/w this connection
4800 * could have. We base our calculation on the lowest
4801 * rtt we have seen during the measurement and the
4802 * largest rwnd the client has given us in that time. This
4803 * forms a BDP that is the maximum that we could ever
4804 * get to the client. Anything larger is not valid.
4805 *
4806 * I originally had code here that rejected measurements
4807 * where the time was less than 1/2 the latest us_rtt.
4808 * But after thinking on that I realized its wrong since
4809 * say you had a 150Mbps or even 1Gbps link, and you
4810 * were a long way away.. example I am in Europe (100ms rtt)
4811 * talking to my 1Gbps link in S.C. Now measuring say 150,000
4812 * bytes my time would be 1.2ms, and yet my rtt would say
4813 * the measurement was invalid the time was < 50ms. The
4814 * same thing is true for 150Mb (8ms of time).
4815 *
4816 * A better way I realized is to look at what the maximum
4817 * the connection could possibly do. This is gated on
4818 * the lowest RTT we have seen and the highest rwnd.
4819 * We should in theory never exceed that, if we are
4820 * then something on the path is storing up packets
4821 * and then feeding them all at once to our endpoint
4822 * messing up our measurement.
4823 */
4824 rack->r_ctl.last_max_bw = rack->r_ctl.rc_gp_high_rwnd;
4825 rack->r_ctl.last_max_bw *= HPTS_USEC_IN_SEC;
4826 rack->r_ctl.last_max_bw /= rack->r_ctl.rc_gp_lowrtt;
4827 if (SEQ_LT(th_ack, tp->gput_seq)) {
4828 /* No measurement can be made */
4829 bytes = 0;
4830 bytes_ps = 0;
4831 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
4832 0, 0, 0, 10, __LINE__, NULL, quality);
4833 goto skip_measurement;
4834 } else
4835 bytes = (th_ack - tp->gput_seq);
4836 bytes_ps = (uint64_t)bytes;
4837 /*
4838 * Don't measure a b/w for pacing unless we have gotten at least
4839 * an initial windows worth of data in this measurement interval.
4840 *
4841 * Small numbers of bytes get badly influenced by delayed ack and
4842 * other artifacts. Note we take the initial window or our
4843 * defined minimum GP (defaulting to 10 which hopefully is the
4844 * IW).
4845 */
4846 if (rack->rc_gp_filled == 0) {
4847 /*
4848 * The initial estimate is special. We
4849 * have blasted out an IW worth of packets
4850 * without a real valid ack ts results. We
4851 * then setup the app_limited_needs_set flag,
4852 * this should get the first ack in (probably 2
4853 * MSS worth) to be recorded as the timestamp.
4854 * We thus allow a smaller number of bytes i.e.
4855 * IW - 2MSS.
4856 */
4857 reqbytes -= (2 * segsiz);
4858 /* Also lets fill previous for our first measurement to be neutral */
4859 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt;
4860 }
4861 if ((bytes_ps < reqbytes) || rack->app_limited_needs_set) {
4862 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
4863 rack->r_ctl.rc_app_limited_cnt,
4864 0, 0, 10, __LINE__, NULL, quality);
4865 goto skip_measurement;
4866 }
4867 /*
4868 * We now need to calculate the Timely like status so
4869 * we can update (possibly) the b/w multipliers.
4870 */
4871 new_rtt_diff = (int32_t)rack->r_ctl.rc_gp_srtt - (int32_t)rack->r_ctl.rc_prev_gp_srtt;
4872 if (rack->rc_gp_filled == 0) {
4873 /* No previous reading */
4874 rack->r_ctl.rc_rtt_diff = new_rtt_diff;
4875 } else {
4876 if (rack->measure_saw_probe_rtt == 0) {
4877 /*
4878 * We don't want a probertt to be counted
4879 * since it will be negative incorrectly. We
4880 * expect to be reducing the RTT when we
4881 * pace at a slower rate.
4882 */
4883 rack->r_ctl.rc_rtt_diff -= (rack->r_ctl.rc_rtt_diff / 8);
4884 rack->r_ctl.rc_rtt_diff += (new_rtt_diff / 8);
4885 }
4886 }
4887 timely_says = rack_make_timely_judgement(rack,
4888 rack->r_ctl.rc_gp_srtt,
4889 rack->r_ctl.rc_rtt_diff,
4890 rack->r_ctl.rc_prev_gp_srtt
4891 );
4892 bytes_ps *= HPTS_USEC_IN_SEC;
4893 bytes_ps /= utim;
4894 if (bytes_ps > rack->r_ctl.last_max_bw) {
4895 /*
4896 * Something is on path playing
4897 * since this b/w is not possible based
4898 * on our BDP (highest rwnd and lowest rtt
4899 * we saw in the measurement window).
4900 *
4901 * Another option here would be to
4902 * instead skip the measurement.
4903 */
4904 rack_log_pacing_delay_calc(rack, bytes, reqbytes,
4905 bytes_ps, rack->r_ctl.last_max_bw, 0,
4906 11, __LINE__, NULL, quality);
4907 bytes_ps = rack->r_ctl.last_max_bw;
4908 }
4909 /* We store gp for b/w in bytes per second */
4910 if (rack->rc_gp_filled == 0) {
4911 /* Initial measurement */
4912 if (bytes_ps) {
4913 rack->r_ctl.gp_bw = bytes_ps;
4914 rack->rc_gp_filled = 1;
4915 rack->r_ctl.num_measurements = 1;
4916 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
4917 } else {
4918 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
4919 rack->r_ctl.rc_app_limited_cnt,
4920 0, 0, 10, __LINE__, NULL, quality);
4921 }
4922 if (tcp_in_hpts(rack->rc_tp) &&
4923 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
4924 /*
4925 * Ok we can't trust the pacer in this case
4926 * where we transition from un-paced to paced.
4927 * Or for that matter when the burst mitigation
4928 * was making a wild guess and got it wrong.
4929 * Stop the pacer and clear up all the aggregate
4930 * delays etc.
4931 */
4932 tcp_hpts_remove(rack->rc_tp);
4933 rack->r_ctl.rc_hpts_flags = 0;
4934 rack->r_ctl.rc_last_output_to = 0;
4935 }
4936 did_add = 2;
4937 } else if (rack->r_ctl.num_measurements < RACK_REQ_AVG) {
4938 /* Still a small number run an average */
4939 rack->r_ctl.gp_bw += bytes_ps;
4940 addpart = rack->r_ctl.num_measurements;
4941 rack->r_ctl.num_measurements++;
4942 if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) {
4943 /* We have collected enough to move forward */
4944 rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_measurements;
4945 }
4946 rack_set_pace_segments(tp, rack, __LINE__, NULL);
4947 did_add = 3;
4948 } else {
4949 /*
4950 * We want to take 1/wma of the goodput and add in to 7/8th
4951 * of the old value weighted by the srtt. So if your measurement
4952 * period is say 2 SRTT's long you would get 1/4 as the
4953 * value, if it was like 1/2 SRTT then you would get 1/16th.
4954 *
4955 * But we must be careful not to take too much i.e. if the
4956 * srtt is say 20ms and the measurement is taken over
4957 * 400ms our weight would be 400/20 i.e. 20. On the
4958 * other hand if we get a measurement over 1ms with a
4959 * 10ms rtt we only want to take a much smaller portion.
4960 */
4961 uint8_t meth;
4962
4963 if (rack->r_ctl.num_measurements < 0xff) {
4964 rack->r_ctl.num_measurements++;
4965 }
4966 srtt = (uint64_t)tp->t_srtt;
4967 if (srtt == 0) {
4968 /*
4969 * Strange why did t_srtt go back to zero?
4970 */
4971 if (rack->r_ctl.rc_rack_min_rtt)
4972 srtt = rack->r_ctl.rc_rack_min_rtt;
4973 else
4974 srtt = HPTS_USEC_IN_MSEC;
4975 }
4976 /*
4977 * XXXrrs: Note for reviewers, in playing with
4978 * dynamic pacing I discovered this GP calculation
4979 * as done originally leads to some undesired results.
4980 * Basically you can get longer measurements contributing
4981 * too much to the WMA. Thus I changed it if you are doing
4982 * dynamic adjustments to only do the aportioned adjustment
4983 * if we have a very small (time wise) measurement. Longer
4984 * measurements just get there weight (defaulting to 1/8)
4985 * add to the WMA. We may want to think about changing
4986 * this to always do that for both sides i.e. dynamic
4987 * and non-dynamic... but considering lots of folks
4988 * were playing with this I did not want to change the
4989 * calculation per.se. without your thoughts.. Lawerence?
4990 * Peter??
4991 */
4992 if (rack->rc_gp_dyn_mul == 0) {
4993 subpart = rack->r_ctl.gp_bw * utim;
4994 subpart /= (srtt * 8);
4995 if (subpart < (rack->r_ctl.gp_bw / 2)) {
4996 /*
4997 * The b/w update takes no more
4998 * away then 1/2 our running total
4999 * so factor it in.
5000 */
5001 addpart = bytes_ps * utim;
5002 addpart /= (srtt * 8);
5003 meth = 1;
5004 } else {
5005 /*
5006 * Don't allow a single measurement
5007 * to account for more than 1/2 of the
5008 * WMA. This could happen on a retransmission
5009 * where utim becomes huge compared to
5010 * srtt (multiple retransmissions when using
5011 * the sending rate which factors in all the
5012 * transmissions from the first one).
5013 */
5014 subpart = rack->r_ctl.gp_bw / 2;
5015 addpart = bytes_ps / 2;
5016 meth = 2;
5017 }
5018 rack_log_gp_calc(rack, addpart, subpart, srtt, bytes_ps, utim, meth, __LINE__);
5019 resid_bw = rack->r_ctl.gp_bw - subpart;
5020 rack->r_ctl.gp_bw = resid_bw + addpart;
5021 did_add = 1;
5022 } else {
5023 if ((utim / srtt) <= 1) {
5024 /*
5025 * The b/w update was over a small period
5026 * of time. The idea here is to prevent a small
5027 * measurement time period from counting
5028 * too much. So we scale it based on the
5029 * time so it attributes less than 1/rack_wma_divisor
5030 * of its measurement.
5031 */
5032 subpart = rack->r_ctl.gp_bw * utim;
5033 subpart /= (srtt * rack_wma_divisor);
5034 addpart = bytes_ps * utim;
5035 addpart /= (srtt * rack_wma_divisor);
5036 meth = 3;
5037 } else {
5038 /*
5039 * The scaled measurement was long
5040 * enough so lets just add in the
5041 * portion of the measurement i.e. 1/rack_wma_divisor
5042 */
5043 subpart = rack->r_ctl.gp_bw / rack_wma_divisor;
5044 addpart = bytes_ps / rack_wma_divisor;
5045 meth = 4;
5046 }
5047 if ((rack->measure_saw_probe_rtt == 0) ||
5048 (bytes_ps > rack->r_ctl.gp_bw)) {
5049 /*
5050 * For probe-rtt we only add it in
5051 * if its larger, all others we just
5052 * add in.
5053 */
5054 did_add = 1;
5055 rack_log_gp_calc(rack, addpart, subpart, srtt, bytes_ps, utim, meth, __LINE__);
5056 resid_bw = rack->r_ctl.gp_bw - subpart;
5057 rack->r_ctl.gp_bw = resid_bw + addpart;
5058 }
5059 }
5060 rack_set_pace_segments(tp, rack, __LINE__, NULL);
5061 }
5062 /*
5063 * We only watch the growth of the GP during the initial startup
5064 * or first-slowstart that ensues. If we ever needed to watch
5065 * growth of gp outside of that period all we need to do is
5066 * remove the first clause of this if (rc_initial_ss_comp).
5067 */
5068 if ((rack->rc_initial_ss_comp == 0) &&
5069 (rack->r_ctl.num_measurements >= RACK_REQ_AVG)) {
5070 uint64_t gp_est;
5071
5072 gp_est = bytes_ps;
5073 if (tcp_bblogging_on(rack->rc_tp)) {
5074 union tcp_log_stackspecific log;
5075 struct timeval tv;
5076
5077 memset(&log, 0, sizeof(log));
5078 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
5079 log.u_bbr.flex1 = rack->r_ctl.current_round;
5080 log.u_bbr.flex2 = rack->r_ctl.last_rnd_of_gp_rise;
5081 log.u_bbr.delRate = gp_est;
5082 log.u_bbr.cur_del_rate = rack->r_ctl.last_gpest;
5083 log.u_bbr.flex8 = 41;
5084 (void)tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
5085 0, &log, false, NULL, __func__, __LINE__,&tv);
5086 }
5087 if ((rack->r_ctl.num_measurements == RACK_REQ_AVG) ||
5088 (rack->r_ctl.last_gpest == 0)) {
5089 /*
5090 * The round we get our measurement averaging going
5091 * is the base round so it always is the source point
5092 * for when we had our first increment. From there on
5093 * we only record the round that had a rise.
5094 */
5095 rack->r_ctl.last_rnd_of_gp_rise = rack->r_ctl.current_round;
5096 rack->r_ctl.last_gpest = rack->r_ctl.gp_bw;
5097 } else if (gp_est >= rack->r_ctl.last_gpest) {
5098 /*
5099 * Test to see if its gone up enough
5100 * to set the round count up to now. Note
5101 * that on the seeding of the 4th measurement we
5102 */
5103 gp_est *= 1000;
5104 gp_est /= rack->r_ctl.last_gpest;
5105 if ((uint32_t)gp_est > rack->r_ctl.gp_gain_req) {
5106 /*
5107 * We went up enough to record the round.
5108 */
5109 if (tcp_bblogging_on(rack->rc_tp)) {
5110 union tcp_log_stackspecific log;
5111 struct timeval tv;
5112
5113 memset(&log, 0, sizeof(log));
5114 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
5115 log.u_bbr.flex1 = rack->r_ctl.current_round;
5116 log.u_bbr.flex2 = (uint32_t)gp_est;
5117 log.u_bbr.flex3 = rack->r_ctl.gp_gain_req;
5118 log.u_bbr.delRate = gp_est;
5119 log.u_bbr.cur_del_rate = rack->r_ctl.last_gpest;
5120 log.u_bbr.flex8 = 42;
5121 (void)tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
5122 0, &log, false, NULL, __func__, __LINE__,&tv);
5123 }
5124 rack->r_ctl.last_rnd_of_gp_rise = rack->r_ctl.current_round;
5125 if (rack->r_ctl.use_gp_not_last == 1)
5126 rack->r_ctl.last_gpest = rack->r_ctl.gp_bw;
5127 else
5128 rack->r_ctl.last_gpest = bytes_ps;
5129 }
5130 }
5131 }
5132 if ((rack->gp_ready == 0) &&
5133 (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) {
5134 /* We have enough measurements now */
5135 rack->gp_ready = 1;
5136 if (rack->dgp_on ||
5137 rack->rack_hibeta)
5138 rack_set_cc_pacing(rack);
5139 if (rack->defer_options)
5140 rack_apply_deferred_options(rack);
5141 }
5142 rack_log_pacing_delay_calc(rack, subpart, addpart, bytes_ps, stim,
5143 rack_get_bw(rack), 22, did_add, NULL, quality);
5144 /* We do not update any multipliers if we are in or have seen a probe-rtt */
5145
5146 if ((rack->measure_saw_probe_rtt == 0) &&
5147 rack->rc_gp_rtt_set) {
5148 if (rack->rc_skip_timely == 0) {
5149 rack_update_multiplier(rack, timely_says, bytes_ps,
5150 rack->r_ctl.rc_gp_srtt,
5151 rack->r_ctl.rc_rtt_diff);
5152 }
5153 }
5154 rack_log_pacing_delay_calc(rack, bytes, tim, bytes_ps, stim,
5155 rack_get_bw(rack), 3, line, NULL, quality);
5156 rack_log_pacing_delay_calc(rack,
5157 bytes, /* flex2 */
5158 tim, /* flex1 */
5159 bytes_ps, /* bw_inuse */
5160 rack->r_ctl.gp_bw, /* delRate */
5161 rack_get_lt_bw(rack), /* rttProp */
5162 20, line, NULL, 0);
5163 /* reset the gp srtt and setup the new prev */
5164 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt;
5165 /* Record the lost count for the next measurement */
5166 rack->r_ctl.rc_loss_at_start = rack->r_ctl.rc_loss_count;
5167 skip_measurement:
5168 /*
5169 * We restart our diffs based on the gpsrtt in the
5170 * measurement window.
5171 */
5172 rack->rc_gp_rtt_set = 0;
5173 rack->rc_gp_saw_rec = 0;
5174 rack->rc_gp_saw_ca = 0;
5175 rack->rc_gp_saw_ss = 0;
5176 rack->rc_dragged_bottom = 0;
5177 if (quality == RACK_QUALITY_HIGH) {
5178 /*
5179 * Gput in the stats world is in kbps where bytes_ps is
5180 * bytes per second so we do ((x * 8)/ 1000).
5181 */
5182 gput = (int32_t)((bytes_ps << 3) / (uint64_t)1000);
5183 #ifdef STATS
5184 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
5185 gput);
5186 /*
5187 * XXXLAS: This is a temporary hack, and should be
5188 * chained off VOI_TCP_GPUT when stats(9) grows an
5189 * API to deal with chained VOIs.
5190 */
5191 if (tp->t_stats_gput_prev > 0)
5192 stats_voi_update_abs_s32(tp->t_stats,
5193 VOI_TCP_GPUT_ND,
5194 ((gput - tp->t_stats_gput_prev) * 100) /
5195 tp->t_stats_gput_prev);
5196 #endif
5197 tp->t_stats_gput_prev = gput;
5198 }
5199 tp->t_flags &= ~TF_GPUTINPROG;
5200 /*
5201 * Now are we app limited now and there is space from where we
5202 * were to where we want to go?
5203 *
5204 * We don't do the other case i.e. non-applimited here since
5205 * the next send will trigger us picking up the missing data.
5206 */
5207 if (rack->r_ctl.rc_first_appl &&
5208 TCPS_HAVEESTABLISHED(tp->t_state) &&
5209 rack->r_ctl.rc_app_limited_cnt &&
5210 (SEQ_GT(rack->r_ctl.rc_first_appl->r_start, th_ack)) &&
5211 ((rack->r_ctl.rc_first_appl->r_end - th_ack) >
5212 max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) {
5213 /*
5214 * Yep there is enough outstanding to make a measurement here.
5215 */
5216 struct rack_sendmap *rsm;
5217
5218 rack->r_ctl.rc_gp_lowrtt = 0xffffffff;
5219 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd;
5220 tp->gput_ts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time);
5221 rack->app_limited_needs_set = 0;
5222 tp->gput_seq = th_ack;
5223 if (rack->in_probe_rtt)
5224 rack->measure_saw_probe_rtt = 1;
5225 else if ((rack->measure_saw_probe_rtt) &&
5226 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit)))
5227 rack->measure_saw_probe_rtt = 0;
5228 if ((rack->r_ctl.rc_first_appl->r_end - th_ack) >= rack_get_measure_window(tp, rack)) {
5229 /* There is a full window to gain info from */
5230 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
5231 } else {
5232 /* We can only measure up to the applimited point */
5233 tp->gput_ack = tp->gput_seq + (rack->r_ctl.rc_first_appl->r_end - th_ack);
5234 if ((tp->gput_ack - tp->gput_seq) < (MIN_GP_WIN * segsiz)) {
5235 /*
5236 * We don't have enough to make a measurement.
5237 */
5238 tp->t_flags &= ~TF_GPUTINPROG;
5239 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq,
5240 0, 0, 0, 6, __LINE__, NULL, quality);
5241 return;
5242 }
5243 }
5244 if (tp->t_state >= TCPS_FIN_WAIT_1) {
5245 /*
5246 * We will get no more data into the SB
5247 * this means we need to have the data available
5248 * before we start a measurement.
5249 */
5250 if (sbavail(&tptosocket(tp)->so_snd) < (tp->gput_ack - tp->gput_seq)) {
5251 /* Nope not enough data. */
5252 return;
5253 }
5254 }
5255 tp->t_flags |= TF_GPUTINPROG;
5256 /*
5257 * Now we need to find the timestamp of the send at tp->gput_seq
5258 * for the send based measurement.
5259 */
5260 rack->r_ctl.rc_gp_cumack_ts = 0;
5261 rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq);
5262 if (rsm) {
5263 /* Ok send-based limit is set */
5264 if (SEQ_LT(rsm->r_start, tp->gput_seq)) {
5265 /*
5266 * Move back to include the earlier part
5267 * so our ack time lines up right (this may
5268 * make an overlapping measurement but thats
5269 * ok).
5270 */
5271 tp->gput_seq = rsm->r_start;
5272 }
5273 if (rsm->r_flags & RACK_ACKED) {
5274 struct rack_sendmap *nrsm;
5275
5276 tp->gput_ts = (uint32_t)rsm->r_ack_arrival;
5277 tp->gput_seq = rsm->r_end;
5278 nrsm = tqhash_next(rack->r_ctl.tqh, rsm);
5279 if (nrsm)
5280 rsm = nrsm;
5281 else {
5282 rack->app_limited_needs_set = 1;
5283 }
5284 } else
5285 rack->app_limited_needs_set = 1;
5286 /* We always go from the first send */
5287 rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[0];
5288 } else {
5289 /*
5290 * If we don't find the rsm due to some
5291 * send-limit set the current time, which
5292 * basically disables the send-limit.
5293 */
5294 struct timeval tv;
5295
5296 microuptime(&tv);
5297 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv);
5298 }
5299 rack_tend_gp_marks(tp, rack);
5300 rack_log_pacing_delay_calc(rack,
5301 tp->gput_seq,
5302 tp->gput_ack,
5303 (uintptr_t)rsm,
5304 tp->gput_ts,
5305 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts),
5306 9,
5307 __LINE__, rsm, quality);
5308 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL);
5309 } else {
5310 /*
5311 * To make sure proper timestamp merging occurs, we need to clear
5312 * all GP marks if we don't start a measurement.
5313 */
5314 rack_clear_gp_marks(tp, rack);
5315 }
5316 }
5317
5318 /*
5319 * CC wrapper hook functions
5320 */
5321 static void
rack_ack_received(struct tcpcb * tp,struct tcp_rack * rack,uint32_t th_ack,uint16_t nsegs,uint16_t type,int32_t post_recovery)5322 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint16_t nsegs,
5323 uint16_t type, int32_t post_recovery)
5324 {
5325 uint32_t prior_cwnd, acked;
5326 struct tcp_log_buffer *lgb = NULL;
5327 uint8_t labc_to_use, quality;
5328
5329 INP_WLOCK_ASSERT(tptoinpcb(tp));
5330 tp->t_ccv.nsegs = nsegs;
5331 acked = tp->t_ccv.bytes_this_ack = (th_ack - tp->snd_una);
5332 if ((post_recovery) && (rack->r_ctl.rc_early_recovery_segs)) {
5333 uint32_t max;
5334
5335 max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp);
5336 if (tp->t_ccv.bytes_this_ack > max) {
5337 tp->t_ccv.bytes_this_ack = max;
5338 }
5339 }
5340 #ifdef STATS
5341 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF,
5342 ((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd);
5343 #endif
5344 if ((th_ack == tp->snd_max) && rack->lt_bw_up) {
5345 /*
5346 * We will ack all the data, time to end any
5347 * lt_bw_up we have running until something
5348 * new is sent. Note we need to use the actual
5349 * ack_rcv_time which with pacing may be different.
5350 */
5351 uint64_t tmark;
5352
5353 rack->r_ctl.lt_bw_bytes += (tp->snd_max - rack->r_ctl.lt_seq);
5354 rack->r_ctl.lt_seq = tp->snd_max;
5355 tmark = tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time);
5356 if (tmark >= rack->r_ctl.lt_timemark) {
5357 rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark);
5358 }
5359 rack->r_ctl.lt_timemark = tmark;
5360 rack->lt_bw_up = 0;
5361 }
5362 quality = RACK_QUALITY_NONE;
5363 if ((tp->t_flags & TF_GPUTINPROG) &&
5364 rack_enough_for_measurement(tp, rack, th_ack, &quality)) {
5365 /* Measure the Goodput */
5366 rack_do_goodput_measurement(tp, rack, th_ack, __LINE__, quality);
5367 }
5368 /* Which way our we limited, if not cwnd limited no advance in CA */
5369 if (tp->snd_cwnd <= tp->snd_wnd)
5370 tp->t_ccv.flags |= CCF_CWND_LIMITED;
5371 else
5372 tp->t_ccv.flags &= ~CCF_CWND_LIMITED;
5373 if (tp->snd_cwnd > tp->snd_ssthresh) {
5374 tp->t_bytes_acked += min(tp->t_ccv.bytes_this_ack,
5375 nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp));
5376 /* For the setting of a window past use the actual scwnd we are using */
5377 if (tp->t_bytes_acked >= rack->r_ctl.cwnd_to_use) {
5378 tp->t_bytes_acked -= rack->r_ctl.cwnd_to_use;
5379 tp->t_ccv.flags |= CCF_ABC_SENTAWND;
5380 }
5381 } else {
5382 tp->t_ccv.flags &= ~CCF_ABC_SENTAWND;
5383 tp->t_bytes_acked = 0;
5384 }
5385 prior_cwnd = tp->snd_cwnd;
5386 if ((post_recovery == 0) || (rack_max_abc_post_recovery == 0) || rack->r_use_labc_for_rec ||
5387 (rack_client_low_buf && rack->client_bufferlvl &&
5388 (rack->client_bufferlvl < rack_client_low_buf)))
5389 labc_to_use = rack->rc_labc;
5390 else
5391 labc_to_use = rack_max_abc_post_recovery;
5392 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
5393 union tcp_log_stackspecific log;
5394 struct timeval tv;
5395
5396 memset(&log, 0, sizeof(log));
5397 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
5398 log.u_bbr.flex1 = th_ack;
5399 log.u_bbr.flex2 = tp->t_ccv.flags;
5400 log.u_bbr.flex3 = tp->t_ccv.bytes_this_ack;
5401 log.u_bbr.flex4 = tp->t_ccv.nsegs;
5402 log.u_bbr.flex5 = labc_to_use;
5403 log.u_bbr.flex6 = prior_cwnd;
5404 log.u_bbr.flex7 = 1; /* always doing RFC6675 SACK */
5405 log.u_bbr.flex8 = 1;
5406 lgb = tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
5407 0, &log, false, NULL, __func__, __LINE__,&tv);
5408 }
5409 if (CC_ALGO(tp)->ack_received != NULL) {
5410 /* XXXLAS: Find a way to live without this */
5411 tp->t_ccv.curack = th_ack;
5412 tp->t_ccv.labc = labc_to_use;
5413 tp->t_ccv.flags |= CCF_USE_LOCAL_ABC;
5414 CC_ALGO(tp)->ack_received(&tp->t_ccv, type);
5415 }
5416 if (lgb) {
5417 lgb->tlb_stackinfo.u_bbr.flex6 = tp->snd_cwnd;
5418 }
5419 if (rack->r_must_retran) {
5420 if (SEQ_GEQ(th_ack, rack->r_ctl.rc_snd_max_at_rto)) {
5421 /*
5422 * We now are beyond the rxt point so lets disable
5423 * the flag.
5424 */
5425 rack->r_ctl.rc_out_at_rto = 0;
5426 rack->r_must_retran = 0;
5427 } else if ((prior_cwnd + ctf_fixed_maxseg(tp)) <= tp->snd_cwnd) {
5428 /*
5429 * Only decrement the rc_out_at_rto if the cwnd advances
5430 * at least a whole segment. Otherwise next time the peer
5431 * acks, we won't be able to send this generaly happens
5432 * when we are in Congestion Avoidance.
5433 */
5434 if (acked <= rack->r_ctl.rc_out_at_rto){
5435 rack->r_ctl.rc_out_at_rto -= acked;
5436 } else {
5437 rack->r_ctl.rc_out_at_rto = 0;
5438 }
5439 }
5440 }
5441 #ifdef STATS
5442 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, rack->r_ctl.cwnd_to_use);
5443 #endif
5444 if (rack->r_ctl.rc_rack_largest_cwnd < rack->r_ctl.cwnd_to_use) {
5445 rack->r_ctl.rc_rack_largest_cwnd = rack->r_ctl.cwnd_to_use;
5446 }
5447 if ((rack->rc_initial_ss_comp == 0) &&
5448 (tp->snd_cwnd >= tp->snd_ssthresh)) {
5449 /*
5450 * The cwnd has grown beyond ssthresh we have
5451 * entered ca and completed our first Slowstart.
5452 */
5453 rack->rc_initial_ss_comp = 1;
5454 }
5455 }
5456
5457 static void
tcp_rack_partialack(struct tcpcb * tp)5458 tcp_rack_partialack(struct tcpcb *tp)
5459 {
5460 struct tcp_rack *rack;
5461
5462 rack = (struct tcp_rack *)tp->t_fb_ptr;
5463 INP_WLOCK_ASSERT(tptoinpcb(tp));
5464 /*
5465 * If we are doing PRR and have enough
5466 * room to send <or> we are pacing and prr
5467 * is disabled we will want to see if we
5468 * can send data (by setting r_wanted_output to
5469 * true).
5470 */
5471 if ((rack->r_ctl.rc_prr_sndcnt > 0) ||
5472 rack->rack_no_prr)
5473 rack->r_wanted_output = 1;
5474 }
5475
5476 static void
rack_exit_recovery(struct tcpcb * tp,struct tcp_rack * rack,int how)5477 rack_exit_recovery(struct tcpcb *tp, struct tcp_rack *rack, int how)
5478 {
5479 /*
5480 * Now exit recovery.
5481 */
5482 EXIT_RECOVERY(tp->t_flags);
5483 }
5484
5485 static void
rack_post_recovery(struct tcpcb * tp,uint32_t th_ack)5486 rack_post_recovery(struct tcpcb *tp, uint32_t th_ack)
5487 {
5488 struct tcp_rack *rack;
5489 uint32_t orig_cwnd;
5490
5491 orig_cwnd = tp->snd_cwnd;
5492 INP_WLOCK_ASSERT(tptoinpcb(tp));
5493 rack = (struct tcp_rack *)tp->t_fb_ptr;
5494 /* only alert CC if we alerted when we entered */
5495 if (CC_ALGO(tp)->post_recovery != NULL) {
5496 tp->t_ccv.curack = th_ack;
5497 CC_ALGO(tp)->post_recovery(&tp->t_ccv);
5498 if (tp->snd_cwnd < tp->snd_ssthresh) {
5499 /*
5500 * Rack has burst control and pacing
5501 * so lets not set this any lower than
5502 * snd_ssthresh per RFC-6582 (option 2).
5503 */
5504 tp->snd_cwnd = tp->snd_ssthresh;
5505 }
5506 }
5507 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
5508 union tcp_log_stackspecific log;
5509 struct timeval tv;
5510
5511 memset(&log, 0, sizeof(log));
5512 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
5513 log.u_bbr.flex1 = th_ack;
5514 log.u_bbr.flex2 = tp->t_ccv.flags;
5515 log.u_bbr.flex3 = tp->t_ccv.bytes_this_ack;
5516 log.u_bbr.flex4 = tp->t_ccv.nsegs;
5517 log.u_bbr.flex5 = V_tcp_abc_l_var;
5518 log.u_bbr.flex6 = orig_cwnd;
5519 log.u_bbr.flex7 = 1; /* always doing RFC6675 SACK */
5520 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
5521 log.u_bbr.flex8 = 2;
5522 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
5523 0, &log, false, NULL, __func__, __LINE__, &tv);
5524 }
5525 if ((rack->rack_no_prr == 0) &&
5526 (rack->no_prr_addback == 0) &&
5527 (rack->r_ctl.rc_prr_sndcnt > 0)) {
5528 /*
5529 * Suck the next prr cnt back into cwnd, but
5530 * only do that if we are not application limited.
5531 */
5532 if (ctf_outstanding(tp) <= sbavail(&tptosocket(tp)->so_snd)) {
5533 /*
5534 * We are allowed to add back to the cwnd the amount we did
5535 * not get out if:
5536 * a) no_prr_addback is off.
5537 * b) we are not app limited
5538 * c) we are doing prr
5539 * <and>
5540 * d) it is bounded by rack_prr_addbackmax (if addback is 0, then none).
5541 */
5542 tp->snd_cwnd += min((ctf_fixed_maxseg(tp) * rack_prr_addbackmax),
5543 rack->r_ctl.rc_prr_sndcnt);
5544 }
5545 rack->r_ctl.rc_prr_sndcnt = 0;
5546 rack_log_to_prr(rack, 1, 0, __LINE__);
5547 }
5548 rack_log_to_prr(rack, 14, orig_cwnd, __LINE__);
5549 tp->snd_recover = tp->snd_una;
5550 if (rack->r_ctl.dsack_persist) {
5551 rack->r_ctl.dsack_persist--;
5552 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) {
5553 rack->r_ctl.num_dsack = 0;
5554 }
5555 rack_log_dsack_event(rack, 1, __LINE__, 0, 0);
5556 }
5557 if (rack->rto_from_rec == 1) {
5558 rack->rto_from_rec = 0;
5559 if (rack->r_ctl.rto_ssthresh > tp->snd_ssthresh)
5560 tp->snd_ssthresh = rack->r_ctl.rto_ssthresh;
5561 }
5562 rack_exit_recovery(tp, rack, 1);
5563 }
5564
5565 static void
rack_cong_signal(struct tcpcb * tp,uint32_t type,uint32_t ack,int line)5566 rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int line)
5567 {
5568 struct tcp_rack *rack;
5569 uint32_t ssthresh_enter, cwnd_enter, in_rec_at_entry, orig_cwnd;
5570
5571 INP_WLOCK_ASSERT(tptoinpcb(tp));
5572 #ifdef STATS
5573 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type);
5574 #endif
5575 if (IN_RECOVERY(tp->t_flags) == 0) {
5576 in_rec_at_entry = 0;
5577 ssthresh_enter = tp->snd_ssthresh;
5578 cwnd_enter = tp->snd_cwnd;
5579 } else
5580 in_rec_at_entry = 1;
5581 rack = (struct tcp_rack *)tp->t_fb_ptr;
5582 switch (type) {
5583 case CC_NDUPACK:
5584 tp->t_flags &= ~TF_WASFRECOVERY;
5585 tp->t_flags &= ~TF_WASCRECOVERY;
5586 if (!IN_FASTRECOVERY(tp->t_flags)) {
5587 /* Check if this is the end of the initial Start-up i.e. initial slow-start */
5588 if (rack->rc_initial_ss_comp == 0) {
5589 /* Yep it is the end of the initial slowstart */
5590 rack->rc_initial_ss_comp = 1;
5591 }
5592 rack->r_ctl.rc_prr_delivered = 0;
5593 rack->r_ctl.rc_prr_out = 0;
5594 rack->r_fast_output = 0;
5595 rack->r_ctl.recovery_rxt_cnt = 0;
5596 if (rack->rack_no_prr == 0) {
5597 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
5598 rack_log_to_prr(rack, 2, in_rec_at_entry, line);
5599 }
5600 rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una;
5601 tp->snd_recover = tp->snd_max;
5602 if (tp->t_flags2 & TF2_ECN_PERMIT)
5603 tp->t_flags2 |= TF2_ECN_SND_CWR;
5604 }
5605 break;
5606 case CC_ECN:
5607 if (!IN_CONGRECOVERY(tp->t_flags) ||
5608 /*
5609 * Allow ECN reaction on ACK to CWR, if
5610 * that data segment was also CE marked.
5611 */
5612 SEQ_GEQ(ack, tp->snd_recover)) {
5613 EXIT_CONGRECOVERY(tp->t_flags);
5614 KMOD_TCPSTAT_INC(tcps_ecn_rcwnd);
5615 rack->r_fast_output = 0;
5616 tp->snd_recover = tp->snd_max + 1;
5617 if (tp->t_flags2 & TF2_ECN_PERMIT)
5618 tp->t_flags2 |= TF2_ECN_SND_CWR;
5619 }
5620 break;
5621 case CC_RTO:
5622 tp->t_dupacks = 0;
5623 tp->t_bytes_acked = 0;
5624 rack->r_fast_output = 0;
5625 if (IN_RECOVERY(tp->t_flags))
5626 rack_exit_recovery(tp, rack, 2);
5627 orig_cwnd = tp->snd_cwnd;
5628 rack_log_to_prr(rack, 16, orig_cwnd, line);
5629 if (CC_ALGO(tp)->cong_signal == NULL) {
5630 /* TSNH */
5631 tp->snd_ssthresh = max(2,
5632 min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 /
5633 ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp);
5634 tp->snd_cwnd = ctf_fixed_maxseg(tp);
5635 }
5636 if (tp->t_flags2 & TF2_ECN_PERMIT)
5637 tp->t_flags2 |= TF2_ECN_SND_CWR;
5638 break;
5639 case CC_RTO_ERR:
5640 KMOD_TCPSTAT_INC(tcps_sndrexmitbad);
5641 /* RTO was unnecessary, so reset everything. */
5642 tp->snd_cwnd = tp->snd_cwnd_prev;
5643 tp->snd_ssthresh = tp->snd_ssthresh_prev;
5644 tp->snd_recover = tp->snd_recover_prev;
5645 if (tp->t_flags & TF_WASFRECOVERY) {
5646 ENTER_FASTRECOVERY(tp->t_flags);
5647 tp->t_flags &= ~TF_WASFRECOVERY;
5648 }
5649 if (tp->t_flags & TF_WASCRECOVERY) {
5650 ENTER_CONGRECOVERY(tp->t_flags);
5651 tp->t_flags &= ~TF_WASCRECOVERY;
5652 }
5653 tp->snd_nxt = tp->snd_max;
5654 tp->t_badrxtwin = 0;
5655 break;
5656 }
5657 if ((CC_ALGO(tp)->cong_signal != NULL) &&
5658 (type != CC_RTO)){
5659 tp->t_ccv.curack = ack;
5660 CC_ALGO(tp)->cong_signal(&tp->t_ccv, type);
5661 }
5662 if ((in_rec_at_entry == 0) && IN_RECOVERY(tp->t_flags)) {
5663 rack_log_to_prr(rack, 15, cwnd_enter, line);
5664 rack->r_ctl.dsack_byte_cnt = 0;
5665 rack->r_ctl.retran_during_recovery = 0;
5666 rack->r_ctl.rc_cwnd_at_erec = cwnd_enter;
5667 rack->r_ctl.rc_ssthresh_at_erec = ssthresh_enter;
5668 rack->r_ent_rec_ns = 1;
5669 }
5670 }
5671
5672 static inline void
rack_cc_after_idle(struct tcp_rack * rack,struct tcpcb * tp)5673 rack_cc_after_idle(struct tcp_rack *rack, struct tcpcb *tp)
5674 {
5675 uint32_t i_cwnd;
5676
5677 INP_WLOCK_ASSERT(tptoinpcb(tp));
5678
5679 if (CC_ALGO(tp)->after_idle != NULL)
5680 CC_ALGO(tp)->after_idle(&tp->t_ccv);
5681
5682 if (tp->snd_cwnd == 1)
5683 i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */
5684 else
5685 i_cwnd = rc_init_window(rack);
5686
5687 /*
5688 * Being idle is no different than the initial window. If the cc
5689 * clamps it down below the initial window raise it to the initial
5690 * window.
5691 */
5692 if (tp->snd_cwnd < i_cwnd) {
5693 tp->snd_cwnd = i_cwnd;
5694 }
5695 }
5696
5697 /*
5698 * Indicate whether this ack should be delayed. We can delay the ack if
5699 * following conditions are met:
5700 * - There is no delayed ack timer in progress.
5701 * - Our last ack wasn't a 0-sized window. We never want to delay
5702 * the ack that opens up a 0-sized window.
5703 * - LRO wasn't used for this segment. We make sure by checking that the
5704 * segment size is not larger than the MSS.
5705 * - Delayed acks are enabled or this is a half-synchronized T/TCP
5706 * connection.
5707 */
5708 #define DELAY_ACK(tp, tlen) \
5709 (((tp->t_flags & TF_RXWIN0SENT) == 0) && \
5710 ((tp->t_flags & TF_DELACK) == 0) && \
5711 (tlen <= tp->t_maxseg) && \
5712 (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN)))
5713
5714 static struct rack_sendmap *
rack_find_lowest_rsm(struct tcp_rack * rack)5715 rack_find_lowest_rsm(struct tcp_rack *rack)
5716 {
5717 struct rack_sendmap *rsm;
5718
5719 /*
5720 * Walk the time-order transmitted list looking for an rsm that is
5721 * not acked. This will be the one that was sent the longest time
5722 * ago that is still outstanding.
5723 */
5724 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
5725 if (rsm->r_flags & RACK_ACKED) {
5726 continue;
5727 }
5728 goto finish;
5729 }
5730 finish:
5731 return (rsm);
5732 }
5733
5734 static struct rack_sendmap *
rack_find_high_nonack(struct tcp_rack * rack,struct rack_sendmap * rsm)5735 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm)
5736 {
5737 struct rack_sendmap *prsm;
5738
5739 /*
5740 * Walk the sequence order list backward until we hit and arrive at
5741 * the highest seq not acked. In theory when this is called it
5742 * should be the last segment (which it was not).
5743 */
5744 prsm = rsm;
5745
5746 TQHASH_FOREACH_REVERSE_FROM(prsm, rack->r_ctl.tqh) {
5747 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) {
5748 continue;
5749 }
5750 return (prsm);
5751 }
5752 return (NULL);
5753 }
5754
5755 static uint32_t
rack_calc_thresh_rack(struct tcp_rack * rack,uint32_t srtt,uint32_t cts,int line,int log_allowed)5756 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts, int line, int log_allowed)
5757 {
5758 int32_t lro;
5759 uint32_t thresh;
5760
5761 /*
5762 * lro is the flag we use to determine if we have seen reordering.
5763 * If it gets set we have seen reordering. The reorder logic either
5764 * works in one of two ways:
5765 *
5766 * If reorder-fade is configured, then we track the last time we saw
5767 * re-ordering occur. If we reach the point where enough time as
5768 * passed we no longer consider reordering as occurring.
5769 *
5770 * Or if reorder-face is 0, then once we see reordering we consider
5771 * the connection to alway be subject to reordering and just set lro
5772 * to 1.
5773 *
5774 * In the end if lro is non-zero we add the extra time for
5775 * reordering in.
5776 */
5777 if (srtt == 0)
5778 srtt = 1;
5779 if (rack->r_ctl.rc_reorder_ts) {
5780 if (rack->r_ctl.rc_reorder_fade) {
5781 if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) {
5782 lro = cts - rack->r_ctl.rc_reorder_ts;
5783 if (lro == 0) {
5784 /*
5785 * No time as passed since the last
5786 * reorder, mark it as reordering.
5787 */
5788 lro = 1;
5789 }
5790 } else {
5791 /* Negative time? */
5792 lro = 0;
5793 }
5794 if (lro > rack->r_ctl.rc_reorder_fade) {
5795 /* Turn off reordering seen too */
5796 rack->r_ctl.rc_reorder_ts = 0;
5797 lro = 0;
5798 }
5799 } else {
5800 /* Reodering does not fade */
5801 lro = 1;
5802 }
5803 } else {
5804 lro = 0;
5805 }
5806 if (rack->rc_rack_tmr_std_based == 0) {
5807 thresh = srtt + rack->r_ctl.rc_pkt_delay;
5808 } else {
5809 /* Standards based pkt-delay is 1/4 srtt */
5810 thresh = srtt + (srtt >> 2);
5811 }
5812 if (lro && (rack->rc_rack_tmr_std_based == 0)) {
5813 /* It must be set, if not you get 1/4 rtt */
5814 if (rack->r_ctl.rc_reorder_shift)
5815 thresh += (srtt >> rack->r_ctl.rc_reorder_shift);
5816 else
5817 thresh += (srtt >> 2);
5818 }
5819 if (rack->rc_rack_use_dsack &&
5820 lro &&
5821 (rack->r_ctl.num_dsack > 0)) {
5822 /*
5823 * We only increase the reordering window if we
5824 * have seen reordering <and> we have a DSACK count.
5825 */
5826 thresh += rack->r_ctl.num_dsack * (srtt >> 2);
5827 if (log_allowed)
5828 rack_log_dsack_event(rack, 4, line, srtt, thresh);
5829 }
5830 /* SRTT * 2 is the ceiling */
5831 if (thresh > (srtt * 2)) {
5832 thresh = srtt * 2;
5833 }
5834 /* And we don't want it above the RTO max either */
5835 if (thresh > rack_rto_max) {
5836 thresh = rack_rto_max;
5837 }
5838 if (log_allowed)
5839 rack_log_dsack_event(rack, 6, line, srtt, thresh);
5840 return (thresh);
5841 }
5842
5843 static uint32_t
rack_calc_thresh_tlp(struct tcpcb * tp,struct tcp_rack * rack,struct rack_sendmap * rsm,uint32_t srtt)5844 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack,
5845 struct rack_sendmap *rsm, uint32_t srtt)
5846 {
5847 struct rack_sendmap *prsm;
5848 uint32_t thresh, len;
5849 int segsiz;
5850
5851 if (srtt == 0)
5852 srtt = 1;
5853 if (rack->r_ctl.rc_tlp_threshold)
5854 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold);
5855 else
5856 thresh = (srtt * 2);
5857
5858 /* Get the previous sent packet, if any */
5859 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
5860 len = rsm->r_end - rsm->r_start;
5861 if (rack->rack_tlp_threshold_use == TLP_USE_ID) {
5862 /* Exactly like the ID */
5863 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= segsiz) {
5864 uint32_t alt_thresh;
5865 /*
5866 * Compensate for delayed-ack with the d-ack time.
5867 */
5868 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
5869 if (alt_thresh > thresh)
5870 thresh = alt_thresh;
5871 }
5872 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) {
5873 /* 2.1 behavior */
5874 prsm = TAILQ_PREV(rsm, rack_head, r_tnext);
5875 if (prsm && (len <= segsiz)) {
5876 /*
5877 * Two packets outstanding, thresh should be (2*srtt) +
5878 * possible inter-packet delay (if any).
5879 */
5880 uint32_t inter_gap = 0;
5881 int idx, nidx;
5882
5883 idx = rsm->r_rtr_cnt - 1;
5884 nidx = prsm->r_rtr_cnt - 1;
5885 if (rsm->r_tim_lastsent[nidx] >= prsm->r_tim_lastsent[idx]) {
5886 /* Yes it was sent later (or at the same time) */
5887 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx];
5888 }
5889 thresh += inter_gap;
5890 } else if (len <= segsiz) {
5891 /*
5892 * Possibly compensate for delayed-ack.
5893 */
5894 uint32_t alt_thresh;
5895
5896 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
5897 if (alt_thresh > thresh)
5898 thresh = alt_thresh;
5899 }
5900 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) {
5901 /* 2.2 behavior */
5902 if (len <= segsiz) {
5903 uint32_t alt_thresh;
5904 /*
5905 * Compensate for delayed-ack with the d-ack time.
5906 */
5907 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
5908 if (alt_thresh > thresh)
5909 thresh = alt_thresh;
5910 }
5911 }
5912 /* Not above an RTO */
5913 if (thresh > tp->t_rxtcur) {
5914 thresh = tp->t_rxtcur;
5915 }
5916 /* Not above a RTO max */
5917 if (thresh > rack_rto_max) {
5918 thresh = rack_rto_max;
5919 }
5920 /* Apply user supplied min TLP */
5921 if (thresh < rack_tlp_min) {
5922 thresh = rack_tlp_min;
5923 }
5924 return (thresh);
5925 }
5926
5927 static uint32_t
rack_grab_rtt(struct tcpcb * tp,struct tcp_rack * rack)5928 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack)
5929 {
5930 /*
5931 * We want the rack_rtt which is the
5932 * last rtt we measured. However if that
5933 * does not exist we fallback to the srtt (which
5934 * we probably will never do) and then as a last
5935 * resort we use RACK_INITIAL_RTO if no srtt is
5936 * yet set.
5937 */
5938 if (rack->rc_rack_rtt)
5939 return (rack->rc_rack_rtt);
5940 else if (tp->t_srtt == 0)
5941 return (RACK_INITIAL_RTO);
5942 return (tp->t_srtt);
5943 }
5944
5945 static struct rack_sendmap *
rack_check_recovery_mode(struct tcpcb * tp,uint32_t tsused)5946 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused)
5947 {
5948 /*
5949 * Check to see that we don't need to fall into recovery. We will
5950 * need to do so if our oldest transmit is past the time we should
5951 * have had an ack.
5952 */
5953 struct tcp_rack *rack;
5954 struct rack_sendmap *rsm;
5955 int32_t idx;
5956 uint32_t srtt, thresh;
5957
5958 rack = (struct tcp_rack *)tp->t_fb_ptr;
5959 if (tqhash_empty(rack->r_ctl.tqh)) {
5960 return (NULL);
5961 }
5962 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
5963 if (rsm == NULL)
5964 return (NULL);
5965
5966
5967 if (rsm->r_flags & RACK_ACKED) {
5968 rsm = rack_find_lowest_rsm(rack);
5969 if (rsm == NULL)
5970 return (NULL);
5971 }
5972 idx = rsm->r_rtr_cnt - 1;
5973 srtt = rack_grab_rtt(tp, rack);
5974 thresh = rack_calc_thresh_rack(rack, srtt, tsused, __LINE__, 1);
5975 if (TSTMP_LT(tsused, ((uint32_t)rsm->r_tim_lastsent[idx]))) {
5976 return (NULL);
5977 }
5978 if ((tsused - ((uint32_t)rsm->r_tim_lastsent[idx])) < thresh) {
5979 return (NULL);
5980 }
5981 /* Ok if we reach here we are over-due and this guy can be sent */
5982 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__);
5983 return (rsm);
5984 }
5985
5986 static uint32_t
rack_get_persists_timer_val(struct tcpcb * tp,struct tcp_rack * rack)5987 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack)
5988 {
5989 int32_t t;
5990 int32_t tt;
5991 uint32_t ret_val;
5992
5993 t = (tp->t_srtt + (tp->t_rttvar << 2));
5994 RACK_TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
5995 rack_persist_min, rack_persist_max, rack->r_ctl.timer_slop);
5996 rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT;
5997 ret_val = (uint32_t)tt;
5998 return (ret_val);
5999 }
6000
6001 static uint32_t
rack_timer_start(struct tcpcb * tp,struct tcp_rack * rack,uint32_t cts,int sup_rack)6002 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack)
6003 {
6004 /*
6005 * Start the FR timer, we do this based on getting the first one in
6006 * the rc_tmap. Note that if its NULL we must stop the timer. in all
6007 * events we need to stop the running timer (if its running) before
6008 * starting the new one.
6009 */
6010 uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse;
6011 uint32_t srtt_cur;
6012 int32_t idx;
6013 int32_t is_tlp_timer = 0;
6014 struct rack_sendmap *rsm;
6015
6016 if (rack->t_timers_stopped) {
6017 /* All timers have been stopped none are to run */
6018 return (0);
6019 }
6020 if (rack->rc_in_persist) {
6021 /* We can't start any timer in persists */
6022 return (rack_get_persists_timer_val(tp, rack));
6023 }
6024 rack->rc_on_min_to = 0;
6025 if ((tp->t_state < TCPS_ESTABLISHED) ||
6026 ((tp->t_flags & TF_SACK_PERMIT) == 0)) {
6027 goto activate_rxt;
6028 }
6029 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
6030 if ((rsm == NULL) || sup_rack) {
6031 /* Nothing on the send map or no rack */
6032 activate_rxt:
6033 time_since_sent = 0;
6034 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
6035 if (rsm) {
6036 /*
6037 * Should we discount the RTX timer any?
6038 *
6039 * We want to discount it the smallest amount.
6040 * If a timer (Rack/TLP or RXT) has gone off more
6041 * recently thats the discount we want to use (now - timer time).
6042 * If the retransmit of the oldest packet was more recent then
6043 * we want to use that (now - oldest-packet-last_transmit_time).
6044 *
6045 */
6046 idx = rsm->r_rtr_cnt - 1;
6047 if (TSTMP_GEQ(rack->r_ctl.rc_tlp_rxt_last_time, ((uint32_t)rsm->r_tim_lastsent[idx])))
6048 tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time;
6049 else
6050 tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx];
6051 if (TSTMP_GT(cts, tstmp_touse))
6052 time_since_sent = cts - tstmp_touse;
6053 }
6054 if (SEQ_LT(tp->snd_una, tp->snd_max) ||
6055 sbavail(&tptosocket(tp)->so_snd)) {
6056 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT;
6057 to = tp->t_rxtcur;
6058 if (to > time_since_sent)
6059 to -= time_since_sent;
6060 else
6061 to = rack->r_ctl.rc_min_to;
6062 if (to == 0)
6063 to = 1;
6064 /* Special case for KEEPINIT */
6065 if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) &&
6066 (TP_KEEPINIT(tp) != 0) &&
6067 rsm) {
6068 /*
6069 * We have to put a ceiling on the rxt timer
6070 * of the keep-init timeout.
6071 */
6072 uint32_t max_time, red;
6073
6074 max_time = TICKS_2_USEC(TP_KEEPINIT(tp));
6075 if (TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) {
6076 red = (cts - (uint32_t)rsm->r_tim_lastsent[0]);
6077 if (red < max_time)
6078 max_time -= red;
6079 else
6080 max_time = 1;
6081 }
6082 /* Reduce timeout to the keep value if needed */
6083 if (max_time < to)
6084 to = max_time;
6085 }
6086 return (to);
6087 }
6088 return (0);
6089 }
6090 if (rsm->r_flags & RACK_ACKED) {
6091 rsm = rack_find_lowest_rsm(rack);
6092 if (rsm == NULL) {
6093 /* No lowest? */
6094 goto activate_rxt;
6095 }
6096 }
6097 /* Convert from ms to usecs */
6098 if ((rsm->r_flags & RACK_SACK_PASSED) ||
6099 (rsm->r_flags & RACK_RWND_COLLAPSED) ||
6100 (rsm->r_dupack >= DUP_ACK_THRESHOLD)) {
6101 if ((tp->t_flags & TF_SENTFIN) &&
6102 ((tp->snd_max - tp->snd_una) == 1) &&
6103 (rsm->r_flags & RACK_HAS_FIN)) {
6104 /*
6105 * We don't start a rack timer if all we have is a
6106 * FIN outstanding.
6107 */
6108 goto activate_rxt;
6109 }
6110 if ((rack->use_rack_rr == 0) &&
6111 (IN_FASTRECOVERY(tp->t_flags)) &&
6112 (rack->rack_no_prr == 0) &&
6113 (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) {
6114 /*
6115 * We are not cheating, in recovery and
6116 * not enough ack's to yet get our next
6117 * retransmission out.
6118 *
6119 * Note that classified attackers do not
6120 * get to use the rack-cheat.
6121 */
6122 goto activate_tlp;
6123 }
6124 srtt = rack_grab_rtt(tp, rack);
6125 thresh = rack_calc_thresh_rack(rack, srtt, cts, __LINE__, 1);
6126 idx = rsm->r_rtr_cnt - 1;
6127 exp = ((uint32_t)rsm->r_tim_lastsent[idx]) + thresh;
6128 if (SEQ_GEQ(exp, cts)) {
6129 to = exp - cts;
6130 if (to < rack->r_ctl.rc_min_to) {
6131 to = rack->r_ctl.rc_min_to;
6132 if (rack->r_rr_config == 3)
6133 rack->rc_on_min_to = 1;
6134 }
6135 } else {
6136 to = rack->r_ctl.rc_min_to;
6137 if (rack->r_rr_config == 3)
6138 rack->rc_on_min_to = 1;
6139 }
6140 } else {
6141 /* Ok we need to do a TLP not RACK */
6142 activate_tlp:
6143 if ((rack->rc_tlp_in_progress != 0) &&
6144 (rack->r_ctl.rc_tlp_cnt_out >= rack_tlp_limit)) {
6145 /*
6146 * The previous send was a TLP and we have sent
6147 * N TLP's without sending new data.
6148 */
6149 goto activate_rxt;
6150 }
6151 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
6152 if (rsm == NULL) {
6153 /* We found no rsm to TLP with. */
6154 goto activate_rxt;
6155 }
6156 if (rsm->r_flags & RACK_HAS_FIN) {
6157 /* If its a FIN we dont do TLP */
6158 rsm = NULL;
6159 goto activate_rxt;
6160 }
6161 idx = rsm->r_rtr_cnt - 1;
6162 time_since_sent = 0;
6163 if (TSTMP_GEQ(((uint32_t)rsm->r_tim_lastsent[idx]), rack->r_ctl.rc_tlp_rxt_last_time))
6164 tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx];
6165 else
6166 tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time;
6167 if (TSTMP_GT(cts, tstmp_touse))
6168 time_since_sent = cts - tstmp_touse;
6169 is_tlp_timer = 1;
6170 if (tp->t_srtt) {
6171 if ((rack->rc_srtt_measure_made == 0) &&
6172 (tp->t_srtt == 1)) {
6173 /*
6174 * If another stack as run and set srtt to 1,
6175 * then the srtt was 0, so lets use the initial.
6176 */
6177 srtt = RACK_INITIAL_RTO;
6178 } else {
6179 srtt_cur = tp->t_srtt;
6180 srtt = srtt_cur;
6181 }
6182 } else
6183 srtt = RACK_INITIAL_RTO;
6184 /*
6185 * If the SRTT is not keeping up and the
6186 * rack RTT has spiked we want to use
6187 * the last RTT not the smoothed one.
6188 */
6189 if (rack_tlp_use_greater &&
6190 tp->t_srtt &&
6191 (srtt < rack_grab_rtt(tp, rack))) {
6192 srtt = rack_grab_rtt(tp, rack);
6193 }
6194 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt);
6195 if (thresh > time_since_sent) {
6196 to = thresh - time_since_sent;
6197 } else {
6198 to = rack->r_ctl.rc_min_to;
6199 rack_log_alt_to_to_cancel(rack,
6200 thresh, /* flex1 */
6201 time_since_sent, /* flex2 */
6202 tstmp_touse, /* flex3 */
6203 rack->r_ctl.rc_tlp_rxt_last_time, /* flex4 */
6204 (uint32_t)rsm->r_tim_lastsent[idx],
6205 srtt,
6206 idx, 99);
6207 }
6208 if (to < rack_tlp_min) {
6209 to = rack_tlp_min;
6210 }
6211 if (to > TICKS_2_USEC(tcp_rexmit_max)) {
6212 /*
6213 * If the TLP time works out to larger than the max
6214 * RTO lets not do TLP.. just RTO.
6215 */
6216 goto activate_rxt;
6217 }
6218 }
6219 if (is_tlp_timer == 0) {
6220 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK;
6221 } else {
6222 rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP;
6223 }
6224 if (to == 0)
6225 to = 1;
6226 return (to);
6227 }
6228
6229 static void
rack_enter_persist(struct tcpcb * tp,struct tcp_rack * rack,uint32_t cts,tcp_seq snd_una)6230 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, tcp_seq snd_una)
6231 {
6232 if (rack->rc_in_persist == 0) {
6233 if (tp->t_flags & TF_GPUTINPROG) {
6234 /*
6235 * Stop the goodput now, the calling of the
6236 * measurement function clears the flag.
6237 */
6238 rack_do_goodput_measurement(tp, rack, tp->snd_una, __LINE__,
6239 RACK_QUALITY_PERSIST);
6240 }
6241 #ifdef NETFLIX_SHARED_CWND
6242 if (rack->r_ctl.rc_scw) {
6243 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
6244 rack->rack_scwnd_is_idle = 1;
6245 }
6246 #endif
6247 rack->r_ctl.rc_went_idle_time = cts;
6248 if (rack->r_ctl.rc_went_idle_time == 0)
6249 rack->r_ctl.rc_went_idle_time = 1;
6250 if (rack->lt_bw_up) {
6251 /* Suspend our LT BW measurement */
6252 uint64_t tmark;
6253
6254 rack->r_ctl.lt_bw_bytes += (snd_una - rack->r_ctl.lt_seq);
6255 rack->r_ctl.lt_seq = snd_una;
6256 tmark = tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time);
6257 if (tmark >= rack->r_ctl.lt_timemark) {
6258 rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark);
6259 }
6260 rack->r_ctl.lt_timemark = tmark;
6261 rack->lt_bw_up = 0;
6262 rack->r_persist_lt_bw_off = 1;
6263 }
6264 rack_timer_cancel(tp, rack, cts, __LINE__);
6265 rack->r_ctl.persist_lost_ends = 0;
6266 rack->probe_not_answered = 0;
6267 rack->forced_ack = 0;
6268 tp->t_rxtshift = 0;
6269 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
6270 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
6271 rack->rc_in_persist = 1;
6272 }
6273 }
6274
6275 static void
rack_exit_persist(struct tcpcb * tp,struct tcp_rack * rack,uint32_t cts)6276 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
6277 {
6278 if (tcp_in_hpts(rack->rc_tp)) {
6279 tcp_hpts_remove(rack->rc_tp);
6280 rack->r_ctl.rc_hpts_flags = 0;
6281 }
6282 #ifdef NETFLIX_SHARED_CWND
6283 if (rack->r_ctl.rc_scw) {
6284 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
6285 rack->rack_scwnd_is_idle = 0;
6286 }
6287 #endif
6288 if (rack->rc_gp_dyn_mul &&
6289 (rack->use_fixed_rate == 0) &&
6290 (rack->rc_always_pace)) {
6291 /*
6292 * Do we count this as if a probe-rtt just
6293 * finished?
6294 */
6295 uint32_t time_idle, idle_min;
6296
6297 time_idle = cts - rack->r_ctl.rc_went_idle_time;
6298 idle_min = rack_min_probertt_hold;
6299 if (rack_probertt_gpsrtt_cnt_div) {
6300 uint64_t extra;
6301 extra = (uint64_t)rack->r_ctl.rc_gp_srtt *
6302 (uint64_t)rack_probertt_gpsrtt_cnt_mul;
6303 extra /= (uint64_t)rack_probertt_gpsrtt_cnt_div;
6304 idle_min += (uint32_t)extra;
6305 }
6306 if (time_idle >= idle_min) {
6307 /* Yes, we count it as a probe-rtt. */
6308 uint32_t us_cts;
6309
6310 us_cts = tcp_get_usecs(NULL);
6311 if (rack->in_probe_rtt == 0) {
6312 rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
6313 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts;
6314 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts;
6315 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts;
6316 } else {
6317 rack_exit_probertt(rack, us_cts);
6318 }
6319 }
6320 }
6321 if (rack->r_persist_lt_bw_off) {
6322 /* Continue where we left off */
6323 rack->r_ctl.lt_timemark = tcp_get_u64_usecs(NULL);
6324 rack->lt_bw_up = 1;
6325 rack->r_persist_lt_bw_off = 0;
6326 }
6327 rack->rc_in_persist = 0;
6328 rack->r_ctl.rc_went_idle_time = 0;
6329 tp->t_rxtshift = 0;
6330 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
6331 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
6332 rack->r_ctl.rc_agg_delayed = 0;
6333 rack->r_early = 0;
6334 rack->r_late = 0;
6335 rack->r_ctl.rc_agg_early = 0;
6336 }
6337
6338 static void
rack_log_hpts_diag(struct tcp_rack * rack,uint32_t cts,struct hpts_diag * diag,struct timeval * tv)6339 rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts,
6340 struct hpts_diag *diag, struct timeval *tv)
6341 {
6342 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
6343 union tcp_log_stackspecific log;
6344
6345 memset(&log, 0, sizeof(log));
6346 log.u_bbr.flex1 = diag->p_nxt_slot;
6347 log.u_bbr.flex2 = diag->p_cur_slot;
6348 log.u_bbr.flex3 = diag->slot_req;
6349 log.u_bbr.flex4 = diag->inp_hptsslot;
6350 log.u_bbr.flex5 = diag->time_remaining;
6351 log.u_bbr.flex6 = diag->need_new_to;
6352 log.u_bbr.flex7 = diag->p_hpts_active;
6353 log.u_bbr.flex8 = diag->p_on_min_sleep;
6354 /* Hijack other fields as needed */
6355 log.u_bbr.epoch = diag->have_slept;
6356 log.u_bbr.lt_epoch = diag->yet_to_sleep;
6357 log.u_bbr.pkts_out = diag->co_ret;
6358 log.u_bbr.applimited = diag->hpts_sleep_time;
6359 log.u_bbr.delivered = diag->p_prev_slot;
6360 log.u_bbr.inflight = diag->p_runningslot;
6361 log.u_bbr.bw_inuse = diag->wheel_slot;
6362 log.u_bbr.rttProp = diag->wheel_cts;
6363 log.u_bbr.timeStamp = cts;
6364 log.u_bbr.delRate = diag->maxslots;
6365 TCP_LOG_EVENTP(rack->rc_tp, NULL,
6366 &rack->rc_inp->inp_socket->so_rcv,
6367 &rack->rc_inp->inp_socket->so_snd,
6368 BBR_LOG_HPTSDIAG, 0,
6369 0, &log, false, tv);
6370 }
6371
6372 }
6373
6374 static void
rack_log_wakeup(struct tcpcb * tp,struct tcp_rack * rack,struct sockbuf * sb,uint32_t len,int type)6375 rack_log_wakeup(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb, uint32_t len, int type)
6376 {
6377 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
6378 union tcp_log_stackspecific log;
6379 struct timeval tv;
6380
6381 memset(&log, 0, sizeof(log));
6382 log.u_bbr.flex1 = sb->sb_flags;
6383 log.u_bbr.flex2 = len;
6384 log.u_bbr.flex3 = sb->sb_state;
6385 log.u_bbr.flex8 = type;
6386 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
6387 TCP_LOG_EVENTP(rack->rc_tp, NULL,
6388 &rack->rc_inp->inp_socket->so_rcv,
6389 &rack->rc_inp->inp_socket->so_snd,
6390 TCP_LOG_SB_WAKE, 0,
6391 len, &log, false, &tv);
6392 }
6393 }
6394
6395 static void
rack_start_hpts_timer(struct tcp_rack * rack,struct tcpcb * tp,uint32_t cts,int32_t usecs,uint32_t tot_len_this_send,int sup_rack)6396 rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
6397 int32_t usecs, uint32_t tot_len_this_send, int sup_rack)
6398 {
6399 struct hpts_diag diag;
6400 struct inpcb *inp = tptoinpcb(tp);
6401 struct timeval tv;
6402 uint32_t delayed_ack = 0;
6403 uint32_t hpts_timeout;
6404 uint32_t entry_usecs = usecs;
6405 uint8_t stopped;
6406 uint32_t left = 0;
6407 uint32_t us_cts;
6408
6409 if ((tp->t_state == TCPS_CLOSED) ||
6410 (tp->t_state == TCPS_LISTEN)) {
6411 return;
6412 }
6413 if (tcp_in_hpts(tp)) {
6414 /* Already on the pacer */
6415 return;
6416 }
6417 stopped = rack->rc_tmr_stopped;
6418 if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) {
6419 left = rack->r_ctl.rc_timer_exp - cts;
6420 }
6421 rack->r_ctl.rc_timer_exp = 0;
6422 rack->r_ctl.rc_hpts_flags = 0;
6423 us_cts = tcp_get_usecs(&tv);
6424 /* Now early/late accounting */
6425 rack_log_pacing_delay_calc(rack, entry_usecs, usecs, 0, 0, 0, 26, __LINE__, NULL, 0);
6426 if (rack->r_early && (rack->rc_ack_can_sendout_data == 0)) {
6427 /*
6428 * We have a early carry over set,
6429 * we can always add more time so we
6430 * can always make this compensation.
6431 *
6432 * Note if ack's are allowed to wake us do not
6433 * penalize the next timer for being awoke
6434 * by an ack aka the rc_agg_early (non-paced mode).
6435 */
6436 usecs += rack->r_ctl.rc_agg_early;
6437 rack->r_early = 0;
6438 rack->r_ctl.rc_agg_early = 0;
6439 }
6440 if ((rack->r_late) &&
6441 ((rack->r_use_hpts_min == 0) || (rack->dgp_on == 0))) {
6442 /*
6443 * This is harder, we can
6444 * compensate some but it
6445 * really depends on what
6446 * the current pacing time is.
6447 */
6448 if (rack->r_ctl.rc_agg_delayed >= usecs) {
6449 /*
6450 * We can't compensate for it all.
6451 * And we have to have some time
6452 * on the clock. We always have a min
6453 * 10 HPTS timer units (10 x 10 i.e. 100 usecs).
6454 */
6455 if (usecs <= HPTS_USECS_PER_SLOT) {
6456 /* We gain delay */
6457 rack->r_ctl.rc_agg_delayed += (HPTS_USECS_PER_SLOT - usecs);
6458 usecs = HPTS_USECS_PER_SLOT;
6459 } else {
6460 /* We take off some */
6461 rack->r_ctl.rc_agg_delayed -= (usecs - HPTS_USECS_PER_SLOT);
6462 usecs = HPTS_USECS_PER_SLOT;
6463 }
6464 } else {
6465 usecs -= rack->r_ctl.rc_agg_delayed;
6466 rack->r_ctl.rc_agg_delayed = 0;
6467 /* Make sure we have 100 useconds at minimum */
6468 if (usecs < HPTS_USECS_PER_SLOT) {
6469 rack->r_ctl.rc_agg_delayed = HPTS_USECS_PER_SLOT - usecs;
6470 usecs = HPTS_USECS_PER_SLOT;
6471 }
6472 if (rack->r_ctl.rc_agg_delayed == 0)
6473 rack->r_late = 0;
6474 }
6475 } else if (rack->r_late) {
6476 /* r_use_hpts_min is on and so is DGP */
6477 uint32_t max_red;
6478
6479 max_red = (usecs * rack->r_ctl.max_reduction) / 100;
6480 if (max_red >= rack->r_ctl.rc_agg_delayed) {
6481 usecs -= rack->r_ctl.rc_agg_delayed;
6482 rack->r_ctl.rc_agg_delayed = 0;
6483 } else {
6484 usecs -= max_red;
6485 rack->r_ctl.rc_agg_delayed -= max_red;
6486 }
6487 }
6488 if ((rack->r_use_hpts_min == 1) &&
6489 (usecs > 0) &&
6490 (rack->dgp_on == 1)) {
6491 /*
6492 * We are enforcing a min pacing timer
6493 * based on our hpts min timeout.
6494 */
6495 uint32_t min;
6496
6497 min = get_hpts_min_sleep_time();
6498 if (min > usecs) {
6499 usecs = min;
6500 }
6501 }
6502 hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack);
6503 if (tp->t_flags & TF_DELACK) {
6504 delayed_ack = TICKS_2_USEC(tcp_delacktime);
6505 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK;
6506 }
6507 if (delayed_ack && ((hpts_timeout == 0) ||
6508 (delayed_ack < hpts_timeout)))
6509 hpts_timeout = delayed_ack;
6510 else
6511 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
6512 /*
6513 * If no timers are going to run and we will fall off the hptsi
6514 * wheel, we resort to a keep-alive timer if its configured.
6515 */
6516 if ((hpts_timeout == 0) &&
6517 (usecs == 0)) {
6518 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
6519 (tp->t_state <= TCPS_CLOSING)) {
6520 /*
6521 * Ok we have no timer (persists, rack, tlp, rxt or
6522 * del-ack), we don't have segments being paced. So
6523 * all that is left is the keepalive timer.
6524 */
6525 if (TCPS_HAVEESTABLISHED(tp->t_state)) {
6526 /* Get the established keep-alive time */
6527 hpts_timeout = TICKS_2_USEC(TP_KEEPIDLE(tp));
6528 } else {
6529 /*
6530 * Get the initial setup keep-alive time,
6531 * note that this is probably not going to
6532 * happen, since rack will be running a rxt timer
6533 * if a SYN of some sort is outstanding. It is
6534 * actually handled in rack_timeout_rxt().
6535 */
6536 hpts_timeout = TICKS_2_USEC(TP_KEEPINIT(tp));
6537 }
6538 rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP;
6539 if (rack->in_probe_rtt) {
6540 /*
6541 * We want to instead not wake up a long time from
6542 * now but to wake up about the time we would
6543 * exit probe-rtt and initiate a keep-alive ack.
6544 * This will get us out of probe-rtt and update
6545 * our min-rtt.
6546 */
6547 hpts_timeout = rack_min_probertt_hold;
6548 }
6549 }
6550 }
6551 if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) ==
6552 (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) {
6553 /*
6554 * RACK, TLP, persists and RXT timers all are restartable
6555 * based on actions input .. i.e we received a packet (ack
6556 * or sack) and that changes things (rw, or snd_una etc).
6557 * Thus we can restart them with a new value. For
6558 * keep-alive, delayed_ack we keep track of what was left
6559 * and restart the timer with a smaller value.
6560 */
6561 if (left < hpts_timeout)
6562 hpts_timeout = left;
6563 }
6564 if (hpts_timeout) {
6565 /*
6566 * Hack alert for now we can't time-out over 2,147,483
6567 * seconds (a bit more than 596 hours), which is probably ok
6568 * :).
6569 */
6570 if (hpts_timeout > 0x7ffffffe)
6571 hpts_timeout = 0x7ffffffe;
6572 rack->r_ctl.rc_timer_exp = cts + hpts_timeout;
6573 }
6574 rack_log_pacing_delay_calc(rack, entry_usecs, usecs, hpts_timeout, 0, 0, 27, __LINE__, NULL, 0);
6575 if ((rack->gp_ready == 0) &&
6576 (rack->use_fixed_rate == 0) &&
6577 (hpts_timeout < usecs) &&
6578 (rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) {
6579 /*
6580 * We have no good estimate yet for the
6581 * old clunky burst mitigation or the
6582 * real pacing. And the tlp or rxt is smaller
6583 * than the pacing calculation. Lets not
6584 * pace that long since we know the calculation
6585 * so far is not accurate.
6586 */
6587 usecs = hpts_timeout;
6588 }
6589 /**
6590 * Turn off all the flags for queuing by default. The
6591 * flags have important meanings to what happens when
6592 * LRO interacts with the transport. Most likely (by default now)
6593 * mbuf_queueing and ack compression are on. So the transport
6594 * has a couple of flags that control what happens (if those
6595 * are not on then these flags won't have any effect since it
6596 * won't go through the queuing LRO path).
6597 *
6598 * TF2_MBUF_QUEUE_READY - This flags says that I am busy
6599 * pacing output, so don't disturb. But
6600 * it also means LRO can wake me if there
6601 * is a SACK arrival.
6602 *
6603 * TF2_DONT_SACK_QUEUE - This flag is used in conjunction
6604 * with the above flag (QUEUE_READY) and
6605 * when present it says don't even wake me
6606 * if a SACK arrives.
6607 *
6608 * The idea behind these flags is that if we are pacing we
6609 * set the MBUF_QUEUE_READY and only get woken up if
6610 * a SACK arrives (which could change things) or if
6611 * our pacing timer expires. If, however, we have a rack
6612 * timer running, then we don't even want a sack to wake
6613 * us since the rack timer has to expire before we can send.
6614 *
6615 * Other cases should usually have none of the flags set
6616 * so LRO can call into us.
6617 */
6618 tp->t_flags2 &= ~(TF2_DONT_SACK_QUEUE|TF2_MBUF_QUEUE_READY);
6619 if (usecs) {
6620 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT;
6621 rack->r_ctl.rc_last_output_to = us_cts + usecs;
6622 /*
6623 * A pacing timer (usecs microseconds) is being set, in
6624 * such a case we cannot send (we are blocked by
6625 * the timer). So lets tell LRO that it should not
6626 * wake us unless there is a SACK. Note this only
6627 * will be effective if mbuf queueing is on or
6628 * compressed acks are being processed.
6629 */
6630 tp->t_flags2 |= TF2_MBUF_QUEUE_READY;
6631 /*
6632 * But wait if we have a Rack timer running
6633 * even a SACK should not disturb us (with
6634 * the exception of r_rr_config 3).
6635 */
6636 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) ||
6637 (IN_RECOVERY(tp->t_flags))) {
6638 if (rack->r_rr_config != 3)
6639 tp->t_flags2 |= TF2_DONT_SACK_QUEUE;
6640 else if (rack->rc_pace_dnd) {
6641 /*
6642 * When DND is on, we only let a sack
6643 * interrupt us if we are not in recovery.
6644 *
6645 * If DND is off, then we never hit here
6646 * and let all sacks wake us up.
6647 *
6648 */
6649 tp->t_flags2 |= TF2_DONT_SACK_QUEUE;
6650 }
6651 }
6652 if (rack->rc_ack_can_sendout_data) {
6653 /*
6654 * Ahh but wait, this is that special case
6655 * where the pacing timer can be disturbed
6656 * backout the changes (used for non-paced
6657 * burst limiting).
6658 */
6659 tp->t_flags2 &= ~(TF2_DONT_SACK_QUEUE |
6660 TF2_MBUF_QUEUE_READY);
6661 }
6662 if ((rack->use_rack_rr) &&
6663 (rack->r_rr_config < 2) &&
6664 ((hpts_timeout) && (hpts_timeout < usecs))) {
6665 /*
6666 * Arrange for the hpts to kick back in after the
6667 * t-o if the t-o does not cause a send.
6668 */
6669 tcp_hpts_insert(tp, hpts_timeout, &diag);
6670 rack_log_hpts_diag(rack, us_cts, &diag, &tv);
6671 rack_log_to_start(rack, cts, hpts_timeout, usecs, 0);
6672 } else {
6673 tcp_hpts_insert(tp, usecs, &diag);
6674 rack_log_hpts_diag(rack, us_cts, &diag, &tv);
6675 rack_log_to_start(rack, cts, hpts_timeout, usecs, 1);
6676 }
6677 } else if (hpts_timeout) {
6678 /*
6679 * With respect to t_flags2(?) here, lets let any new acks wake
6680 * us up here. Since we are not pacing (no pacing timer), output
6681 * can happen so we should let it. If its a Rack timer, then any inbound
6682 * packet probably won't change the sending (we will be blocked)
6683 * but it may change the prr stats so letting it in (the set defaults
6684 * at the start of this block) are good enough.
6685 */
6686 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
6687 tcp_hpts_insert(tp, hpts_timeout, &diag);
6688 rack_log_hpts_diag(rack, us_cts, &diag, &tv);
6689 rack_log_to_start(rack, cts, hpts_timeout, usecs, 0);
6690 } else {
6691 /* No timer starting */
6692 #ifdef INVARIANTS
6693 if (SEQ_GT(tp->snd_max, tp->snd_una)) {
6694 panic("tp:%p rack:%p tlts:%d cts:%u usecs:%u pto:%u -- no timer started?",
6695 tp, rack, tot_len_this_send, cts, usecs, hpts_timeout);
6696 }
6697 #endif
6698 }
6699 rack->rc_tmr_stopped = 0;
6700 if (usecs)
6701 rack_log_type_bbrsnd(rack, tot_len_this_send, usecs, us_cts, &tv, __LINE__);
6702 }
6703
6704 static void
rack_mark_lost(struct tcpcb * tp,struct tcp_rack * rack,struct rack_sendmap * rsm,uint32_t cts)6705 rack_mark_lost(struct tcpcb *tp,
6706 struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t cts)
6707 {
6708 struct rack_sendmap *nrsm;
6709 uint32_t thresh, exp;
6710
6711 thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(tp, rack), cts, __LINE__, 0);
6712 nrsm = rsm;
6713 TAILQ_FOREACH_FROM(nrsm, &rack->r_ctl.rc_tmap, r_tnext) {
6714 if ((nrsm->r_flags & RACK_SACK_PASSED) == 0) {
6715 /* Got up to all that were marked sack-passed */
6716 break;
6717 }
6718 if ((nrsm->r_flags & RACK_WAS_LOST) == 0) {
6719 exp = ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]) + thresh;
6720 if (TSTMP_LT(exp, cts) || (exp == cts)) {
6721 /* We now consider it lost */
6722 nrsm->r_flags |= RACK_WAS_LOST;
6723 rack->r_ctl.rc_considered_lost += nrsm->r_end - nrsm->r_start;
6724 } else {
6725 /* Past here it won't be lost so stop */
6726 break;
6727 }
6728 }
6729 }
6730 }
6731
6732 static inline void
rack_mark_nolonger_lost(struct tcp_rack * rack,struct rack_sendmap * rsm)6733 rack_mark_nolonger_lost(struct tcp_rack *rack, struct rack_sendmap *rsm)
6734 {
6735 KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)),
6736 ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack));
6737 rsm->r_flags &= ~RACK_WAS_LOST;
6738 if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start))
6739 rack->r_ctl.rc_considered_lost -= rsm->r_end - rsm->r_start;
6740 else
6741 rack->r_ctl.rc_considered_lost = 0;
6742 }
6743
6744 /*
6745 * RACK Timer, here we simply do logging and house keeping.
6746 * the normal rack_output() function will call the
6747 * appropriate thing to check if we need to do a RACK retransmit.
6748 * We return 1, saying don't proceed with rack_output only
6749 * when all timers have been stopped (destroyed PCB?).
6750 */
6751 static int
rack_timeout_rack(struct tcpcb * tp,struct tcp_rack * rack,uint32_t cts)6752 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
6753 {
6754 /*
6755 * This timer simply provides an internal trigger to send out data.
6756 * The check_recovery_mode call will see if there are needed
6757 * retransmissions, if so we will enter fast-recovery. The output
6758 * call may or may not do the same thing depending on sysctl
6759 * settings.
6760 */
6761 struct rack_sendmap *rsm;
6762
6763 counter_u64_add(rack_to_tot, 1);
6764 if (rack->r_state && (rack->r_state != tp->t_state))
6765 rack_set_state(tp, rack);
6766 rack->rc_on_min_to = 0;
6767 rsm = rack_check_recovery_mode(tp, cts);
6768 rack_log_to_event(rack, RACK_TO_FRM_RACK, rsm);
6769 if (rsm) {
6770 /* We need to stroke any lost that are now declared as lost */
6771 rack_mark_lost(tp, rack, rsm, cts);
6772 rack->r_ctl.rc_resend = rsm;
6773 rack->r_timer_override = 1;
6774 if (rack->use_rack_rr) {
6775 /*
6776 * Don't accumulate extra pacing delay
6777 * we are allowing the rack timer to
6778 * over-ride pacing i.e. rrr takes precedence
6779 * if the pacing interval is longer than the rrr
6780 * time (in other words we get the min pacing
6781 * time versus rrr pacing time).
6782 */
6783 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
6784 }
6785 }
6786 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK;
6787 if (rsm == NULL) {
6788 /* restart a timer and return 1 */
6789 rack_start_hpts_timer(rack, tp, cts,
6790 0, 0, 0);
6791 return (1);
6792 }
6793 return (0);
6794 }
6795
6796
6797
6798 static void
rack_adjust_orig_mlen(struct rack_sendmap * rsm)6799 rack_adjust_orig_mlen(struct rack_sendmap *rsm)
6800 {
6801
6802 if ((M_TRAILINGROOM(rsm->m) != rsm->orig_t_space)) {
6803 /*
6804 * The trailing space changed, mbufs can grow
6805 * at the tail but they can't shrink from
6806 * it, KASSERT that. Adjust the orig_m_len to
6807 * compensate for this change.
6808 */
6809 KASSERT((rsm->orig_t_space > M_TRAILINGROOM(rsm->m)),
6810 ("mbuf:%p rsm:%p trailing_space:%jd ots:%u oml:%u mlen:%u\n",
6811 rsm->m,
6812 rsm,
6813 (intmax_t)M_TRAILINGROOM(rsm->m),
6814 rsm->orig_t_space,
6815 rsm->orig_m_len,
6816 rsm->m->m_len));
6817 rsm->orig_m_len += (rsm->orig_t_space - M_TRAILINGROOM(rsm->m));
6818 rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
6819 }
6820 if (rsm->m->m_len < rsm->orig_m_len) {
6821 /*
6822 * Mbuf shrank, trimmed off the top by an ack, our
6823 * offset changes.
6824 */
6825 KASSERT((rsm->soff >= (rsm->orig_m_len - rsm->m->m_len)),
6826 ("mbuf:%p len:%u rsm:%p oml:%u soff:%u\n",
6827 rsm->m, rsm->m->m_len,
6828 rsm, rsm->orig_m_len,
6829 rsm->soff));
6830 if (rsm->soff >= (rsm->orig_m_len - rsm->m->m_len))
6831 rsm->soff -= (rsm->orig_m_len - rsm->m->m_len);
6832 else
6833 rsm->soff = 0;
6834 rsm->orig_m_len = rsm->m->m_len;
6835 #ifdef INVARIANTS
6836 } else if (rsm->m->m_len > rsm->orig_m_len) {
6837 panic("rsm:%p m:%p m_len grew outside of t_space compensation",
6838 rsm, rsm->m);
6839 #endif
6840 }
6841 }
6842
6843 static void
rack_setup_offset_for_rsm(struct tcp_rack * rack,struct rack_sendmap * src_rsm,struct rack_sendmap * rsm)6844 rack_setup_offset_for_rsm(struct tcp_rack *rack, struct rack_sendmap *src_rsm, struct rack_sendmap *rsm)
6845 {
6846 struct mbuf *m;
6847 uint32_t soff;
6848
6849 if (src_rsm->m &&
6850 ((src_rsm->orig_m_len != src_rsm->m->m_len) ||
6851 (M_TRAILINGROOM(src_rsm->m) != src_rsm->orig_t_space))) {
6852 /* Fix up the orig_m_len and possibly the mbuf offset */
6853 rack_adjust_orig_mlen(src_rsm);
6854 }
6855 m = src_rsm->m;
6856 soff = src_rsm->soff + (src_rsm->r_end - src_rsm->r_start);
6857 while (soff >= m->m_len) {
6858 /* Move out past this mbuf */
6859 soff -= m->m_len;
6860 m = m->m_next;
6861 KASSERT((m != NULL),
6862 ("rsm:%p nrsm:%p hit at soff:%u null m",
6863 src_rsm, rsm, soff));
6864 if (m == NULL) {
6865 /* This should *not* happen which is why there is a kassert */
6866 src_rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd,
6867 (src_rsm->r_start - rack->rc_tp->snd_una),
6868 &src_rsm->soff);
6869 src_rsm->orig_m_len = src_rsm->m->m_len;
6870 src_rsm->orig_t_space = M_TRAILINGROOM(src_rsm->m);
6871 rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd,
6872 (rsm->r_start - rack->rc_tp->snd_una),
6873 &rsm->soff);
6874 rsm->orig_m_len = rsm->m->m_len;
6875 rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
6876 return;
6877 }
6878 }
6879 rsm->m = m;
6880 rsm->soff = soff;
6881 rsm->orig_m_len = m->m_len;
6882 rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
6883 }
6884
6885 static inline void
rack_clone_rsm(struct tcp_rack * rack,struct rack_sendmap * nrsm,struct rack_sendmap * rsm,uint32_t start)6886 rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm,
6887 struct rack_sendmap *rsm, uint32_t start)
6888 {
6889 int idx;
6890
6891 nrsm->r_start = start;
6892 nrsm->r_end = rsm->r_end;
6893 nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
6894 nrsm->r_act_rxt_cnt = rsm->r_act_rxt_cnt;
6895 nrsm->r_flags = rsm->r_flags;
6896 nrsm->r_dupack = rsm->r_dupack;
6897 nrsm->r_no_rtt_allowed = rsm->r_no_rtt_allowed;
6898 nrsm->r_rtr_bytes = 0;
6899 nrsm->r_fas = rsm->r_fas;
6900 nrsm->r_bas = rsm->r_bas;
6901 tqhash_update_end(rack->r_ctl.tqh, rsm, nrsm->r_start);
6902 nrsm->r_just_ret = rsm->r_just_ret;
6903 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
6904 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
6905 }
6906 /* Now if we have SYN flag we keep it on the left edge */
6907 if (nrsm->r_flags & RACK_HAS_SYN)
6908 nrsm->r_flags &= ~RACK_HAS_SYN;
6909 /* Now if we have a FIN flag we keep it on the right edge */
6910 if (rsm->r_flags & RACK_HAS_FIN)
6911 rsm->r_flags &= ~RACK_HAS_FIN;
6912 /* Push bit must go to the right edge as well */
6913 if (rsm->r_flags & RACK_HAD_PUSH)
6914 rsm->r_flags &= ~RACK_HAD_PUSH;
6915 /* Update the count if app limited */
6916 if (nrsm->r_flags & RACK_APP_LIMITED)
6917 rack->r_ctl.rc_app_limited_cnt++;
6918 /* Clone over the state of the hw_tls flag */
6919 nrsm->r_hw_tls = rsm->r_hw_tls;
6920 /*
6921 * Now we need to find nrsm's new location in the mbuf chain
6922 * we basically calculate a new offset, which is soff +
6923 * how much is left in original rsm. Then we walk out the mbuf
6924 * chain to find the righ position, it may be the same mbuf
6925 * or maybe not.
6926 */
6927 KASSERT(((rsm->m != NULL) ||
6928 (rsm->r_flags & (RACK_HAS_SYN|RACK_HAS_FIN))),
6929 ("rsm:%p nrsm:%p rack:%p -- rsm->m is NULL?", rsm, nrsm, rack));
6930 if (rsm->m)
6931 rack_setup_offset_for_rsm(rack, rsm, nrsm);
6932 }
6933
6934 static struct rack_sendmap *
rack_merge_rsm(struct tcp_rack * rack,struct rack_sendmap * l_rsm,struct rack_sendmap * r_rsm)6935 rack_merge_rsm(struct tcp_rack *rack,
6936 struct rack_sendmap *l_rsm,
6937 struct rack_sendmap *r_rsm)
6938 {
6939 /*
6940 * We are merging two ack'd RSM's,
6941 * the l_rsm is on the left (lower seq
6942 * values) and the r_rsm is on the right
6943 * (higher seq value). The simplest way
6944 * to merge these is to move the right
6945 * one into the left. I don't think there
6946 * is any reason we need to try to find
6947 * the oldest (or last oldest retransmitted).
6948 */
6949 rack_log_map_chg(rack->rc_tp, rack, NULL,
6950 l_rsm, r_rsm, MAP_MERGE, r_rsm->r_end, __LINE__);
6951 tqhash_update_end(rack->r_ctl.tqh, l_rsm, r_rsm->r_end);
6952 if (l_rsm->r_dupack < r_rsm->r_dupack)
6953 l_rsm->r_dupack = r_rsm->r_dupack;
6954 if (r_rsm->r_rtr_bytes)
6955 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes;
6956 if (r_rsm->r_in_tmap) {
6957 /* This really should not happen */
6958 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext);
6959 r_rsm->r_in_tmap = 0;
6960 }
6961
6962 /* Now the flags */
6963 if (r_rsm->r_flags & RACK_HAS_FIN)
6964 l_rsm->r_flags |= RACK_HAS_FIN;
6965 if (r_rsm->r_flags & RACK_TLP)
6966 l_rsm->r_flags |= RACK_TLP;
6967 if (r_rsm->r_flags & RACK_RWND_COLLAPSED)
6968 l_rsm->r_flags |= RACK_RWND_COLLAPSED;
6969 if ((r_rsm->r_flags & RACK_APP_LIMITED) &&
6970 ((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) {
6971 /*
6972 * If both are app-limited then let the
6973 * free lower the count. If right is app
6974 * limited and left is not, transfer.
6975 */
6976 l_rsm->r_flags |= RACK_APP_LIMITED;
6977 r_rsm->r_flags &= ~RACK_APP_LIMITED;
6978 if (r_rsm == rack->r_ctl.rc_first_appl)
6979 rack->r_ctl.rc_first_appl = l_rsm;
6980 }
6981 tqhash_remove(rack->r_ctl.tqh, r_rsm, REMOVE_TYPE_MERGE);
6982 /*
6983 * We keep the largest value, which is the newest
6984 * send. We do this in case a segment that is
6985 * joined together and not part of a GP estimate
6986 * later gets expanded into the GP estimate.
6987 *
6988 * We prohibit the merging of unlike kinds i.e.
6989 * all pieces that are in the GP estimate can be
6990 * merged and all pieces that are not in a GP estimate
6991 * can be merged, but not disimilar pieces. Combine
6992 * this with taking the highest here and we should
6993 * be ok unless of course the client reneges. Then
6994 * all bets are off.
6995 */
6996 if(l_rsm->r_tim_lastsent[(l_rsm->r_rtr_cnt-1)] <
6997 r_rsm->r_tim_lastsent[(r_rsm->r_rtr_cnt-1)]) {
6998 l_rsm->r_tim_lastsent[(l_rsm->r_rtr_cnt-1)] = r_rsm->r_tim_lastsent[(r_rsm->r_rtr_cnt-1)];
6999 }
7000 /*
7001 * When merging two RSM's we also need to consider the ack time and keep
7002 * newest. If the ack gets merged into a measurement then that is the
7003 * one we will want to be using.
7004 */
7005 if(l_rsm->r_ack_arrival < r_rsm->r_ack_arrival)
7006 l_rsm->r_ack_arrival = r_rsm->r_ack_arrival;
7007
7008 if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) {
7009 /* Transfer the split limit to the map we free */
7010 r_rsm->r_limit_type = l_rsm->r_limit_type;
7011 l_rsm->r_limit_type = 0;
7012 }
7013 rack_free(rack, r_rsm);
7014 l_rsm->r_flags |= RACK_MERGED;
7015 return (l_rsm);
7016 }
7017
7018 /*
7019 * TLP Timer, here we simply setup what segment we want to
7020 * have the TLP expire on, the normal rack_output() will then
7021 * send it out.
7022 *
7023 * We return 1, saying don't proceed with rack_output only
7024 * when all timers have been stopped (destroyed PCB?).
7025 */
7026 static int
rack_timeout_tlp(struct tcpcb * tp,struct tcp_rack * rack,uint32_t cts,uint8_t * doing_tlp)7027 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t *doing_tlp)
7028 {
7029 /*
7030 * Tail Loss Probe.
7031 */
7032 struct rack_sendmap *rsm = NULL;
7033 int insret __diagused;
7034 struct socket *so = tptosocket(tp);
7035 uint32_t amm;
7036 uint32_t out, avail;
7037 int collapsed_win = 0;
7038
7039 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
7040 /* Its not time yet */
7041 return (0);
7042 }
7043 if (ctf_progress_timeout_check(tp, true)) {
7044 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
7045 return (-ETIMEDOUT); /* tcp_drop() */
7046 }
7047 /*
7048 * A TLP timer has expired. We have been idle for 2 rtts. So we now
7049 * need to figure out how to force a full MSS segment out.
7050 */
7051 rack_log_to_event(rack, RACK_TO_FRM_TLP, NULL);
7052 rack->r_ctl.retran_during_recovery = 0;
7053 rack->r_might_revert = 0;
7054 rack->r_ctl.dsack_byte_cnt = 0;
7055 counter_u64_add(rack_tlp_tot, 1);
7056 if (rack->r_state && (rack->r_state != tp->t_state))
7057 rack_set_state(tp, rack);
7058 avail = sbavail(&so->so_snd);
7059 out = tp->snd_max - tp->snd_una;
7060 if ((out > tp->snd_wnd) || rack->rc_has_collapsed) {
7061 /* special case, we need a retransmission */
7062 collapsed_win = 1;
7063 goto need_retran;
7064 }
7065 if (rack->r_ctl.dsack_persist && (rack->r_ctl.rc_tlp_cnt_out >= 1)) {
7066 rack->r_ctl.dsack_persist--;
7067 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) {
7068 rack->r_ctl.num_dsack = 0;
7069 }
7070 rack_log_dsack_event(rack, 1, __LINE__, 0, 0);
7071 }
7072 if ((tp->t_flags & TF_GPUTINPROG) &&
7073 (rack->r_ctl.rc_tlp_cnt_out == 1)) {
7074 /*
7075 * If this is the second in a row
7076 * TLP and we are doing a measurement
7077 * its time to abandon the measurement.
7078 * Something is likely broken on
7079 * the clients network and measuring a
7080 * broken network does us no good.
7081 */
7082 tp->t_flags &= ~TF_GPUTINPROG;
7083 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
7084 rack->r_ctl.rc_gp_srtt /*flex1*/,
7085 tp->gput_seq,
7086 0, 0, 18, __LINE__, NULL, 0);
7087 }
7088 /*
7089 * Check our send oldest always settings, and if
7090 * there is an oldest to send jump to the need_retran.
7091 */
7092 if (rack_always_send_oldest && (TAILQ_EMPTY(&rack->r_ctl.rc_tmap) == 0))
7093 goto need_retran;
7094
7095 if (avail > out) {
7096 /* New data is available */
7097 amm = avail - out;
7098 if (amm > ctf_fixed_maxseg(tp)) {
7099 amm = ctf_fixed_maxseg(tp);
7100 if ((amm + out) > tp->snd_wnd) {
7101 /* We are rwnd limited */
7102 goto need_retran;
7103 }
7104 } else if (amm < ctf_fixed_maxseg(tp)) {
7105 /* not enough to fill a MTU */
7106 goto need_retran;
7107 }
7108 if (IN_FASTRECOVERY(tp->t_flags)) {
7109 /* Unlikely */
7110 if (rack->rack_no_prr == 0) {
7111 if (out + amm <= tp->snd_wnd) {
7112 rack->r_ctl.rc_prr_sndcnt = amm;
7113 rack->r_ctl.rc_tlp_new_data = amm;
7114 rack_log_to_prr(rack, 4, 0, __LINE__);
7115 }
7116 } else
7117 goto need_retran;
7118 } else {
7119 /* Set the send-new override */
7120 if (out + amm <= tp->snd_wnd)
7121 rack->r_ctl.rc_tlp_new_data = amm;
7122 else
7123 goto need_retran;
7124 }
7125 rack->r_ctl.rc_tlpsend = NULL;
7126 counter_u64_add(rack_tlp_newdata, 1);
7127 goto send;
7128 }
7129 need_retran:
7130 /*
7131 * Ok we need to arrange the last un-acked segment to be re-sent, or
7132 * optionally the first un-acked segment.
7133 */
7134 if (collapsed_win == 0) {
7135 if (rack_always_send_oldest)
7136 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
7137 else {
7138 rsm = tqhash_max(rack->r_ctl.tqh);
7139 if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) {
7140 rsm = rack_find_high_nonack(rack, rsm);
7141 }
7142 }
7143 if (rsm == NULL) {
7144 #ifdef TCP_BLACKBOX
7145 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
7146 #endif
7147 goto out;
7148 }
7149 } else {
7150 /*
7151 * We had a collapsed window, lets find
7152 * the point before the collapse.
7153 */
7154 if (SEQ_GT((rack->r_ctl.last_collapse_point - 1), rack->rc_tp->snd_una))
7155 rsm = tqhash_find(rack->r_ctl.tqh, (rack->r_ctl.last_collapse_point - 1));
7156 else {
7157 rsm = tqhash_min(rack->r_ctl.tqh);
7158 }
7159 if (rsm == NULL) {
7160 /* Huh */
7161 goto out;
7162 }
7163 }
7164 if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) {
7165 /*
7166 * We need to split this the last segment in two.
7167 */
7168 struct rack_sendmap *nrsm;
7169
7170 nrsm = rack_alloc_full_limit(rack);
7171 if (nrsm == NULL) {
7172 /*
7173 * No memory to split, we will just exit and punt
7174 * off to the RXT timer.
7175 */
7176 goto out;
7177 }
7178 rack_clone_rsm(rack, nrsm, rsm,
7179 (rsm->r_end - ctf_fixed_maxseg(tp)));
7180 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__);
7181 #ifndef INVARIANTS
7182 (void)tqhash_insert(rack->r_ctl.tqh, nrsm);
7183 #else
7184 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) {
7185 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p",
7186 nrsm, insret, rack, rsm);
7187 }
7188 #endif
7189 if (rsm->r_in_tmap) {
7190 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
7191 nrsm->r_in_tmap = 1;
7192 }
7193 rsm = nrsm;
7194 }
7195 rack->r_ctl.rc_tlpsend = rsm;
7196 send:
7197 /* Make sure output path knows we are doing a TLP */
7198 *doing_tlp = 1;
7199 rack->r_timer_override = 1;
7200 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
7201 return (0);
7202 out:
7203 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
7204 return (0);
7205 }
7206
7207 /*
7208 * Delayed ack Timer, here we simply need to setup the
7209 * ACK_NOW flag and remove the DELACK flag. From there
7210 * the output routine will send the ack out.
7211 *
7212 * We only return 1, saying don't proceed, if all timers
7213 * are stopped (destroyed PCB?).
7214 */
7215 static int
rack_timeout_delack(struct tcpcb * tp,struct tcp_rack * rack,uint32_t cts)7216 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
7217 {
7218
7219 rack_log_to_event(rack, RACK_TO_FRM_DELACK, NULL);
7220 tp->t_flags &= ~TF_DELACK;
7221 tp->t_flags |= TF_ACKNOW;
7222 KMOD_TCPSTAT_INC(tcps_delack);
7223 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
7224 return (0);
7225 }
7226
7227 static inline int
rack_send_ack_challange(struct tcp_rack * rack)7228 rack_send_ack_challange(struct tcp_rack *rack)
7229 {
7230 struct tcptemp *t_template;
7231
7232 t_template = tcpip_maketemplate(rack->rc_inp);
7233 if (t_template) {
7234 if (rack->forced_ack == 0) {
7235 rack->forced_ack = 1;
7236 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
7237 } else {
7238 rack->probe_not_answered = 1;
7239 }
7240 tcp_respond(rack->rc_tp, t_template->tt_ipgen,
7241 &t_template->tt_t, (struct mbuf *)NULL,
7242 rack->rc_tp->rcv_nxt, rack->rc_tp->snd_una - 1, 0);
7243 free(t_template, M_TEMP);
7244 /* This does send an ack so kill any D-ack timer */
7245 if (rack->rc_tp->t_flags & TF_DELACK)
7246 rack->rc_tp->t_flags &= ~TF_DELACK;
7247 return(1);
7248 } else
7249 return (0);
7250
7251 }
7252
7253 /*
7254 * Persists timer, here we simply send the
7255 * same thing as a keepalive will.
7256 * the one byte send.
7257 *
7258 * We only return 1, saying don't proceed, if all timers
7259 * are stopped (destroyed PCB?).
7260 */
7261 static int
rack_timeout_persist(struct tcpcb * tp,struct tcp_rack * rack,uint32_t cts)7262 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
7263 {
7264 int32_t retval = 1;
7265
7266 if (rack->rc_in_persist == 0)
7267 return (0);
7268 if (ctf_progress_timeout_check(tp, false)) {
7269 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
7270 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
7271 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
7272 return (-ETIMEDOUT); /* tcp_drop() */
7273 }
7274 /*
7275 * Persistence timer into zero window. Force a byte to be output, if
7276 * possible.
7277 */
7278 KMOD_TCPSTAT_INC(tcps_persisttimeo);
7279 /*
7280 * Hack: if the peer is dead/unreachable, we do not time out if the
7281 * window is closed. After a full backoff, drop the connection if
7282 * the idle time (no responses to probes) reaches the maximum
7283 * backoff that we would use if retransmitting.
7284 */
7285 if (tp->t_rxtshift >= V_tcp_retries &&
7286 (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
7287 TICKS_2_USEC(ticks - tp->t_rcvtime) >= RACK_REXMTVAL(tp) * tcp_totbackoff)) {
7288 KMOD_TCPSTAT_INC(tcps_persistdrop);
7289 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
7290 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
7291 retval = -ETIMEDOUT; /* tcp_drop() */
7292 goto out;
7293 }
7294 if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) &&
7295 tp->snd_una == tp->snd_max)
7296 rack_exit_persist(tp, rack, cts);
7297 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT;
7298 /*
7299 * If the user has closed the socket then drop a persisting
7300 * connection after a much reduced timeout.
7301 */
7302 if (tp->t_state > TCPS_CLOSE_WAIT &&
7303 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
7304 KMOD_TCPSTAT_INC(tcps_persistdrop);
7305 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
7306 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
7307 retval = -ETIMEDOUT; /* tcp_drop() */
7308 goto out;
7309 }
7310 if (rack_send_ack_challange(rack)) {
7311 /* only set it if we were answered */
7312 if (rack->probe_not_answered) {
7313 counter_u64_add(rack_persists_loss, 1);
7314 rack->r_ctl.persist_lost_ends++;
7315 }
7316 counter_u64_add(rack_persists_sends, 1);
7317 counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1);
7318 }
7319 if (tp->t_rxtshift < V_tcp_retries)
7320 tp->t_rxtshift++;
7321 out:
7322 rack_log_to_event(rack, RACK_TO_FRM_PERSIST, NULL);
7323 rack_start_hpts_timer(rack, tp, cts,
7324 0, 0, 0);
7325 return (retval);
7326 }
7327
7328 /*
7329 * If a keepalive goes off, we had no other timers
7330 * happening. We always return 1 here since this
7331 * routine either drops the connection or sends
7332 * out a segment with respond.
7333 */
7334 static int
rack_timeout_keepalive(struct tcpcb * tp,struct tcp_rack * rack,uint32_t cts)7335 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
7336 {
7337 struct inpcb *inp = tptoinpcb(tp);
7338
7339 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP;
7340 rack_log_to_event(rack, RACK_TO_FRM_KEEP, NULL);
7341 /*
7342 * Keep-alive timer went off; send something or drop connection if
7343 * idle for too long.
7344 */
7345 KMOD_TCPSTAT_INC(tcps_keeptimeo);
7346 if (tp->t_state < TCPS_ESTABLISHED)
7347 goto dropit;
7348 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
7349 tp->t_state <= TCPS_CLOSING) {
7350 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
7351 goto dropit;
7352 /*
7353 * Send a packet designed to force a response if the peer is
7354 * up and reachable: either an ACK if the connection is
7355 * still alive, or an RST if the peer has closed the
7356 * connection due to timeout or reboot. Using sequence
7357 * number tp->snd_una-1 causes the transmitted zero-length
7358 * segment to lie outside the receive window; by the
7359 * protocol spec, this requires the correspondent TCP to
7360 * respond.
7361 */
7362 KMOD_TCPSTAT_INC(tcps_keepprobe);
7363 rack_send_ack_challange(rack);
7364 }
7365 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
7366 return (1);
7367 dropit:
7368 KMOD_TCPSTAT_INC(tcps_keepdrops);
7369 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX);
7370 return (-ETIMEDOUT); /* tcp_drop() */
7371 }
7372
7373 /*
7374 * Retransmit helper function, clear up all the ack
7375 * flags and take care of important book keeping.
7376 */
7377 static void
rack_remxt_tmr(struct tcpcb * tp)7378 rack_remxt_tmr(struct tcpcb *tp)
7379 {
7380 /*
7381 * The retransmit timer went off, all sack'd blocks must be
7382 * un-acked.
7383 */
7384 struct rack_sendmap *rsm, *trsm = NULL;
7385 struct tcp_rack *rack;
7386
7387 rack = (struct tcp_rack *)tp->t_fb_ptr;
7388 rack_timer_cancel(tp, rack, tcp_get_usecs(NULL), __LINE__);
7389 rack_log_to_event(rack, RACK_TO_FRM_TMR, NULL);
7390 rack->r_timer_override = 1;
7391 rack->r_ctl.rc_snd_max_at_rto = tp->snd_max;
7392 rack->r_ctl.rc_last_timeout_snduna = tp->snd_una;
7393 rack->r_late = 0;
7394 rack->r_early = 0;
7395 rack->r_ctl.rc_agg_delayed = 0;
7396 rack->r_ctl.rc_agg_early = 0;
7397 if (rack->r_state && (rack->r_state != tp->t_state))
7398 rack_set_state(tp, rack);
7399 if (tp->t_rxtshift <= rack_rxt_scoreboard_clear_thresh) {
7400 /*
7401 * We do not clear the scoreboard until we have had
7402 * more than rack_rxt_scoreboard_clear_thresh time-outs.
7403 */
7404 rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
7405 if (rack->r_ctl.rc_resend != NULL)
7406 rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT;
7407
7408 return;
7409 }
7410 /*
7411 * Ideally we would like to be able to
7412 * mark SACK-PASS on anything not acked here.
7413 *
7414 * However, if we do that we would burst out
7415 * all that data 1ms apart. This would be unwise,
7416 * so for now we will just let the normal rxt timer
7417 * and tlp timer take care of it.
7418 *
7419 * Also we really need to stick them back in sequence
7420 * order. This way we send in the proper order and any
7421 * sacks that come floating in will "re-ack" the data.
7422 * To do this we zap the tmap with an INIT and then
7423 * walk through and place every rsm in the tail queue
7424 * hash table back in its seq ordered place.
7425 */
7426 TAILQ_INIT(&rack->r_ctl.rc_tmap);
7427
7428 rack->r_ctl.recovery_rxt_cnt = 0;
7429 TQHASH_FOREACH(rsm, rack->r_ctl.tqh) {
7430 rsm->r_dupack = 0;
7431 if (rack_verbose_logging)
7432 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
7433 /* We must re-add it back to the tlist */
7434 if (trsm == NULL) {
7435 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
7436 } else {
7437 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext);
7438 }
7439 rsm->r_in_tmap = 1;
7440 trsm = rsm;
7441 if (rsm->r_flags & RACK_ACKED)
7442 rsm->r_flags |= RACK_WAS_ACKED;
7443 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS | RACK_RWND_COLLAPSED | RACK_WAS_LOST);
7444 rsm->r_flags |= RACK_MUST_RXT;
7445 }
7446 /* zero the lost since it's all gone */
7447 rack->r_ctl.rc_considered_lost = 0;
7448 /* Clear the count (we just un-acked them) */
7449 rack->r_ctl.rc_sacked = 0;
7450 rack->r_ctl.rc_sacklast = NULL;
7451 /* Clear the tlp rtx mark */
7452 rack->r_ctl.rc_resend = tqhash_min(rack->r_ctl.tqh);
7453 if (rack->r_ctl.rc_resend != NULL)
7454 rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT;
7455 rack->r_ctl.rc_prr_sndcnt = 0;
7456 rack_log_to_prr(rack, 6, 0, __LINE__);
7457 rack->r_ctl.rc_resend = tqhash_min(rack->r_ctl.tqh);
7458 if (rack->r_ctl.rc_resend != NULL)
7459 rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT;
7460 if (((tp->t_flags & TF_SACK_PERMIT) == 0) &&
7461 ((tp->t_flags & TF_SENTFIN) == 0)) {
7462 /*
7463 * For non-sack customers new data
7464 * needs to go out as retransmits until
7465 * we retransmit up to snd_max.
7466 */
7467 rack->r_must_retran = 1;
7468 rack->r_ctl.rc_out_at_rto = ctf_flight_size(rack->rc_tp,
7469 rack->r_ctl.rc_sacked);
7470 }
7471 }
7472
7473 static void
rack_convert_rtts(struct tcpcb * tp)7474 rack_convert_rtts(struct tcpcb *tp)
7475 {
7476 tcp_change_time_units(tp, TCP_TMR_GRANULARITY_USEC);
7477 tp->t_rxtcur = RACK_REXMTVAL(tp);
7478 if (TCPS_HAVEESTABLISHED(tp->t_state)) {
7479 tp->t_rxtcur += TICKS_2_USEC(tcp_rexmit_slop);
7480 }
7481 if (tp->t_rxtcur > rack_rto_max) {
7482 tp->t_rxtcur = rack_rto_max;
7483 }
7484 }
7485
7486 static void
rack_cc_conn_init(struct tcpcb * tp)7487 rack_cc_conn_init(struct tcpcb *tp)
7488 {
7489 struct tcp_rack *rack;
7490 uint32_t srtt;
7491
7492 rack = (struct tcp_rack *)tp->t_fb_ptr;
7493 srtt = tp->t_srtt;
7494 cc_conn_init(tp);
7495 /*
7496 * Now convert to rack's internal format,
7497 * if required.
7498 */
7499 if ((srtt == 0) && (tp->t_srtt != 0))
7500 rack_convert_rtts(tp);
7501 /*
7502 * We want a chance to stay in slowstart as
7503 * we create a connection. TCP spec says that
7504 * initially ssthresh is infinite. For our
7505 * purposes that is the snd_wnd.
7506 */
7507 if (tp->snd_ssthresh < tp->snd_wnd) {
7508 tp->snd_ssthresh = tp->snd_wnd;
7509 }
7510 /*
7511 * We also want to assure a IW worth of
7512 * data can get inflight.
7513 */
7514 if (rc_init_window(rack) < tp->snd_cwnd)
7515 tp->snd_cwnd = rc_init_window(rack);
7516 }
7517
7518 /*
7519 * Re-transmit timeout! If we drop the PCB we will return 1, otherwise
7520 * we will setup to retransmit the lowest seq number outstanding.
7521 */
7522 static int
rack_timeout_rxt(struct tcpcb * tp,struct tcp_rack * rack,uint32_t cts)7523 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
7524 {
7525 struct inpcb *inp = tptoinpcb(tp);
7526 int32_t rexmt;
7527 int32_t retval = 0;
7528 bool isipv6;
7529
7530 if ((tp->t_flags & TF_GPUTINPROG) &&
7531 (tp->t_rxtshift)) {
7532 /*
7533 * We have had a second timeout
7534 * measurements on successive rxt's are not profitable.
7535 * It is unlikely to be of any use (the network is
7536 * broken or the client went away).
7537 */
7538 tp->t_flags &= ~TF_GPUTINPROG;
7539 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
7540 rack->r_ctl.rc_gp_srtt /*flex1*/,
7541 tp->gput_seq,
7542 0, 0, 18, __LINE__, NULL, 0);
7543 }
7544 if (ctf_progress_timeout_check(tp, false)) {
7545 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
7546 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
7547 return (-ETIMEDOUT); /* tcp_drop() */
7548 }
7549 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT;
7550 rack->r_ctl.retran_during_recovery = 0;
7551 rack->rc_ack_required = 1;
7552 rack->r_ctl.dsack_byte_cnt = 0;
7553 if (IN_RECOVERY(tp->t_flags) &&
7554 (rack->rto_from_rec == 0)) {
7555 /*
7556 * Mark that we had a rto while in recovery
7557 * and save the ssthresh so if we go back
7558 * into recovery we will have a chance
7559 * to slowstart back to the level.
7560 */
7561 rack->rto_from_rec = 1;
7562 rack->r_ctl.rto_ssthresh = tp->snd_ssthresh;
7563 }
7564 if (IN_FASTRECOVERY(tp->t_flags))
7565 tp->t_flags |= TF_WASFRECOVERY;
7566 else
7567 tp->t_flags &= ~TF_WASFRECOVERY;
7568 if (IN_CONGRECOVERY(tp->t_flags))
7569 tp->t_flags |= TF_WASCRECOVERY;
7570 else
7571 tp->t_flags &= ~TF_WASCRECOVERY;
7572 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
7573 (tp->snd_una == tp->snd_max)) {
7574 /* Nothing outstanding .. nothing to do */
7575 return (0);
7576 }
7577 if (rack->r_ctl.dsack_persist) {
7578 rack->r_ctl.dsack_persist--;
7579 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) {
7580 rack->r_ctl.num_dsack = 0;
7581 }
7582 rack_log_dsack_event(rack, 1, __LINE__, 0, 0);
7583 }
7584 /*
7585 * Rack can only run one timer at a time, so we cannot
7586 * run a KEEPINIT (gating SYN sending) and a retransmit
7587 * timer for the SYN. So if we are in a front state and
7588 * have a KEEPINIT timer we need to check the first transmit
7589 * against now to see if we have exceeded the KEEPINIT time
7590 * (if one is set).
7591 */
7592 if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) &&
7593 (TP_KEEPINIT(tp) != 0)) {
7594 struct rack_sendmap *rsm;
7595
7596 rsm = tqhash_min(rack->r_ctl.tqh);
7597 if (rsm) {
7598 /* Ok we have something outstanding to test keepinit with */
7599 if ((TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) &&
7600 ((cts - (uint32_t)rsm->r_tim_lastsent[0]) >= TICKS_2_USEC(TP_KEEPINIT(tp)))) {
7601 /* We have exceeded the KEEPINIT time */
7602 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX);
7603 goto drop_it;
7604 }
7605 }
7606 }
7607 /*
7608 * Retransmission timer went off. Message has not been acked within
7609 * retransmit interval. Back off to a longer retransmit interval
7610 * and retransmit one segment.
7611 */
7612 if ((rack->r_ctl.rc_resend == NULL) ||
7613 ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) {
7614 /*
7615 * If the rwnd collapsed on
7616 * the one we are retransmitting
7617 * it does not count against the
7618 * rxt count.
7619 */
7620 tp->t_rxtshift++;
7621 }
7622 rack_remxt_tmr(tp);
7623 if (tp->t_rxtshift > V_tcp_retries) {
7624 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
7625 drop_it:
7626 tp->t_rxtshift = V_tcp_retries;
7627 KMOD_TCPSTAT_INC(tcps_timeoutdrop);
7628 /* XXXGL: previously t_softerror was casted to uint16_t */
7629 MPASS(tp->t_softerror >= 0);
7630 retval = tp->t_softerror ? -tp->t_softerror : -ETIMEDOUT;
7631 goto out; /* tcp_drop() */
7632 }
7633 if (tp->t_state == TCPS_SYN_SENT) {
7634 /*
7635 * If the SYN was retransmitted, indicate CWND to be limited
7636 * to 1 segment in cc_conn_init().
7637 */
7638 tp->snd_cwnd = 1;
7639 } else if (tp->t_rxtshift == 1) {
7640 /*
7641 * first retransmit; record ssthresh and cwnd so they can be
7642 * recovered if this turns out to be a "bad" retransmit. A
7643 * retransmit is considered "bad" if an ACK for this segment
7644 * is received within RTT/2 interval; the assumption here is
7645 * that the ACK was already in flight. See "On Estimating
7646 * End-to-End Network Path Properties" by Allman and Paxson
7647 * for more details.
7648 */
7649 tp->snd_cwnd_prev = tp->snd_cwnd;
7650 tp->snd_ssthresh_prev = tp->snd_ssthresh;
7651 tp->snd_recover_prev = tp->snd_recover;
7652 tp->t_badrxtwin = ticks + (USEC_2_TICKS(tp->t_srtt)/2);
7653 tp->t_flags |= TF_PREVVALID;
7654 } else if ((tp->t_flags & TF_RCVD_TSTMP) == 0)
7655 tp->t_flags &= ~TF_PREVVALID;
7656 KMOD_TCPSTAT_INC(tcps_rexmttimeo);
7657 if ((tp->t_state == TCPS_SYN_SENT) ||
7658 (tp->t_state == TCPS_SYN_RECEIVED))
7659 rexmt = RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift];
7660 else
7661 rexmt = max(rack_rto_min, (tp->t_srtt + (tp->t_rttvar << 2))) * tcp_backoff[tp->t_rxtshift];
7662
7663 RACK_TCPT_RANGESET(tp->t_rxtcur, rexmt,
7664 max(rack_rto_min, rexmt), rack_rto_max, rack->r_ctl.timer_slop);
7665 /*
7666 * We enter the path for PLMTUD if connection is established or, if
7667 * connection is FIN_WAIT_1 status, reason for the last is that if
7668 * amount of data we send is very small, we could send it in couple
7669 * of packets and process straight to FIN. In that case we won't
7670 * catch ESTABLISHED state.
7671 */
7672 #ifdef INET6
7673 isipv6 = (inp->inp_vflag & INP_IPV6) ? true : false;
7674 #else
7675 isipv6 = false;
7676 #endif
7677 if (((V_tcp_pmtud_blackhole_detect == 1) ||
7678 (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) ||
7679 (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) &&
7680 ((tp->t_state == TCPS_ESTABLISHED) ||
7681 (tp->t_state == TCPS_FIN_WAIT_1))) {
7682 /*
7683 * Idea here is that at each stage of mtu probe (usually,
7684 * 1448 -> 1188 -> 524) should be given 2 chances to recover
7685 * before further clamping down. 'tp->t_rxtshift % 2 == 0'
7686 * should take care of that.
7687 */
7688 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) ==
7689 (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) &&
7690 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 &&
7691 tp->t_rxtshift % 2 == 0)) {
7692 /*
7693 * Enter Path MTU Black-hole Detection mechanism: -
7694 * Disable Path MTU Discovery (IP "DF" bit). -
7695 * Reduce MTU to lower value than what we negotiated
7696 * with peer.
7697 */
7698 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) {
7699 /* Record that we may have found a black hole. */
7700 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
7701 /* Keep track of previous MSS. */
7702 tp->t_pmtud_saved_maxseg = tp->t_maxseg;
7703 }
7704
7705 /*
7706 * Reduce the MSS to blackhole value or to the
7707 * default in an attempt to retransmit.
7708 */
7709 #ifdef INET6
7710 if (isipv6 &&
7711 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
7712 /* Use the sysctl tuneable blackhole MSS. */
7713 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
7714 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated);
7715 } else if (isipv6) {
7716 /* Use the default MSS. */
7717 tp->t_maxseg = V_tcp_v6mssdflt;
7718 /*
7719 * Disable Path MTU Discovery when we switch
7720 * to minmss.
7721 */
7722 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
7723 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
7724 }
7725 #endif
7726 #if defined(INET6) && defined(INET)
7727 else
7728 #endif
7729 #ifdef INET
7730 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
7731 /* Use the sysctl tuneable blackhole MSS. */
7732 tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
7733 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated);
7734 } else {
7735 /* Use the default MSS. */
7736 tp->t_maxseg = V_tcp_mssdflt;
7737 /*
7738 * Disable Path MTU Discovery when we switch
7739 * to minmss.
7740 */
7741 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
7742 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
7743 }
7744 #endif
7745 } else {
7746 /*
7747 * If further retransmissions are still unsuccessful
7748 * with a lowered MTU, maybe this isn't a blackhole
7749 * and we restore the previous MSS and blackhole
7750 * detection flags. The limit '6' is determined by
7751 * giving each probe stage (1448, 1188, 524) 2
7752 * chances to recover.
7753 */
7754 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
7755 (tp->t_rxtshift >= 6)) {
7756 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
7757 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
7758 tp->t_maxseg = tp->t_pmtud_saved_maxseg;
7759 if (tp->t_maxseg < V_tcp_mssdflt) {
7760 /*
7761 * The MSS is so small we should not
7762 * process incoming SACK's since we are
7763 * subject to attack in such a case.
7764 */
7765 tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT;
7766 } else {
7767 tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT;
7768 }
7769 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed);
7770 }
7771 }
7772 }
7773 /*
7774 * Disable RFC1323 and SACK if we haven't got any response to
7775 * our third SYN to work-around some broken terminal servers
7776 * (most of which have hopefully been retired) that have bad VJ
7777 * header compression code which trashes TCP segments containing
7778 * unknown-to-them TCP options.
7779 */
7780 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
7781 (tp->t_rxtshift == 3))
7782 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT);
7783 /*
7784 * If we backed off this far, our srtt estimate is probably bogus.
7785 * Clobber it so we'll take the next rtt measurement as our srtt;
7786 * move the current srtt into rttvar to keep the current retransmit
7787 * times until then.
7788 */
7789 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
7790 #ifdef INET6
7791 if ((inp->inp_vflag & INP_IPV6) != 0)
7792 in6_losing(inp);
7793 else
7794 #endif
7795 in_losing(inp);
7796 tp->t_rttvar += tp->t_srtt;
7797 tp->t_srtt = 0;
7798 }
7799 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
7800 tp->snd_recover = tp->snd_max;
7801 tp->t_flags |= TF_ACKNOW;
7802 tp->t_rtttime = 0;
7803 rack_cong_signal(tp, CC_RTO, tp->snd_una, __LINE__);
7804 out:
7805 return (retval);
7806 }
7807
7808 static int
rack_process_timers(struct tcpcb * tp,struct tcp_rack * rack,uint32_t cts,uint8_t hpts_calling,uint8_t * doing_tlp)7809 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling, uint8_t *doing_tlp)
7810 {
7811 int32_t ret = 0;
7812 int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK);
7813
7814 if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
7815 (tp->t_flags & TF_GPUTINPROG)) {
7816 /*
7817 * We have a goodput in progress
7818 * and we have entered a late state.
7819 * Do we have enough data in the sb
7820 * to handle the GPUT request?
7821 */
7822 uint32_t bytes;
7823
7824 bytes = tp->gput_ack - tp->gput_seq;
7825 if (SEQ_GT(tp->gput_seq, tp->snd_una))
7826 bytes += tp->gput_seq - tp->snd_una;
7827 if (bytes > sbavail(&tptosocket(tp)->so_snd)) {
7828 /*
7829 * There are not enough bytes in the socket
7830 * buffer that have been sent to cover this
7831 * measurement. Cancel it.
7832 */
7833 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
7834 rack->r_ctl.rc_gp_srtt /*flex1*/,
7835 tp->gput_seq,
7836 0, 0, 18, __LINE__, NULL, 0);
7837 tp->t_flags &= ~TF_GPUTINPROG;
7838 }
7839 }
7840 if (timers == 0) {
7841 return (0);
7842 }
7843 if (tp->t_state == TCPS_LISTEN) {
7844 /* no timers on listen sockets */
7845 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)
7846 return (0);
7847 return (1);
7848 }
7849 if ((timers & PACE_TMR_RACK) &&
7850 rack->rc_on_min_to) {
7851 /*
7852 * For the rack timer when we
7853 * are on a min-timeout (which means rrr_conf = 3)
7854 * we don't want to check the timer. It may
7855 * be going off for a pace and thats ok we
7856 * want to send the retransmit (if its ready).
7857 *
7858 * If its on a normal rack timer (non-min) then
7859 * we will check if its expired.
7860 */
7861 goto skip_time_check;
7862 }
7863 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
7864 uint32_t left;
7865
7866 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
7867 ret = -1;
7868 rack_log_to_processing(rack, cts, ret, 0);
7869 return (0);
7870 }
7871 if (hpts_calling == 0) {
7872 /*
7873 * A user send or queued mbuf (sack) has called us? We
7874 * return 0 and let the pacing guards
7875 * deal with it if they should or
7876 * should not cause a send.
7877 */
7878 ret = -2;
7879 rack_log_to_processing(rack, cts, ret, 0);
7880 return (0);
7881 }
7882 /*
7883 * Ok our timer went off early and we are not paced false
7884 * alarm, go back to sleep. We make sure we don't have
7885 * no-sack wakeup on since we no longer have a PKT_OUTPUT
7886 * flag in place.
7887 */
7888 rack->rc_tp->t_flags2 &= ~TF2_DONT_SACK_QUEUE;
7889 ret = -3;
7890 left = rack->r_ctl.rc_timer_exp - cts;
7891 tcp_hpts_insert(tp, left, NULL);
7892 rack_log_to_processing(rack, cts, ret, left);
7893 return (1);
7894 }
7895 skip_time_check:
7896 rack->rc_tmr_stopped = 0;
7897 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK;
7898 if (timers & PACE_TMR_DELACK) {
7899 ret = rack_timeout_delack(tp, rack, cts);
7900 } else if (timers & PACE_TMR_RACK) {
7901 rack->r_ctl.rc_tlp_rxt_last_time = cts;
7902 rack->r_fast_output = 0;
7903 ret = rack_timeout_rack(tp, rack, cts);
7904 } else if (timers & PACE_TMR_TLP) {
7905 rack->r_ctl.rc_tlp_rxt_last_time = cts;
7906 rack->r_fast_output = 0;
7907 ret = rack_timeout_tlp(tp, rack, cts, doing_tlp);
7908 } else if (timers & PACE_TMR_RXT) {
7909 rack->r_ctl.rc_tlp_rxt_last_time = cts;
7910 rack->r_fast_output = 0;
7911 ret = rack_timeout_rxt(tp, rack, cts);
7912 } else if (timers & PACE_TMR_PERSIT) {
7913 ret = rack_timeout_persist(tp, rack, cts);
7914 } else if (timers & PACE_TMR_KEEP) {
7915 ret = rack_timeout_keepalive(tp, rack, cts);
7916 }
7917 rack_log_to_processing(rack, cts, ret, timers);
7918 return (ret);
7919 }
7920
7921 static void
rack_timer_cancel(struct tcpcb * tp,struct tcp_rack * rack,uint32_t cts,int line)7922 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line)
7923 {
7924 struct timeval tv;
7925 uint32_t us_cts, flags_on_entry;
7926 uint8_t hpts_removed = 0;
7927
7928 flags_on_entry = rack->r_ctl.rc_hpts_flags;
7929 us_cts = tcp_get_usecs(&tv);
7930 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
7931 ((TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) ||
7932 ((tp->snd_max - tp->snd_una) == 0))) {
7933 tcp_hpts_remove(rack->rc_tp);
7934 hpts_removed = 1;
7935 /* If we were not delayed cancel out the flag. */
7936 if ((tp->snd_max - tp->snd_una) == 0)
7937 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
7938 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry);
7939 }
7940 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
7941 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
7942 if (tcp_in_hpts(rack->rc_tp) &&
7943 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) {
7944 /*
7945 * Canceling timer's when we have no output being
7946 * paced. We also must remove ourselves from the
7947 * hpts.
7948 */
7949 tcp_hpts_remove(rack->rc_tp);
7950 hpts_removed = 1;
7951 }
7952 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK);
7953 }
7954 if (hpts_removed == 0)
7955 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry);
7956 }
7957
7958 static int
rack_stopall(struct tcpcb * tp)7959 rack_stopall(struct tcpcb *tp)
7960 {
7961 struct tcp_rack *rack;
7962
7963 rack = (struct tcp_rack *)tp->t_fb_ptr;
7964 rack->t_timers_stopped = 1;
7965
7966 tcp_hpts_remove(tp);
7967
7968 return (0);
7969 }
7970
7971 static void
rack_stop_all_timers(struct tcpcb * tp,struct tcp_rack * rack)7972 rack_stop_all_timers(struct tcpcb *tp, struct tcp_rack *rack)
7973 {
7974 /*
7975 * Assure no timers are running.
7976 */
7977 if (tcp_timer_active(tp, TT_PERSIST)) {
7978 /* We enter in persists, set the flag appropriately */
7979 rack->rc_in_persist = 1;
7980 }
7981 if (tcp_in_hpts(rack->rc_tp)) {
7982 tcp_hpts_remove(rack->rc_tp);
7983 }
7984 }
7985
7986 static void
rack_update_rsm(struct tcpcb * tp,struct tcp_rack * rack,struct rack_sendmap * rsm,uint64_t ts,uint32_t add_flag,int segsiz)7987 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
7988 struct rack_sendmap *rsm, uint64_t ts, uint32_t add_flag, int segsiz)
7989 {
7990 int32_t idx;
7991
7992 rsm->r_rtr_cnt++;
7993 if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) {
7994 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS;
7995 rsm->r_flags |= RACK_OVERMAX;
7996 }
7997 rsm->r_act_rxt_cnt++;
7998 /* Peg the count/index */
7999 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
8000 rsm->r_dupack = 0;
8001 if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & RACK_TLP) == 0)) {
8002 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start);
8003 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start);
8004 }
8005 if (rsm->r_flags & RACK_WAS_LOST) {
8006 /*
8007 * We retransmitted it putting it back in flight
8008 * remove the lost desgination and reduce the
8009 * bytes considered lost.
8010 */
8011 rack_mark_nolonger_lost(rack, rsm);
8012 }
8013 idx = rsm->r_rtr_cnt - 1;
8014 rsm->r_tim_lastsent[idx] = ts;
8015 /*
8016 * Here we don't add in the len of send, since its already
8017 * in snduna <->snd_max.
8018 */
8019 rsm->r_fas = ctf_flight_size(rack->rc_tp,
8020 rack->r_ctl.rc_sacked);
8021 if (rsm->r_flags & RACK_ACKED) {
8022 /* Problably MTU discovery messing with us */
8023 rsm->r_flags &= ~RACK_ACKED;
8024 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
8025 }
8026 if (rsm->r_in_tmap) {
8027 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
8028 rsm->r_in_tmap = 0;
8029 }
8030 /* Lets make sure it really is in or not the GP window */
8031 rack_mark_in_gp_win(tp, rsm);
8032 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
8033 rsm->r_in_tmap = 1;
8034 rsm->r_bas = (uint8_t)(((rsm->r_end - rsm->r_start) + segsiz - 1) / segsiz);
8035 /* Take off the must retransmit flag, if its on */
8036 if (rsm->r_flags & RACK_MUST_RXT) {
8037 if (rack->r_must_retran)
8038 rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start);
8039 if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) {
8040 /*
8041 * We have retransmitted all we need. Clear
8042 * any must retransmit flags.
8043 */
8044 rack->r_must_retran = 0;
8045 rack->r_ctl.rc_out_at_rto = 0;
8046 }
8047 rsm->r_flags &= ~RACK_MUST_RXT;
8048 }
8049 /* Remove any collapsed flag */
8050 rsm->r_flags &= ~RACK_RWND_COLLAPSED;
8051 if (rsm->r_flags & RACK_SACK_PASSED) {
8052 /* We have retransmitted due to the SACK pass */
8053 rsm->r_flags &= ~RACK_SACK_PASSED;
8054 rsm->r_flags |= RACK_WAS_SACKPASS;
8055 rack->r_ctl.recovery_rxt_cnt += (rsm->r_end - rsm->r_start);
8056 }
8057 }
8058
8059 static uint32_t
rack_update_entry(struct tcpcb * tp,struct tcp_rack * rack,struct rack_sendmap * rsm,uint64_t ts,int32_t * lenp,uint32_t add_flag,int segsiz)8060 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
8061 struct rack_sendmap *rsm, uint64_t ts, int32_t *lenp, uint32_t add_flag, int segsiz)
8062 {
8063 /*
8064 * We (re-)transmitted starting at rsm->r_start for some length
8065 * (possibly less than r_end.
8066 */
8067 struct rack_sendmap *nrsm;
8068 int insret __diagused;
8069 uint32_t c_end;
8070 int32_t len;
8071
8072 len = *lenp;
8073 c_end = rsm->r_start + len;
8074 if (SEQ_GEQ(c_end, rsm->r_end)) {
8075 /*
8076 * We retransmitted the whole piece or more than the whole
8077 * slopping into the next rsm.
8078 */
8079 rack_update_rsm(tp, rack, rsm, ts, add_flag, segsiz);
8080 if (c_end == rsm->r_end) {
8081 *lenp = 0;
8082 return (0);
8083 } else {
8084 int32_t act_len;
8085
8086 /* Hangs over the end return whats left */
8087 act_len = rsm->r_end - rsm->r_start;
8088 *lenp = (len - act_len);
8089 return (rsm->r_end);
8090 }
8091 /* We don't get out of this block. */
8092 }
8093 /*
8094 * Here we retransmitted less than the whole thing which means we
8095 * have to split this into what was transmitted and what was not.
8096 */
8097 nrsm = rack_alloc_full_limit(rack);
8098 if (nrsm == NULL) {
8099 /*
8100 * We can't get memory, so lets not proceed.
8101 */
8102 *lenp = 0;
8103 return (0);
8104 }
8105 /*
8106 * So here we are going to take the original rsm and make it what we
8107 * retransmitted. nrsm will be the tail portion we did not
8108 * retransmit. For example say the chunk was 1, 11 (10 bytes). And
8109 * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to
8110 * 1, 6 and the new piece will be 6, 11.
8111 */
8112 rack_clone_rsm(rack, nrsm, rsm, c_end);
8113 nrsm->r_dupack = 0;
8114 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2);
8115 #ifndef INVARIANTS
8116 (void)tqhash_insert(rack->r_ctl.tqh, nrsm);
8117 #else
8118 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) {
8119 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p",
8120 nrsm, insret, rack, rsm);
8121 }
8122 #endif
8123 if (rsm->r_in_tmap) {
8124 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
8125 nrsm->r_in_tmap = 1;
8126 }
8127 rsm->r_flags &= (~RACK_HAS_FIN);
8128 rack_update_rsm(tp, rack, rsm, ts, add_flag, segsiz);
8129 /* Log a split of rsm into rsm and nrsm */
8130 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__);
8131 *lenp = 0;
8132 return (0);
8133 }
8134
8135 static void
rack_log_output(struct tcpcb * tp,struct tcpopt * to,int32_t len,uint32_t seq_out,uint16_t th_flags,int32_t err,uint64_t cts,struct rack_sendmap * hintrsm,uint32_t add_flag,struct mbuf * s_mb,uint32_t s_moff,int hw_tls,int segsiz)8136 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
8137 uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t cts,
8138 struct rack_sendmap *hintrsm, uint32_t add_flag, struct mbuf *s_mb,
8139 uint32_t s_moff, int hw_tls, int segsiz)
8140 {
8141 struct tcp_rack *rack;
8142 struct rack_sendmap *rsm, *nrsm;
8143 int insret __diagused;
8144
8145 register uint32_t snd_max, snd_una;
8146
8147 /*
8148 * Add to the RACK log of packets in flight or retransmitted. If
8149 * there is a TS option we will use the TS echoed, if not we will
8150 * grab a TS.
8151 *
8152 * Retransmissions will increment the count and move the ts to its
8153 * proper place. Note that if options do not include TS's then we
8154 * won't be able to effectively use the ACK for an RTT on a retran.
8155 *
8156 * Notes about r_start and r_end. Lets consider a send starting at
8157 * sequence 1 for 10 bytes. In such an example the r_start would be
8158 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11.
8159 * This means that r_end is actually the first sequence for the next
8160 * slot (11).
8161 *
8162 */
8163 /*
8164 * If err is set what do we do XXXrrs? should we not add the thing?
8165 * -- i.e. return if err != 0 or should we pretend we sent it? --
8166 * i.e. proceed with add ** do this for now.
8167 */
8168 INP_WLOCK_ASSERT(tptoinpcb(tp));
8169 if (err)
8170 /*
8171 * We don't log errors -- we could but snd_max does not
8172 * advance in this case either.
8173 */
8174 return;
8175
8176 if (th_flags & TH_RST) {
8177 /*
8178 * We don't log resets and we return immediately from
8179 * sending
8180 */
8181 return;
8182 }
8183 rack = (struct tcp_rack *)tp->t_fb_ptr;
8184 snd_una = tp->snd_una;
8185 snd_max = tp->snd_max;
8186 if (th_flags & (TH_SYN | TH_FIN)) {
8187 /*
8188 * The call to rack_log_output is made before bumping
8189 * snd_max. This means we can record one extra byte on a SYN
8190 * or FIN if seq_out is adding more on and a FIN is present
8191 * (and we are not resending).
8192 */
8193 if ((th_flags & TH_SYN) && (seq_out == tp->iss))
8194 len++;
8195 if (th_flags & TH_FIN)
8196 len++;
8197 }
8198 if (SEQ_LEQ((seq_out + len), snd_una)) {
8199 /* Are sending an old segment to induce an ack (keep-alive)? */
8200 return;
8201 }
8202 if (SEQ_LT(seq_out, snd_una)) {
8203 /* huh? should we panic? */
8204 uint32_t end;
8205
8206 end = seq_out + len;
8207 seq_out = snd_una;
8208 if (SEQ_GEQ(end, seq_out))
8209 len = end - seq_out;
8210 else
8211 len = 0;
8212 }
8213 if (len == 0) {
8214 /* We don't log zero window probes */
8215 return;
8216 }
8217 if (IN_FASTRECOVERY(tp->t_flags)) {
8218 rack->r_ctl.rc_prr_out += len;
8219 }
8220 /* First question is it a retransmission or new? */
8221 if (seq_out == snd_max) {
8222 /* Its new */
8223 rack_chk_req_and_hybrid_on_out(rack, seq_out, len, cts);
8224 again:
8225 rsm = rack_alloc(rack);
8226 if (rsm == NULL) {
8227 /*
8228 * Hmm out of memory and the tcb got destroyed while
8229 * we tried to wait.
8230 */
8231 return;
8232 }
8233 if (th_flags & TH_FIN) {
8234 rsm->r_flags = RACK_HAS_FIN|add_flag;
8235 } else {
8236 rsm->r_flags = add_flag;
8237 }
8238 if (hw_tls)
8239 rsm->r_hw_tls = 1;
8240 rsm->r_tim_lastsent[0] = cts;
8241 rsm->r_rtr_cnt = 1;
8242 rsm->r_act_rxt_cnt = 0;
8243 rsm->r_rtr_bytes = 0;
8244 if (th_flags & TH_SYN) {
8245 /* The data space is one beyond snd_una */
8246 rsm->r_flags |= RACK_HAS_SYN;
8247 }
8248 rsm->r_start = seq_out;
8249 rsm->r_end = rsm->r_start + len;
8250 rack_mark_in_gp_win(tp, rsm);
8251 rsm->r_dupack = 0;
8252 /*
8253 * save off the mbuf location that
8254 * sndmbuf_noadv returned (which is
8255 * where we started copying from)..
8256 */
8257 rsm->m = s_mb;
8258 rsm->soff = s_moff;
8259 /*
8260 * Here we do add in the len of send, since its not yet
8261 * reflected in in snduna <->snd_max
8262 */
8263 rsm->r_fas = (ctf_flight_size(rack->rc_tp,
8264 rack->r_ctl.rc_sacked) +
8265 (rsm->r_end - rsm->r_start));
8266 if ((rack->rc_initial_ss_comp == 0) &&
8267 (rack->r_ctl.ss_hi_fs < rsm->r_fas)) {
8268 rack->r_ctl.ss_hi_fs = rsm->r_fas;
8269 }
8270 /* rsm->m will be NULL if RACK_HAS_SYN or RACK_HAS_FIN is set */
8271 if (rsm->m) {
8272 if (rsm->m->m_len <= rsm->soff) {
8273 /*
8274 * XXXrrs Question, will this happen?
8275 *
8276 * If sbsndptr is set at the correct place
8277 * then s_moff should always be somewhere
8278 * within rsm->m. But if the sbsndptr was
8279 * off then that won't be true. If it occurs
8280 * we need to walkout to the correct location.
8281 */
8282 struct mbuf *lm;
8283
8284 lm = rsm->m;
8285 while (lm->m_len <= rsm->soff) {
8286 rsm->soff -= lm->m_len;
8287 lm = lm->m_next;
8288 KASSERT(lm != NULL, ("%s rack:%p lm goes null orig_off:%u origmb:%p rsm->soff:%u",
8289 __func__, rack, s_moff, s_mb, rsm->soff));
8290 }
8291 rsm->m = lm;
8292 }
8293 rsm->orig_m_len = rsm->m->m_len;
8294 rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
8295 } else {
8296 rsm->orig_m_len = 0;
8297 rsm->orig_t_space = 0;
8298 }
8299 rsm->r_bas = (uint8_t)((len + segsiz - 1) / segsiz);
8300 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
8301 /* Log a new rsm */
8302 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_NEW, 0, __LINE__);
8303 #ifndef INVARIANTS
8304 (void)tqhash_insert(rack->r_ctl.tqh, rsm);
8305 #else
8306 if ((insret = tqhash_insert(rack->r_ctl.tqh, rsm)) != 0) {
8307 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p",
8308 nrsm, insret, rack, rsm);
8309 }
8310 #endif
8311 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
8312 rsm->r_in_tmap = 1;
8313 if (rsm->r_flags & RACK_IS_PCM) {
8314 rack->r_ctl.pcm_i.send_time = cts;
8315 rack->r_ctl.pcm_i.eseq = rsm->r_end;
8316 /* First time through we set the start too */
8317 if (rack->pcm_in_progress == 0)
8318 rack->r_ctl.pcm_i.sseq = rsm->r_start;
8319 }
8320 /*
8321 * Special case detection, is there just a single
8322 * packet outstanding when we are not in recovery?
8323 *
8324 * If this is true mark it so.
8325 */
8326 if ((IN_FASTRECOVERY(tp->t_flags) == 0) &&
8327 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) == ctf_fixed_maxseg(tp))) {
8328 struct rack_sendmap *prsm;
8329
8330 prsm = tqhash_prev(rack->r_ctl.tqh, rsm);
8331 if (prsm)
8332 prsm->r_one_out_nr = 1;
8333 }
8334 return;
8335 }
8336 /*
8337 * If we reach here its a retransmission and we need to find it.
8338 */
8339 more:
8340 if (hintrsm && (hintrsm->r_start == seq_out)) {
8341 rsm = hintrsm;
8342 hintrsm = NULL;
8343 } else {
8344 /* No hints sorry */
8345 rsm = NULL;
8346 }
8347 if ((rsm) && (rsm->r_start == seq_out)) {
8348 seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag, segsiz);
8349 if (len == 0) {
8350 return;
8351 } else {
8352 goto more;
8353 }
8354 }
8355 /* Ok it was not the last pointer go through it the hard way. */
8356 refind:
8357 rsm = tqhash_find(rack->r_ctl.tqh, seq_out);
8358 if (rsm) {
8359 if (rsm->r_start == seq_out) {
8360 seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag, segsiz);
8361 if (len == 0) {
8362 return;
8363 } else {
8364 goto refind;
8365 }
8366 }
8367 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) {
8368 /* Transmitted within this piece */
8369 /*
8370 * Ok we must split off the front and then let the
8371 * update do the rest
8372 */
8373 nrsm = rack_alloc_full_limit(rack);
8374 if (nrsm == NULL) {
8375 rack_update_rsm(tp, rack, rsm, cts, add_flag, segsiz);
8376 return;
8377 }
8378 /*
8379 * copy rsm to nrsm and then trim the front of rsm
8380 * to not include this part.
8381 */
8382 rack_clone_rsm(rack, nrsm, rsm, seq_out);
8383 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__);
8384 #ifndef INVARIANTS
8385 (void)tqhash_insert(rack->r_ctl.tqh, nrsm);
8386 #else
8387 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) {
8388 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p",
8389 nrsm, insret, rack, rsm);
8390 }
8391 #endif
8392 if (rsm->r_in_tmap) {
8393 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
8394 nrsm->r_in_tmap = 1;
8395 }
8396 rsm->r_flags &= (~RACK_HAS_FIN);
8397 seq_out = rack_update_entry(tp, rack, nrsm, cts, &len, add_flag, segsiz);
8398 if (len == 0) {
8399 return;
8400 } else if (len > 0)
8401 goto refind;
8402 }
8403 }
8404 /*
8405 * Hmm not found in map did they retransmit both old and on into the
8406 * new?
8407 */
8408 if (seq_out == tp->snd_max) {
8409 goto again;
8410 } else if (SEQ_LT(seq_out, tp->snd_max)) {
8411 #ifdef INVARIANTS
8412 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n",
8413 seq_out, len, tp->snd_una, tp->snd_max);
8414 printf("Starting Dump of all rack entries\n");
8415 TQHASH_FOREACH(rsm, rack->r_ctl.tqh) {
8416 printf("rsm:%p start:%u end:%u\n",
8417 rsm, rsm->r_start, rsm->r_end);
8418 }
8419 printf("Dump complete\n");
8420 panic("seq_out not found rack:%p tp:%p",
8421 rack, tp);
8422 #endif
8423 } else {
8424 #ifdef INVARIANTS
8425 /*
8426 * Hmm beyond sndmax? (only if we are using the new rtt-pack
8427 * flag)
8428 */
8429 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p",
8430 seq_out, len, tp->snd_max, tp);
8431 #endif
8432 }
8433 }
8434
8435 /*
8436 * Record one of the RTT updates from an ack into
8437 * our sample structure.
8438 */
8439
8440 static void
tcp_rack_xmit_timer(struct tcp_rack * rack,int32_t rtt,uint32_t len,uint32_t us_rtt,int confidence,struct rack_sendmap * rsm,uint16_t rtrcnt)8441 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, uint32_t len, uint32_t us_rtt,
8442 int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt)
8443 {
8444 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
8445 (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) {
8446 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt;
8447 }
8448 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
8449 (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) {
8450 rack->r_ctl.rack_rs.rs_rtt_highest = rtt;
8451 }
8452 if (rack->rc_tp->t_flags & TF_GPUTINPROG) {
8453 if (us_rtt < rack->r_ctl.rc_gp_lowrtt)
8454 rack->r_ctl.rc_gp_lowrtt = us_rtt;
8455 if (rack->rc_tp->snd_wnd > rack->r_ctl.rc_gp_high_rwnd)
8456 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd;
8457 }
8458 if ((confidence == 1) &&
8459 ((rsm == NULL) ||
8460 (rsm->r_just_ret) ||
8461 (rsm->r_one_out_nr &&
8462 len < (ctf_fixed_maxseg(rack->rc_tp) * 2)))) {
8463 /*
8464 * If the rsm had a just return
8465 * hit it then we can't trust the
8466 * rtt measurement for buffer deterimination
8467 * Note that a confidence of 2, indicates
8468 * SACK'd which overrides the r_just_ret or
8469 * the r_one_out_nr. If it was a CUM-ACK and
8470 * we had only two outstanding, but get an
8471 * ack for only 1. Then that also lowers our
8472 * confidence.
8473 */
8474 confidence = 0;
8475 }
8476 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
8477 (rack->r_ctl.rack_rs.rs_us_rtt > us_rtt)) {
8478 if (rack->r_ctl.rack_rs.confidence == 0) {
8479 /*
8480 * We take anything with no current confidence
8481 * saved.
8482 */
8483 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt;
8484 rack->r_ctl.rack_rs.confidence = confidence;
8485 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt;
8486 } else if (confidence != 0) {
8487 /*
8488 * Once we have a confident number,
8489 * we can update it with a smaller
8490 * value since this confident number
8491 * may include the DSACK time until
8492 * the next segment (the second one) arrived.
8493 */
8494 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt;
8495 rack->r_ctl.rack_rs.confidence = confidence;
8496 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt;
8497 }
8498 }
8499 rack_log_rtt_upd(rack->rc_tp, rack, us_rtt, len, rsm, confidence);
8500 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID;
8501 rack->r_ctl.rack_rs.rs_rtt_tot += rtt;
8502 rack->r_ctl.rack_rs.rs_rtt_cnt++;
8503 }
8504
8505 /*
8506 * Collect new round-trip time estimate
8507 * and update averages and current timeout.
8508 */
8509 static void
tcp_rack_xmit_timer_commit(struct tcp_rack * rack,struct tcpcb * tp)8510 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp)
8511 {
8512 int32_t delta;
8513 int32_t rtt;
8514
8515 if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY)
8516 /* No valid sample */
8517 return;
8518 if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) {
8519 /* We are to use the lowest RTT seen in a single ack */
8520 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest;
8521 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) {
8522 /* We are to use the highest RTT seen in a single ack */
8523 rtt = rack->r_ctl.rack_rs.rs_rtt_highest;
8524 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) {
8525 /* We are to use the average RTT seen in a single ack */
8526 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot /
8527 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt);
8528 } else {
8529 #ifdef INVARIANTS
8530 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method);
8531 #endif
8532 return;
8533 }
8534 if (rtt == 0)
8535 rtt = 1;
8536 if (rack->rc_gp_rtt_set == 0) {
8537 /*
8538 * With no RTT we have to accept
8539 * even one we are not confident of.
8540 */
8541 rack->r_ctl.rc_gp_srtt = rack->r_ctl.rack_rs.rs_us_rtt;
8542 rack->rc_gp_rtt_set = 1;
8543 } else if (rack->r_ctl.rack_rs.confidence) {
8544 /* update the running gp srtt */
8545 rack->r_ctl.rc_gp_srtt -= (rack->r_ctl.rc_gp_srtt/8);
8546 rack->r_ctl.rc_gp_srtt += rack->r_ctl.rack_rs.rs_us_rtt / 8;
8547 }
8548 if (rack->r_ctl.rack_rs.confidence) {
8549 /*
8550 * record the low and high for highly buffered path computation,
8551 * we only do this if we are confident (not a retransmission).
8552 */
8553 if (rack->r_ctl.rc_highest_us_rtt < rack->r_ctl.rack_rs.rs_us_rtt) {
8554 rack->r_ctl.rc_highest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt;
8555 }
8556 if (rack->rc_highly_buffered == 0) {
8557 /*
8558 * Currently once we declare a path has
8559 * highly buffered there is no going
8560 * back, which may be a problem...
8561 */
8562 if ((rack->r_ctl.rc_highest_us_rtt / rack->r_ctl.rc_lowest_us_rtt) > rack_hbp_thresh) {
8563 rack_log_rtt_shrinks(rack, rack->r_ctl.rack_rs.rs_us_rtt,
8564 rack->r_ctl.rc_highest_us_rtt,
8565 rack->r_ctl.rc_lowest_us_rtt,
8566 RACK_RTTS_SEEHBP);
8567 rack->rc_highly_buffered = 1;
8568 }
8569 }
8570 }
8571 if ((rack->r_ctl.rack_rs.confidence) ||
8572 (rack->r_ctl.rack_rs.rs_us_rtrcnt == 1)) {
8573 /*
8574 * If we are highly confident of it <or> it was
8575 * never retransmitted we accept it as the last us_rtt.
8576 */
8577 rack->r_ctl.rc_last_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt;
8578 /* The lowest rtt can be set if its was not retransmited */
8579 if (rack->r_ctl.rc_lowest_us_rtt > rack->r_ctl.rack_rs.rs_us_rtt) {
8580 rack->r_ctl.rc_lowest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt;
8581 if (rack->r_ctl.rc_lowest_us_rtt == 0)
8582 rack->r_ctl.rc_lowest_us_rtt = 1;
8583 }
8584 }
8585 rack = (struct tcp_rack *)tp->t_fb_ptr;
8586 if (tp->t_srtt != 0) {
8587 /*
8588 * We keep a simple srtt in microseconds, like our rtt
8589 * measurement. We don't need to do any tricks with shifting
8590 * etc. Instead we just add in 1/8th of the new measurement
8591 * and subtract out 1/8 of the old srtt. We do the same with
8592 * the variance after finding the absolute value of the
8593 * difference between this sample and the current srtt.
8594 */
8595 delta = tp->t_srtt - rtt;
8596 /* Take off 1/8th of the current sRTT */
8597 tp->t_srtt -= (tp->t_srtt >> 3);
8598 /* Add in 1/8th of the new RTT just measured */
8599 tp->t_srtt += (rtt >> 3);
8600 if (tp->t_srtt <= 0)
8601 tp->t_srtt = 1;
8602 /* Now lets make the absolute value of the variance */
8603 if (delta < 0)
8604 delta = -delta;
8605 /* Subtract out 1/8th */
8606 tp->t_rttvar -= (tp->t_rttvar >> 3);
8607 /* Add in 1/8th of the new variance we just saw */
8608 tp->t_rttvar += (delta >> 3);
8609 if (tp->t_rttvar <= 0)
8610 tp->t_rttvar = 1;
8611 } else {
8612 /*
8613 * No rtt measurement yet - use the unsmoothed rtt. Set the
8614 * variance to half the rtt (so our first retransmit happens
8615 * at 3*rtt).
8616 */
8617 tp->t_srtt = rtt;
8618 tp->t_rttvar = rtt >> 1;
8619 }
8620 rack->rc_srtt_measure_made = 1;
8621 KMOD_TCPSTAT_INC(tcps_rttupdated);
8622 if (tp->t_rttupdated < UCHAR_MAX)
8623 tp->t_rttupdated++;
8624 #ifdef STATS
8625 if (rack_stats_gets_ms_rtt == 0) {
8626 /* Send in the microsecond rtt used for rxt timeout purposes */
8627 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt));
8628 } else if (rack_stats_gets_ms_rtt == 1) {
8629 /* Send in the millisecond rtt used for rxt timeout purposes */
8630 int32_t ms_rtt;
8631
8632 /* Round up */
8633 ms_rtt = (rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC;
8634 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt));
8635 } else if (rack_stats_gets_ms_rtt == 2) {
8636 /* Send in the millisecond rtt has close to the path RTT as we can get */
8637 int32_t ms_rtt;
8638
8639 /* Round up */
8640 ms_rtt = (rack->r_ctl.rack_rs.rs_us_rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC;
8641 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt));
8642 } else {
8643 /* Send in the microsecond rtt has close to the path RTT as we can get */
8644 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt));
8645 }
8646 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_PATHRTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt));
8647 #endif
8648 rack->r_ctl.last_rcv_tstmp_for_rtt = tcp_tv_to_msec(&rack->r_ctl.act_rcv_time);
8649 /*
8650 * the retransmit should happen at rtt + 4 * rttvar. Because of the
8651 * way we do the smoothing, srtt and rttvar will each average +1/2
8652 * tick of bias. When we compute the retransmit timer, we want 1/2
8653 * tick of rounding and 1 extra tick because of +-1/2 tick
8654 * uncertainty in the firing of the timer. The bias will give us
8655 * exactly the 1.5 tick we need. But, because the bias is
8656 * statistical, we have to test that we don't drop below the minimum
8657 * feasible timer (which is 2 ticks).
8658 */
8659 tp->t_rxtshift = 0;
8660 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
8661 max(rack_rto_min, rtt + 2), rack_rto_max, rack->r_ctl.timer_slop);
8662 rack_log_rtt_sample(rack, rtt);
8663 tp->t_softerror = 0;
8664 }
8665
8666
8667 static void
rack_apply_updated_usrtt(struct tcp_rack * rack,uint32_t us_rtt,uint32_t us_cts)8668 rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts)
8669 {
8670 /*
8671 * Apply to filter the inbound us-rtt at us_cts.
8672 */
8673 uint32_t old_rtt;
8674
8675 old_rtt = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
8676 apply_filter_min_small(&rack->r_ctl.rc_gp_min_rtt,
8677 us_rtt, us_cts);
8678 if (old_rtt > us_rtt) {
8679 /* We just hit a new lower rtt time */
8680 rack_log_rtt_shrinks(rack, us_cts, old_rtt,
8681 __LINE__, RACK_RTTS_NEWRTT);
8682 /*
8683 * Only count it if its lower than what we saw within our
8684 * calculated range.
8685 */
8686 if ((old_rtt - us_rtt) > rack_min_rtt_movement) {
8687 if (rack_probertt_lower_within &&
8688 rack->rc_gp_dyn_mul &&
8689 (rack->use_fixed_rate == 0) &&
8690 (rack->rc_always_pace)) {
8691 /*
8692 * We are seeing a new lower rtt very close
8693 * to the time that we would have entered probe-rtt.
8694 * This is probably due to the fact that a peer flow
8695 * has entered probe-rtt. Lets go in now too.
8696 */
8697 uint32_t val;
8698
8699 val = rack_probertt_lower_within * rack_time_between_probertt;
8700 val /= 100;
8701 if ((rack->in_probe_rtt == 0) &&
8702 (rack->rc_skip_timely == 0) &&
8703 ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) {
8704 rack_enter_probertt(rack, us_cts);
8705 }
8706 }
8707 rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
8708 }
8709 }
8710 }
8711
8712 static int
rack_update_rtt(struct tcpcb * tp,struct tcp_rack * rack,struct rack_sendmap * rsm,struct tcpopt * to,uint32_t cts,int32_t ack_type,tcp_seq th_ack)8713 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
8714 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack)
8715 {
8716 uint32_t us_rtt;
8717 int32_t i, all;
8718 uint32_t t, len_acked;
8719
8720 if ((rsm->r_flags & RACK_ACKED) ||
8721 (rsm->r_flags & RACK_WAS_ACKED))
8722 /* Already done */
8723 return (0);
8724 if (rsm->r_no_rtt_allowed) {
8725 /* Not allowed */
8726 return (0);
8727 }
8728 if (ack_type == CUM_ACKED) {
8729 if (SEQ_GT(th_ack, rsm->r_end)) {
8730 len_acked = rsm->r_end - rsm->r_start;
8731 all = 1;
8732 } else {
8733 len_acked = th_ack - rsm->r_start;
8734 all = 0;
8735 }
8736 } else {
8737 len_acked = rsm->r_end - rsm->r_start;
8738 all = 0;
8739 }
8740 if (rsm->r_rtr_cnt == 1) {
8741
8742 t = cts - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
8743 if ((int)t <= 0)
8744 t = 1;
8745 if (!tp->t_rttlow || tp->t_rttlow > t)
8746 tp->t_rttlow = t;
8747 if (!rack->r_ctl.rc_rack_min_rtt ||
8748 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
8749 rack->r_ctl.rc_rack_min_rtt = t;
8750 if (rack->r_ctl.rc_rack_min_rtt == 0) {
8751 rack->r_ctl.rc_rack_min_rtt = 1;
8752 }
8753 }
8754 if (TSTMP_GT(tcp_tv_to_usec(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]))
8755 us_rtt = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
8756 else
8757 us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
8758 if (us_rtt == 0)
8759 us_rtt = 1;
8760 if (CC_ALGO(tp)->rttsample != NULL) {
8761 /* Kick the RTT to the CC */
8762 CC_ALGO(tp)->rttsample(&tp->t_ccv, us_rtt, 1, rsm->r_fas);
8763 }
8764 rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usec(&rack->r_ctl.act_rcv_time));
8765 if (ack_type == SACKED) {
8766 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 1);
8767 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 2 , rsm, rsm->r_rtr_cnt);
8768 } else {
8769 /*
8770 * We need to setup what our confidence
8771 * is in this ack.
8772 *
8773 * If the rsm was app limited and it is
8774 * less than a mss in length (the end
8775 * of the send) then we have a gap. If we
8776 * were app limited but say we were sending
8777 * multiple MSS's then we are more confident
8778 * int it.
8779 *
8780 * When we are not app-limited then we see if
8781 * the rsm is being included in the current
8782 * measurement, we tell this by the app_limited_needs_set
8783 * flag.
8784 *
8785 * Note that being cwnd blocked is not applimited
8786 * as well as the pacing delay between packets which
8787 * are sending only 1 or 2 MSS's also will show up
8788 * in the RTT. We probably need to examine this algorithm
8789 * a bit more and enhance it to account for the delay
8790 * between rsm's. We could do that by saving off the
8791 * pacing delay of each rsm (in an rsm) and then
8792 * factoring that in somehow though for now I am
8793 * not sure how :)
8794 */
8795 int calc_conf = 0;
8796
8797 if (rsm->r_flags & RACK_APP_LIMITED) {
8798 if (all && (len_acked <= ctf_fixed_maxseg(tp)))
8799 calc_conf = 0;
8800 else
8801 calc_conf = 1;
8802 } else if (rack->app_limited_needs_set == 0) {
8803 calc_conf = 1;
8804 } else {
8805 calc_conf = 0;
8806 }
8807 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 2);
8808 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt,
8809 calc_conf, rsm, rsm->r_rtr_cnt);
8810 }
8811 if ((rsm->r_flags & RACK_TLP) &&
8812 (!IN_FASTRECOVERY(tp->t_flags))) {
8813 /* Segment was a TLP and our retrans matched */
8814 if (rack->r_ctl.rc_tlp_cwnd_reduce) {
8815 rack_cong_signal(tp, CC_NDUPACK, th_ack, __LINE__);
8816 }
8817 }
8818 if ((rack->r_ctl.rc_rack_tmit_time == 0) ||
8819 (SEQ_LT(rack->r_ctl.rc_rack_tmit_time,
8820 (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]))) {
8821 /* New more recent rack_tmit_time */
8822 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
8823 if (rack->r_ctl.rc_rack_tmit_time == 0)
8824 rack->r_ctl.rc_rack_tmit_time = 1;
8825 rack->rc_rack_rtt = t;
8826 }
8827 return (1);
8828 }
8829 /*
8830 * We clear the soft/rxtshift since we got an ack.
8831 * There is no assurance we will call the commit() function
8832 * so we need to clear these to avoid incorrect handling.
8833 */
8834 tp->t_rxtshift = 0;
8835 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
8836 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
8837 tp->t_softerror = 0;
8838 if (to && (to->to_flags & TOF_TS) &&
8839 (ack_type == CUM_ACKED) &&
8840 (to->to_tsecr) &&
8841 ((rsm->r_flags & RACK_OVERMAX) == 0)) {
8842 /*
8843 * Now which timestamp does it match? In this block the ACK
8844 * must be coming from a previous transmission.
8845 */
8846 for (i = 0; i < rsm->r_rtr_cnt; i++) {
8847 if (rack_ts_to_msec(rsm->r_tim_lastsent[i]) == to->to_tsecr) {
8848 t = cts - (uint32_t)rsm->r_tim_lastsent[i];
8849 if ((int)t <= 0)
8850 t = 1;
8851 if (CC_ALGO(tp)->rttsample != NULL) {
8852 /*
8853 * Kick the RTT to the CC, here
8854 * we lie a bit in that we know the
8855 * retransmission is correct even though
8856 * we retransmitted. This is because
8857 * we match the timestamps.
8858 */
8859 if (TSTMP_GT(tcp_tv_to_usec(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[i]))
8860 us_rtt = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[i];
8861 else
8862 us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[i];
8863 CC_ALGO(tp)->rttsample(&tp->t_ccv, us_rtt, 1, rsm->r_fas);
8864 }
8865 if ((i + 1) < rsm->r_rtr_cnt) {
8866 /*
8867 * The peer ack'd from our previous
8868 * transmission. We have a spurious
8869 * retransmission and thus we dont
8870 * want to update our rack_rtt.
8871 *
8872 * Hmm should there be a CC revert here?
8873 *
8874 */
8875 return (0);
8876 }
8877 if (!tp->t_rttlow || tp->t_rttlow > t)
8878 tp->t_rttlow = t;
8879 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
8880 rack->r_ctl.rc_rack_min_rtt = t;
8881 if (rack->r_ctl.rc_rack_min_rtt == 0) {
8882 rack->r_ctl.rc_rack_min_rtt = 1;
8883 }
8884 }
8885 if ((rack->r_ctl.rc_rack_tmit_time == 0) ||
8886 (SEQ_LT(rack->r_ctl.rc_rack_tmit_time,
8887 (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]))) {
8888 /* New more recent rack_tmit_time */
8889 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
8890 if (rack->r_ctl.rc_rack_tmit_time == 0)
8891 rack->r_ctl.rc_rack_tmit_time = 1;
8892 rack->rc_rack_rtt = t;
8893 }
8894 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[i], cts, 3);
8895 tcp_rack_xmit_timer(rack, t + 1, len_acked, t, 0, rsm,
8896 rsm->r_rtr_cnt);
8897 return (1);
8898 }
8899 }
8900 /* If we are logging log out the sendmap */
8901 if (tcp_bblogging_on(rack->rc_tp)) {
8902 for (i = 0; i < rsm->r_rtr_cnt; i++) {
8903 rack_log_rtt_sendmap(rack, i, rsm->r_tim_lastsent[i], to->to_tsecr);
8904 }
8905 }
8906 goto ts_not_found;
8907 } else {
8908 /*
8909 * Ok its a SACK block that we retransmitted. or a windows
8910 * machine without timestamps. We can tell nothing from the
8911 * time-stamp since its not there or the time the peer last
8912 * received a segment that moved forward its cum-ack point.
8913 */
8914 ts_not_found:
8915 i = rsm->r_rtr_cnt - 1;
8916 t = cts - (uint32_t)rsm->r_tim_lastsent[i];
8917 if ((int)t <= 0)
8918 t = 1;
8919 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
8920 /*
8921 * We retransmitted and the ack came back in less
8922 * than the smallest rtt we have observed. We most
8923 * likely did an improper retransmit as outlined in
8924 * 6.2 Step 2 point 2 in the rack-draft so we
8925 * don't want to update our rack_rtt. We in
8926 * theory (in future) might want to think about reverting our
8927 * cwnd state but we won't for now.
8928 */
8929 return (0);
8930 } else if (rack->r_ctl.rc_rack_min_rtt) {
8931 /*
8932 * We retransmitted it and the retransmit did the
8933 * job.
8934 */
8935 if (!rack->r_ctl.rc_rack_min_rtt ||
8936 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
8937 rack->r_ctl.rc_rack_min_rtt = t;
8938 if (rack->r_ctl.rc_rack_min_rtt == 0) {
8939 rack->r_ctl.rc_rack_min_rtt = 1;
8940 }
8941 }
8942 if ((rack->r_ctl.rc_rack_tmit_time == 0) ||
8943 (SEQ_LT(rack->r_ctl.rc_rack_tmit_time,
8944 (uint32_t)rsm->r_tim_lastsent[i]))) {
8945 /* New more recent rack_tmit_time */
8946 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[i];
8947 if (rack->r_ctl.rc_rack_tmit_time == 0)
8948 rack->r_ctl.rc_rack_tmit_time = 1;
8949 rack->rc_rack_rtt = t;
8950 }
8951 return (1);
8952 }
8953 }
8954 return (0);
8955 }
8956
8957 /*
8958 * Mark the SACK_PASSED flag on all entries prior to rsm send wise.
8959 */
8960 static void
rack_log_sack_passed(struct tcpcb * tp,struct tcp_rack * rack,struct rack_sendmap * rsm,uint32_t cts,int line)8961 rack_log_sack_passed(struct tcpcb *tp,
8962 struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t cts, int line)
8963 {
8964 struct rack_sendmap *nrsm;
8965 uint32_t thresh;
8966
8967 /* Get our rxt threshold for lost consideration */
8968 thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(tp, rack), cts, __LINE__, 0);
8969 /* Now start looking at rsm's */
8970 nrsm = rsm;
8971 TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap,
8972 rack_head, r_tnext) {
8973 if (nrsm == rsm) {
8974 /* Skip original segment he is acked */
8975 continue;
8976 }
8977 if (nrsm->r_flags & RACK_ACKED) {
8978 /*
8979 * Skip ack'd segments, though we
8980 * should not see these, since tmap
8981 * should not have ack'd segments.
8982 */
8983 continue;
8984 }
8985 if (nrsm->r_flags & RACK_RWND_COLLAPSED) {
8986 /*
8987 * If the peer dropped the rwnd on
8988 * these then we don't worry about them.
8989 */
8990 continue;
8991 }
8992 /* Check lost state */
8993 if ((nrsm->r_flags & RACK_WAS_LOST) == 0) {
8994 uint32_t exp;
8995
8996 exp = ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]) + thresh;
8997 if (TSTMP_LT(exp, cts) || (exp == cts)) {
8998 /* We consider it lost */
8999 nrsm->r_flags |= RACK_WAS_LOST;
9000 rack->r_ctl.rc_considered_lost += nrsm->r_end - nrsm->r_start;
9001 }
9002 }
9003 if (nrsm->r_flags & RACK_SACK_PASSED) {
9004 /*
9005 * We found one that is already marked
9006 * passed, we have been here before and
9007 * so all others below this are marked.
9008 */
9009 break;
9010 }
9011 rack_log_dsack_event(rack, 12, __LINE__, nrsm->r_start, nrsm->r_end);
9012 nrsm->r_flags |= RACK_SACK_PASSED;
9013 nrsm->r_flags &= ~RACK_WAS_SACKPASS;
9014 }
9015 }
9016
9017 static void
rack_need_set_test(struct tcpcb * tp,struct tcp_rack * rack,struct rack_sendmap * rsm,tcp_seq th_ack,int line,int use_which)9018 rack_need_set_test(struct tcpcb *tp,
9019 struct tcp_rack *rack,
9020 struct rack_sendmap *rsm,
9021 tcp_seq th_ack,
9022 int line,
9023 int use_which)
9024 {
9025 struct rack_sendmap *s_rsm;
9026
9027 if ((tp->t_flags & TF_GPUTINPROG) &&
9028 SEQ_GEQ(rsm->r_end, tp->gput_seq)) {
9029 /*
9030 * We were app limited, and this ack
9031 * butts up or goes beyond the point where we want
9032 * to start our next measurement. We need
9033 * to record the new gput_ts as here and
9034 * possibly update the start sequence.
9035 */
9036 uint32_t seq, ts;
9037
9038 if (rsm->r_rtr_cnt > 1) {
9039 /*
9040 * This is a retransmit, can we
9041 * really make any assessment at this
9042 * point? We are not really sure of
9043 * the timestamp, is it this or the
9044 * previous transmission?
9045 *
9046 * Lets wait for something better that
9047 * is not retransmitted.
9048 */
9049 return;
9050 }
9051 seq = tp->gput_seq;
9052 ts = tp->gput_ts;
9053 rack->app_limited_needs_set = 0;
9054 tp->gput_ts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time);
9055 /* Do we start at a new end? */
9056 if ((use_which == RACK_USE_BEG) &&
9057 SEQ_GEQ(rsm->r_start, tp->gput_seq)) {
9058 /*
9059 * When we get an ACK that just eats
9060 * up some of the rsm, we set RACK_USE_BEG
9061 * since whats at r_start (i.e. th_ack)
9062 * is left unacked and thats where the
9063 * measurement now starts.
9064 */
9065 tp->gput_seq = rsm->r_start;
9066 }
9067 if ((use_which == RACK_USE_END) &&
9068 SEQ_GEQ(rsm->r_end, tp->gput_seq)) {
9069 /*
9070 * We use the end when the cumack
9071 * is moving forward and completely
9072 * deleting the rsm passed so basically
9073 * r_end holds th_ack.
9074 *
9075 * For SACK's we also want to use the end
9076 * since this piece just got sacked and
9077 * we want to target anything after that
9078 * in our measurement.
9079 */
9080 tp->gput_seq = rsm->r_end;
9081 }
9082 if (use_which == RACK_USE_END_OR_THACK) {
9083 /*
9084 * special case for ack moving forward,
9085 * not a sack, we need to move all the
9086 * way up to where this ack cum-ack moves
9087 * to.
9088 */
9089 if (SEQ_GT(th_ack, rsm->r_end))
9090 tp->gput_seq = th_ack;
9091 else
9092 tp->gput_seq = rsm->r_end;
9093 }
9094 if (SEQ_LT(tp->gput_seq, tp->snd_max))
9095 s_rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq);
9096 else
9097 s_rsm = NULL;
9098 /*
9099 * Pick up the correct send time if we can the rsm passed in
9100 * may be equal to s_rsm if the RACK_USE_BEG was set. For the other
9101 * two cases (RACK_USE_THACK or RACK_USE_END) most likely we will
9102 * find a different seq i.e. the next send up.
9103 *
9104 * If that has not been sent, s_rsm will be NULL and we must
9105 * arrange it so this function will get called again by setting
9106 * app_limited_needs_set.
9107 */
9108 if (s_rsm)
9109 rack->r_ctl.rc_gp_output_ts = s_rsm->r_tim_lastsent[0];
9110 else {
9111 /* If we hit here we have to have *not* sent tp->gput_seq */
9112 rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[0];
9113 /* Set it up so we will go through here again */
9114 rack->app_limited_needs_set = 1;
9115 }
9116 if (SEQ_GT(tp->gput_seq, tp->gput_ack)) {
9117 /*
9118 * We moved beyond this guy's range, re-calculate
9119 * the new end point.
9120 */
9121 if (rack->rc_gp_filled == 0) {
9122 tp->gput_ack = tp->gput_seq + max(rc_init_window(rack), (MIN_GP_WIN * ctf_fixed_maxseg(tp)));
9123 } else {
9124 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
9125 }
9126 }
9127 /*
9128 * We are moving the goal post, we may be able to clear the
9129 * measure_saw_probe_rtt flag.
9130 */
9131 if ((rack->in_probe_rtt == 0) &&
9132 (rack->measure_saw_probe_rtt) &&
9133 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit)))
9134 rack->measure_saw_probe_rtt = 0;
9135 rack_log_pacing_delay_calc(rack, ts, tp->gput_ts,
9136 seq, tp->gput_seq,
9137 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) |
9138 (uint64_t)rack->r_ctl.rc_gp_output_ts),
9139 5, line, NULL, 0);
9140 if (rack->rc_gp_filled &&
9141 ((tp->gput_ack - tp->gput_seq) <
9142 max(rc_init_window(rack), (MIN_GP_WIN *
9143 ctf_fixed_maxseg(tp))))) {
9144 uint32_t ideal_amount;
9145
9146 ideal_amount = rack_get_measure_window(tp, rack);
9147 if (ideal_amount > sbavail(&tptosocket(tp)->so_snd)) {
9148 /*
9149 * There is no sense of continuing this measurement
9150 * because its too small to gain us anything we
9151 * trust. Skip it and that way we can start a new
9152 * measurement quicker.
9153 */
9154 tp->t_flags &= ~TF_GPUTINPROG;
9155 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq,
9156 0, 0,
9157 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) |
9158 (uint64_t)rack->r_ctl.rc_gp_output_ts),
9159 6, __LINE__, NULL, 0);
9160 } else {
9161 /*
9162 * Reset the window further out.
9163 */
9164 tp->gput_ack = tp->gput_seq + ideal_amount;
9165 }
9166 }
9167 rack_tend_gp_marks(tp, rack);
9168 rack_log_gpset(rack, tp->gput_ack, 0, 0, line, 2, rsm);
9169 }
9170 }
9171
9172 static inline int
is_rsm_inside_declared_tlp_block(struct tcp_rack * rack,struct rack_sendmap * rsm)9173 is_rsm_inside_declared_tlp_block(struct tcp_rack *rack, struct rack_sendmap *rsm)
9174 {
9175 if (SEQ_LT(rsm->r_end, rack->r_ctl.last_tlp_acked_start)) {
9176 /* Behind our TLP definition or right at */
9177 return (0);
9178 }
9179 if (SEQ_GT(rsm->r_start, rack->r_ctl.last_tlp_acked_end)) {
9180 /* The start is beyond or right at our end of TLP definition */
9181 return (0);
9182 }
9183 /* It has to be a sub-part of the original TLP recorded */
9184 return (1);
9185 }
9186
9187
9188 static int
rack_check_reorder_ack(struct tcpcb * tp,struct tcp_rack * rack,struct rack_sendmap * rsm,int the_end,uint32_t cts,int can_exit_recovery,int line)9189 rack_check_reorder_ack(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, int the_end, uint32_t cts, int can_exit_recovery, int line)
9190 {
9191 if ((rack_rtt_divisor > 0) &&
9192 (rsm->r_rtr_cnt == 2) &&
9193 IN_RECOVERY(tp->t_flags) &&
9194 (rsm->r_flags & RACK_WAS_SACKPASS)){
9195 uint32_t fractional, snt_to_ack;
9196
9197 fractional = (tp->t_srtt / rack_rtt_divisor);
9198 if (fractional == 0)
9199 fractional = 1;
9200 snt_to_ack = cts - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
9201 if (snt_to_ack <= fractional) {
9202 rack->r_ctl.rc_reorder_ts = cts;
9203 KASSERT((rack->r_ctl.recovery_rxt_cnt >= (the_end - rsm->r_start)),
9204 ("rsm:%p rack:%p recovery_rxt_cnt would go negative recovery_rxt_cnt:%u sub:%u", rsm, rack, rack->r_ctl.recovery_rxt_cnt, (the_end - rsm->r_start)));
9205 rack->r_ctl.recovery_rxt_cnt -= (the_end - rsm->r_start);
9206 rack_log_to_prr(rack, 18, rack->r_ctl.recovery_rxt_cnt, line);
9207 if (can_exit_recovery && (rack->r_ctl.recovery_rxt_cnt == 0)) {
9208 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at_erec;
9209 rack_exit_recovery(tp, rack, 4);
9210 rack->r_might_revert = 0;
9211 rack->r_ctl.retran_during_recovery = 0;
9212 rack_log_to_prr(rack, 17, snt_to_ack, line);
9213 }
9214 return (1);
9215 }
9216 }
9217 return (0);
9218 }
9219
9220 static uint32_t
rack_proc_sack_blk(struct tcpcb * tp,struct tcp_rack * rack,struct sackblk * sack,struct tcpopt * to,struct rack_sendmap ** prsm,uint32_t cts,uint32_t segsiz)9221 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack,
9222 struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts,
9223 uint32_t segsiz)
9224 {
9225 uint32_t start, end, changed = 0;
9226 struct rack_sendmap stack_map;
9227 struct rack_sendmap *rsm, *nrsm, *prev, *next;
9228 int insret __diagused;
9229 int32_t used_ref = 1;
9230 int can_use_hookery = 0;
9231 int prohibit_marking = 0;
9232
9233 start = sack->start;
9234 end = sack->end;
9235 rsm = *prsm;
9236
9237 do_rest_ofb:
9238 if ((rsm == NULL) ||
9239 (SEQ_LT(end, rsm->r_start)) ||
9240 (SEQ_GEQ(start, rsm->r_end)) ||
9241 (SEQ_LT(start, rsm->r_start))) {
9242 /*
9243 * We are not in the right spot,
9244 * find the correct spot in the tree.
9245 */
9246 used_ref = 0;
9247 rsm = tqhash_find(rack->r_ctl.tqh, start);
9248 }
9249 if (rsm == NULL) {
9250 /* TSNH */
9251 goto out;
9252 }
9253 /* Ok we have an ACK for some piece of this rsm */
9254 if (rsm->r_start != start) {
9255 if ((rsm->r_flags & RACK_ACKED) == 0) {
9256 /*
9257 * Before any splitting or hookery is
9258 * done is it a TLP of interest i.e. rxt?
9259 */
9260 if ((rsm->r_flags & RACK_TLP) &&
9261 (rsm->r_rtr_cnt > 1)) {
9262 /*
9263 * We are splitting a rxt TLP, check
9264 * if we need to save off the start/end
9265 */
9266 if (rack->rc_last_tlp_acked_set &&
9267 (is_rsm_inside_declared_tlp_block(rack, rsm))) {
9268 /*
9269 * We already turned this on since we are inside
9270 * the previous one was a partially sack now we
9271 * are getting another one (maybe all of it).
9272 *
9273 */
9274 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
9275 /*
9276 * Lets make sure we have all of it though.
9277 */
9278 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
9279 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
9280 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
9281 rack->r_ctl.last_tlp_acked_end);
9282 }
9283 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
9284 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
9285 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
9286 rack->r_ctl.last_tlp_acked_end);
9287 }
9288 } else {
9289 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
9290 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
9291 rack->rc_last_tlp_past_cumack = 0;
9292 rack->rc_last_tlp_acked_set = 1;
9293 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
9294 }
9295 }
9296 /**
9297 * Need to split this in two pieces the before and after,
9298 * the before remains in the map, the after must be
9299 * added. In other words we have:
9300 * rsm |--------------|
9301 * sackblk |------->
9302 * rsm will become
9303 * rsm |---|
9304 * and nrsm will be the sacked piece
9305 * nrsm |----------|
9306 *
9307 * But before we start down that path lets
9308 * see if the sack spans over on top of
9309 * the next guy and it is already sacked.
9310 *
9311 */
9312 /*
9313 * Hookery can only be used if the two entries
9314 * are in the same bucket and neither one of
9315 * them staddle the bucket line.
9316 */
9317 next = tqhash_next(rack->r_ctl.tqh, rsm);
9318 if (next &&
9319 (rsm->bindex == next->bindex) &&
9320 ((rsm->r_flags & RACK_STRADDLE) == 0) &&
9321 ((next->r_flags & RACK_STRADDLE) == 0) &&
9322 ((rsm->r_flags & RACK_WAS_SACKPASS) == 0) &&
9323 ((next->r_flags & RACK_WAS_SACKPASS) == 0) &&
9324 ((rsm->r_flags & RACK_IS_PCM) == 0) &&
9325 ((next->r_flags & RACK_IS_PCM) == 0) &&
9326 (rsm->r_flags & RACK_IN_GP_WIN) &&
9327 (next->r_flags & RACK_IN_GP_WIN))
9328 can_use_hookery = 1;
9329 else
9330 can_use_hookery = 0;
9331 if (next && can_use_hookery &&
9332 (next->r_flags & RACK_ACKED) &&
9333 SEQ_GEQ(end, next->r_start)) {
9334 /**
9335 * So the next one is already acked, and
9336 * we can thus by hookery use our stack_map
9337 * to reflect the piece being sacked and
9338 * then adjust the two tree entries moving
9339 * the start and ends around. So we start like:
9340 * rsm |------------| (not-acked)
9341 * next |-----------| (acked)
9342 * sackblk |-------->
9343 * We want to end like so:
9344 * rsm |------| (not-acked)
9345 * next |-----------------| (acked)
9346 * nrsm |-----|
9347 * Where nrsm is a temporary stack piece we
9348 * use to update all the gizmos.
9349 */
9350 /* Copy up our fudge block */
9351 nrsm = &stack_map;
9352 memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
9353 /* Now adjust our tree blocks */
9354 tqhash_update_end(rack->r_ctl.tqh, rsm, start);
9355 next->r_start = start;
9356 rsm->r_flags |= RACK_SHUFFLED;
9357 next->r_flags |= RACK_SHUFFLED;
9358 /* Now we must adjust back where next->m is */
9359 rack_setup_offset_for_rsm(rack, rsm, next);
9360 /*
9361 * Which timestamp do we keep? It is rather
9362 * important in GP measurements to have the
9363 * accurate end of the send window.
9364 *
9365 * We keep the largest value, which is the newest
9366 * send. We do this in case a segment that is
9367 * joined together and not part of a GP estimate
9368 * later gets expanded into the GP estimate.
9369 *
9370 * We prohibit the merging of unlike kinds i.e.
9371 * all pieces that are in the GP estimate can be
9372 * merged and all pieces that are not in a GP estimate
9373 * can be merged, but not disimilar pieces. Combine
9374 * this with taking the highest here and we should
9375 * be ok unless of course the client reneges. Then
9376 * all bets are off.
9377 */
9378 if (next->r_tim_lastsent[(next->r_rtr_cnt-1)] <
9379 nrsm->r_tim_lastsent[(nrsm->r_rtr_cnt-1)])
9380 next->r_tim_lastsent[(next->r_rtr_cnt-1)] = nrsm->r_tim_lastsent[(nrsm->r_rtr_cnt-1)];
9381 /*
9382 * And we must keep the newest ack arrival time.
9383 */
9384 if (next->r_ack_arrival <
9385 rack_to_usec_ts(&rack->r_ctl.act_rcv_time))
9386 next->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
9387
9388
9389 /* We don't need to adjust rsm, it did not change */
9390 /* Clear out the dup ack count of the remainder */
9391 rsm->r_dupack = 0;
9392 rsm->r_just_ret = 0;
9393 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
9394 /* Now lets make sure our fudge block is right */
9395 nrsm->r_start = start;
9396 /* Check if the ack was too soon i.e. reordering + ack arrives too quickly */
9397 prohibit_marking = rack_check_reorder_ack(tp, rack, nrsm, nrsm->r_end, cts, 0, __LINE__);
9398 /* Now lets update all the stats and such */
9399 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0);
9400 if (rack->app_limited_needs_set)
9401 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END);
9402 changed += (nrsm->r_end - nrsm->r_start);
9403 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start);
9404 if (rsm->r_flags & RACK_WAS_LOST) {
9405 int my_chg;
9406
9407 /*
9408 * Note here we do not use our rack_mark_nolonger_lost() function
9409 * since we are moving our data pointer around and the
9410 * ack'ed side is already not considered lost.
9411 */
9412 my_chg = (nrsm->r_end - nrsm->r_start);
9413 KASSERT((rack->r_ctl.rc_considered_lost >= my_chg),
9414 ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack));
9415 if (my_chg <= rack->r_ctl.rc_considered_lost)
9416 rack->r_ctl.rc_considered_lost -= my_chg;
9417 else
9418 rack->r_ctl.rc_considered_lost = 0;
9419 }
9420 if (nrsm->r_flags & RACK_SACK_PASSED) {
9421 rack->r_ctl.rc_reorder_ts = cts;
9422 if (rack->r_ctl.rc_reorder_ts == 0)
9423 rack->r_ctl.rc_reorder_ts = 1;
9424 }
9425 /*
9426 * Now we want to go up from rsm (the
9427 * one left un-acked) to the next one
9428 * in the tmap. We do this so when
9429 * we walk backwards we include marking
9430 * sack-passed on rsm (The one passed in
9431 * is skipped since it is generally called
9432 * on something sacked before removing it
9433 * from the tmap).
9434 */
9435 if (rsm->r_in_tmap) {
9436 nrsm = TAILQ_NEXT(rsm, r_tnext);
9437 /*
9438 * Now that we have the next
9439 * one walk backwards from there.
9440 */
9441 if (nrsm && nrsm->r_in_tmap && (prohibit_marking == 0))
9442 rack_log_sack_passed(tp, rack, nrsm, cts, __LINE__);
9443 }
9444 /* Now are we done? */
9445 if (SEQ_LT(end, next->r_end) ||
9446 (end == next->r_end)) {
9447 /* Done with block */
9448 goto out;
9449 }
9450 rack_log_map_chg(tp, rack, &stack_map, rsm, next, MAP_SACK_M1, end, __LINE__);
9451 /* Postion for the next block */
9452 start = next->r_end;
9453 rsm = tqhash_next(rack->r_ctl.tqh, next);
9454 if (rsm == NULL)
9455 goto out;
9456 } else {
9457 /**
9458 * We can't use any hookery here, so we
9459 * need to split the map. We enter like
9460 * so:
9461 * rsm |--------|
9462 * sackblk |----->
9463 * We will add the new block nrsm and
9464 * that will be the new portion, and then
9465 * fall through after reseting rsm. So we
9466 * split and look like this:
9467 * rsm |----|
9468 * sackblk |----->
9469 * nrsm |---|
9470 * We then fall through reseting
9471 * rsm to nrsm, so the next block
9472 * picks it up.
9473 */
9474 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
9475 if (nrsm == NULL) {
9476 /*
9477 * failed XXXrrs what can we do but loose the sack
9478 * info?
9479 */
9480 goto out;
9481 }
9482 rack_clone_rsm(rack, nrsm, rsm, start);
9483 rsm->r_just_ret = 0;
9484 #ifndef INVARIANTS
9485 (void)tqhash_insert(rack->r_ctl.tqh, nrsm);
9486 #else
9487 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) {
9488 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p",
9489 nrsm, insret, rack, rsm);
9490 }
9491 #endif
9492 if (rsm->r_in_tmap) {
9493 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
9494 nrsm->r_in_tmap = 1;
9495 }
9496 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M2, end, __LINE__);
9497 rsm->r_flags &= (~RACK_HAS_FIN);
9498 /* Check if the ack was too soon i.e. reordering + ack arrives too quickly */
9499 prohibit_marking = rack_check_reorder_ack(tp, rack, nrsm, nrsm->r_end, cts, 0, __LINE__);
9500 /* Position us to point to the new nrsm that starts the sack blk */
9501 rsm = nrsm;
9502 }
9503 } else {
9504 /* Already sacked this piece */
9505 if (end == rsm->r_end) {
9506 /* Done with block */
9507 rsm = tqhash_next(rack->r_ctl.tqh, rsm);
9508 goto out;
9509 } else if (SEQ_LT(end, rsm->r_end)) {
9510 /* A partial sack to a already sacked block */
9511 rsm = tqhash_next(rack->r_ctl.tqh, rsm);
9512 goto out;
9513 } else {
9514 /*
9515 * The end goes beyond this guy
9516 * reposition the start to the
9517 * next block.
9518 */
9519 start = rsm->r_end;
9520 rsm = tqhash_next(rack->r_ctl.tqh, rsm);
9521 if (rsm == NULL)
9522 goto out;
9523 }
9524 }
9525 }
9526 if (SEQ_GEQ(end, rsm->r_end)) {
9527 /**
9528 * The end of this block is either beyond this guy or right
9529 * at this guy. I.e.:
9530 * rsm --- |-----|
9531 * end |-----|
9532 * <or>
9533 * end |---------|
9534 */
9535 if ((rsm->r_flags & RACK_ACKED) == 0) {
9536 /*
9537 * Is it a TLP of interest?
9538 */
9539 if ((rsm->r_flags & RACK_TLP) &&
9540 (rsm->r_rtr_cnt > 1)) {
9541 /*
9542 * We are splitting a rxt TLP, check
9543 * if we need to save off the start/end
9544 */
9545 if (rack->rc_last_tlp_acked_set &&
9546 (is_rsm_inside_declared_tlp_block(rack, rsm))) {
9547 /*
9548 * We already turned this on since we are inside
9549 * the previous one was a partially sack now we
9550 * are getting another one (maybe all of it).
9551 */
9552 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
9553 /*
9554 * Lets make sure we have all of it though.
9555 */
9556 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
9557 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
9558 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
9559 rack->r_ctl.last_tlp_acked_end);
9560 }
9561 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
9562 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
9563 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
9564 rack->r_ctl.last_tlp_acked_end);
9565 }
9566 } else {
9567 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
9568 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
9569 rack->rc_last_tlp_past_cumack = 0;
9570 rack->rc_last_tlp_acked_set = 1;
9571 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
9572 }
9573 }
9574 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0);
9575 changed += (rsm->r_end - rsm->r_start);
9576 /* Check if the ack was too soon i.e. reordering + ack arrives too quickly */
9577 prohibit_marking = rack_check_reorder_ack(tp, rack, rsm, rsm->r_end, cts, 0, __LINE__);
9578 /* You get a count for acking a whole segment or more */
9579 if (rsm->r_flags & RACK_WAS_LOST) {
9580 /*
9581 * Here we can use the inline function since
9582 * the rsm is truly marked lost and now no longer lost.
9583 */
9584 rack_mark_nolonger_lost(rack, rsm);
9585 }
9586 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
9587 if (rsm->r_in_tmap && (prohibit_marking == 0)) /* should be true */
9588 rack_log_sack_passed(tp, rack, rsm, cts, __LINE__);
9589
9590 /* Is Reordering occuring? */
9591 if (rsm->r_flags & RACK_SACK_PASSED) {
9592 rsm->r_flags &= ~RACK_SACK_PASSED;
9593 rack->r_ctl.rc_reorder_ts = cts;
9594 if (rack->r_ctl.rc_reorder_ts == 0)
9595 rack->r_ctl.rc_reorder_ts = 1;
9596 }
9597 if (rack->app_limited_needs_set)
9598 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END);
9599 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
9600 rsm->r_flags |= RACK_ACKED;
9601 rack_update_pcm_ack(rack, 0, rsm->r_start, rsm->r_end);
9602 if (rsm->r_in_tmap) {
9603 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
9604 rsm->r_in_tmap = 0;
9605 }
9606 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_SACK_M3, end, __LINE__);
9607 }
9608 if (end == rsm->r_end) {
9609 /* This block only - done, setup for next */
9610 goto out;
9611 }
9612 /*
9613 * There is more not coverend by this rsm move on
9614 * to the next block in the tail queue hash table.
9615 */
9616 nrsm = tqhash_next(rack->r_ctl.tqh, rsm);
9617 start = rsm->r_end;
9618 rsm = nrsm;
9619 if (rsm == NULL)
9620 goto out;
9621 goto do_rest_ofb;
9622 }
9623 /**
9624 * The end of this sack block is smaller than
9625 * our rsm i.e.:
9626 * rsm --- |-----|
9627 * end |--|
9628 */
9629 if ((rsm->r_flags & RACK_ACKED) == 0) {
9630 /*
9631 * Is it a TLP of interest?
9632 */
9633 if ((rsm->r_flags & RACK_TLP) &&
9634 (rsm->r_rtr_cnt > 1)) {
9635 /*
9636 * We are splitting a rxt TLP, check
9637 * if we need to save off the start/end
9638 */
9639 if (rack->rc_last_tlp_acked_set &&
9640 (is_rsm_inside_declared_tlp_block(rack, rsm))) {
9641 /*
9642 * We already turned this on since we are inside
9643 * the previous one was a partially sack now we
9644 * are getting another one (maybe all of it).
9645 */
9646 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
9647 /*
9648 * Lets make sure we have all of it though.
9649 */
9650 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
9651 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
9652 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
9653 rack->r_ctl.last_tlp_acked_end);
9654 }
9655 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
9656 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
9657 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
9658 rack->r_ctl.last_tlp_acked_end);
9659 }
9660 } else {
9661 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
9662 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
9663 rack->rc_last_tlp_past_cumack = 0;
9664 rack->rc_last_tlp_acked_set = 1;
9665 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
9666 }
9667 }
9668 /*
9669 * Hookery can only be used if the two entries
9670 * are in the same bucket and neither one of
9671 * them staddle the bucket line.
9672 */
9673 prev = tqhash_prev(rack->r_ctl.tqh, rsm);
9674 if (prev &&
9675 (rsm->bindex == prev->bindex) &&
9676 ((rsm->r_flags & RACK_STRADDLE) == 0) &&
9677 ((prev->r_flags & RACK_STRADDLE) == 0) &&
9678 ((prev->r_flags & RACK_WAS_SACKPASS) == 0) &&
9679 ((rsm->r_flags & RACK_WAS_SACKPASS) == 0) &&
9680 ((rsm->r_flags & RACK_IS_PCM) == 0) &&
9681 ((prev->r_flags & RACK_IS_PCM) == 0) &&
9682 (rsm->r_flags & RACK_IN_GP_WIN) &&
9683 (prev->r_flags & RACK_IN_GP_WIN))
9684 can_use_hookery = 1;
9685 else
9686 can_use_hookery = 0;
9687 if (prev && can_use_hookery &&
9688 (prev->r_flags & RACK_ACKED)) {
9689 /**
9690 * Goal, we want the right remainder of rsm to shrink
9691 * in place and span from (rsm->r_start = end) to rsm->r_end.
9692 * We want to expand prev to go all the way
9693 * to prev->r_end <- end.
9694 * so in the tree we have before:
9695 * prev |--------| (acked)
9696 * rsm |-------| (non-acked)
9697 * sackblk |-|
9698 * We churn it so we end up with
9699 * prev |----------| (acked)
9700 * rsm |-----| (non-acked)
9701 * nrsm |-| (temporary)
9702 *
9703 * Note if either prev/rsm is a TLP we don't
9704 * do this.
9705 */
9706 nrsm = &stack_map;
9707 memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
9708 tqhash_update_end(rack->r_ctl.tqh, prev, end);
9709 rsm->r_start = end;
9710 rsm->r_flags |= RACK_SHUFFLED;
9711 prev->r_flags |= RACK_SHUFFLED;
9712 /* Now adjust nrsm (stack copy) to be
9713 * the one that is the small
9714 * piece that was "sacked".
9715 */
9716 nrsm->r_end = end;
9717 rsm->r_dupack = 0;
9718 /* Check if the ack was too soon i.e. reordering + ack arrives too quickly */
9719 prohibit_marking = rack_check_reorder_ack(tp, rack, nrsm, nrsm->r_end, cts, 0, __LINE__);
9720 /*
9721 * Which timestamp do we keep? It is rather
9722 * important in GP measurements to have the
9723 * accurate end of the send window.
9724 *
9725 * We keep the largest value, which is the newest
9726 * send. We do this in case a segment that is
9727 * joined together and not part of a GP estimate
9728 * later gets expanded into the GP estimate.
9729 *
9730 * We prohibit the merging of unlike kinds i.e.
9731 * all pieces that are in the GP estimate can be
9732 * merged and all pieces that are not in a GP estimate
9733 * can be merged, but not disimilar pieces. Combine
9734 * this with taking the highest here and we should
9735 * be ok unless of course the client reneges. Then
9736 * all bets are off.
9737 */
9738 if(prev->r_tim_lastsent[(prev->r_rtr_cnt-1)] <
9739 nrsm->r_tim_lastsent[(nrsm->r_rtr_cnt-1)]) {
9740 prev->r_tim_lastsent[(prev->r_rtr_cnt-1)] = nrsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
9741 }
9742 /*
9743 * And we must keep the newest ack arrival time.
9744 */
9745
9746 if(prev->r_ack_arrival <
9747 rack_to_usec_ts(&rack->r_ctl.act_rcv_time))
9748 prev->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
9749
9750 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
9751 /*
9752 * Now that the rsm has had its start moved forward
9753 * lets go ahead and get its new place in the world.
9754 */
9755 rack_setup_offset_for_rsm(rack, prev, rsm);
9756 /*
9757 * Now nrsm is our new little piece
9758 * that is acked (which was merged
9759 * to prev). Update the rtt and changed
9760 * based on that. Also check for reordering.
9761 */
9762 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0);
9763 if (rack->app_limited_needs_set)
9764 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END);
9765 changed += (nrsm->r_end - nrsm->r_start);
9766 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start);
9767 if (rsm->r_flags & RACK_WAS_LOST) {
9768 int my_chg;
9769
9770 /*
9771 * Note here we are using hookery again so we can't
9772 * use our rack_mark_nolonger_lost() function.
9773 */
9774 my_chg = (nrsm->r_end - nrsm->r_start);
9775 KASSERT((rack->r_ctl.rc_considered_lost >= my_chg),
9776 ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack));
9777 if (my_chg <= rack->r_ctl.rc_considered_lost)
9778 rack->r_ctl.rc_considered_lost -= my_chg;
9779 else
9780 rack->r_ctl.rc_considered_lost = 0;
9781 }
9782 if (nrsm->r_flags & RACK_SACK_PASSED) {
9783 rack->r_ctl.rc_reorder_ts = cts;
9784 if (rack->r_ctl.rc_reorder_ts == 0)
9785 rack->r_ctl.rc_reorder_ts = 1;
9786 }
9787 rack_log_map_chg(tp, rack, prev, &stack_map, rsm, MAP_SACK_M4, end, __LINE__);
9788 rsm = prev;
9789 } else {
9790 /**
9791 * This is the case where our previous
9792 * block is not acked either, so we must
9793 * split the block in two.
9794 */
9795 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
9796 if (nrsm == NULL) {
9797 /* failed rrs what can we do but loose the sack info? */
9798 goto out;
9799 }
9800 if ((rsm->r_flags & RACK_TLP) &&
9801 (rsm->r_rtr_cnt > 1)) {
9802 /*
9803 * We are splitting a rxt TLP, check
9804 * if we need to save off the start/end
9805 */
9806 if (rack->rc_last_tlp_acked_set &&
9807 (is_rsm_inside_declared_tlp_block(rack, rsm))) {
9808 /*
9809 * We already turned this on since this block is inside
9810 * the previous one was a partially sack now we
9811 * are getting another one (maybe all of it).
9812 */
9813 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
9814 /*
9815 * Lets make sure we have all of it though.
9816 */
9817 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
9818 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
9819 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
9820 rack->r_ctl.last_tlp_acked_end);
9821 }
9822 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
9823 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
9824 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
9825 rack->r_ctl.last_tlp_acked_end);
9826 }
9827 } else {
9828 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
9829 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
9830 rack->rc_last_tlp_acked_set = 1;
9831 rack->rc_last_tlp_past_cumack = 0;
9832 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
9833 }
9834 }
9835 /**
9836 * In this case nrsm becomes
9837 * nrsm->r_start = end;
9838 * nrsm->r_end = rsm->r_end;
9839 * which is un-acked.
9840 * <and>
9841 * rsm->r_end = nrsm->r_start;
9842 * i.e. the remaining un-acked
9843 * piece is left on the left
9844 * hand side.
9845 *
9846 * So we start like this
9847 * rsm |----------| (not acked)
9848 * sackblk |---|
9849 * build it so we have
9850 * rsm |---| (acked)
9851 * nrsm |------| (not acked)
9852 */
9853 rack_clone_rsm(rack, nrsm, rsm, end);
9854 rsm->r_flags &= (~RACK_HAS_FIN);
9855 rsm->r_just_ret = 0;
9856 #ifndef INVARIANTS
9857 (void)tqhash_insert(rack->r_ctl.tqh, nrsm);
9858 #else
9859 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) {
9860 panic("Insert in tailq_hash of %p fails ret:% rack:%p rsm:%p",
9861 nrsm, insret, rack, rsm);
9862 }
9863 #endif
9864 if (rsm->r_in_tmap) {
9865 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
9866 nrsm->r_in_tmap = 1;
9867 }
9868 nrsm->r_dupack = 0;
9869 /* Check if the ack was too soon i.e. reordering + ack arrives too quickly */
9870 prohibit_marking = rack_check_reorder_ack(tp, rack, nrsm, nrsm->r_end, cts, 0, __LINE__);
9871 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2);
9872 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0);
9873 changed += (rsm->r_end - rsm->r_start);
9874 if (rsm->r_flags & RACK_WAS_LOST) {
9875 /*
9876 * Here it is safe to use our function.
9877 */
9878 rack_mark_nolonger_lost(rack, rsm);
9879 }
9880 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
9881
9882 if (rsm->r_in_tmap && (prohibit_marking == 0)) /* should be true */
9883 rack_log_sack_passed(tp, rack, rsm, cts, __LINE__);
9884 /* Is Reordering occuring? */
9885 if (rsm->r_flags & RACK_SACK_PASSED) {
9886 rsm->r_flags &= ~RACK_SACK_PASSED;
9887 rack->r_ctl.rc_reorder_ts = cts;
9888 if (rack->r_ctl.rc_reorder_ts == 0)
9889 rack->r_ctl.rc_reorder_ts = 1;
9890 }
9891 if (rack->app_limited_needs_set)
9892 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END);
9893 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
9894 rsm->r_flags |= RACK_ACKED;
9895 rack_update_pcm_ack(rack, 0, rsm->r_start, rsm->r_end);
9896 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M5, end, __LINE__);
9897 if (rsm->r_in_tmap) {
9898 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
9899 rsm->r_in_tmap = 0;
9900 }
9901 }
9902 }
9903 out:
9904 if (rsm &&
9905 ((rsm->r_flags & RACK_TLP) == 0) &&
9906 (rsm->r_flags & RACK_ACKED)) {
9907 /*
9908 * Now can we merge where we worked
9909 * with either the previous or
9910 * next block?
9911 */
9912 next = tqhash_next(rack->r_ctl.tqh, rsm);
9913 while (next) {
9914 if (next->r_flags & RACK_TLP)
9915 break;
9916 /* Only allow merges between ones in or out of GP window */
9917 if ((next->r_flags & RACK_IN_GP_WIN) &&
9918 ((rsm->r_flags & RACK_IN_GP_WIN) == 0)) {
9919 break;
9920 }
9921 /* We can't merge retransmitted with sack-pass set */
9922 if ((rsm->r_flags & RACK_WAS_SACKPASS) ||
9923 (next->r_flags & RACK_WAS_SACKPASS))
9924 break;
9925 if ((rsm->r_flags & RACK_IN_GP_WIN) &&
9926 ((next->r_flags & RACK_IN_GP_WIN) == 0)) {
9927 break;
9928 }
9929 if (rsm->bindex != next->bindex)
9930 break;
9931 if (rsm->r_flags & RACK_STRADDLE)
9932 break;
9933 if (rsm->r_flags & RACK_IS_PCM)
9934 break;
9935 if (next->r_flags & RACK_STRADDLE)
9936 break;
9937 if (next->r_flags & RACK_IS_PCM)
9938 break;
9939 if (next->r_flags & RACK_ACKED) {
9940 /* yep this and next can be merged */
9941 rsm = rack_merge_rsm(rack, rsm, next);
9942 next = tqhash_next(rack->r_ctl.tqh, rsm);
9943 } else
9944 break;
9945 }
9946 /* Now what about the previous? */
9947 prev = tqhash_prev(rack->r_ctl.tqh, rsm);
9948 while (prev) {
9949 if (prev->r_flags & RACK_TLP)
9950 break;
9951 /* Only allow merges between ones in or out of GP window */
9952 if ((prev->r_flags & RACK_IN_GP_WIN) &&
9953 ((rsm->r_flags & RACK_IN_GP_WIN) == 0)) {
9954 break;
9955 }
9956 /* We can't merge retransmitted with sack-pass set */
9957 if ((rsm->r_flags & RACK_WAS_SACKPASS) ||
9958 (prev->r_flags & RACK_WAS_SACKPASS))
9959 break;
9960 if ((rsm->r_flags & RACK_IN_GP_WIN) &&
9961 ((prev->r_flags & RACK_IN_GP_WIN) == 0)) {
9962 break;
9963 }
9964 if (rsm->bindex != prev->bindex)
9965 break;
9966 if (rsm->r_flags & RACK_STRADDLE)
9967 break;
9968 if (rsm->r_flags & RACK_IS_PCM)
9969 break;
9970 if (prev->r_flags & RACK_STRADDLE)
9971 break;
9972 if (prev->r_flags & RACK_IS_PCM)
9973 break;
9974 if (prev->r_flags & RACK_ACKED) {
9975 /* yep the previous and this can be merged */
9976 rsm = rack_merge_rsm(rack, prev, rsm);
9977 prev = tqhash_prev(rack->r_ctl.tqh, rsm);
9978 } else
9979 break;
9980 }
9981 }
9982 if (used_ref == 0) {
9983 counter_u64_add(rack_sack_proc_all, 1);
9984 } else {
9985 counter_u64_add(rack_sack_proc_short, 1);
9986 }
9987 /* Save off the next one for quick reference. */
9988 nrsm = tqhash_find(rack->r_ctl.tqh, end);
9989 *prsm = rack->r_ctl.rc_sacklast = nrsm;
9990 return (changed);
9991 }
9992
9993 static void inline
rack_peer_reneges(struct tcp_rack * rack,struct rack_sendmap * rsm,tcp_seq th_ack)9994 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack)
9995 {
9996 struct rack_sendmap *tmap;
9997
9998 tmap = NULL;
9999 while (rsm && (rsm->r_flags & RACK_ACKED)) {
10000 /* Its no longer sacked, mark it so */
10001 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
10002 #ifdef INVARIANTS
10003 if (rsm->r_in_tmap) {
10004 panic("rack:%p rsm:%p flags:0x%x in tmap?",
10005 rack, rsm, rsm->r_flags);
10006 }
10007 #endif
10008 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS);
10009 /* Rebuild it into our tmap */
10010 if (tmap == NULL) {
10011 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
10012 tmap = rsm;
10013 } else {
10014 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext);
10015 tmap = rsm;
10016 }
10017 tmap->r_in_tmap = 1;
10018 rsm = tqhash_next(rack->r_ctl.tqh, rsm);
10019 }
10020 /*
10021 * Now lets possibly clear the sack filter so we start
10022 * recognizing sacks that cover this area.
10023 */
10024 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack);
10025
10026 }
10027
10028
10029 static void inline
rack_rsm_sender_update(struct tcp_rack * rack,struct tcpcb * tp,struct rack_sendmap * rsm,uint8_t from)10030 rack_rsm_sender_update(struct tcp_rack *rack, struct tcpcb *tp, struct rack_sendmap *rsm, uint8_t from)
10031 {
10032 /*
10033 * We look at advancing the end send time for our GP
10034 * measurement tracking only as the cumulative acknowledgment
10035 * moves forward. You might wonder about this, why not
10036 * at every transmission or retransmission within the
10037 * GP window update the rc_gp_cumack_ts? Well its rather
10038 * nuanced but basically the GP window *may* expand (as
10039 * it does below) or worse and harder to track it may shrink.
10040 *
10041 * This last makes it impossible to track at the time of
10042 * the send, since you may set forward your rc_gp_cumack_ts
10043 * when you send, because that send *is* in your currently
10044 * "guessed" window, but then it shrinks. Now which was
10045 * the send time of the last bytes in the window, by the
10046 * time you ask that question that part of the sendmap
10047 * is freed. So you don't know and you will have too
10048 * long of send window. Instead by updating the time
10049 * marker only when the cumack advances this assures us
10050 * that we will have only the sends in the window of our
10051 * GP measurement.
10052 *
10053 * Another complication from this is the
10054 * merging of sendmap entries. During SACK processing this
10055 * can happen to conserve the sendmap size. That breaks
10056 * everything down in tracking the send window of the GP
10057 * estimate. So to prevent that and keep it working with
10058 * a tiny bit more limited merging, we only allow like
10059 * types to be merged. I.e. if two sends are in the GP window
10060 * then its ok to merge them together. If two sends are not
10061 * in the GP window its ok to merge them together too. Though
10062 * one send in and one send out cannot be merged. We combine
10063 * this with never allowing the shrinking of the GP window when
10064 * we are in recovery so that we can properly calculate the
10065 * sending times.
10066 *
10067 * This all of course seems complicated, because it is.. :)
10068 *
10069 * The cum-ack is being advanced upon the sendmap.
10070 * If we are not doing a GP estimate don't
10071 * proceed.
10072 */
10073 uint64_t ts;
10074
10075 if ((tp->t_flags & TF_GPUTINPROG) == 0)
10076 return;
10077 /*
10078 * If this sendmap entry is going
10079 * beyond the measurement window we had picked,
10080 * expand the measurement window by that much.
10081 */
10082 if (SEQ_GT(rsm->r_end, tp->gput_ack)) {
10083 tp->gput_ack = rsm->r_end;
10084 }
10085 /*
10086 * If we have not setup a ack, then we
10087 * have no idea if the newly acked pieces
10088 * will be "in our seq measurement range". If
10089 * it is when we clear the app_limited_needs_set
10090 * flag the timestamp will be updated.
10091 */
10092 if (rack->app_limited_needs_set)
10093 return;
10094 /*
10095 * Finally, we grab out the latest timestamp
10096 * that this packet was sent and then see
10097 * if:
10098 * a) The packet touches are newly defined GP range.
10099 * b) The time is greater than (newer) than the
10100 * one we currently have. If so we update
10101 * our sending end time window.
10102 *
10103 * Note we *do not* do this at send time. The reason
10104 * is that if you do you *may* pick up a newer timestamp
10105 * for a range you are not going to measure. We project
10106 * out how far and then sometimes modify that to be
10107 * smaller. If that occurs then you will have a send
10108 * that does not belong to the range included.
10109 */
10110 if ((ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]) <=
10111 rack->r_ctl.rc_gp_cumack_ts)
10112 return;
10113 if (rack_in_gp_window(tp, rsm)) {
10114 rack->r_ctl.rc_gp_cumack_ts = ts;
10115 rack_log_gpset(rack, tp->gput_ack, (uint32_t)ts, rsm->r_end,
10116 __LINE__, from, rsm);
10117 }
10118 }
10119
10120 static void
rack_process_to_cumack(struct tcpcb * tp,struct tcp_rack * rack,register uint32_t th_ack,uint32_t cts,struct tcpopt * to,uint64_t acktime)10121 rack_process_to_cumack(struct tcpcb *tp, struct tcp_rack *rack, register uint32_t th_ack, uint32_t cts, struct tcpopt *to, uint64_t acktime)
10122 {
10123 struct rack_sendmap *rsm;
10124 /*
10125 * The ACK point is advancing to th_ack, we must drop off
10126 * the packets in the rack log and calculate any eligble
10127 * RTT's.
10128 */
10129
10130 if (sack_filter_blks_used(&rack->r_ctl.rack_sf)) {
10131 /*
10132 * If we have some sack blocks in the filter
10133 * lets prune them out by calling sfb with no blocks.
10134 */
10135 sack_filter_blks(tp, &rack->r_ctl.rack_sf, NULL, 0, th_ack);
10136 }
10137 if (SEQ_GT(th_ack, tp->snd_una)) {
10138 /* Clear any app ack remembered settings */
10139 rack->r_ctl.cleared_app_ack = 0;
10140 }
10141 rack->r_wanted_output = 1;
10142 if (SEQ_GT(th_ack, tp->snd_una))
10143 rack->r_ctl.last_cumack_advance = acktime;
10144
10145 /* Tend any TLP that has been marked for 1/2 the seq space (its old) */
10146 if ((rack->rc_last_tlp_acked_set == 1)&&
10147 (rack->rc_last_tlp_past_cumack == 1) &&
10148 (SEQ_GT(rack->r_ctl.last_tlp_acked_start, th_ack))) {
10149 /*
10150 * We have reached the point where our last rack
10151 * tlp retransmit sequence is ahead of the cum-ack.
10152 * This can only happen when the cum-ack moves all
10153 * the way around (its been a full 2^^31+1 bytes
10154 * or more since we sent a retransmitted TLP). Lets
10155 * turn off the valid flag since its not really valid.
10156 *
10157 * Note since sack's also turn on this event we have
10158 * a complication, we have to wait to age it out until
10159 * the cum-ack is by the TLP before checking which is
10160 * what the next else clause does.
10161 */
10162 rack_log_dsack_event(rack, 9, __LINE__,
10163 rack->r_ctl.last_tlp_acked_start,
10164 rack->r_ctl.last_tlp_acked_end);
10165 rack->rc_last_tlp_acked_set = 0;
10166 rack->rc_last_tlp_past_cumack = 0;
10167 } else if ((rack->rc_last_tlp_acked_set == 1) &&
10168 (rack->rc_last_tlp_past_cumack == 0) &&
10169 (SEQ_GEQ(th_ack, rack->r_ctl.last_tlp_acked_end))) {
10170 /*
10171 * It is safe to start aging TLP's out.
10172 */
10173 rack->rc_last_tlp_past_cumack = 1;
10174 }
10175 /* We do the same for the tlp send seq as well */
10176 if ((rack->rc_last_sent_tlp_seq_valid == 1) &&
10177 (rack->rc_last_sent_tlp_past_cumack == 1) &&
10178 (SEQ_GT(rack->r_ctl.last_sent_tlp_seq, th_ack))) {
10179 rack_log_dsack_event(rack, 9, __LINE__,
10180 rack->r_ctl.last_sent_tlp_seq,
10181 (rack->r_ctl.last_sent_tlp_seq +
10182 rack->r_ctl.last_sent_tlp_len));
10183 rack->rc_last_sent_tlp_seq_valid = 0;
10184 rack->rc_last_sent_tlp_past_cumack = 0;
10185 } else if ((rack->rc_last_sent_tlp_seq_valid == 1) &&
10186 (rack->rc_last_sent_tlp_past_cumack == 0) &&
10187 (SEQ_GEQ(th_ack, rack->r_ctl.last_sent_tlp_seq))) {
10188 /*
10189 * It is safe to start aging TLP's send.
10190 */
10191 rack->rc_last_sent_tlp_past_cumack = 1;
10192 }
10193 more:
10194 rsm = tqhash_min(rack->r_ctl.tqh);
10195 if (rsm == NULL) {
10196 if ((th_ack - 1) == tp->iss) {
10197 /*
10198 * For the SYN incoming case we will not
10199 * have called tcp_output for the sending of
10200 * the SYN, so there will be no map. All
10201 * other cases should probably be a panic.
10202 */
10203 return;
10204 }
10205 if (tp->t_flags & TF_SENTFIN) {
10206 /* if we sent a FIN we often will not have map */
10207 return;
10208 }
10209 #ifdef INVARIANTS
10210 panic("No rack map tp:%p for state:%d ack:%u rack:%p snd_una:%u snd_max:%u\n",
10211 tp,
10212 tp->t_state, th_ack, rack,
10213 tp->snd_una, tp->snd_max);
10214 #endif
10215 return;
10216 }
10217 if (SEQ_LT(th_ack, rsm->r_start)) {
10218 /* Huh map is missing this */
10219 #ifdef INVARIANTS
10220 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n",
10221 rsm->r_start,
10222 th_ack, tp->t_state, rack->r_state);
10223 #endif
10224 return;
10225 }
10226 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED, th_ack);
10227
10228 /* Now was it a retransmitted TLP? */
10229 if ((rsm->r_flags & RACK_TLP) &&
10230 (rsm->r_rtr_cnt > 1)) {
10231 /*
10232 * Yes, this rsm was a TLP and retransmitted, remember that
10233 * since if a DSACK comes back on this we don't want
10234 * to think of it as a reordered segment. This may
10235 * get updated again with possibly even other TLPs
10236 * in flight, but thats ok. Only when we don't send
10237 * a retransmitted TLP for 1/2 the sequences space
10238 * will it get turned off (above).
10239 */
10240 if (rack->rc_last_tlp_acked_set &&
10241 (is_rsm_inside_declared_tlp_block(rack, rsm))) {
10242 /*
10243 * We already turned this on since the end matches,
10244 * the previous one was a partially ack now we
10245 * are getting another one (maybe all of it).
10246 */
10247 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
10248 /*
10249 * Lets make sure we have all of it though.
10250 */
10251 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
10252 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
10253 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
10254 rack->r_ctl.last_tlp_acked_end);
10255 }
10256 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
10257 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
10258 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
10259 rack->r_ctl.last_tlp_acked_end);
10260 }
10261 } else {
10262 rack->rc_last_tlp_past_cumack = 1;
10263 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
10264 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
10265 rack->rc_last_tlp_acked_set = 1;
10266 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
10267 }
10268 }
10269 /* Now do we consume the whole thing? */
10270 rack->r_ctl.last_tmit_time_acked = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
10271 if (SEQ_GEQ(th_ack, rsm->r_end)) {
10272 /* Its all consumed. */
10273 uint32_t left;
10274 uint8_t newly_acked;
10275
10276 if (rsm->r_flags & RACK_WAS_LOST) {
10277 /*
10278 * This can happen when we marked it as lost
10279 * and yet before retransmitting we get an ack
10280 * which can happen due to reordering.
10281 */
10282 rack_mark_nolonger_lost(rack, rsm);
10283 }
10284 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_FREE, rsm->r_end, __LINE__);
10285 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
10286 rsm->r_rtr_bytes = 0;
10287 /*
10288 * Record the time of highest cumack sent if its in our measurement
10289 * window and possibly bump out the end.
10290 */
10291 rack_rsm_sender_update(rack, tp, rsm, 4);
10292 tqhash_remove(rack->r_ctl.tqh, rsm, REMOVE_TYPE_CUMACK);
10293 if (rsm->r_in_tmap) {
10294 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
10295 rsm->r_in_tmap = 0;
10296 }
10297 newly_acked = 1;
10298 if (rsm->r_flags & RACK_ACKED) {
10299 /*
10300 * It was acked on the scoreboard -- remove
10301 * it from total
10302 */
10303 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
10304 newly_acked = 0;
10305 } else if (rsm->r_flags & RACK_SACK_PASSED) {
10306 /*
10307 * There are segments ACKED on the
10308 * scoreboard further up. We are seeing
10309 * reordering.
10310 */
10311 rsm->r_flags &= ~RACK_SACK_PASSED;
10312 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
10313 rsm->r_flags |= RACK_ACKED;
10314 rack->r_ctl.rc_reorder_ts = cts;
10315 if (rack->r_ctl.rc_reorder_ts == 0)
10316 rack->r_ctl.rc_reorder_ts = 1;
10317 if (rack->r_ent_rec_ns) {
10318 /*
10319 * We have sent no more, and we saw an sack
10320 * then ack arrive.
10321 */
10322 rack->r_might_revert = 1;
10323 }
10324 rack_update_pcm_ack(rack, 1, rsm->r_start, rsm->r_end);
10325 } else {
10326 (void)rack_check_reorder_ack(tp, rack, rsm, rsm->r_end, cts, 1, __LINE__);
10327 rack_update_pcm_ack(rack, 1, rsm->r_start, rsm->r_end);
10328 }
10329 if ((rsm->r_flags & RACK_TO_REXT) &&
10330 (tp->t_flags & TF_RCVD_TSTMP) &&
10331 (to->to_flags & TOF_TS) &&
10332 (to->to_tsecr != 0) &&
10333 (tp->t_flags & TF_PREVVALID)) {
10334 /*
10335 * We can use the timestamp to see
10336 * if this retransmission was from the
10337 * first transmit. If so we made a mistake.
10338 */
10339 tp->t_flags &= ~TF_PREVVALID;
10340 if (to->to_tsecr == rack_ts_to_msec(rsm->r_tim_lastsent[0])) {
10341 /* The first transmit is what this ack is for */
10342 rack_cong_signal(tp, CC_RTO_ERR, th_ack, __LINE__);
10343 }
10344 }
10345 left = th_ack - rsm->r_end;
10346 if (rack->app_limited_needs_set && newly_acked)
10347 rack_need_set_test(tp, rack, rsm, th_ack, __LINE__, RACK_USE_END_OR_THACK);
10348 /* Free back to zone */
10349 rack_free(rack, rsm);
10350 if (left) {
10351 goto more;
10352 }
10353 /* Check for reneging */
10354 rsm = tqhash_min(rack->r_ctl.tqh);
10355 if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) {
10356 /*
10357 * The peer has moved snd_una up to
10358 * the edge of this send, i.e. one
10359 * that it had previously acked. The only
10360 * way that can be true if the peer threw
10361 * away data (space issues) that it had
10362 * previously sacked (else it would have
10363 * given us snd_una up to (rsm->r_end).
10364 * We need to undo the acked markings here.
10365 *
10366 * Note we have to look to make sure th_ack is
10367 * our rsm->r_start in case we get an old ack
10368 * where th_ack is behind snd_una.
10369 */
10370 rack_peer_reneges(rack, rsm, th_ack);
10371 }
10372 return;
10373 }
10374 if (rsm->r_flags & RACK_ACKED) {
10375 /*
10376 * It was acked on the scoreboard -- remove it from
10377 * total for the part being cum-acked.
10378 */
10379 rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start);
10380 } else {
10381 rack_update_pcm_ack(rack, 1, rsm->r_start, th_ack);
10382 }
10383 /* And what about the lost flag? */
10384 if (rsm->r_flags & RACK_WAS_LOST) {
10385 /*
10386 * This can happen when we marked it as lost
10387 * and yet before retransmitting we get an ack
10388 * which can happen due to reordering. In this
10389 * case its only a partial ack of the send.
10390 */
10391 rack_mark_nolonger_lost(rack, rsm);
10392 }
10393 /*
10394 * Clear the dup ack count for
10395 * the piece that remains.
10396 */
10397 rsm->r_dupack = 0;
10398 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
10399 if (rsm->r_rtr_bytes) {
10400 /*
10401 * It was retransmitted adjust the
10402 * sack holes for what was acked.
10403 */
10404 int ack_am;
10405
10406 ack_am = (th_ack - rsm->r_start);
10407 if (ack_am >= rsm->r_rtr_bytes) {
10408 rack->r_ctl.rc_holes_rxt -= ack_am;
10409 rsm->r_rtr_bytes -= ack_am;
10410 }
10411 }
10412 /*
10413 * Update where the piece starts and record
10414 * the time of send of highest cumack sent if
10415 * its in our GP range.
10416 */
10417 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_TRIM_HEAD, th_ack, __LINE__);
10418 /* Now we need to move our offset forward too */
10419 if (rsm->m &&
10420 ((rsm->orig_m_len != rsm->m->m_len) ||
10421 (M_TRAILINGROOM(rsm->m) != rsm->orig_t_space))) {
10422 /* Fix up the orig_m_len and possibly the mbuf offset */
10423 rack_adjust_orig_mlen(rsm);
10424 }
10425 rsm->soff += (th_ack - rsm->r_start);
10426 rack_rsm_sender_update(rack, tp, rsm, 5);
10427
10428 /*
10429 * Handle the special case where we retransmitted part of a segment we
10430 * in this case pass in th_ack which is shorter than r_end.
10431 */
10432 if (rsm->r_flags & RACK_WAS_SACKPASS) {
10433 rack_check_reorder_ack(tp, rack, rsm, th_ack, cts, 1, __LINE__);
10434 }
10435 /* The trim will move th_ack into r_start for us */
10436 tqhash_trim(rack->r_ctl.tqh, th_ack);
10437 /* Now do we need to move the mbuf fwd too? */
10438 {
10439 struct mbuf *m;
10440 uint32_t soff;
10441
10442 m = rsm->m;
10443 soff = rsm->soff;
10444 if (m) {
10445 while (soff >= m->m_len) {
10446 soff -= m->m_len;
10447 KASSERT((m->m_next != NULL),
10448 (" rsm:%p off:%u soff:%u m:%p",
10449 rsm, rsm->soff, soff, m));
10450 m = m->m_next;
10451 if (m == NULL) {
10452 /*
10453 * This is a fall-back that prevents a panic. In reality
10454 * we should be able to walk the mbuf's and find our place.
10455 * At this point snd_una has not been updated with the sbcut() yet
10456 * but tqhash_trim did update rsm->r_start so the offset calcuation
10457 * should work fine. This is undesirable since we will take cache
10458 * hits to access the socket buffer. And even more puzzling is that
10459 * it happens occasionally. It should not :(
10460 */
10461 m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd,
10462 (rsm->r_start - tp->snd_una),
10463 &soff);
10464 break;
10465 }
10466 }
10467 /*
10468 * Now save in our updated values.
10469 */
10470 rsm->m = m;
10471 rsm->soff = soff;
10472 rsm->orig_m_len = rsm->m->m_len;
10473 rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
10474 }
10475 }
10476 if (rack->app_limited_needs_set &&
10477 SEQ_GEQ(th_ack, tp->gput_seq))
10478 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG);
10479 }
10480
10481 static void
rack_handle_might_revert(struct tcpcb * tp,struct tcp_rack * rack)10482 rack_handle_might_revert(struct tcpcb *tp, struct tcp_rack *rack)
10483 {
10484 struct rack_sendmap *rsm;
10485 int sack_pass_fnd = 0;
10486
10487 if (rack->r_might_revert) {
10488 /*
10489 * Ok we have reordering, have not sent anything, we
10490 * might want to revert the congestion state if nothing
10491 * further has SACK_PASSED on it. Lets check.
10492 *
10493 * We also get here when we have DSACKs come in for
10494 * all the data that we FR'd. Note that a rxt or tlp
10495 * timer clears this from happening.
10496 */
10497
10498 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
10499 if (rsm->r_flags & RACK_SACK_PASSED) {
10500 sack_pass_fnd = 1;
10501 break;
10502 }
10503 }
10504 if (sack_pass_fnd == 0) {
10505 /*
10506 * We went into recovery
10507 * incorrectly due to reordering!
10508 */
10509 int orig_cwnd;
10510
10511 rack->r_ent_rec_ns = 0;
10512 orig_cwnd = tp->snd_cwnd;
10513 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at_erec;
10514 tp->snd_recover = tp->snd_una;
10515 rack_log_to_prr(rack, 14, orig_cwnd, __LINE__);
10516 if (IN_RECOVERY(tp->t_flags)) {
10517 rack_exit_recovery(tp, rack, 3);
10518 if ((rack->rto_from_rec == 1) && (rack_ssthresh_rest_rto_rec != 0) ){
10519 /*
10520 * We were in recovery, had an RTO
10521 * and then re-entered recovery (more sack's arrived)
10522 * and we have properly recorded the old ssthresh from
10523 * the first recovery. We want to be able to slow-start
10524 * back to this level. The ssthresh from the timeout
10525 * and then back into recovery will end up most likely
10526 * to be min(cwnd=1mss, 2mss). Which makes it basically
10527 * so we get no slow-start after our RTO.
10528 */
10529 rack->rto_from_rec = 0;
10530 if (rack->r_ctl.rto_ssthresh > tp->snd_ssthresh)
10531 tp->snd_ssthresh = rack->r_ctl.rto_ssthresh;
10532 }
10533 }
10534 }
10535 rack->r_might_revert = 0;
10536 }
10537 }
10538
10539
10540 static int
rack_note_dsack(struct tcp_rack * rack,tcp_seq start,tcp_seq end)10541 rack_note_dsack(struct tcp_rack *rack, tcp_seq start, tcp_seq end)
10542 {
10543
10544 uint32_t am, l_end;
10545 int was_tlp = 0;
10546
10547 if (SEQ_GT(end, start))
10548 am = end - start;
10549 else
10550 am = 0;
10551 if ((rack->rc_last_tlp_acked_set ) &&
10552 (SEQ_GEQ(start, rack->r_ctl.last_tlp_acked_start)) &&
10553 (SEQ_LEQ(end, rack->r_ctl.last_tlp_acked_end))) {
10554 /*
10555 * The DSACK is because of a TLP which we don't
10556 * do anything with the reordering window over since
10557 * it was not reordering that caused the DSACK but
10558 * our previous retransmit TLP.
10559 */
10560 rack_log_dsack_event(rack, 7, __LINE__, start, end);
10561 was_tlp = 1;
10562 goto skip_dsack_round;
10563 }
10564 if (rack->rc_last_sent_tlp_seq_valid) {
10565 l_end = rack->r_ctl.last_sent_tlp_seq + rack->r_ctl.last_sent_tlp_len;
10566 if (SEQ_GEQ(start, rack->r_ctl.last_sent_tlp_seq) &&
10567 (SEQ_LEQ(end, l_end))) {
10568 /*
10569 * This dsack is from the last sent TLP, ignore it
10570 * for reordering purposes.
10571 */
10572 rack_log_dsack_event(rack, 7, __LINE__, start, end);
10573 was_tlp = 1;
10574 goto skip_dsack_round;
10575 }
10576 }
10577 if (rack->rc_dsack_round_seen == 0) {
10578 rack->rc_dsack_round_seen = 1;
10579 rack->r_ctl.dsack_round_end = rack->rc_tp->snd_max;
10580 rack->r_ctl.num_dsack++;
10581 rack->r_ctl.dsack_persist = 16; /* 16 is from the standard */
10582 rack_log_dsack_event(rack, 2, __LINE__, 0, 0);
10583 }
10584 skip_dsack_round:
10585 /*
10586 * We keep track of how many DSACK blocks we get
10587 * after a recovery incident.
10588 */
10589 rack->r_ctl.dsack_byte_cnt += am;
10590 if (!IN_FASTRECOVERY(rack->rc_tp->t_flags) &&
10591 rack->r_ctl.retran_during_recovery &&
10592 (rack->r_ctl.dsack_byte_cnt >= rack->r_ctl.retran_during_recovery)) {
10593 /*
10594 * False recovery most likely culprit is reordering. If
10595 * nothing else is missing we need to revert.
10596 */
10597 rack->r_might_revert = 1;
10598 rack_handle_might_revert(rack->rc_tp, rack);
10599 rack->r_might_revert = 0;
10600 rack->r_ctl.retran_during_recovery = 0;
10601 rack->r_ctl.dsack_byte_cnt = 0;
10602 }
10603 return (was_tlp);
10604 }
10605
10606 static uint32_t
do_rack_compute_pipe(struct tcpcb * tp,struct tcp_rack * rack,uint32_t snd_una)10607 do_rack_compute_pipe(struct tcpcb *tp, struct tcp_rack *rack, uint32_t snd_una)
10608 {
10609 return (((tp->snd_max - snd_una) -
10610 (rack->r_ctl.rc_sacked + rack->r_ctl.rc_considered_lost)) + rack->r_ctl.rc_holes_rxt);
10611 }
10612
10613 static int32_t
rack_compute_pipe(struct tcpcb * tp)10614 rack_compute_pipe(struct tcpcb *tp)
10615 {
10616 return ((int32_t)do_rack_compute_pipe(tp,
10617 (struct tcp_rack *)tp->t_fb_ptr,
10618 tp->snd_una));
10619 }
10620
10621 static void
rack_update_prr(struct tcpcb * tp,struct tcp_rack * rack,uint32_t changed,tcp_seq th_ack)10622 rack_update_prr(struct tcpcb *tp, struct tcp_rack *rack, uint32_t changed, tcp_seq th_ack)
10623 {
10624 /* Deal with changed and PRR here (in recovery only) */
10625 uint32_t pipe, snd_una;
10626
10627 rack->r_ctl.rc_prr_delivered += changed;
10628
10629 if (sbavail(&rack->rc_inp->inp_socket->so_snd) <= (tp->snd_max - tp->snd_una)) {
10630 /*
10631 * It is all outstanding, we are application limited
10632 * and thus we don't need more room to send anything.
10633 * Note we use tp->snd_una here and not th_ack because
10634 * the data as yet not been cut from the sb.
10635 */
10636 rack->r_ctl.rc_prr_sndcnt = 0;
10637 return;
10638 }
10639 /* Compute prr_sndcnt */
10640 if (SEQ_GT(tp->snd_una, th_ack)) {
10641 snd_una = tp->snd_una;
10642 } else {
10643 snd_una = th_ack;
10644 }
10645 pipe = do_rack_compute_pipe(tp, rack, snd_una);
10646 if (pipe > tp->snd_ssthresh) {
10647 long sndcnt;
10648
10649 sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh;
10650 if (rack->r_ctl.rc_prr_recovery_fs > 0)
10651 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs;
10652 else {
10653 rack->r_ctl.rc_prr_sndcnt = 0;
10654 rack_log_to_prr(rack, 9, 0, __LINE__);
10655 sndcnt = 0;
10656 }
10657 sndcnt++;
10658 if (sndcnt > (long)rack->r_ctl.rc_prr_out)
10659 sndcnt -= rack->r_ctl.rc_prr_out;
10660 else
10661 sndcnt = 0;
10662 rack->r_ctl.rc_prr_sndcnt = sndcnt;
10663 rack_log_to_prr(rack, 10, 0, __LINE__);
10664 } else {
10665 uint32_t limit;
10666
10667 if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out)
10668 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out);
10669 else
10670 limit = 0;
10671 if (changed > limit)
10672 limit = changed;
10673 limit += ctf_fixed_maxseg(tp);
10674 if (tp->snd_ssthresh > pipe) {
10675 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit);
10676 rack_log_to_prr(rack, 11, 0, __LINE__);
10677 } else {
10678 rack->r_ctl.rc_prr_sndcnt = min(0, limit);
10679 rack_log_to_prr(rack, 12, 0, __LINE__);
10680 }
10681 }
10682 }
10683
10684 static void
rack_log_ack(struct tcpcb * tp,struct tcpopt * to,struct tcphdr * th,int entered_recovery,int dup_ack_struck,int * dsack_seen,int * sacks_seen)10685 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered_recovery, int dup_ack_struck,
10686 int *dsack_seen, int *sacks_seen)
10687 {
10688 uint32_t changed;
10689 struct tcp_rack *rack;
10690 struct rack_sendmap *rsm;
10691 struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1];
10692 register uint32_t th_ack;
10693 int32_t i, j, k, num_sack_blks = 0;
10694 uint32_t cts, acked, ack_point;
10695 int loop_start = 0;
10696 uint32_t tsused;
10697 uint32_t segsiz;
10698
10699
10700 INP_WLOCK_ASSERT(tptoinpcb(tp));
10701 if (tcp_get_flags(th) & TH_RST) {
10702 /* We don't log resets */
10703 return;
10704 }
10705 rack = (struct tcp_rack *)tp->t_fb_ptr;
10706 cts = tcp_get_usecs(NULL);
10707 rsm = tqhash_min(rack->r_ctl.tqh);
10708 changed = 0;
10709 th_ack = th->th_ack;
10710 segsiz = ctf_fixed_maxseg(rack->rc_tp);
10711 if (SEQ_GT(th_ack, tp->snd_una)) {
10712 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__);
10713 tp->t_acktime = ticks;
10714 }
10715 if (rsm && SEQ_GT(th_ack, rsm->r_start))
10716 changed = th_ack - rsm->r_start;
10717 if (changed) {
10718 rack_process_to_cumack(tp, rack, th_ack, cts, to,
10719 tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time));
10720 }
10721 if ((to->to_flags & TOF_SACK) == 0) {
10722 /* We are done nothing left and no sack. */
10723 rack_handle_might_revert(tp, rack);
10724 /*
10725 * For cases where we struck a dup-ack
10726 * with no SACK, add to the changes so
10727 * PRR will work right.
10728 */
10729 if (dup_ack_struck && (changed == 0)) {
10730 changed += ctf_fixed_maxseg(rack->rc_tp);
10731 }
10732 goto out;
10733 }
10734 /* Sack block processing */
10735 if (SEQ_GT(th_ack, tp->snd_una))
10736 ack_point = th_ack;
10737 else
10738 ack_point = tp->snd_una;
10739 for (i = 0; i < to->to_nsacks; i++) {
10740 bcopy((to->to_sacks + i * TCPOLEN_SACK),
10741 &sack, sizeof(sack));
10742 sack.start = ntohl(sack.start);
10743 sack.end = ntohl(sack.end);
10744 if (SEQ_GT(sack.end, sack.start) &&
10745 SEQ_GT(sack.start, ack_point) &&
10746 SEQ_LT(sack.start, tp->snd_max) &&
10747 SEQ_GT(sack.end, ack_point) &&
10748 SEQ_LEQ(sack.end, tp->snd_max)) {
10749 sack_blocks[num_sack_blks] = sack;
10750 num_sack_blks++;
10751 } else if (SEQ_LEQ(sack.start, th_ack) &&
10752 SEQ_LEQ(sack.end, th_ack)) {
10753 int was_tlp;
10754
10755 if (dsack_seen != NULL)
10756 *dsack_seen = 1;
10757 was_tlp = rack_note_dsack(rack, sack.start, sack.end);
10758 /*
10759 * Its a D-SACK block.
10760 */
10761 tcp_record_dsack(tp, sack.start, sack.end, was_tlp);
10762 }
10763 }
10764 if (rack->rc_dsack_round_seen) {
10765 /* Is the dsack roound over? */
10766 if (SEQ_GEQ(th_ack, rack->r_ctl.dsack_round_end)) {
10767 /* Yes it is */
10768 rack->rc_dsack_round_seen = 0;
10769 rack_log_dsack_event(rack, 3, __LINE__, 0, 0);
10770 }
10771 }
10772 /*
10773 * Sort the SACK blocks so we can update the rack scoreboard with
10774 * just one pass.
10775 */
10776 num_sack_blks = sack_filter_blks(tp, &rack->r_ctl.rack_sf, sack_blocks,
10777 num_sack_blks, th->th_ack);
10778 ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks);
10779 if (sacks_seen != NULL)
10780 *sacks_seen = num_sack_blks;
10781 if (num_sack_blks == 0) {
10782 /* Nothing to sack */
10783 goto out;
10784 }
10785 /* Its a sack of some sort */
10786 if (num_sack_blks < 2) {
10787 /* Only one, we don't need to sort */
10788 goto do_sack_work;
10789 }
10790 /* Sort the sacks */
10791 for (i = 0; i < num_sack_blks; i++) {
10792 for (j = i + 1; j < num_sack_blks; j++) {
10793 if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
10794 sack = sack_blocks[i];
10795 sack_blocks[i] = sack_blocks[j];
10796 sack_blocks[j] = sack;
10797 }
10798 }
10799 }
10800 /*
10801 * Now are any of the sack block ends the same (yes some
10802 * implementations send these)?
10803 */
10804 again:
10805 if (num_sack_blks == 0)
10806 goto out;
10807 if (num_sack_blks > 1) {
10808 for (i = 0; i < num_sack_blks; i++) {
10809 for (j = i + 1; j < num_sack_blks; j++) {
10810 if (sack_blocks[i].end == sack_blocks[j].end) {
10811 /*
10812 * Ok these two have the same end we
10813 * want the smallest end and then
10814 * throw away the larger and start
10815 * again.
10816 */
10817 if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) {
10818 /*
10819 * The second block covers
10820 * more area use that
10821 */
10822 sack_blocks[i].start = sack_blocks[j].start;
10823 }
10824 /*
10825 * Now collapse out the dup-sack and
10826 * lower the count
10827 */
10828 for (k = (j + 1); k < num_sack_blks; k++) {
10829 sack_blocks[j].start = sack_blocks[k].start;
10830 sack_blocks[j].end = sack_blocks[k].end;
10831 j++;
10832 }
10833 num_sack_blks--;
10834 goto again;
10835 }
10836 }
10837 }
10838 }
10839 do_sack_work:
10840 /*
10841 * First lets look to see if
10842 * we have retransmitted and
10843 * can use the transmit next?
10844 */
10845 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
10846 if (rsm &&
10847 SEQ_GT(sack_blocks[0].end, rsm->r_start) &&
10848 SEQ_LT(sack_blocks[0].start, rsm->r_end)) {
10849 /*
10850 * We probably did the FR and the next
10851 * SACK in continues as we would expect.
10852 */
10853 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, segsiz);
10854 if (acked) {
10855 rack->r_wanted_output = 1;
10856 changed += acked;
10857 }
10858 if (num_sack_blks == 1) {
10859 goto out;
10860 } else {
10861 /*
10862 * Start the loop through the
10863 * rest of blocks, past the first block.
10864 */
10865 loop_start = 1;
10866 }
10867 }
10868 rsm = rack->r_ctl.rc_sacklast;
10869 for (i = loop_start; i < num_sack_blks; i++) {
10870 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, segsiz);
10871 if (acked) {
10872 rack->r_wanted_output = 1;
10873 changed += acked;
10874 }
10875 }
10876 out:
10877 if (changed) {
10878 /* Something changed cancel the rack timer */
10879 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
10880 }
10881 tsused = tcp_get_usecs(NULL);
10882 rsm = tcp_rack_output(tp, rack, tsused);
10883 if ((!IN_FASTRECOVERY(tp->t_flags)) &&
10884 rsm &&
10885 ((rsm->r_flags & RACK_MUST_RXT) == 0)) {
10886 /* Enter recovery */
10887 entered_recovery = 1;
10888 rack_cong_signal(tp, CC_NDUPACK, th_ack, __LINE__);
10889 /*
10890 * When we enter recovery we need to assure we send
10891 * one packet.
10892 */
10893 if (rack->rack_no_prr == 0) {
10894 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
10895 rack_log_to_prr(rack, 8, 0, __LINE__);
10896 }
10897 rack->r_timer_override = 1;
10898 rack->r_early = 0;
10899 rack->r_ctl.rc_agg_early = 0;
10900 } else if (IN_FASTRECOVERY(tp->t_flags) &&
10901 rsm &&
10902 (rack->r_rr_config == 3)) {
10903 /*
10904 * Assure we can output and we get no
10905 * remembered pace time except the retransmit.
10906 */
10907 rack->r_timer_override = 1;
10908 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
10909 rack->r_ctl.rc_resend = rsm;
10910 }
10911 if (IN_FASTRECOVERY(tp->t_flags) &&
10912 (rack->rack_no_prr == 0) &&
10913 (entered_recovery == 0)) {
10914 rack_update_prr(tp, rack, changed, th_ack);
10915 if ((rsm && (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) &&
10916 ((tcp_in_hpts(rack->rc_tp) == 0) &&
10917 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)))) {
10918 /*
10919 * If you are pacing output you don't want
10920 * to override.
10921 */
10922 rack->r_early = 0;
10923 rack->r_ctl.rc_agg_early = 0;
10924 rack->r_timer_override = 1;
10925 }
10926 }
10927 }
10928
10929 static void
rack_strike_dupack(struct tcp_rack * rack,tcp_seq th_ack)10930 rack_strike_dupack(struct tcp_rack *rack, tcp_seq th_ack)
10931 {
10932 struct rack_sendmap *rsm;
10933
10934 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
10935 while (rsm) {
10936 /*
10937 * We need to skip anything already set
10938 * to be retransmitted.
10939 */
10940 if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) ||
10941 (rsm->r_flags & RACK_MUST_RXT)) {
10942 rsm = TAILQ_NEXT(rsm, r_tnext);
10943 continue;
10944 }
10945 break;
10946 }
10947 if (rsm && (rsm->r_dupack < 0xff)) {
10948 rsm->r_dupack++;
10949 if (rsm->r_dupack >= DUP_ACK_THRESHOLD) {
10950 struct timeval tv;
10951 uint32_t cts;
10952 /*
10953 * Here we see if we need to retransmit. For
10954 * a SACK type connection if enough time has passed
10955 * we will get a return of the rsm. For a non-sack
10956 * connection we will get the rsm returned if the
10957 * dupack value is 3 or more.
10958 */
10959 cts = tcp_get_usecs(&tv);
10960 rack->r_ctl.rc_resend = tcp_rack_output(rack->rc_tp, rack, cts);
10961 if (rack->r_ctl.rc_resend != NULL) {
10962 if (!IN_FASTRECOVERY(rack->rc_tp->t_flags)) {
10963 rack_cong_signal(rack->rc_tp, CC_NDUPACK,
10964 th_ack, __LINE__);
10965 }
10966 rack->r_wanted_output = 1;
10967 rack->r_timer_override = 1;
10968 rack_log_retran_reason(rack, rsm, __LINE__, 1, 3);
10969 }
10970 } else {
10971 rack_log_retran_reason(rack, rsm, __LINE__, 0, 3);
10972 }
10973 }
10974 }
10975
10976 static void
rack_check_bottom_drag(struct tcpcb * tp,struct tcp_rack * rack,struct socket * so)10977 rack_check_bottom_drag(struct tcpcb *tp,
10978 struct tcp_rack *rack,
10979 struct socket *so)
10980 {
10981 /*
10982 * So what is dragging bottom?
10983 *
10984 * Dragging bottom means you were under pacing and had a
10985 * delay in processing inbound acks waiting on our pacing
10986 * timer to expire. While you were waiting all of the acknowledgments
10987 * for the packets you sent have arrived. This means we are pacing
10988 * way underneath the bottleneck to the point where our Goodput
10989 * measurements stop working, since they require more than one
10990 * ack (usually at least 8 packets worth with multiple acks so we can
10991 * gauge the inter-ack times). If that occurs we have a real problem
10992 * since we are stuck in a hole that we can't get out of without
10993 * something speeding us up.
10994 *
10995 * We also check to see if we are widdling down to just one segment
10996 * outstanding. If this occurs and we have room to send in our cwnd/rwnd
10997 * then we are adding the delayed ack interval into our measurments and
10998 * we need to speed up slightly.
10999 */
11000 uint32_t segsiz, minseg;
11001
11002 segsiz = ctf_fixed_maxseg(tp);
11003 minseg = segsiz;
11004 if (tp->snd_max == tp->snd_una) {
11005 /*
11006 * We are doing dynamic pacing and we are way
11007 * under. Basically everything got acked while
11008 * we were still waiting on the pacer to expire.
11009 *
11010 * This means we need to boost the b/w in
11011 * addition to any earlier boosting of
11012 * the multiplier.
11013 */
11014 uint64_t lt_bw;
11015
11016 tcp_trace_point(rack->rc_tp, TCP_TP_PACED_BOTTOM);
11017 lt_bw = rack_get_lt_bw(rack);
11018 rack->rc_dragged_bottom = 1;
11019 rack_validate_multipliers_at_or_above100(rack);
11020 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_VALID) &&
11021 (rack->dis_lt_bw == 0) &&
11022 (rack->use_lesser_lt_bw == 0) &&
11023 (lt_bw > 0)) {
11024 /*
11025 * Lets use the long-term b/w we have
11026 * been getting as a base.
11027 */
11028 if (rack->rc_gp_filled == 0) {
11029 if (lt_bw > ONE_POINT_TWO_MEG) {
11030 /*
11031 * If we have no measurement
11032 * don't let us set in more than
11033 * 1.2Mbps. If we are still too
11034 * low after pacing with this we
11035 * will hopefully have a max b/w
11036 * available to sanity check things.
11037 */
11038 lt_bw = ONE_POINT_TWO_MEG;
11039 }
11040 rack->r_ctl.rc_rtt_diff = 0;
11041 rack->r_ctl.gp_bw = lt_bw;
11042 rack->rc_gp_filled = 1;
11043 if (rack->r_ctl.num_measurements < RACK_REQ_AVG)
11044 rack->r_ctl.num_measurements = RACK_REQ_AVG;
11045 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
11046 } else if (lt_bw > rack->r_ctl.gp_bw) {
11047 rack->r_ctl.rc_rtt_diff = 0;
11048 if (rack->r_ctl.num_measurements < RACK_REQ_AVG)
11049 rack->r_ctl.num_measurements = RACK_REQ_AVG;
11050 rack->r_ctl.gp_bw = lt_bw;
11051 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
11052 } else
11053 rack_increase_bw_mul(rack, -1, 0, 0, 1);
11054 if ((rack->gp_ready == 0) &&
11055 (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) {
11056 /* We have enough measurements now */
11057 rack->gp_ready = 1;
11058 if (rack->dgp_on ||
11059 rack->rack_hibeta)
11060 rack_set_cc_pacing(rack);
11061 if (rack->defer_options)
11062 rack_apply_deferred_options(rack);
11063 }
11064 } else {
11065 /*
11066 * zero rtt possibly?, settle for just an old increase.
11067 */
11068 rack_increase_bw_mul(rack, -1, 0, 0, 1);
11069 }
11070 } else if ((IN_FASTRECOVERY(tp->t_flags) == 0) &&
11071 (sbavail(&so->so_snd) > max((segsiz * (4 + rack_req_segs)),
11072 minseg)) &&
11073 (rack->r_ctl.cwnd_to_use > max((segsiz * (rack_req_segs + 2)), minseg)) &&
11074 (tp->snd_wnd > max((segsiz * (rack_req_segs + 2)), minseg)) &&
11075 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) <=
11076 (segsiz * rack_req_segs))) {
11077 /*
11078 * We are doing dynamic GP pacing and
11079 * we have everything except 1MSS or less
11080 * bytes left out. We are still pacing away.
11081 * And there is data that could be sent, This
11082 * means we are inserting delayed ack time in
11083 * our measurements because we are pacing too slow.
11084 */
11085 rack_validate_multipliers_at_or_above100(rack);
11086 rack->rc_dragged_bottom = 1;
11087 rack_increase_bw_mul(rack, -1, 0, 0, 1);
11088 }
11089 }
11090
11091 #ifdef TCP_REQUEST_TRK
11092 static void
rack_log_hybrid(struct tcp_rack * rack,uint32_t seq,struct tcp_sendfile_track * cur,uint8_t mod,int line,int err)11093 rack_log_hybrid(struct tcp_rack *rack, uint32_t seq,
11094 struct tcp_sendfile_track *cur, uint8_t mod, int line, int err)
11095 {
11096 int do_log;
11097
11098 do_log = tcp_bblogging_on(rack->rc_tp);
11099 if (do_log == 0) {
11100 if ((do_log = tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING) )== 0)
11101 return;
11102 /* We only allow the three below with point logging on */
11103 if ((mod != HYBRID_LOG_RULES_APP) &&
11104 (mod != HYBRID_LOG_RULES_SET) &&
11105 (mod != HYBRID_LOG_REQ_COMP))
11106 return;
11107
11108 }
11109 if (do_log) {
11110 union tcp_log_stackspecific log;
11111 struct timeval tv;
11112
11113 /* Convert our ms to a microsecond */
11114 memset(&log, 0, sizeof(log));
11115 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
11116 log.u_bbr.flex1 = seq;
11117 log.u_bbr.cwnd_gain = line;
11118 if (cur != NULL) {
11119 uint64_t off;
11120
11121 log.u_bbr.flex2 = cur->start_seq;
11122 log.u_bbr.flex3 = cur->end_seq;
11123 log.u_bbr.flex4 = (uint32_t)((cur->localtime >> 32) & 0x00000000ffffffff);
11124 log.u_bbr.flex5 = (uint32_t)(cur->localtime & 0x00000000ffffffff);
11125 log.u_bbr.flex6 = cur->flags;
11126 log.u_bbr.pkts_out = cur->hybrid_flags;
11127 log.u_bbr.rttProp = cur->timestamp;
11128 log.u_bbr.cur_del_rate = cur->cspr;
11129 log.u_bbr.bw_inuse = cur->start;
11130 log.u_bbr.applimited = (uint32_t)(cur->end & 0x00000000ffffffff);
11131 log.u_bbr.delivered = (uint32_t)((cur->end >> 32) & 0x00000000ffffffff) ;
11132 log.u_bbr.epoch = (uint32_t)(cur->deadline & 0x00000000ffffffff);
11133 log.u_bbr.lt_epoch = (uint32_t)((cur->deadline >> 32) & 0x00000000ffffffff) ;
11134 log.u_bbr.inhpts = 1;
11135 #ifdef TCP_REQUEST_TRK
11136 off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_tcpreq_info[0]);
11137 log.u_bbr.use_lt_bw = (uint8_t)(off / sizeof(struct tcp_sendfile_track));
11138 #endif
11139 } else {
11140 log.u_bbr.flex2 = err;
11141 }
11142 /*
11143 * Fill in flex7 to be CHD (catchup|hybrid|DGP)
11144 */
11145 log.u_bbr.flex7 = rack->rc_catch_up;
11146 log.u_bbr.flex7 <<= 1;
11147 log.u_bbr.flex7 |= rack->rc_hybrid_mode;
11148 log.u_bbr.flex7 <<= 1;
11149 log.u_bbr.flex7 |= rack->dgp_on;
11150 /*
11151 * Compose bbr_state to be a bit wise 0000ADHF
11152 * where A is the always_pace flag
11153 * where D is the dgp_on flag
11154 * where H is the hybrid_mode on flag
11155 * where F is the use_fixed_rate flag.
11156 */
11157 log.u_bbr.bbr_state = rack->rc_always_pace;
11158 log.u_bbr.bbr_state <<= 1;
11159 log.u_bbr.bbr_state |= rack->dgp_on;
11160 log.u_bbr.bbr_state <<= 1;
11161 log.u_bbr.bbr_state |= rack->rc_hybrid_mode;
11162 log.u_bbr.bbr_state <<= 1;
11163 log.u_bbr.bbr_state |= rack->use_fixed_rate;
11164 log.u_bbr.flex8 = mod;
11165 log.u_bbr.delRate = rack->r_ctl.bw_rate_cap;
11166 log.u_bbr.bbr_substate = rack->r_ctl.client_suggested_maxseg;
11167 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
11168 log.u_bbr.pkt_epoch = rack->rc_tp->tcp_hybrid_start;
11169 log.u_bbr.lost = rack->rc_tp->tcp_hybrid_error;
11170 log.u_bbr.pacing_gain = (uint16_t)rack->rc_tp->tcp_hybrid_stop;
11171 tcp_log_event(rack->rc_tp, NULL,
11172 &rack->rc_inp->inp_socket->so_rcv,
11173 &rack->rc_inp->inp_socket->so_snd,
11174 TCP_HYBRID_PACING_LOG, 0,
11175 0, &log, false, NULL, __func__, __LINE__, &tv);
11176 }
11177 }
11178 #endif
11179
11180 #ifdef TCP_REQUEST_TRK
11181 static void
rack_set_dgp_hybrid_mode(struct tcp_rack * rack,tcp_seq seq,uint32_t len,uint64_t cts)11182 rack_set_dgp_hybrid_mode(struct tcp_rack *rack, tcp_seq seq, uint32_t len, uint64_t cts)
11183 {
11184 struct tcp_sendfile_track *rc_cur, *orig_ent;
11185 struct tcpcb *tp;
11186 int err = 0;
11187
11188 orig_ent = rack->r_ctl.rc_last_sft;
11189 rc_cur = tcp_req_find_req_for_seq(rack->rc_tp, seq);
11190 if (rc_cur == NULL) {
11191 /* If not in the beginning what about the end piece */
11192 if (rack->rc_hybrid_mode)
11193 rack_log_hybrid(rack, seq, NULL, HYBRID_LOG_NO_RANGE, __LINE__, err);
11194 rc_cur = tcp_req_find_req_for_seq(rack->rc_tp, (seq + len - 1));
11195 } else {
11196 err = 12345;
11197 }
11198 /* If we find no parameters we are in straight DGP mode */
11199 if(rc_cur == NULL) {
11200 /* None found for this seq, just DGP for now */
11201 if (rack->rc_hybrid_mode) {
11202 rack->r_ctl.client_suggested_maxseg = 0;
11203 rack->rc_catch_up = 0;
11204 if (rack->cspr_is_fcc == 0)
11205 rack->r_ctl.bw_rate_cap = 0;
11206 else
11207 rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap;
11208 }
11209 if (rack->rc_hybrid_mode) {
11210 rack_log_hybrid(rack, (seq + len - 1), NULL, HYBRID_LOG_NO_RANGE, __LINE__, err);
11211 }
11212 if (rack->r_ctl.rc_last_sft) {
11213 rack->r_ctl.rc_last_sft = NULL;
11214 }
11215 return;
11216 }
11217 if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_WASSET) == 0) {
11218 /* This entry was never setup for hybrid pacing on/off etc */
11219 if (rack->rc_hybrid_mode) {
11220 rack->r_ctl.client_suggested_maxseg = 0;
11221 rack->rc_catch_up = 0;
11222 rack->r_ctl.bw_rate_cap = 0;
11223 }
11224 if (rack->r_ctl.rc_last_sft) {
11225 rack->r_ctl.rc_last_sft = NULL;
11226 }
11227 if ((rc_cur->flags & TCP_TRK_TRACK_FLG_FSND) == 0) {
11228 rc_cur->flags |= TCP_TRK_TRACK_FLG_FSND;
11229 rc_cur->first_send = cts;
11230 rc_cur->sent_at_fs = rack->rc_tp->t_sndbytes;
11231 rc_cur->rxt_at_fs = rack->rc_tp->t_snd_rxt_bytes;
11232 }
11233 return;
11234 }
11235 /*
11236 * Ok if we have a new entry *or* have never
11237 * set up an entry we need to proceed. If
11238 * we have already set it up this entry we
11239 * just continue along with what we already
11240 * setup.
11241 */
11242 tp = rack->rc_tp;
11243 if ((rack->r_ctl.rc_last_sft != NULL) &&
11244 (rack->r_ctl.rc_last_sft == rc_cur)) {
11245 /* Its already in place */
11246 if (rack->rc_hybrid_mode)
11247 rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_ISSAME, __LINE__, 0);
11248 return;
11249 }
11250 if (rack->rc_hybrid_mode == 0) {
11251 rack->r_ctl.rc_last_sft = rc_cur;
11252 if (orig_ent) {
11253 orig_ent->sent_at_ls = rack->rc_tp->t_sndbytes;
11254 orig_ent->rxt_at_ls = rack->rc_tp->t_snd_rxt_bytes;
11255 orig_ent->flags |= TCP_TRK_TRACK_FLG_LSND;
11256 }
11257 rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_RULES_APP, __LINE__, 0);
11258 return;
11259 }
11260 if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_CSPR) && rc_cur->cspr){
11261 /* Compensate for all the header overhead's */
11262 if (rack->cspr_is_fcc == 0)
11263 rack->r_ctl.bw_rate_cap = rack_compensate_for_linerate(rack, rc_cur->cspr);
11264 else
11265 rack->r_ctl.fillcw_cap = rack_compensate_for_linerate(rack, rc_cur->cspr);
11266 } else {
11267 if (rack->rc_hybrid_mode) {
11268 if (rack->cspr_is_fcc == 0)
11269 rack->r_ctl.bw_rate_cap = 0;
11270 else
11271 rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap;
11272 }
11273 }
11274 if (rc_cur->hybrid_flags & TCP_HYBRID_PACING_H_MS)
11275 rack->r_ctl.client_suggested_maxseg = rc_cur->hint_maxseg;
11276 else
11277 rack->r_ctl.client_suggested_maxseg = 0;
11278 if (rc_cur->timestamp == rack->r_ctl.last_tm_mark) {
11279 /*
11280 * It is the same timestamp as the previous one
11281 * add the hybrid flag that will indicate we use
11282 * sendtime not arrival time for catch-up mode.
11283 */
11284 rc_cur->hybrid_flags |= TCP_HYBRID_PACING_SENDTIME;
11285 }
11286 if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_CU) &&
11287 (rc_cur->cspr > 0)) {
11288 uint64_t len;
11289
11290 rack->rc_catch_up = 1;
11291 /*
11292 * Calculate the deadline time, first set the
11293 * time to when the request arrived.
11294 */
11295 if (rc_cur->hybrid_flags & TCP_HYBRID_PACING_SENDTIME) {
11296 /*
11297 * For cases where its a duplicate tm (we received more
11298 * than one request for a tm) we want to use now, the point
11299 * where we are just sending the first bit of the request.
11300 */
11301 rc_cur->deadline = cts;
11302 } else {
11303 /*
11304 * Here we have a different tm from the last request
11305 * so we want to use arrival time as our base.
11306 */
11307 rc_cur->deadline = rc_cur->localtime;
11308 }
11309 /*
11310 * Next calculate the length and compensate for
11311 * TLS if need be.
11312 */
11313 len = rc_cur->end - rc_cur->start;
11314 if (tp->t_inpcb.inp_socket->so_snd.sb_tls_info) {
11315 /*
11316 * This session is doing TLS. Take a swag guess
11317 * at the overhead.
11318 */
11319 len += tcp_estimate_tls_overhead(tp->t_inpcb.inp_socket, len);
11320 }
11321 /*
11322 * Now considering the size, and the cspr, what is the time that
11323 * would be required at the cspr rate. Here we use the raw
11324 * cspr value since the client only looks at the raw data. We
11325 * do use len which includes TLS overhead, but not the TCP/IP etc.
11326 * That will get made up for in the CU pacing rate set.
11327 */
11328 len *= HPTS_USEC_IN_SEC;
11329 len /= rc_cur->cspr;
11330 rc_cur->deadline += len;
11331 } else {
11332 rack->rc_catch_up = 0;
11333 rc_cur->deadline = 0;
11334 }
11335 if (rack->r_ctl.client_suggested_maxseg != 0) {
11336 /*
11337 * We need to reset the max pace segs if we have a
11338 * client_suggested_maxseg.
11339 */
11340 rack_set_pace_segments(tp, rack, __LINE__, NULL);
11341 }
11342 if (orig_ent) {
11343 orig_ent->sent_at_ls = rack->rc_tp->t_sndbytes;
11344 orig_ent->rxt_at_ls = rack->rc_tp->t_snd_rxt_bytes;
11345 orig_ent->flags |= TCP_TRK_TRACK_FLG_LSND;
11346 }
11347 rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_RULES_APP, __LINE__, 0);
11348 /* Remember it for next time and for CU mode */
11349 rack->r_ctl.rc_last_sft = rc_cur;
11350 rack->r_ctl.last_tm_mark = rc_cur->timestamp;
11351 }
11352 #endif
11353
11354 static void
rack_chk_req_and_hybrid_on_out(struct tcp_rack * rack,tcp_seq seq,uint32_t len,uint64_t cts)11355 rack_chk_req_and_hybrid_on_out(struct tcp_rack *rack, tcp_seq seq, uint32_t len, uint64_t cts)
11356 {
11357 #ifdef TCP_REQUEST_TRK
11358 struct tcp_sendfile_track *ent;
11359
11360 ent = rack->r_ctl.rc_last_sft;
11361 if ((ent == NULL) ||
11362 (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) ||
11363 (SEQ_GEQ(seq, ent->end_seq))) {
11364 /* Time to update the track. */
11365 rack_set_dgp_hybrid_mode(rack, seq, len, cts);
11366 ent = rack->r_ctl.rc_last_sft;
11367 }
11368 /* Out of all */
11369 if (ent == NULL) {
11370 return;
11371 }
11372 if (SEQ_LT(ent->end_seq, (seq + len))) {
11373 /*
11374 * This is the case where our end_seq guess
11375 * was wrong. This is usually due to TLS having
11376 * more bytes then our guess. It could also be the
11377 * case that the client sent in two requests closely
11378 * and the SB is full of both so we are sending part
11379 * of each (end|beg). In such a case lets move this
11380 * guys end to match the end of this send. That
11381 * way it will complete when all of it is acked.
11382 */
11383 ent->end_seq = (seq + len);
11384 if (rack->rc_hybrid_mode)
11385 rack_log_hybrid_bw(rack, seq, len, 0, 0, HYBRID_LOG_EXTEND, 0, ent, __LINE__);
11386 }
11387 /* Now validate we have set the send time of this one */
11388 if ((ent->flags & TCP_TRK_TRACK_FLG_FSND) == 0) {
11389 ent->flags |= TCP_TRK_TRACK_FLG_FSND;
11390 ent->first_send = cts;
11391 ent->sent_at_fs = rack->rc_tp->t_sndbytes;
11392 ent->rxt_at_fs = rack->rc_tp->t_snd_rxt_bytes;
11393 }
11394 #endif
11395 }
11396
11397 static void
rack_gain_for_fastoutput(struct tcp_rack * rack,struct tcpcb * tp,struct socket * so,uint32_t acked_amount)11398 rack_gain_for_fastoutput(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t acked_amount)
11399 {
11400 /*
11401 * The fast output path is enabled and we
11402 * have moved the cumack forward. Lets see if
11403 * we can expand forward the fast path length by
11404 * that amount. What we would ideally like to
11405 * do is increase the number of bytes in the
11406 * fast path block (left_to_send) by the
11407 * acked amount. However we have to gate that
11408 * by two factors:
11409 * 1) The amount outstanding and the rwnd of the peer
11410 * (i.e. we don't want to exceed the rwnd of the peer).
11411 * <and>
11412 * 2) The amount of data left in the socket buffer (i.e.
11413 * we can't send beyond what is in the buffer).
11414 *
11415 * Note that this does not take into account any increase
11416 * in the cwnd. We will only extend the fast path by
11417 * what was acked.
11418 */
11419 uint32_t new_total, gating_val;
11420
11421 new_total = acked_amount + rack->r_ctl.fsb.left_to_send;
11422 gating_val = min((sbavail(&so->so_snd) - (tp->snd_max - tp->snd_una)),
11423 (tp->snd_wnd - (tp->snd_max - tp->snd_una)));
11424 if (new_total <= gating_val) {
11425 /* We can increase left_to_send by the acked amount */
11426 counter_u64_add(rack_extended_rfo, 1);
11427 rack->r_ctl.fsb.left_to_send = new_total;
11428 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(&rack->rc_inp->inp_socket->so_snd) - (tp->snd_max - tp->snd_una))),
11429 ("rack:%p left_to_send:%u sbavail:%u out:%u",
11430 rack, rack->r_ctl.fsb.left_to_send,
11431 sbavail(&rack->rc_inp->inp_socket->so_snd),
11432 (tp->snd_max - tp->snd_una)));
11433
11434 }
11435 }
11436
11437 static void
rack_adjust_sendmap_head(struct tcp_rack * rack,struct sockbuf * sb)11438 rack_adjust_sendmap_head(struct tcp_rack *rack, struct sockbuf *sb)
11439 {
11440 /*
11441 * Here any sendmap entry that points to the
11442 * beginning mbuf must be adjusted to the correct
11443 * offset. This must be called with:
11444 * 1) The socket buffer locked
11445 * 2) snd_una adjusted to its new position.
11446 *
11447 * Note that (2) implies rack_ack_received has also
11448 * been called and all the sbcut's have been done.
11449 *
11450 * We grab the first mbuf in the socket buffer and
11451 * then go through the front of the sendmap, recalculating
11452 * the stored offset for any sendmap entry that has
11453 * that mbuf. We must use the sb functions to do this
11454 * since its possible an add was done has well as
11455 * the subtraction we may have just completed. This should
11456 * not be a penalty though, since we just referenced the sb
11457 * to go in and trim off the mbufs that we freed (of course
11458 * there will be a penalty for the sendmap references though).
11459 *
11460 * Note also with INVARIANT on, we validate with a KASSERT
11461 * that the first sendmap entry has a soff of 0.
11462 *
11463 */
11464 struct mbuf *m;
11465 struct rack_sendmap *rsm;
11466 tcp_seq snd_una;
11467 #ifdef INVARIANTS
11468 int first_processed = 0;
11469 #endif
11470
11471 snd_una = rack->rc_tp->snd_una;
11472 SOCKBUF_LOCK_ASSERT(sb);
11473 m = sb->sb_mb;
11474 rsm = tqhash_min(rack->r_ctl.tqh);
11475 if ((rsm == NULL) || (m == NULL)) {
11476 /* Nothing outstanding */
11477 return;
11478 }
11479 /* The very first RSM's mbuf must point to the head mbuf in the sb */
11480 KASSERT((rsm->m == m),
11481 ("Rack:%p sb:%p rsm:%p -- first rsm mbuf not aligned to sb",
11482 rack, sb, rsm));
11483 while (rsm->m && (rsm->m == m)) {
11484 /* one to adjust */
11485 #ifdef INVARIANTS
11486 struct mbuf *tm;
11487 uint32_t soff;
11488
11489 tm = sbsndmbuf(sb, (rsm->r_start - snd_una), &soff);
11490 if ((rsm->orig_m_len != m->m_len) ||
11491 (rsm->orig_t_space != M_TRAILINGROOM(m))){
11492 rack_adjust_orig_mlen(rsm);
11493 }
11494 if (first_processed == 0) {
11495 KASSERT((rsm->soff == 0),
11496 ("Rack:%p rsm:%p -- rsm at head but soff not zero",
11497 rack, rsm));
11498 first_processed = 1;
11499 }
11500 if ((rsm->soff != soff) || (rsm->m != tm)) {
11501 /*
11502 * This is not a fatal error, we anticipate it
11503 * might happen (the else code), so we count it here
11504 * so that under invariant we can see that it really
11505 * does happen.
11506 */
11507 counter_u64_add(rack_adjust_map_bw, 1);
11508 }
11509 rsm->m = tm;
11510 rsm->soff = soff;
11511 if (tm) {
11512 rsm->orig_m_len = rsm->m->m_len;
11513 rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
11514 } else {
11515 rsm->orig_m_len = 0;
11516 rsm->orig_t_space = 0;
11517 }
11518 #else
11519 rsm->m = sbsndmbuf(sb, (rsm->r_start - snd_una), &rsm->soff);
11520 if (rsm->m) {
11521 rsm->orig_m_len = rsm->m->m_len;
11522 rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
11523 } else {
11524 rsm->orig_m_len = 0;
11525 rsm->orig_t_space = 0;
11526 }
11527 #endif
11528 rsm = tqhash_next(rack->r_ctl.tqh, rsm);
11529 if (rsm == NULL)
11530 break;
11531 }
11532 }
11533
11534 #ifdef TCP_REQUEST_TRK
11535 static inline void
rack_req_check_for_comp(struct tcp_rack * rack,tcp_seq th_ack)11536 rack_req_check_for_comp(struct tcp_rack *rack, tcp_seq th_ack)
11537 {
11538 struct tcp_sendfile_track *ent;
11539 int i;
11540
11541 if ((rack->rc_hybrid_mode == 0) &&
11542 (tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING) == 0)) {
11543 /*
11544 * Just do normal completions hybrid pacing is not on
11545 * and CLDL is off as well.
11546 */
11547 tcp_req_check_for_comp(rack->rc_tp, th_ack);
11548 return;
11549 }
11550 /*
11551 * Originally I was just going to find the th_ack associated
11552 * with an entry. But then I realized a large strech ack could
11553 * in theory ack two or more requests at once. So instead we
11554 * need to find all entries that are completed by th_ack not
11555 * just a single entry and do our logging.
11556 */
11557 ent = tcp_req_find_a_req_that_is_completed_by(rack->rc_tp, th_ack, &i);
11558 while (ent != NULL) {
11559 /*
11560 * We may be doing hybrid pacing or CLDL and need more details possibly
11561 * so we do it manually instead of calling
11562 * tcp_req_check_for_comp()
11563 */
11564 uint64_t laa, tim, data, cbw, ftim;
11565
11566 /* Ok this ack frees it */
11567 rack_log_hybrid(rack, th_ack,
11568 ent, HYBRID_LOG_REQ_COMP, __LINE__, 0);
11569 rack_log_hybrid_sends(rack, ent, __LINE__);
11570 /* calculate the time based on the ack arrival */
11571 data = ent->end - ent->start;
11572 laa = tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time);
11573 if (ent->flags & TCP_TRK_TRACK_FLG_FSND) {
11574 if (ent->first_send > ent->localtime)
11575 ftim = ent->first_send;
11576 else
11577 ftim = ent->localtime;
11578 } else {
11579 /* TSNH */
11580 ftim = ent->localtime;
11581 }
11582 if (laa > ent->localtime)
11583 tim = laa - ftim;
11584 else
11585 tim = 0;
11586 cbw = data * HPTS_USEC_IN_SEC;
11587 if (tim > 0)
11588 cbw /= tim;
11589 else
11590 cbw = 0;
11591 rack_log_hybrid_bw(rack, th_ack, cbw, tim, data, HYBRID_LOG_BW_MEASURE, 0, ent, __LINE__);
11592 /*
11593 * Check to see if we are freeing what we are pointing to send wise
11594 * if so be sure to NULL the pointer so we know we are no longer
11595 * set to anything.
11596 */
11597 if (ent == rack->r_ctl.rc_last_sft) {
11598 rack->r_ctl.rc_last_sft = NULL;
11599 if (rack->rc_hybrid_mode) {
11600 rack->rc_catch_up = 0;
11601 if (rack->cspr_is_fcc == 0)
11602 rack->r_ctl.bw_rate_cap = 0;
11603 else
11604 rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap;
11605 rack->r_ctl.client_suggested_maxseg = 0;
11606 }
11607 }
11608 /* Generate the log that the tcp_netflix call would have */
11609 tcp_req_log_req_info(rack->rc_tp, ent,
11610 i, TCP_TRK_REQ_LOG_FREED, 0, 0);
11611 /* Free it and see if there is another one */
11612 tcp_req_free_a_slot(rack->rc_tp, ent);
11613 ent = tcp_req_find_a_req_that_is_completed_by(rack->rc_tp, th_ack, &i);
11614 }
11615 }
11616 #endif
11617
11618
11619 /*
11620 * Return value of 1, we do not need to call rack_process_data().
11621 * return value of 0, rack_process_data can be called.
11622 * For ret_val if its 0 the TCP is locked, if its non-zero
11623 * its unlocked and probably unsafe to touch the TCB.
11624 */
11625 static int
rack_process_ack(struct mbuf * m,struct tcphdr * th,struct socket * so,struct tcpcb * tp,struct tcpopt * to,uint32_t tiwin,int32_t tlen,int32_t * ofia,int32_t thflags,int32_t * ret_val,int32_t orig_tlen)11626 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
11627 struct tcpcb *tp, struct tcpopt *to,
11628 uint32_t tiwin, int32_t tlen,
11629 int32_t * ofia, int32_t thflags, int32_t *ret_val, int32_t orig_tlen)
11630 {
11631 int32_t ourfinisacked = 0;
11632 int32_t nsegs, acked_amount;
11633 int32_t acked;
11634 struct mbuf *mfree;
11635 struct tcp_rack *rack;
11636 int32_t under_pacing = 0;
11637 int32_t post_recovery = 0;
11638 uint32_t p_cwnd;
11639
11640 INP_WLOCK_ASSERT(tptoinpcb(tp));
11641
11642 rack = (struct tcp_rack *)tp->t_fb_ptr;
11643 if (SEQ_GEQ(tp->snd_una, tp->iss + (65535 << tp->snd_scale))) {
11644 /* Checking SEG.ACK against ISS is definitely redundant. */
11645 tp->t_flags2 |= TF2_NO_ISS_CHECK;
11646 }
11647 if (!V_tcp_insecure_ack) {
11648 tcp_seq seq_min;
11649 bool ghost_ack_check;
11650
11651 if (tp->t_flags2 & TF2_NO_ISS_CHECK) {
11652 /* Check for too old ACKs (RFC 5961, Section 5.2). */
11653 seq_min = tp->snd_una - tp->max_sndwnd;
11654 ghost_ack_check = false;
11655 } else {
11656 if (SEQ_GT(tp->iss + 1, tp->snd_una - tp->max_sndwnd)) {
11657 /* Checking for ghost ACKs is stricter. */
11658 seq_min = tp->iss + 1;
11659 ghost_ack_check = true;
11660 } else {
11661 /*
11662 * Checking for too old ACKs (RFC 5961,
11663 * Section 5.2) is stricter.
11664 */
11665 seq_min = tp->snd_una - tp->max_sndwnd;
11666 ghost_ack_check = false;
11667 }
11668 }
11669 if (SEQ_LT(th->th_ack, seq_min)) {
11670 if (ghost_ack_check)
11671 TCPSTAT_INC(tcps_rcvghostack);
11672 else
11673 TCPSTAT_INC(tcps_rcvacktooold);
11674 /* Send challenge ACK. */
11675 ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
11676 rack->r_wanted_output = 1;
11677 return (1);
11678 }
11679 }
11680 if (SEQ_GT(th->th_ack, tp->snd_max)) {
11681 ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
11682 rack->r_wanted_output = 1;
11683 return (1);
11684 }
11685 if (rack->gp_ready &&
11686 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
11687 under_pacing = 1;
11688 }
11689 if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) {
11690 int in_rec, dup_ack_struck = 0;
11691 int dsack_seen = 0, sacks_seen = 0;
11692
11693 in_rec = IN_FASTRECOVERY(tp->t_flags);
11694 if (rack->rc_in_persist) {
11695 tp->t_rxtshift = 0;
11696 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
11697 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
11698 }
11699
11700 if ((th->th_ack == tp->snd_una) &&
11701 (tiwin == tp->snd_wnd) &&
11702 (orig_tlen == 0) &&
11703 ((to->to_flags & TOF_SACK) == 0)) {
11704 rack_strike_dupack(rack, th->th_ack);
11705 dup_ack_struck = 1;
11706 }
11707 rack_log_ack(tp, to, th, ((in_rec == 0) && IN_FASTRECOVERY(tp->t_flags)),
11708 dup_ack_struck, &dsack_seen, &sacks_seen);
11709
11710 }
11711 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
11712 /*
11713 * Old ack, behind (or duplicate to) the last one rcv'd
11714 * Note: We mark reordering is occuring if its
11715 * less than and we have not closed our window.
11716 */
11717 if (SEQ_LT(th->th_ack, tp->snd_una) && (sbspace(&so->so_rcv) > ctf_fixed_maxseg(tp))) {
11718 rack->r_ctl.rc_reorder_ts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time);
11719 if (rack->r_ctl.rc_reorder_ts == 0)
11720 rack->r_ctl.rc_reorder_ts = 1;
11721 }
11722 return (0);
11723 }
11724 /*
11725 * If we reach this point, ACK is not a duplicate, i.e., it ACKs
11726 * something we sent.
11727 */
11728 if (tp->t_flags & TF_NEEDSYN) {
11729 /*
11730 * T/TCP: Connection was half-synchronized, and our SYN has
11731 * been ACK'd (so connection is now fully synchronized). Go
11732 * to non-starred state, increment snd_una for ACK of SYN,
11733 * and check if we can do window scaling.
11734 */
11735 tp->t_flags &= ~TF_NEEDSYN;
11736 tp->snd_una++;
11737 /* Do window scaling? */
11738 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
11739 (TF_RCVD_SCALE | TF_REQ_SCALE)) {
11740 tp->rcv_scale = tp->request_r_scale;
11741 /* Send window already scaled. */
11742 }
11743 }
11744 nsegs = max(1, m->m_pkthdr.lro_nsegs);
11745
11746 acked = BYTES_THIS_ACK(tp, th);
11747 if (acked) {
11748 /*
11749 * Any time we move the cum-ack forward clear
11750 * keep-alive tied probe-not-answered. The
11751 * persists clears its own on entry.
11752 */
11753 rack->probe_not_answered = 0;
11754 }
11755 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs);
11756 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
11757 /*
11758 * If we just performed our first retransmit, and the ACK arrives
11759 * within our recovery window, then it was a mistake to do the
11760 * retransmit in the first place. Recover our original cwnd and
11761 * ssthresh, and proceed to transmit where we left off.
11762 */
11763 if ((tp->t_flags & TF_PREVVALID) &&
11764 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) {
11765 tp->t_flags &= ~TF_PREVVALID;
11766 if (tp->t_rxtshift == 1 &&
11767 (int)(ticks - tp->t_badrxtwin) < 0)
11768 rack_cong_signal(tp, CC_RTO_ERR, th->th_ack, __LINE__);
11769 }
11770 if (acked) {
11771 /* assure we are not backed off */
11772 tp->t_rxtshift = 0;
11773 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
11774 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
11775 rack->rc_tlp_in_progress = 0;
11776 rack->r_ctl.rc_tlp_cnt_out = 0;
11777 /*
11778 * If it is the RXT timer we want to
11779 * stop it, so we can restart a TLP.
11780 */
11781 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT)
11782 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
11783 #ifdef TCP_REQUEST_TRK
11784 rack_req_check_for_comp(rack, th->th_ack);
11785 #endif
11786 }
11787 /*
11788 * If we have a timestamp reply, update smoothed round trip time. If
11789 * no timestamp is present but transmit timer is running and timed
11790 * sequence number was acked, update smoothed round trip time. Since
11791 * we now have an rtt measurement, cancel the timer backoff (cf.,
11792 * Phil Karn's retransmit alg.). Recompute the initial retransmit
11793 * timer.
11794 *
11795 * Some boxes send broken timestamp replies during the SYN+ACK
11796 * phase, ignore timestamps of 0 or we could calculate a huge RTT
11797 * and blow up the retransmit timer.
11798 */
11799 /*
11800 * If all outstanding data is acked, stop retransmit timer and
11801 * remember to restart (more output or persist). If there is more
11802 * data to be acked, restart retransmit timer, using current
11803 * (possibly backed-off) value.
11804 */
11805 if (acked == 0) {
11806 if (ofia)
11807 *ofia = ourfinisacked;
11808 return (0);
11809 }
11810 if (IN_RECOVERY(tp->t_flags)) {
11811 if (SEQ_LT(th->th_ack, tp->snd_recover) &&
11812 (SEQ_LT(th->th_ack, tp->snd_max))) {
11813 tcp_rack_partialack(tp);
11814 } else {
11815 rack_post_recovery(tp, th->th_ack);
11816 post_recovery = 1;
11817 /*
11818 * Grab the segsiz, multiply by 2 and add the snd_cwnd
11819 * that is the max the CC should add if we are exiting
11820 * recovery and doing a late add.
11821 */
11822 p_cwnd = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
11823 p_cwnd <<= 1;
11824 p_cwnd += tp->snd_cwnd;
11825 }
11826 } else if ((rack->rto_from_rec == 1) &&
11827 SEQ_GEQ(th->th_ack, tp->snd_recover)) {
11828 /*
11829 * We were in recovery, hit a rxt timeout
11830 * and never re-entered recovery. The timeout(s)
11831 * made up all the lost data. In such a case
11832 * we need to clear the rto_from_rec flag.
11833 */
11834 rack->rto_from_rec = 0;
11835 }
11836 /*
11837 * Let the congestion control algorithm update congestion control
11838 * related information. This typically means increasing the
11839 * congestion window.
11840 */
11841 rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, post_recovery);
11842 if (post_recovery &&
11843 (tp->snd_cwnd > p_cwnd)) {
11844 /* Must be non-newreno (cubic) getting too ahead of itself */
11845 tp->snd_cwnd = p_cwnd;
11846 }
11847 SOCK_SENDBUF_LOCK(so);
11848 acked_amount = min(acked, (int)sbavail(&so->so_snd));
11849 tp->snd_wnd -= acked_amount;
11850 mfree = sbcut_locked(&so->so_snd, acked_amount);
11851 if ((sbused(&so->so_snd) == 0) &&
11852 (acked > acked_amount) &&
11853 (tp->t_state >= TCPS_FIN_WAIT_1) &&
11854 (tp->t_flags & TF_SENTFIN)) {
11855 /*
11856 * We must be sure our fin
11857 * was sent and acked (we can be
11858 * in FIN_WAIT_1 without having
11859 * sent the fin).
11860 */
11861 ourfinisacked = 1;
11862 }
11863 tp->snd_una = th->th_ack;
11864 /* wakeups? */
11865 if (acked_amount && sbavail(&so->so_snd))
11866 rack_adjust_sendmap_head(rack, &so->so_snd);
11867 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
11868 /* NB: sowwakeup_locked() does an implicit unlock. */
11869 sowwakeup_locked(so);
11870 m_freem(mfree);
11871 if (SEQ_GT(tp->snd_una, tp->snd_recover))
11872 tp->snd_recover = tp->snd_una;
11873
11874 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
11875 tp->snd_nxt = tp->snd_max;
11876 }
11877 if (under_pacing &&
11878 (rack->use_fixed_rate == 0) &&
11879 (rack->in_probe_rtt == 0) &&
11880 rack->rc_gp_dyn_mul &&
11881 rack->rc_always_pace) {
11882 /* Check if we are dragging bottom */
11883 rack_check_bottom_drag(tp, rack, so);
11884 }
11885 if (tp->snd_una == tp->snd_max) {
11886 /* Nothing left outstanding */
11887 tp->t_flags &= ~TF_PREVVALID;
11888 if (rack->r_ctl.rc_went_idle_time == 0)
11889 rack->r_ctl.rc_went_idle_time = 1;
11890 rack->r_ctl.retran_during_recovery = 0;
11891 rack->r_ctl.dsack_byte_cnt = 0;
11892 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
11893 if (sbavail(&tptosocket(tp)->so_snd) == 0)
11894 tp->t_acktime = 0;
11895 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
11896 rack->rc_suspicious = 0;
11897 /* Set need output so persist might get set */
11898 rack->r_wanted_output = 1;
11899 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
11900 if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
11901 (sbavail(&so->so_snd) == 0) &&
11902 (tp->t_flags2 & TF2_DROP_AF_DATA)) {
11903 /*
11904 * The socket was gone and the
11905 * peer sent data (now or in the past), time to
11906 * reset him.
11907 */
11908 *ret_val = 1;
11909 /* tcp_close will kill the inp pre-log the Reset */
11910 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
11911 tp = tcp_close(tp);
11912 ctf_do_dropwithreset(m, tp, th, tlen);
11913 return (1);
11914 }
11915 }
11916 if (ofia)
11917 *ofia = ourfinisacked;
11918 return (0);
11919 }
11920
11921
11922 static void
rack_log_collapse(struct tcp_rack * rack,uint32_t cnt,uint32_t split,uint32_t out,int line,int dir,uint32_t flags,struct rack_sendmap * rsm)11923 rack_log_collapse(struct tcp_rack *rack, uint32_t cnt, uint32_t split, uint32_t out, int line,
11924 int dir, uint32_t flags, struct rack_sendmap *rsm)
11925 {
11926 if (tcp_bblogging_on(rack->rc_tp)) {
11927 union tcp_log_stackspecific log;
11928 struct timeval tv;
11929
11930 memset(&log, 0, sizeof(log));
11931 log.u_bbr.flex1 = cnt;
11932 log.u_bbr.flex2 = split;
11933 log.u_bbr.flex3 = out;
11934 log.u_bbr.flex4 = line;
11935 log.u_bbr.flex5 = rack->r_must_retran;
11936 log.u_bbr.flex6 = flags;
11937 log.u_bbr.flex7 = rack->rc_has_collapsed;
11938 log.u_bbr.flex8 = dir; /*
11939 * 1 is collapsed, 0 is uncollapsed,
11940 * 2 is log of a rsm being marked, 3 is a split.
11941 */
11942 if (rsm == NULL)
11943 log.u_bbr.rttProp = 0;
11944 else
11945 log.u_bbr.rttProp = (uintptr_t)rsm;
11946 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
11947 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
11948 TCP_LOG_EVENTP(rack->rc_tp, NULL,
11949 &rack->rc_inp->inp_socket->so_rcv,
11950 &rack->rc_inp->inp_socket->so_snd,
11951 TCP_RACK_LOG_COLLAPSE, 0,
11952 0, &log, false, &tv);
11953 }
11954 }
11955
11956 static void
rack_collapsed_window(struct tcp_rack * rack,uint32_t out,tcp_seq th_ack,int line)11957 rack_collapsed_window(struct tcp_rack *rack, uint32_t out, tcp_seq th_ack, int line)
11958 {
11959 /*
11960 * Here all we do is mark the collapsed point and set the flag.
11961 * This may happen again and again, but there is no
11962 * sense splitting our map until we know where the
11963 * peer finally lands in the collapse.
11964 */
11965 tcp_trace_point(rack->rc_tp, TCP_TP_COLLAPSED_WND);
11966 if ((rack->rc_has_collapsed == 0) ||
11967 (rack->r_ctl.last_collapse_point != (th_ack + rack->rc_tp->snd_wnd)))
11968 counter_u64_add(rack_collapsed_win_seen, 1);
11969 rack->r_ctl.last_collapse_point = th_ack + rack->rc_tp->snd_wnd;
11970 rack->r_ctl.high_collapse_point = rack->rc_tp->snd_max;
11971 rack->rc_has_collapsed = 1;
11972 rack->r_collapse_point_valid = 1;
11973 rack_log_collapse(rack, 0, th_ack, rack->r_ctl.last_collapse_point, line, 1, 0, NULL);
11974 }
11975
11976 static void
rack_un_collapse_window(struct tcp_rack * rack,int line)11977 rack_un_collapse_window(struct tcp_rack *rack, int line)
11978 {
11979 struct rack_sendmap *nrsm, *rsm;
11980 int cnt = 0, split = 0;
11981 int insret __diagused;
11982
11983
11984 tcp_trace_point(rack->rc_tp, TCP_TP_COLLAPSED_WND);
11985 rack->rc_has_collapsed = 0;
11986 rsm = tqhash_find(rack->r_ctl.tqh, rack->r_ctl.last_collapse_point);
11987 if (rsm == NULL) {
11988 /* Nothing to do maybe the peer ack'ed it all */
11989 rack_log_collapse(rack, 0, 0, ctf_outstanding(rack->rc_tp), line, 0, 0, NULL);
11990 return;
11991 }
11992 /* Now do we need to split this one? */
11993 if (SEQ_GT(rack->r_ctl.last_collapse_point, rsm->r_start)) {
11994 rack_log_collapse(rack, rsm->r_start, rsm->r_end,
11995 rack->r_ctl.last_collapse_point, line, 3, rsm->r_flags, rsm);
11996 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
11997 if (nrsm == NULL) {
11998 /* We can't get a rsm, mark all? */
11999 nrsm = rsm;
12000 goto no_split;
12001 }
12002 /* Clone it */
12003 split = 1;
12004 rack_clone_rsm(rack, nrsm, rsm, rack->r_ctl.last_collapse_point);
12005 #ifndef INVARIANTS
12006 (void)tqhash_insert(rack->r_ctl.tqh, nrsm);
12007 #else
12008 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) {
12009 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p",
12010 nrsm, insret, rack, rsm);
12011 }
12012 #endif
12013 rack_log_map_chg(rack->rc_tp, rack, NULL, rsm, nrsm, MAP_SPLIT,
12014 rack->r_ctl.last_collapse_point, __LINE__);
12015 if (rsm->r_in_tmap) {
12016 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
12017 nrsm->r_in_tmap = 1;
12018 }
12019 /*
12020 * Set in the new RSM as the
12021 * collapsed starting point
12022 */
12023 rsm = nrsm;
12024 }
12025
12026 no_split:
12027 TQHASH_FOREACH_FROM(nrsm, rack->r_ctl.tqh, rsm) {
12028 cnt++;
12029 nrsm->r_flags |= RACK_RWND_COLLAPSED;
12030 rack_log_collapse(rack, nrsm->r_start, nrsm->r_end, 0, line, 4, nrsm->r_flags, nrsm);
12031 cnt++;
12032 }
12033 if (cnt) {
12034 counter_u64_add(rack_collapsed_win, 1);
12035 }
12036 rack_log_collapse(rack, cnt, split, ctf_outstanding(rack->rc_tp), line, 0, 0, NULL);
12037 }
12038
12039 static void
rack_handle_delayed_ack(struct tcpcb * tp,struct tcp_rack * rack,int32_t tlen,int32_t tfo_syn)12040 rack_handle_delayed_ack(struct tcpcb *tp, struct tcp_rack *rack,
12041 int32_t tlen, int32_t tfo_syn)
12042 {
12043 if (DELAY_ACK(tp, tlen) || tfo_syn) {
12044 rack_timer_cancel(tp, rack,
12045 rack->r_ctl.rc_rcvtime, __LINE__);
12046 tp->t_flags |= TF_DELACK;
12047 } else {
12048 rack->r_wanted_output = 1;
12049 tp->t_flags |= TF_ACKNOW;
12050 }
12051 }
12052
12053 static void
rack_validate_fo_sendwin_up(struct tcpcb * tp,struct tcp_rack * rack)12054 rack_validate_fo_sendwin_up(struct tcpcb *tp, struct tcp_rack *rack)
12055 {
12056 /*
12057 * If fast output is in progress, lets validate that
12058 * the new window did not shrink on us and make it
12059 * so fast output should end.
12060 */
12061 if (rack->r_fast_output) {
12062 uint32_t out;
12063
12064 /*
12065 * Calculate what we will send if left as is
12066 * and compare that to our send window.
12067 */
12068 out = ctf_outstanding(tp);
12069 if ((out + rack->r_ctl.fsb.left_to_send) > tp->snd_wnd) {
12070 /* ok we have an issue */
12071 if (out >= tp->snd_wnd) {
12072 /* Turn off fast output the window is met or collapsed */
12073 rack->r_fast_output = 0;
12074 } else {
12075 /* we have some room left */
12076 rack->r_ctl.fsb.left_to_send = tp->snd_wnd - out;
12077 if (rack->r_ctl.fsb.left_to_send < ctf_fixed_maxseg(tp)) {
12078 /* If not at least 1 full segment never mind */
12079 rack->r_fast_output = 0;
12080 }
12081 }
12082 }
12083 }
12084 }
12085
12086 /*
12087 * Return value of 1, the TCB is unlocked and most
12088 * likely gone, return value of 0, the TCP is still
12089 * locked.
12090 */
12091 static int
rack_process_data(struct mbuf * m,struct tcphdr * th,struct socket * so,struct tcpcb * tp,int32_t drop_hdrlen,int32_t tlen,uint32_t tiwin,int32_t thflags,int32_t nxt_pkt)12092 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
12093 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
12094 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
12095 {
12096 /*
12097 * Update window information. Don't look at window if no ACK: TAC's
12098 * send garbage on first SYN.
12099 */
12100 int32_t nsegs;
12101 int32_t tfo_syn;
12102 struct tcp_rack *rack;
12103
12104 INP_WLOCK_ASSERT(tptoinpcb(tp));
12105
12106 rack = (struct tcp_rack *)tp->t_fb_ptr;
12107 nsegs = max(1, m->m_pkthdr.lro_nsegs);
12108 if ((thflags & TH_ACK) &&
12109 (SEQ_LT(tp->snd_wl1, th->th_seq) ||
12110 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
12111 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
12112 /* keep track of pure window updates */
12113 if (tlen == 0 &&
12114 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
12115 KMOD_TCPSTAT_INC(tcps_rcvwinupd);
12116 tp->snd_wnd = tiwin;
12117 rack_validate_fo_sendwin_up(tp, rack);
12118 tp->snd_wl1 = th->th_seq;
12119 tp->snd_wl2 = th->th_ack;
12120 if (tp->snd_wnd > tp->max_sndwnd)
12121 tp->max_sndwnd = tp->snd_wnd;
12122 rack->r_wanted_output = 1;
12123 } else if (thflags & TH_ACK) {
12124 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) {
12125 tp->snd_wnd = tiwin;
12126 rack_validate_fo_sendwin_up(tp, rack);
12127 tp->snd_wl1 = th->th_seq;
12128 tp->snd_wl2 = th->th_ack;
12129 }
12130 }
12131 if (tp->snd_wnd < ctf_outstanding(tp))
12132 /* The peer collapsed the window */
12133 rack_collapsed_window(rack, ctf_outstanding(tp), th->th_ack, __LINE__);
12134 else if (rack->rc_has_collapsed)
12135 rack_un_collapse_window(rack, __LINE__);
12136 if ((rack->r_collapse_point_valid) &&
12137 (SEQ_GT(th->th_ack, rack->r_ctl.high_collapse_point)))
12138 rack->r_collapse_point_valid = 0;
12139 /* Was persist timer active and now we have window space? */
12140 if ((rack->rc_in_persist != 0) &&
12141 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
12142 rack->r_ctl.rc_pace_min_segs))) {
12143 rack_exit_persist(tp, rack, rack->r_ctl.rc_rcvtime);
12144 tp->snd_nxt = tp->snd_max;
12145 /* Make sure we output to start the timer */
12146 rack->r_wanted_output = 1;
12147 }
12148 /* Do we enter persists? */
12149 if ((rack->rc_in_persist == 0) &&
12150 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
12151 TCPS_HAVEESTABLISHED(tp->t_state) &&
12152 ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) &&
12153 sbavail(&tptosocket(tp)->so_snd) &&
12154 (sbavail(&tptosocket(tp)->so_snd) > tp->snd_wnd)) {
12155 /*
12156 * Here the rwnd is less than
12157 * the pacing size, we are established,
12158 * nothing is outstanding, and there is
12159 * data to send. Enter persists.
12160 */
12161 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, tp->snd_una);
12162 }
12163 if (tp->t_flags2 & TF2_DROP_AF_DATA) {
12164 m_freem(m);
12165 return (0);
12166 }
12167 /*
12168 * don't process the URG bit, ignore them drag
12169 * along the up.
12170 */
12171 tp->rcv_up = tp->rcv_nxt;
12172
12173 /*
12174 * Process the segment text, merging it into the TCP sequencing
12175 * queue, and arranging for acknowledgment of receipt if necessary.
12176 * This process logically involves adjusting tp->rcv_wnd as data is
12177 * presented to the user (this happens in tcp_usrreq.c, case
12178 * PRU_RCVD). If a FIN has already been received on this connection
12179 * then we just ignore the text.
12180 */
12181 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
12182 (tp->t_flags & TF_FASTOPEN));
12183 if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) &&
12184 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
12185 tcp_seq save_start = th->th_seq;
12186 tcp_seq save_rnxt = tp->rcv_nxt;
12187 int save_tlen = tlen;
12188
12189 m_adj(m, drop_hdrlen); /* delayed header drop */
12190 /*
12191 * Insert segment which includes th into TCP reassembly
12192 * queue with control block tp. Set thflags to whether
12193 * reassembly now includes a segment with FIN. This handles
12194 * the common case inline (segment is the next to be
12195 * received on an established connection, and the queue is
12196 * empty), avoiding linkage into and removal from the queue
12197 * and repetition of various conversions. Set DELACK for
12198 * segments received in order, but ack immediately when
12199 * segments are out of order (so fast retransmit can work).
12200 */
12201 if (th->th_seq == tp->rcv_nxt &&
12202 SEGQ_EMPTY(tp) &&
12203 (TCPS_HAVEESTABLISHED(tp->t_state) ||
12204 tfo_syn)) {
12205 #ifdef NETFLIX_SB_LIMITS
12206 u_int mcnt, appended;
12207
12208 if (so->so_rcv.sb_shlim) {
12209 mcnt = m_memcnt(m);
12210 appended = 0;
12211 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
12212 CFO_NOSLEEP, NULL) == false) {
12213 counter_u64_add(tcp_sb_shlim_fails, 1);
12214 m_freem(m);
12215 return (0);
12216 }
12217 }
12218 #endif
12219 rack_handle_delayed_ack(tp, rack, tlen, tfo_syn);
12220 tp->rcv_nxt += tlen;
12221 if (tlen &&
12222 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) &&
12223 (tp->t_fbyte_in == 0)) {
12224 tp->t_fbyte_in = ticks;
12225 if (tp->t_fbyte_in == 0)
12226 tp->t_fbyte_in = 1;
12227 if (tp->t_fbyte_out && tp->t_fbyte_in)
12228 tp->t_flags2 |= TF2_FBYTES_COMPLETE;
12229 }
12230 thflags = tcp_get_flags(th) & TH_FIN;
12231 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs);
12232 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen);
12233 SOCK_RECVBUF_LOCK(so);
12234 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
12235 m_freem(m);
12236 } else {
12237 int32_t newsize;
12238
12239 if (tlen > 0) {
12240 newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
12241 if (newsize)
12242 if (!sbreserve_locked(so, SO_RCV, newsize, NULL))
12243 so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
12244 }
12245 #ifdef NETFLIX_SB_LIMITS
12246 appended =
12247 #endif
12248 sbappendstream_locked(&so->so_rcv, m, 0);
12249 }
12250 rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1);
12251 /* NB: sorwakeup_locked() does an implicit unlock. */
12252 sorwakeup_locked(so);
12253 #ifdef NETFLIX_SB_LIMITS
12254 if (so->so_rcv.sb_shlim && appended != mcnt)
12255 counter_fo_release(so->so_rcv.sb_shlim,
12256 mcnt - appended);
12257 #endif
12258 } else {
12259 /*
12260 * XXX: Due to the header drop above "th" is
12261 * theoretically invalid by now. Fortunately
12262 * m_adj() doesn't actually frees any mbufs when
12263 * trimming from the head.
12264 */
12265 tcp_seq temp = save_start;
12266
12267 thflags = tcp_reass(tp, th, &temp, &tlen, m);
12268 tp->t_flags |= TF_ACKNOW;
12269 if (tp->t_flags & TF_WAKESOR) {
12270 tp->t_flags &= ~TF_WAKESOR;
12271 /* NB: sorwakeup_locked() does an implicit unlock. */
12272 sorwakeup_locked(so);
12273 }
12274 }
12275 if ((tp->t_flags & TF_SACK_PERMIT) &&
12276 (save_tlen > 0) &&
12277 TCPS_HAVEESTABLISHED(tp->t_state)) {
12278 if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) {
12279 /*
12280 * DSACK actually handled in the fastpath
12281 * above.
12282 */
12283 tcp_update_sack_list(tp, save_start,
12284 save_start + save_tlen);
12285 } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
12286 if ((tp->rcv_numsacks >= 1) &&
12287 (tp->sackblks[0].end == save_start)) {
12288 /*
12289 * Partial overlap, recorded at todrop
12290 * above.
12291 */
12292 tcp_update_sack_list(tp,
12293 tp->sackblks[0].start,
12294 tp->sackblks[0].end);
12295 } else {
12296 tcp_update_dsack_list(tp, save_start,
12297 save_start + save_tlen);
12298 }
12299 } else if (tlen >= save_tlen) {
12300 /* Update of sackblks. */
12301 tcp_update_dsack_list(tp, save_start,
12302 save_start + save_tlen);
12303 } else if (tlen > 0) {
12304 tcp_update_dsack_list(tp, save_start,
12305 save_start + tlen);
12306 }
12307 }
12308 } else {
12309 m_freem(m);
12310 thflags &= ~TH_FIN;
12311 }
12312
12313 /*
12314 * If FIN is received ACK the FIN and let the user know that the
12315 * connection is closing.
12316 */
12317 if (thflags & TH_FIN) {
12318 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
12319 /* The socket upcall is handled by socantrcvmore. */
12320 socantrcvmore(so);
12321 /*
12322 * If connection is half-synchronized (ie NEEDSYN
12323 * flag on) then delay ACK, so it may be piggybacked
12324 * when SYN is sent. Otherwise, since we received a
12325 * FIN then no more input can be expected, send ACK
12326 * now.
12327 */
12328 if (tp->t_flags & TF_NEEDSYN) {
12329 rack_timer_cancel(tp, rack,
12330 rack->r_ctl.rc_rcvtime, __LINE__);
12331 tp->t_flags |= TF_DELACK;
12332 } else {
12333 tp->t_flags |= TF_ACKNOW;
12334 }
12335 tp->rcv_nxt++;
12336 }
12337 switch (tp->t_state) {
12338 /*
12339 * In SYN_RECEIVED and ESTABLISHED STATES enter the
12340 * CLOSE_WAIT state.
12341 */
12342 case TCPS_SYN_RECEIVED:
12343 tp->t_starttime = ticks;
12344 /* FALLTHROUGH */
12345 case TCPS_ESTABLISHED:
12346 rack_timer_cancel(tp, rack,
12347 rack->r_ctl.rc_rcvtime, __LINE__);
12348 tcp_state_change(tp, TCPS_CLOSE_WAIT);
12349 break;
12350
12351 /*
12352 * If still in FIN_WAIT_1 STATE FIN has not been
12353 * acked so enter the CLOSING state.
12354 */
12355 case TCPS_FIN_WAIT_1:
12356 rack_timer_cancel(tp, rack,
12357 rack->r_ctl.rc_rcvtime, __LINE__);
12358 tcp_state_change(tp, TCPS_CLOSING);
12359 break;
12360
12361 /*
12362 * In FIN_WAIT_2 state enter the TIME_WAIT state,
12363 * starting the time-wait timer, turning off the
12364 * other standard timers.
12365 */
12366 case TCPS_FIN_WAIT_2:
12367 rack_timer_cancel(tp, rack,
12368 rack->r_ctl.rc_rcvtime, __LINE__);
12369 tcp_twstart(tp);
12370 return (1);
12371 }
12372 }
12373 /*
12374 * Return any desired output.
12375 */
12376 if ((tp->t_flags & TF_ACKNOW) ||
12377 (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) {
12378 rack->r_wanted_output = 1;
12379 }
12380 return (0);
12381 }
12382
12383 /*
12384 * Here nothing is really faster, its just that we
12385 * have broken out the fast-data path also just like
12386 * the fast-ack.
12387 */
12388 static int
rack_do_fastnewdata(struct mbuf * m,struct tcphdr * th,struct socket * so,struct tcpcb * tp,struct tcpopt * to,int32_t drop_hdrlen,int32_t tlen,uint32_t tiwin,int32_t nxt_pkt,uint8_t iptos)12389 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
12390 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
12391 uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos)
12392 {
12393 int32_t nsegs;
12394 int32_t newsize = 0; /* automatic sockbuf scaling */
12395 struct tcp_rack *rack;
12396 #ifdef NETFLIX_SB_LIMITS
12397 u_int mcnt, appended;
12398 #endif
12399
12400 /*
12401 * If last ACK falls within this segment's sequence numbers, record
12402 * the timestamp. NOTE that the test is modified according to the
12403 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
12404 */
12405 if (__predict_false(th->th_seq != tp->rcv_nxt)) {
12406 return (0);
12407 }
12408 if (tiwin && tiwin != tp->snd_wnd) {
12409 return (0);
12410 }
12411 if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) {
12412 return (0);
12413 }
12414 if (__predict_false((to->to_flags & TOF_TS) &&
12415 (TSTMP_LT(to->to_tsval, tp->ts_recent)))) {
12416 return (0);
12417 }
12418 if (__predict_false((th->th_ack != tp->snd_una))) {
12419 return (0);
12420 }
12421 if (__predict_false(tlen > sbspace(&so->so_rcv))) {
12422 return (0);
12423 }
12424 if ((to->to_flags & TOF_TS) != 0 &&
12425 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
12426 tp->ts_recent_age = tcp_ts_getticks();
12427 tp->ts_recent = to->to_tsval;
12428 }
12429 rack = (struct tcp_rack *)tp->t_fb_ptr;
12430 /*
12431 * This is a pure, in-sequence data packet with nothing on the
12432 * reassembly queue and we have enough buffer space to take it.
12433 */
12434 nsegs = max(1, m->m_pkthdr.lro_nsegs);
12435
12436 #ifdef NETFLIX_SB_LIMITS
12437 if (so->so_rcv.sb_shlim) {
12438 mcnt = m_memcnt(m);
12439 appended = 0;
12440 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
12441 CFO_NOSLEEP, NULL) == false) {
12442 counter_u64_add(tcp_sb_shlim_fails, 1);
12443 m_freem(m);
12444 return (1);
12445 }
12446 }
12447 #endif
12448 /* Clean receiver SACK report if present */
12449 if (tp->rcv_numsacks)
12450 tcp_clean_sackreport(tp);
12451 KMOD_TCPSTAT_INC(tcps_preddat);
12452 tp->rcv_nxt += tlen;
12453 if (tlen &&
12454 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) &&
12455 (tp->t_fbyte_in == 0)) {
12456 tp->t_fbyte_in = ticks;
12457 if (tp->t_fbyte_in == 0)
12458 tp->t_fbyte_in = 1;
12459 if (tp->t_fbyte_out && tp->t_fbyte_in)
12460 tp->t_flags2 |= TF2_FBYTES_COMPLETE;
12461 }
12462 /*
12463 * Pull snd_wl1 up to prevent seq wrap relative to th_seq.
12464 */
12465 tp->snd_wl1 = th->th_seq;
12466 /*
12467 * Pull rcv_up up to prevent seq wrap relative to rcv_nxt.
12468 */
12469 tp->rcv_up = tp->rcv_nxt;
12470 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs);
12471 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen);
12472 newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
12473
12474 /* Add data to socket buffer. */
12475 SOCK_RECVBUF_LOCK(so);
12476 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
12477 m_freem(m);
12478 } else {
12479 /*
12480 * Set new socket buffer size. Give up when limit is
12481 * reached.
12482 */
12483 if (newsize)
12484 if (!sbreserve_locked(so, SO_RCV, newsize, NULL))
12485 so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
12486 m_adj(m, drop_hdrlen); /* delayed header drop */
12487 #ifdef NETFLIX_SB_LIMITS
12488 appended =
12489 #endif
12490 sbappendstream_locked(&so->so_rcv, m, 0);
12491 ctf_calc_rwin(so, tp);
12492 }
12493 rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1);
12494 /* NB: sorwakeup_locked() does an implicit unlock. */
12495 sorwakeup_locked(so);
12496 #ifdef NETFLIX_SB_LIMITS
12497 if (so->so_rcv.sb_shlim && mcnt != appended)
12498 counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended);
12499 #endif
12500 rack_handle_delayed_ack(tp, rack, tlen, 0);
12501 if (tp->snd_una == tp->snd_max)
12502 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
12503 return (1);
12504 }
12505
12506 /*
12507 * This subfunction is used to try to highly optimize the
12508 * fast path. We again allow window updates that are
12509 * in sequence to remain in the fast-path. We also add
12510 * in the __predict's to attempt to help the compiler.
12511 * Note that if we return a 0, then we can *not* process
12512 * it and the caller should push the packet into the
12513 * slow-path.
12514 */
12515 static int
rack_fastack(struct mbuf * m,struct tcphdr * th,struct socket * so,struct tcpcb * tp,struct tcpopt * to,int32_t drop_hdrlen,int32_t tlen,uint32_t tiwin,int32_t nxt_pkt,uint32_t cts)12516 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
12517 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
12518 uint32_t tiwin, int32_t nxt_pkt, uint32_t cts)
12519 {
12520 int32_t acked;
12521 int32_t nsegs;
12522 int32_t under_pacing = 0;
12523 struct tcp_rack *rack;
12524
12525 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
12526 /* Old ack, behind (or duplicate to) the last one rcv'd */
12527 return (0);
12528 }
12529 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) {
12530 /* Above what we have sent? */
12531 return (0);
12532 }
12533 if (__predict_false(tiwin == 0)) {
12534 /* zero window */
12535 return (0);
12536 }
12537 if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) {
12538 /* We need a SYN or a FIN, unlikely.. */
12539 return (0);
12540 }
12541 if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) {
12542 /* Timestamp is behind .. old ack with seq wrap? */
12543 return (0);
12544 }
12545 if (__predict_false(IN_RECOVERY(tp->t_flags))) {
12546 /* Still recovering */
12547 return (0);
12548 }
12549 rack = (struct tcp_rack *)tp->t_fb_ptr;
12550 if (rack->r_ctl.rc_sacked) {
12551 /* We have sack holes on our scoreboard */
12552 return (0);
12553 }
12554 /* Ok if we reach here, we can process a fast-ack */
12555 if (rack->gp_ready &&
12556 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
12557 under_pacing = 1;
12558 }
12559 nsegs = max(1, m->m_pkthdr.lro_nsegs);
12560 rack_log_ack(tp, to, th, 0, 0, NULL, NULL);
12561 /* Did the window get updated? */
12562 if (tiwin != tp->snd_wnd) {
12563 tp->snd_wnd = tiwin;
12564 rack_validate_fo_sendwin_up(tp, rack);
12565 tp->snd_wl1 = th->th_seq;
12566 if (tp->snd_wnd > tp->max_sndwnd)
12567 tp->max_sndwnd = tp->snd_wnd;
12568 }
12569 /* Do we exit persists? */
12570 if ((rack->rc_in_persist != 0) &&
12571 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
12572 rack->r_ctl.rc_pace_min_segs))) {
12573 rack_exit_persist(tp, rack, cts);
12574 }
12575 /* Do we enter persists? */
12576 if ((rack->rc_in_persist == 0) &&
12577 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
12578 TCPS_HAVEESTABLISHED(tp->t_state) &&
12579 ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) &&
12580 sbavail(&tptosocket(tp)->so_snd) &&
12581 (sbavail(&tptosocket(tp)->so_snd) > tp->snd_wnd)) {
12582 /*
12583 * Here the rwnd is less than
12584 * the pacing size, we are established,
12585 * nothing is outstanding, and there is
12586 * data to send. Enter persists.
12587 */
12588 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, th->th_ack);
12589 }
12590 /*
12591 * If last ACK falls within this segment's sequence numbers, record
12592 * the timestamp. NOTE that the test is modified according to the
12593 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
12594 */
12595 if ((to->to_flags & TOF_TS) != 0 &&
12596 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
12597 tp->ts_recent_age = tcp_ts_getticks();
12598 tp->ts_recent = to->to_tsval;
12599 }
12600 /*
12601 * This is a pure ack for outstanding data.
12602 */
12603 KMOD_TCPSTAT_INC(tcps_predack);
12604
12605 /*
12606 * "bad retransmit" recovery.
12607 */
12608 if ((tp->t_flags & TF_PREVVALID) &&
12609 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) {
12610 tp->t_flags &= ~TF_PREVVALID;
12611 if (tp->t_rxtshift == 1 &&
12612 (int)(ticks - tp->t_badrxtwin) < 0)
12613 rack_cong_signal(tp, CC_RTO_ERR, th->th_ack, __LINE__);
12614 }
12615 /*
12616 * Recalculate the transmit timer / rtt.
12617 *
12618 * Some boxes send broken timestamp replies during the SYN+ACK
12619 * phase, ignore timestamps of 0 or we could calculate a huge RTT
12620 * and blow up the retransmit timer.
12621 */
12622 acked = BYTES_THIS_ACK(tp, th);
12623
12624 #ifdef TCP_HHOOK
12625 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
12626 hhook_run_tcp_est_in(tp, th, to);
12627 #endif
12628 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs);
12629 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
12630 if (acked) {
12631 struct mbuf *mfree;
12632
12633 rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, 0);
12634 SOCK_SENDBUF_LOCK(so);
12635 mfree = sbcut_locked(&so->so_snd, acked);
12636 tp->snd_una = th->th_ack;
12637 /* Note we want to hold the sb lock through the sendmap adjust */
12638 rack_adjust_sendmap_head(rack, &so->so_snd);
12639 /* Wake up the socket if we have room to write more */
12640 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
12641 sowwakeup_locked(so);
12642 m_freem(mfree);
12643 tp->t_rxtshift = 0;
12644 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
12645 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
12646 rack->rc_tlp_in_progress = 0;
12647 rack->r_ctl.rc_tlp_cnt_out = 0;
12648 /*
12649 * If it is the RXT timer we want to
12650 * stop it, so we can restart a TLP.
12651 */
12652 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT)
12653 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
12654
12655 #ifdef TCP_REQUEST_TRK
12656 rack_req_check_for_comp(rack, th->th_ack);
12657 #endif
12658 }
12659 /*
12660 * Let the congestion control algorithm update congestion control
12661 * related information. This typically means increasing the
12662 * congestion window.
12663 */
12664 if (tp->snd_wnd < ctf_outstanding(tp)) {
12665 /* The peer collapsed the window */
12666 rack_collapsed_window(rack, ctf_outstanding(tp), th->th_ack, __LINE__);
12667 } else if (rack->rc_has_collapsed)
12668 rack_un_collapse_window(rack, __LINE__);
12669 if ((rack->r_collapse_point_valid) &&
12670 (SEQ_GT(tp->snd_una, rack->r_ctl.high_collapse_point)))
12671 rack->r_collapse_point_valid = 0;
12672 /*
12673 * Pull snd_wl2 up to prevent seq wrap relative to th_ack.
12674 */
12675 tp->snd_wl2 = th->th_ack;
12676 tp->t_dupacks = 0;
12677 m_freem(m);
12678 /* ND6_HINT(tp); *//* Some progress has been made. */
12679
12680 /*
12681 * If all outstanding data are acked, stop retransmit timer,
12682 * otherwise restart timer using current (possibly backed-off)
12683 * value. If process is waiting for space, wakeup/selwakeup/signal.
12684 * If data are ready to send, let tcp_output decide between more
12685 * output or persist.
12686 */
12687 if (under_pacing &&
12688 (rack->use_fixed_rate == 0) &&
12689 (rack->in_probe_rtt == 0) &&
12690 rack->rc_gp_dyn_mul &&
12691 rack->rc_always_pace) {
12692 /* Check if we are dragging bottom */
12693 rack_check_bottom_drag(tp, rack, so);
12694 }
12695 if (tp->snd_una == tp->snd_max) {
12696 tp->t_flags &= ~TF_PREVVALID;
12697 rack->r_ctl.retran_during_recovery = 0;
12698 rack->rc_suspicious = 0;
12699 rack->r_ctl.dsack_byte_cnt = 0;
12700 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
12701 if (rack->r_ctl.rc_went_idle_time == 0)
12702 rack->r_ctl.rc_went_idle_time = 1;
12703 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
12704 if (sbavail(&tptosocket(tp)->so_snd) == 0)
12705 tp->t_acktime = 0;
12706 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
12707 }
12708 if (acked && rack->r_fast_output)
12709 rack_gain_for_fastoutput(rack, tp, so, (uint32_t)acked);
12710 if (sbavail(&so->so_snd)) {
12711 rack->r_wanted_output = 1;
12712 }
12713 return (1);
12714 }
12715
12716 /*
12717 * Return value of 1, the TCB is unlocked and most
12718 * likely gone, return value of 0, the TCP is still
12719 * locked.
12720 */
12721 static int
rack_do_syn_sent(struct mbuf * m,struct tcphdr * th,struct socket * so,struct tcpcb * tp,struct tcpopt * to,int32_t drop_hdrlen,int32_t tlen,uint32_t tiwin,int32_t thflags,int32_t nxt_pkt,uint8_t iptos)12722 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
12723 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
12724 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
12725 {
12726 int32_t ret_val = 0;
12727 int32_t orig_tlen = tlen;
12728 int32_t todrop;
12729 int32_t ourfinisacked = 0;
12730 struct tcp_rack *rack;
12731
12732 INP_WLOCK_ASSERT(tptoinpcb(tp));
12733
12734 ctf_calc_rwin(so, tp);
12735 /*
12736 * If the state is SYN_SENT: if seg contains an ACK, but not for our
12737 * SYN, drop the input. if seg contains a RST, then drop the
12738 * connection. if seg does not contain SYN, then drop it. Otherwise
12739 * this is an acceptable SYN segment initialize tp->rcv_nxt and
12740 * tp->irs if seg contains ack then advance tp->snd_una if seg
12741 * contains an ECE and ECN support is enabled, the stream is ECN
12742 * capable. if SYN has been acked change to ESTABLISHED else
12743 * SYN_RCVD state arrange for segment to be acked (eventually)
12744 * continue processing rest of data/controls.
12745 */
12746 if ((thflags & TH_ACK) &&
12747 (SEQ_LEQ(th->th_ack, tp->iss) ||
12748 SEQ_GT(th->th_ack, tp->snd_max))) {
12749 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
12750 ctf_do_dropwithreset(m, tp, th, tlen);
12751 return (1);
12752 }
12753 if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
12754 TCP_PROBE5(connect__refused, NULL, tp,
12755 mtod(m, const char *), tp, th);
12756 tp = tcp_drop(tp, ECONNREFUSED);
12757 ctf_do_drop(m, tp);
12758 return (1);
12759 }
12760 if (thflags & TH_RST) {
12761 ctf_do_drop(m, tp);
12762 return (1);
12763 }
12764 if (!(thflags & TH_SYN)) {
12765 ctf_do_drop(m, tp);
12766 return (1);
12767 }
12768 tp->irs = th->th_seq;
12769 tcp_rcvseqinit(tp);
12770 rack = (struct tcp_rack *)tp->t_fb_ptr;
12771 if (thflags & TH_ACK) {
12772 int tfo_partial = 0;
12773
12774 KMOD_TCPSTAT_INC(tcps_connects);
12775 soisconnected(so);
12776 #ifdef MAC
12777 mac_socketpeer_set_from_mbuf(m, so);
12778 #endif
12779 /* Do window scaling on this connection? */
12780 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
12781 (TF_RCVD_SCALE | TF_REQ_SCALE)) {
12782 tp->rcv_scale = tp->request_r_scale;
12783 }
12784 tp->rcv_adv += min(tp->rcv_wnd,
12785 TCP_MAXWIN << tp->rcv_scale);
12786 /*
12787 * If not all the data that was sent in the TFO SYN
12788 * has been acked, resend the remainder right away.
12789 */
12790 if ((tp->t_flags & TF_FASTOPEN) &&
12791 (tp->snd_una != tp->snd_max)) {
12792 /* Was it a partial ack? */
12793 if (SEQ_LT(th->th_ack, tp->snd_max))
12794 tfo_partial = 1;
12795 }
12796 /*
12797 * If there's data, delay ACK; if there's also a FIN ACKNOW
12798 * will be turned on later.
12799 */
12800 if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial) {
12801 rack_timer_cancel(tp, rack,
12802 rack->r_ctl.rc_rcvtime, __LINE__);
12803 tp->t_flags |= TF_DELACK;
12804 } else {
12805 rack->r_wanted_output = 1;
12806 tp->t_flags |= TF_ACKNOW;
12807 }
12808
12809 tcp_ecn_input_syn_sent(tp, thflags, iptos);
12810
12811 if (SEQ_GT(th->th_ack, tp->snd_una)) {
12812 /*
12813 * We advance snd_una for the
12814 * fast open case. If th_ack is
12815 * acknowledging data beyond
12816 * snd_una we can't just call
12817 * ack-processing since the
12818 * data stream in our send-map
12819 * will start at snd_una + 1 (one
12820 * beyond the SYN). If its just
12821 * equal we don't need to do that
12822 * and there is no send_map.
12823 */
12824 tp->snd_una++;
12825 if (tfo_partial && (SEQ_GT(tp->snd_max, tp->snd_una))) {
12826 /*
12827 * We sent a SYN with data, and thus have a
12828 * sendmap entry with a SYN set. Lets find it
12829 * and take off the send bit and the byte and
12830 * set it up to be what we send (send it next).
12831 */
12832 struct rack_sendmap *rsm;
12833
12834 rsm = tqhash_min(rack->r_ctl.tqh);
12835 if (rsm) {
12836 if (rsm->r_flags & RACK_HAS_SYN) {
12837 rsm->r_flags &= ~RACK_HAS_SYN;
12838 rsm->r_start++;
12839 }
12840 rack->r_ctl.rc_resend = rsm;
12841 }
12842 }
12843 }
12844 /*
12845 * Received <SYN,ACK> in SYN_SENT[*] state. Transitions:
12846 * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1
12847 */
12848 tp->t_starttime = ticks;
12849 if (tp->t_flags & TF_NEEDFIN) {
12850 tcp_state_change(tp, TCPS_FIN_WAIT_1);
12851 tp->t_flags &= ~TF_NEEDFIN;
12852 thflags &= ~TH_SYN;
12853 } else {
12854 tcp_state_change(tp, TCPS_ESTABLISHED);
12855 TCP_PROBE5(connect__established, NULL, tp,
12856 mtod(m, const char *), tp, th);
12857 rack_cc_conn_init(tp);
12858 }
12859 } else {
12860 /*
12861 * Received initial SYN in SYN-SENT[*] state => simultaneous
12862 * open. If segment contains CC option and there is a
12863 * cached CC, apply TAO test. If it succeeds, connection is *
12864 * half-synchronized. Otherwise, do 3-way handshake:
12865 * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If
12866 * there was no CC option, clear cached CC value.
12867 */
12868 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN | TF_SONOTCONN);
12869 tcp_state_change(tp, TCPS_SYN_RECEIVED);
12870 }
12871 /*
12872 * Advance th->th_seq to correspond to first data byte. If data,
12873 * trim to stay within window, dropping FIN if necessary.
12874 */
12875 th->th_seq++;
12876 if (tlen > tp->rcv_wnd) {
12877 todrop = tlen - tp->rcv_wnd;
12878 m_adj(m, -todrop);
12879 tlen = tp->rcv_wnd;
12880 thflags &= ~TH_FIN;
12881 KMOD_TCPSTAT_INC(tcps_rcvpackafterwin);
12882 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
12883 }
12884 tp->snd_wl1 = th->th_seq - 1;
12885 tp->rcv_up = th->th_seq;
12886 /*
12887 * Client side of transaction: already sent SYN and data. If the
12888 * remote host used T/TCP to validate the SYN, our data will be
12889 * ACK'd; if so, enter normal data segment processing in the middle
12890 * of step 5, ack processing. Otherwise, goto step 6.
12891 */
12892 if (thflags & TH_ACK) {
12893 /* For syn-sent we need to possibly update the rtt */
12894 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
12895 uint32_t t, mcts;
12896
12897 mcts = tcp_ts_getticks();
12898 t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC;
12899 if (!tp->t_rttlow || tp->t_rttlow > t)
12900 tp->t_rttlow = t;
12901 rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 4);
12902 tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2);
12903 tcp_rack_xmit_timer_commit(rack, tp);
12904 }
12905 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen))
12906 return (ret_val);
12907 /* We may have changed to FIN_WAIT_1 above */
12908 if (tp->t_state == TCPS_FIN_WAIT_1) {
12909 /*
12910 * In FIN_WAIT_1 STATE in addition to the processing
12911 * for the ESTABLISHED state if our FIN is now
12912 * acknowledged then enter FIN_WAIT_2.
12913 */
12914 if (ourfinisacked) {
12915 /*
12916 * If we can't receive any more data, then
12917 * closing user can proceed. Starting the
12918 * timer is contrary to the specification,
12919 * but if we don't get a FIN we'll hang
12920 * forever.
12921 *
12922 * XXXjl: we should release the tp also, and
12923 * use a compressed state.
12924 */
12925 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
12926 soisdisconnected(so);
12927 tcp_timer_activate(tp, TT_2MSL,
12928 (tcp_fast_finwait2_recycle ?
12929 tcp_finwait2_timeout :
12930 TP_MAXIDLE(tp)));
12931 }
12932 tcp_state_change(tp, TCPS_FIN_WAIT_2);
12933 }
12934 }
12935 }
12936 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
12937 tiwin, thflags, nxt_pkt));
12938 }
12939
12940 /*
12941 * Return value of 1, the TCB is unlocked and most
12942 * likely gone, return value of 0, the TCP is still
12943 * locked.
12944 */
12945 static int
rack_do_syn_recv(struct mbuf * m,struct tcphdr * th,struct socket * so,struct tcpcb * tp,struct tcpopt * to,int32_t drop_hdrlen,int32_t tlen,uint32_t tiwin,int32_t thflags,int32_t nxt_pkt,uint8_t iptos)12946 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
12947 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
12948 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
12949 {
12950 struct tcp_rack *rack;
12951 int32_t orig_tlen = tlen;
12952 int32_t ret_val = 0;
12953 int32_t ourfinisacked = 0;
12954
12955 rack = (struct tcp_rack *)tp->t_fb_ptr;
12956 ctf_calc_rwin(so, tp);
12957 if ((thflags & TH_RST) ||
12958 (tp->t_fin_is_rst && (thflags & TH_FIN)))
12959 return (ctf_process_rst(m, th, so, tp));
12960 if ((thflags & TH_ACK) &&
12961 (SEQ_LEQ(th->th_ack, tp->snd_una) ||
12962 SEQ_GT(th->th_ack, tp->snd_max))) {
12963 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
12964 ctf_do_dropwithreset(m, tp, th, tlen);
12965 return (1);
12966 }
12967 if (tp->t_flags & TF_FASTOPEN) {
12968 /*
12969 * When a TFO connection is in SYN_RECEIVED, the
12970 * only valid packets are the initial SYN, a
12971 * retransmit/copy of the initial SYN (possibly with
12972 * a subset of the original data), a valid ACK, a
12973 * FIN, or a RST.
12974 */
12975 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
12976 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
12977 ctf_do_dropwithreset(m, tp, th, tlen);
12978 return (1);
12979 } else if (thflags & TH_SYN) {
12980 /* non-initial SYN is ignored */
12981 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) ||
12982 (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) ||
12983 (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) {
12984 ctf_do_drop(m, NULL);
12985 return (0);
12986 }
12987 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) {
12988 ctf_do_drop(m, NULL);
12989 return (0);
12990 }
12991 }
12992
12993 /*
12994 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
12995 * it's less than ts_recent, drop it.
12996 */
12997 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
12998 TSTMP_LT(to->to_tsval, tp->ts_recent)) {
12999 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
13000 return (ret_val);
13001 }
13002 /*
13003 * In the SYN-RECEIVED state, validate that the packet belongs to
13004 * this connection before trimming the data to fit the receive
13005 * window. Check the sequence number versus IRS since we know the
13006 * sequence numbers haven't wrapped. This is a partial fix for the
13007 * "LAND" DoS attack.
13008 */
13009 if (SEQ_LT(th->th_seq, tp->irs)) {
13010 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
13011 ctf_do_dropwithreset(m, tp, th, tlen);
13012 return (1);
13013 }
13014 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
13015 return (ret_val);
13016 }
13017 /*
13018 * If last ACK falls within this segment's sequence numbers, record
13019 * its timestamp. NOTE: 1) That the test incorporates suggestions
13020 * from the latest proposal of the tcplw@cray.com list (Braden
13021 * 1993/04/26). 2) That updating only on newer timestamps interferes
13022 * with our earlier PAWS tests, so this check should be solely
13023 * predicated on the sequence space of this segment. 3) That we
13024 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
13025 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
13026 * SEG.Len, This modified check allows us to overcome RFC1323's
13027 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
13028 * p.869. In such cases, we can still calculate the RTT correctly
13029 * when RCV.NXT == Last.ACK.Sent.
13030 */
13031 if ((to->to_flags & TOF_TS) != 0 &&
13032 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
13033 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
13034 ((thflags & (TH_SYN | TH_FIN)) != 0))) {
13035 tp->ts_recent_age = tcp_ts_getticks();
13036 tp->ts_recent = to->to_tsval;
13037 }
13038 tp->snd_wnd = tiwin;
13039 rack_validate_fo_sendwin_up(tp, rack);
13040 /*
13041 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
13042 * is on (half-synchronized state), then queue data for later
13043 * processing; else drop segment and return.
13044 */
13045 if ((thflags & TH_ACK) == 0) {
13046 if (tp->t_flags & TF_FASTOPEN) {
13047 rack_cc_conn_init(tp);
13048 }
13049 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13050 tiwin, thflags, nxt_pkt));
13051 }
13052 KMOD_TCPSTAT_INC(tcps_connects);
13053 if (tp->t_flags & TF_SONOTCONN) {
13054 tp->t_flags &= ~TF_SONOTCONN;
13055 soisconnected(so);
13056 }
13057 /* Do window scaling? */
13058 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
13059 (TF_RCVD_SCALE | TF_REQ_SCALE)) {
13060 tp->rcv_scale = tp->request_r_scale;
13061 }
13062 /*
13063 * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* ->
13064 * FIN-WAIT-1
13065 */
13066 tp->t_starttime = ticks;
13067 if ((tp->t_flags & TF_FASTOPEN) && tp->t_tfo_pending) {
13068 tcp_fastopen_decrement_counter(tp->t_tfo_pending);
13069 tp->t_tfo_pending = NULL;
13070 }
13071 if (tp->t_flags & TF_NEEDFIN) {
13072 tcp_state_change(tp, TCPS_FIN_WAIT_1);
13073 tp->t_flags &= ~TF_NEEDFIN;
13074 } else {
13075 tcp_state_change(tp, TCPS_ESTABLISHED);
13076 TCP_PROBE5(accept__established, NULL, tp,
13077 mtod(m, const char *), tp, th);
13078 /*
13079 * TFO connections call cc_conn_init() during SYN
13080 * processing. Calling it again here for such connections
13081 * is not harmless as it would undo the snd_cwnd reduction
13082 * that occurs when a TFO SYN|ACK is retransmitted.
13083 */
13084 if (!(tp->t_flags & TF_FASTOPEN))
13085 rack_cc_conn_init(tp);
13086 }
13087 /*
13088 * Account for the ACK of our SYN prior to
13089 * regular ACK processing below, except for
13090 * simultaneous SYN, which is handled later.
13091 */
13092 if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN))
13093 tp->snd_una++;
13094 /*
13095 * If segment contains data or ACK, will call tcp_reass() later; if
13096 * not, do so now to pass queued data to user.
13097 */
13098 if (tlen == 0 && (thflags & TH_FIN) == 0) {
13099 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
13100 (struct mbuf *)0);
13101 if (tp->t_flags & TF_WAKESOR) {
13102 tp->t_flags &= ~TF_WAKESOR;
13103 /* NB: sorwakeup_locked() does an implicit unlock. */
13104 sorwakeup_locked(so);
13105 }
13106 }
13107 tp->snd_wl1 = th->th_seq - 1;
13108 /* For syn-recv we need to possibly update the rtt */
13109 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
13110 uint32_t t, mcts;
13111
13112 mcts = tcp_ts_getticks();
13113 t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC;
13114 if (!tp->t_rttlow || tp->t_rttlow > t)
13115 tp->t_rttlow = t;
13116 rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 5);
13117 tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2);
13118 tcp_rack_xmit_timer_commit(rack, tp);
13119 }
13120 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) {
13121 return (ret_val);
13122 }
13123 if (tp->t_state == TCPS_FIN_WAIT_1) {
13124 /* We could have went to FIN_WAIT_1 (or EST) above */
13125 /*
13126 * In FIN_WAIT_1 STATE in addition to the processing for the
13127 * ESTABLISHED state if our FIN is now acknowledged then
13128 * enter FIN_WAIT_2.
13129 */
13130 if (ourfinisacked) {
13131 /*
13132 * If we can't receive any more data, then closing
13133 * user can proceed. Starting the timer is contrary
13134 * to the specification, but if we don't get a FIN
13135 * we'll hang forever.
13136 *
13137 * XXXjl: we should release the tp also, and use a
13138 * compressed state.
13139 */
13140 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
13141 soisdisconnected(so);
13142 tcp_timer_activate(tp, TT_2MSL,
13143 (tcp_fast_finwait2_recycle ?
13144 tcp_finwait2_timeout :
13145 TP_MAXIDLE(tp)));
13146 }
13147 tcp_state_change(tp, TCPS_FIN_WAIT_2);
13148 }
13149 }
13150 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13151 tiwin, thflags, nxt_pkt));
13152 }
13153
13154 /*
13155 * Return value of 1, the TCB is unlocked and most
13156 * likely gone, return value of 0, the TCP is still
13157 * locked.
13158 */
13159 static int
rack_do_established(struct mbuf * m,struct tcphdr * th,struct socket * so,struct tcpcb * tp,struct tcpopt * to,int32_t drop_hdrlen,int32_t tlen,uint32_t tiwin,int32_t thflags,int32_t nxt_pkt,uint8_t iptos)13160 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
13161 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
13162 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
13163 {
13164 int32_t ret_val = 0;
13165 int32_t orig_tlen = tlen;
13166 struct tcp_rack *rack;
13167
13168 /*
13169 * Header prediction: check for the two common cases of a
13170 * uni-directional data xfer. If the packet has no control flags,
13171 * is in-sequence, the window didn't change and we're not
13172 * retransmitting, it's a candidate. If the length is zero and the
13173 * ack moved forward, we're the sender side of the xfer. Just free
13174 * the data acked & wake any higher level process that was blocked
13175 * waiting for space. If the length is non-zero and the ack didn't
13176 * move, we're the receiver side. If we're getting packets in-order
13177 * (the reassembly queue is empty), add the data toc The socket
13178 * buffer and note that we need a delayed ack. Make sure that the
13179 * hidden state-flags are also off. Since we check for
13180 * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN.
13181 */
13182 rack = (struct tcp_rack *)tp->t_fb_ptr;
13183 if (__predict_true(((to->to_flags & TOF_SACK) == 0)) &&
13184 __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_ACK)) == TH_ACK) &&
13185 __predict_true(SEGQ_EMPTY(tp)) &&
13186 __predict_true(th->th_seq == tp->rcv_nxt)) {
13187 if (tlen == 0) {
13188 if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen,
13189 tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) {
13190 return (0);
13191 }
13192 } else {
13193 if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen,
13194 tiwin, nxt_pkt, iptos)) {
13195 return (0);
13196 }
13197 }
13198 }
13199 ctf_calc_rwin(so, tp);
13200
13201 if ((thflags & TH_RST) ||
13202 (tp->t_fin_is_rst && (thflags & TH_FIN)))
13203 return (ctf_process_rst(m, th, so, tp));
13204
13205 /*
13206 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
13207 * synchronized state.
13208 */
13209 if (thflags & TH_SYN) {
13210 ctf_challenge_ack(m, th, tp, iptos, &ret_val);
13211 return (ret_val);
13212 }
13213 /*
13214 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
13215 * it's less than ts_recent, drop it.
13216 */
13217 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
13218 TSTMP_LT(to->to_tsval, tp->ts_recent)) {
13219 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
13220 return (ret_val);
13221 }
13222 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
13223 return (ret_val);
13224 }
13225 /*
13226 * If last ACK falls within this segment's sequence numbers, record
13227 * its timestamp. NOTE: 1) That the test incorporates suggestions
13228 * from the latest proposal of the tcplw@cray.com list (Braden
13229 * 1993/04/26). 2) That updating only on newer timestamps interferes
13230 * with our earlier PAWS tests, so this check should be solely
13231 * predicated on the sequence space of this segment. 3) That we
13232 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
13233 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
13234 * SEG.Len, This modified check allows us to overcome RFC1323's
13235 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
13236 * p.869. In such cases, we can still calculate the RTT correctly
13237 * when RCV.NXT == Last.ACK.Sent.
13238 */
13239 if ((to->to_flags & TOF_TS) != 0 &&
13240 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
13241 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
13242 ((thflags & (TH_SYN | TH_FIN)) != 0))) {
13243 tp->ts_recent_age = tcp_ts_getticks();
13244 tp->ts_recent = to->to_tsval;
13245 }
13246 /*
13247 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
13248 * is on (half-synchronized state), then queue data for later
13249 * processing; else drop segment and return.
13250 */
13251 if ((thflags & TH_ACK) == 0) {
13252 if (tp->t_flags & TF_NEEDSYN) {
13253 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13254 tiwin, thflags, nxt_pkt));
13255
13256 } else if (tp->t_flags & TF_ACKNOW) {
13257 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
13258 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
13259 return (ret_val);
13260 } else {
13261 ctf_do_drop(m, NULL);
13262 return (0);
13263 }
13264 }
13265 /*
13266 * Ack processing.
13267 */
13268 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val, orig_tlen)) {
13269 return (ret_val);
13270 }
13271 if (sbavail(&so->so_snd)) {
13272 if (ctf_progress_timeout_check(tp, true)) {
13273 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
13274 ctf_do_dropwithreset_conn(m, tp, th, tlen);
13275 return (1);
13276 }
13277 }
13278 /* State changes only happen in rack_process_data() */
13279 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13280 tiwin, thflags, nxt_pkt));
13281 }
13282
13283 /*
13284 * Return value of 1, the TCB is unlocked and most
13285 * likely gone, return value of 0, the TCP is still
13286 * locked.
13287 */
13288 static int
rack_do_close_wait(struct mbuf * m,struct tcphdr * th,struct socket * so,struct tcpcb * tp,struct tcpopt * to,int32_t drop_hdrlen,int32_t tlen,uint32_t tiwin,int32_t thflags,int32_t nxt_pkt,uint8_t iptos)13289 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
13290 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
13291 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
13292 {
13293 int32_t ret_val = 0;
13294 int32_t orig_tlen = tlen;
13295
13296 ctf_calc_rwin(so, tp);
13297 if ((thflags & TH_RST) ||
13298 (tp->t_fin_is_rst && (thflags & TH_FIN)))
13299 return (ctf_process_rst(m, th, so, tp));
13300 /*
13301 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
13302 * synchronized state.
13303 */
13304 if (thflags & TH_SYN) {
13305 ctf_challenge_ack(m, th, tp, iptos, &ret_val);
13306 return (ret_val);
13307 }
13308 /*
13309 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
13310 * it's less than ts_recent, drop it.
13311 */
13312 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
13313 TSTMP_LT(to->to_tsval, tp->ts_recent)) {
13314 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
13315 return (ret_val);
13316 }
13317 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
13318 return (ret_val);
13319 }
13320 /*
13321 * If last ACK falls within this segment's sequence numbers, record
13322 * its timestamp. NOTE: 1) That the test incorporates suggestions
13323 * from the latest proposal of the tcplw@cray.com list (Braden
13324 * 1993/04/26). 2) That updating only on newer timestamps interferes
13325 * with our earlier PAWS tests, so this check should be solely
13326 * predicated on the sequence space of this segment. 3) That we
13327 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
13328 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
13329 * SEG.Len, This modified check allows us to overcome RFC1323's
13330 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
13331 * p.869. In such cases, we can still calculate the RTT correctly
13332 * when RCV.NXT == Last.ACK.Sent.
13333 */
13334 if ((to->to_flags & TOF_TS) != 0 &&
13335 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
13336 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
13337 ((thflags & (TH_SYN | TH_FIN)) != 0))) {
13338 tp->ts_recent_age = tcp_ts_getticks();
13339 tp->ts_recent = to->to_tsval;
13340 }
13341 /*
13342 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
13343 * is on (half-synchronized state), then queue data for later
13344 * processing; else drop segment and return.
13345 */
13346 if ((thflags & TH_ACK) == 0) {
13347 if (tp->t_flags & TF_NEEDSYN) {
13348 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13349 tiwin, thflags, nxt_pkt));
13350
13351 } else if (tp->t_flags & TF_ACKNOW) {
13352 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
13353 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
13354 return (ret_val);
13355 } else {
13356 ctf_do_drop(m, NULL);
13357 return (0);
13358 }
13359 }
13360 /*
13361 * Ack processing.
13362 */
13363 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val, orig_tlen)) {
13364 return (ret_val);
13365 }
13366 if (sbavail(&so->so_snd)) {
13367 if (ctf_progress_timeout_check(tp, true)) {
13368 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
13369 tp, tick, PROGRESS_DROP, __LINE__);
13370 ctf_do_dropwithreset_conn(m, tp, th, tlen);
13371 return (1);
13372 }
13373 }
13374 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13375 tiwin, thflags, nxt_pkt));
13376 }
13377
13378 static int
rack_check_data_after_close(struct mbuf * m,struct tcpcb * tp,int32_t * tlen,struct tcphdr * th,struct socket * so)13379 rack_check_data_after_close(struct mbuf *m,
13380 struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so)
13381 {
13382 struct tcp_rack *rack;
13383
13384 rack = (struct tcp_rack *)tp->t_fb_ptr;
13385 if (rack->rc_allow_data_af_clo == 0) {
13386 close_now:
13387 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE);
13388 /* tcp_close will kill the inp pre-log the Reset */
13389 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
13390 tp = tcp_close(tp);
13391 KMOD_TCPSTAT_INC(tcps_rcvafterclose);
13392 ctf_do_dropwithreset(m, tp, th, *tlen);
13393 return (1);
13394 }
13395 if (sbavail(&so->so_snd) == 0)
13396 goto close_now;
13397 /* Ok we allow data that is ignored and a followup reset */
13398 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE);
13399 tp->rcv_nxt = th->th_seq + *tlen;
13400 tp->t_flags2 |= TF2_DROP_AF_DATA;
13401 rack->r_wanted_output = 1;
13402 *tlen = 0;
13403 return (0);
13404 }
13405
13406 /*
13407 * Return value of 1, the TCB is unlocked and most
13408 * likely gone, return value of 0, the TCP is still
13409 * locked.
13410 */
13411 static int
rack_do_fin_wait_1(struct mbuf * m,struct tcphdr * th,struct socket * so,struct tcpcb * tp,struct tcpopt * to,int32_t drop_hdrlen,int32_t tlen,uint32_t tiwin,int32_t thflags,int32_t nxt_pkt,uint8_t iptos)13412 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
13413 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
13414 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
13415 {
13416 int32_t ret_val = 0;
13417 int32_t orig_tlen = tlen;
13418 int32_t ourfinisacked = 0;
13419
13420 ctf_calc_rwin(so, tp);
13421
13422 if ((thflags & TH_RST) ||
13423 (tp->t_fin_is_rst && (thflags & TH_FIN)))
13424 return (ctf_process_rst(m, th, so, tp));
13425 /*
13426 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
13427 * synchronized state.
13428 */
13429 if (thflags & TH_SYN) {
13430 ctf_challenge_ack(m, th, tp, iptos, &ret_val);
13431 return (ret_val);
13432 }
13433 /*
13434 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
13435 * it's less than ts_recent, drop it.
13436 */
13437 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
13438 TSTMP_LT(to->to_tsval, tp->ts_recent)) {
13439 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
13440 return (ret_val);
13441 }
13442 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
13443 return (ret_val);
13444 }
13445 /*
13446 * If new data are received on a connection after the user processes
13447 * are gone, then RST the other end.
13448 */
13449 if ((tp->t_flags & TF_CLOSED) && tlen &&
13450 rack_check_data_after_close(m, tp, &tlen, th, so))
13451 return (1);
13452 /*
13453 * If last ACK falls within this segment's sequence numbers, record
13454 * its timestamp. NOTE: 1) That the test incorporates suggestions
13455 * from the latest proposal of the tcplw@cray.com list (Braden
13456 * 1993/04/26). 2) That updating only on newer timestamps interferes
13457 * with our earlier PAWS tests, so this check should be solely
13458 * predicated on the sequence space of this segment. 3) That we
13459 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
13460 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
13461 * SEG.Len, This modified check allows us to overcome RFC1323's
13462 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
13463 * p.869. In such cases, we can still calculate the RTT correctly
13464 * when RCV.NXT == Last.ACK.Sent.
13465 */
13466 if ((to->to_flags & TOF_TS) != 0 &&
13467 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
13468 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
13469 ((thflags & (TH_SYN | TH_FIN)) != 0))) {
13470 tp->ts_recent_age = tcp_ts_getticks();
13471 tp->ts_recent = to->to_tsval;
13472 }
13473 /*
13474 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
13475 * is on (half-synchronized state), then queue data for later
13476 * processing; else drop segment and return.
13477 */
13478 if ((thflags & TH_ACK) == 0) {
13479 if (tp->t_flags & TF_NEEDSYN) {
13480 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13481 tiwin, thflags, nxt_pkt));
13482 } else if (tp->t_flags & TF_ACKNOW) {
13483 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
13484 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
13485 return (ret_val);
13486 } else {
13487 ctf_do_drop(m, NULL);
13488 return (0);
13489 }
13490 }
13491 /*
13492 * Ack processing.
13493 */
13494 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) {
13495 return (ret_val);
13496 }
13497 if (ourfinisacked) {
13498 /*
13499 * If we can't receive any more data, then closing user can
13500 * proceed. Starting the timer is contrary to the
13501 * specification, but if we don't get a FIN we'll hang
13502 * forever.
13503 *
13504 * XXXjl: we should release the tp also, and use a
13505 * compressed state.
13506 */
13507 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
13508 soisdisconnected(so);
13509 tcp_timer_activate(tp, TT_2MSL,
13510 (tcp_fast_finwait2_recycle ?
13511 tcp_finwait2_timeout :
13512 TP_MAXIDLE(tp)));
13513 }
13514 tcp_state_change(tp, TCPS_FIN_WAIT_2);
13515 }
13516 if (sbavail(&so->so_snd)) {
13517 if (ctf_progress_timeout_check(tp, true)) {
13518 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
13519 tp, tick, PROGRESS_DROP, __LINE__);
13520 ctf_do_dropwithreset_conn(m, tp, th, tlen);
13521 return (1);
13522 }
13523 }
13524 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13525 tiwin, thflags, nxt_pkt));
13526 }
13527
13528 /*
13529 * Return value of 1, the TCB is unlocked and most
13530 * likely gone, return value of 0, the TCP is still
13531 * locked.
13532 */
13533 static int
rack_do_closing(struct mbuf * m,struct tcphdr * th,struct socket * so,struct tcpcb * tp,struct tcpopt * to,int32_t drop_hdrlen,int32_t tlen,uint32_t tiwin,int32_t thflags,int32_t nxt_pkt,uint8_t iptos)13534 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
13535 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
13536 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
13537 {
13538 int32_t ret_val = 0;
13539 int32_t orig_tlen = tlen;
13540 int32_t ourfinisacked = 0;
13541
13542 ctf_calc_rwin(so, tp);
13543
13544 if ((thflags & TH_RST) ||
13545 (tp->t_fin_is_rst && (thflags & TH_FIN)))
13546 return (ctf_process_rst(m, th, so, tp));
13547 /*
13548 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
13549 * synchronized state.
13550 */
13551 if (thflags & TH_SYN) {
13552 ctf_challenge_ack(m, th, tp, iptos, &ret_val);
13553 return (ret_val);
13554 }
13555 /*
13556 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
13557 * it's less than ts_recent, drop it.
13558 */
13559 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
13560 TSTMP_LT(to->to_tsval, tp->ts_recent)) {
13561 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
13562 return (ret_val);
13563 }
13564 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
13565 return (ret_val);
13566 }
13567 /*
13568 * If last ACK falls within this segment's sequence numbers, record
13569 * its timestamp. NOTE: 1) That the test incorporates suggestions
13570 * from the latest proposal of the tcplw@cray.com list (Braden
13571 * 1993/04/26). 2) That updating only on newer timestamps interferes
13572 * with our earlier PAWS tests, so this check should be solely
13573 * predicated on the sequence space of this segment. 3) That we
13574 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
13575 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
13576 * SEG.Len, This modified check allows us to overcome RFC1323's
13577 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
13578 * p.869. In such cases, we can still calculate the RTT correctly
13579 * when RCV.NXT == Last.ACK.Sent.
13580 */
13581 if ((to->to_flags & TOF_TS) != 0 &&
13582 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
13583 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
13584 ((thflags & (TH_SYN | TH_FIN)) != 0))) {
13585 tp->ts_recent_age = tcp_ts_getticks();
13586 tp->ts_recent = to->to_tsval;
13587 }
13588 /*
13589 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
13590 * is on (half-synchronized state), then queue data for later
13591 * processing; else drop segment and return.
13592 */
13593 if ((thflags & TH_ACK) == 0) {
13594 if (tp->t_flags & TF_NEEDSYN) {
13595 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13596 tiwin, thflags, nxt_pkt));
13597 } else if (tp->t_flags & TF_ACKNOW) {
13598 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
13599 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
13600 return (ret_val);
13601 } else {
13602 ctf_do_drop(m, NULL);
13603 return (0);
13604 }
13605 }
13606 /*
13607 * Ack processing.
13608 */
13609 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) {
13610 return (ret_val);
13611 }
13612 if (ourfinisacked) {
13613 tcp_twstart(tp);
13614 m_freem(m);
13615 return (1);
13616 }
13617 if (sbavail(&so->so_snd)) {
13618 if (ctf_progress_timeout_check(tp, true)) {
13619 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
13620 tp, tick, PROGRESS_DROP, __LINE__);
13621 ctf_do_dropwithreset_conn(m, tp, th, tlen);
13622 return (1);
13623 }
13624 }
13625 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13626 tiwin, thflags, nxt_pkt));
13627 }
13628
13629 /*
13630 * Return value of 1, the TCB is unlocked and most
13631 * likely gone, return value of 0, the TCP is still
13632 * locked.
13633 */
13634 static int
rack_do_lastack(struct mbuf * m,struct tcphdr * th,struct socket * so,struct tcpcb * tp,struct tcpopt * to,int32_t drop_hdrlen,int32_t tlen,uint32_t tiwin,int32_t thflags,int32_t nxt_pkt,uint8_t iptos)13635 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
13636 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
13637 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
13638 {
13639 int32_t ret_val = 0;
13640 int32_t orig_tlen;
13641 int32_t ourfinisacked = 0;
13642
13643 ctf_calc_rwin(so, tp);
13644
13645 if ((thflags & TH_RST) ||
13646 (tp->t_fin_is_rst && (thflags & TH_FIN)))
13647 return (ctf_process_rst(m, th, so, tp));
13648 /*
13649 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
13650 * synchronized state.
13651 */
13652 if (thflags & TH_SYN) {
13653 ctf_challenge_ack(m, th, tp, iptos, &ret_val);
13654 return (ret_val);
13655 }
13656 /*
13657 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
13658 * it's less than ts_recent, drop it.
13659 */
13660 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
13661 TSTMP_LT(to->to_tsval, tp->ts_recent)) {
13662 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
13663 return (ret_val);
13664 }
13665 orig_tlen = tlen;
13666 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
13667 return (ret_val);
13668 }
13669 /*
13670 * If last ACK falls within this segment's sequence numbers, record
13671 * its timestamp. NOTE: 1) That the test incorporates suggestions
13672 * from the latest proposal of the tcplw@cray.com list (Braden
13673 * 1993/04/26). 2) That updating only on newer timestamps interferes
13674 * with our earlier PAWS tests, so this check should be solely
13675 * predicated on the sequence space of this segment. 3) That we
13676 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
13677 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
13678 * SEG.Len, This modified check allows us to overcome RFC1323's
13679 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
13680 * p.869. In such cases, we can still calculate the RTT correctly
13681 * when RCV.NXT == Last.ACK.Sent.
13682 */
13683 if ((to->to_flags & TOF_TS) != 0 &&
13684 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
13685 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
13686 ((thflags & (TH_SYN | TH_FIN)) != 0))) {
13687 tp->ts_recent_age = tcp_ts_getticks();
13688 tp->ts_recent = to->to_tsval;
13689 }
13690 /*
13691 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
13692 * is on (half-synchronized state), then queue data for later
13693 * processing; else drop segment and return.
13694 */
13695 if ((thflags & TH_ACK) == 0) {
13696 if (tp->t_flags & TF_NEEDSYN) {
13697 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13698 tiwin, thflags, nxt_pkt));
13699 } else if (tp->t_flags & TF_ACKNOW) {
13700 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
13701 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
13702 return (ret_val);
13703 } else {
13704 ctf_do_drop(m, NULL);
13705 return (0);
13706 }
13707 }
13708 /*
13709 * case TCPS_LAST_ACK: Ack processing.
13710 */
13711 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) {
13712 return (ret_val);
13713 }
13714 if (ourfinisacked) {
13715 tp = tcp_close(tp);
13716 ctf_do_drop(m, tp);
13717 return (1);
13718 }
13719 if (sbavail(&so->so_snd)) {
13720 if (ctf_progress_timeout_check(tp, true)) {
13721 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
13722 tp, tick, PROGRESS_DROP, __LINE__);
13723 ctf_do_dropwithreset_conn(m, tp, th, tlen);
13724 return (1);
13725 }
13726 }
13727 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13728 tiwin, thflags, nxt_pkt));
13729 }
13730
13731 /*
13732 * Return value of 1, the TCB is unlocked and most
13733 * likely gone, return value of 0, the TCP is still
13734 * locked.
13735 */
13736 static int
rack_do_fin_wait_2(struct mbuf * m,struct tcphdr * th,struct socket * so,struct tcpcb * tp,struct tcpopt * to,int32_t drop_hdrlen,int32_t tlen,uint32_t tiwin,int32_t thflags,int32_t nxt_pkt,uint8_t iptos)13737 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
13738 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
13739 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
13740 {
13741 int32_t ret_val = 0;
13742 int32_t orig_tlen = tlen;
13743 int32_t ourfinisacked = 0;
13744
13745 ctf_calc_rwin(so, tp);
13746
13747 /* Reset receive buffer auto scaling when not in bulk receive mode. */
13748 if ((thflags & TH_RST) ||
13749 (tp->t_fin_is_rst && (thflags & TH_FIN)))
13750 return (ctf_process_rst(m, th, so, tp));
13751 /*
13752 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
13753 * synchronized state.
13754 */
13755 if (thflags & TH_SYN) {
13756 ctf_challenge_ack(m, th, tp, iptos, &ret_val);
13757 return (ret_val);
13758 }
13759 /*
13760 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
13761 * it's less than ts_recent, drop it.
13762 */
13763 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
13764 TSTMP_LT(to->to_tsval, tp->ts_recent)) {
13765 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
13766 return (ret_val);
13767 }
13768 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
13769 return (ret_val);
13770 }
13771 /*
13772 * If new data are received on a connection after the user processes
13773 * are gone, then RST the other end.
13774 */
13775 if ((tp->t_flags & TF_CLOSED) && tlen &&
13776 rack_check_data_after_close(m, tp, &tlen, th, so))
13777 return (1);
13778 /*
13779 * If last ACK falls within this segment's sequence numbers, record
13780 * its timestamp. NOTE: 1) That the test incorporates suggestions
13781 * from the latest proposal of the tcplw@cray.com list (Braden
13782 * 1993/04/26). 2) That updating only on newer timestamps interferes
13783 * with our earlier PAWS tests, so this check should be solely
13784 * predicated on the sequence space of this segment. 3) That we
13785 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
13786 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
13787 * SEG.Len, This modified check allows us to overcome RFC1323's
13788 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
13789 * p.869. In such cases, we can still calculate the RTT correctly
13790 * when RCV.NXT == Last.ACK.Sent.
13791 */
13792 if ((to->to_flags & TOF_TS) != 0 &&
13793 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
13794 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
13795 ((thflags & (TH_SYN | TH_FIN)) != 0))) {
13796 tp->ts_recent_age = tcp_ts_getticks();
13797 tp->ts_recent = to->to_tsval;
13798 }
13799 /*
13800 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
13801 * is on (half-synchronized state), then queue data for later
13802 * processing; else drop segment and return.
13803 */
13804 if ((thflags & TH_ACK) == 0) {
13805 if (tp->t_flags & TF_NEEDSYN) {
13806 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13807 tiwin, thflags, nxt_pkt));
13808 } else if (tp->t_flags & TF_ACKNOW) {
13809 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
13810 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
13811 return (ret_val);
13812 } else {
13813 ctf_do_drop(m, NULL);
13814 return (0);
13815 }
13816 }
13817 /*
13818 * Ack processing.
13819 */
13820 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) {
13821 return (ret_val);
13822 }
13823 if (sbavail(&so->so_snd)) {
13824 if (ctf_progress_timeout_check(tp, true)) {
13825 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
13826 tp, tick, PROGRESS_DROP, __LINE__);
13827 ctf_do_dropwithreset_conn(m, tp, th, tlen);
13828 return (1);
13829 }
13830 }
13831 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13832 tiwin, thflags, nxt_pkt));
13833 }
13834
13835 static void inline
rack_clear_rate_sample(struct tcp_rack * rack)13836 rack_clear_rate_sample(struct tcp_rack *rack)
13837 {
13838 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY;
13839 rack->r_ctl.rack_rs.rs_rtt_cnt = 0;
13840 rack->r_ctl.rack_rs.rs_rtt_tot = 0;
13841 }
13842
13843 static void
rack_set_pace_segments(struct tcpcb * tp,struct tcp_rack * rack,uint32_t line,uint64_t * fill_override)13844 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override)
13845 {
13846 uint64_t bw_est, rate_wanted;
13847 int chged = 0;
13848 uint32_t user_max, orig_min, orig_max;
13849
13850 #ifdef TCP_REQUEST_TRK
13851 if (rack->rc_hybrid_mode &&
13852 (rack->r_ctl.rc_pace_max_segs != 0) &&
13853 (rack_hybrid_allow_set_maxseg == 1) &&
13854 (rack->r_ctl.rc_last_sft != NULL)) {
13855 rack->r_ctl.rc_last_sft->hybrid_flags &= ~TCP_HYBRID_PACING_SETMSS;
13856 return;
13857 }
13858 #endif
13859 orig_min = rack->r_ctl.rc_pace_min_segs;
13860 orig_max = rack->r_ctl.rc_pace_max_segs;
13861 user_max = ctf_fixed_maxseg(tp) * rack->rc_user_set_max_segs;
13862 if (ctf_fixed_maxseg(tp) != rack->r_ctl.rc_pace_min_segs)
13863 chged = 1;
13864 rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp);
13865 if (rack->use_fixed_rate || rack->rc_force_max_seg) {
13866 if (user_max != rack->r_ctl.rc_pace_max_segs)
13867 chged = 1;
13868 }
13869 if (rack->rc_force_max_seg) {
13870 rack->r_ctl.rc_pace_max_segs = user_max;
13871 } else if (rack->use_fixed_rate) {
13872 bw_est = rack_get_bw(rack);
13873 if ((rack->r_ctl.crte == NULL) ||
13874 (bw_est != rack->r_ctl.crte->rate)) {
13875 rack->r_ctl.rc_pace_max_segs = user_max;
13876 } else {
13877 /* We are pacing right at the hardware rate */
13878 uint32_t segsiz, pace_one;
13879
13880 if (rack_pace_one_seg ||
13881 (rack->r_ctl.rc_user_set_min_segs == 1))
13882 pace_one = 1;
13883 else
13884 pace_one = 0;
13885 segsiz = min(ctf_fixed_maxseg(tp),
13886 rack->r_ctl.rc_pace_min_segs);
13887 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size_w_divisor(
13888 tp, bw_est, segsiz, pace_one,
13889 rack->r_ctl.crte, NULL, rack->r_ctl.pace_len_divisor);
13890 }
13891 } else if (rack->rc_always_pace) {
13892 if (rack->r_ctl.gp_bw ||
13893 rack->r_ctl.init_rate) {
13894 /* We have a rate of some sort set */
13895 uint32_t orig;
13896
13897 bw_est = rack_get_bw(rack);
13898 orig = rack->r_ctl.rc_pace_max_segs;
13899 if (fill_override)
13900 rate_wanted = *fill_override;
13901 else
13902 rate_wanted = rack_get_gp_est(rack);
13903 if (rate_wanted) {
13904 /* We have something */
13905 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack,
13906 rate_wanted,
13907 ctf_fixed_maxseg(rack->rc_tp));
13908 } else
13909 rack->r_ctl.rc_pace_max_segs = rack->r_ctl.rc_pace_min_segs;
13910 if (orig != rack->r_ctl.rc_pace_max_segs)
13911 chged = 1;
13912 } else if ((rack->r_ctl.gp_bw == 0) &&
13913 (rack->r_ctl.rc_pace_max_segs == 0)) {
13914 /*
13915 * If we have nothing limit us to bursting
13916 * out IW sized pieces.
13917 */
13918 chged = 1;
13919 rack->r_ctl.rc_pace_max_segs = rc_init_window(rack);
13920 }
13921 }
13922 if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) {
13923 chged = 1;
13924 rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES;
13925 }
13926 if (chged)
13927 rack_log_type_pacing_sizes(tp, rack, orig_min, orig_max, line, 2);
13928 }
13929
13930
13931 static void
rack_init_fsb_block(struct tcpcb * tp,struct tcp_rack * rack,int32_t flags)13932 rack_init_fsb_block(struct tcpcb *tp, struct tcp_rack *rack, int32_t flags)
13933 {
13934 #ifdef INET6
13935 struct ip6_hdr *ip6 = NULL;
13936 #endif
13937 #ifdef INET
13938 struct ip *ip = NULL;
13939 #endif
13940 struct udphdr *udp = NULL;
13941
13942 /* Ok lets fill in the fast block, it can only be used with no IP options! */
13943 #ifdef INET6
13944 if (rack->r_is_v6) {
13945 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
13946 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
13947 if (tp->t_port) {
13948 rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr);
13949 udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr));
13950 udp->uh_sport = htons(V_tcp_udp_tunneling_port);
13951 udp->uh_dport = tp->t_port;
13952 rack->r_ctl.fsb.udp = udp;
13953 rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1);
13954 } else
13955 {
13956 rack->r_ctl.fsb.th = (struct tcphdr *)(ip6 + 1);
13957 rack->r_ctl.fsb.udp = NULL;
13958 }
13959 tcpip_fillheaders(rack->rc_inp,
13960 tp->t_port,
13961 ip6, rack->r_ctl.fsb.th);
13962 rack->r_ctl.fsb.hoplimit = in6_selecthlim(rack->rc_inp, NULL);
13963 } else
13964 #endif /* INET6 */
13965 #ifdef INET
13966 {
13967 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr);
13968 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
13969 if (tp->t_port) {
13970 rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr);
13971 udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip));
13972 udp->uh_sport = htons(V_tcp_udp_tunneling_port);
13973 udp->uh_dport = tp->t_port;
13974 rack->r_ctl.fsb.udp = udp;
13975 rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1);
13976 } else
13977 {
13978 rack->r_ctl.fsb.udp = NULL;
13979 rack->r_ctl.fsb.th = (struct tcphdr *)(ip + 1);
13980 }
13981 tcpip_fillheaders(rack->rc_inp,
13982 tp->t_port,
13983 ip, rack->r_ctl.fsb.th);
13984 rack->r_ctl.fsb.hoplimit = tptoinpcb(tp)->inp_ip_ttl;
13985 }
13986 #endif
13987 rack->r_ctl.fsb.recwin = lmin(lmax(sbspace(&tptosocket(tp)->so_rcv), 0),
13988 (long)TCP_MAXWIN << tp->rcv_scale);
13989 rack->r_fsb_inited = 1;
13990 }
13991
13992 static int
rack_init_fsb(struct tcpcb * tp,struct tcp_rack * rack)13993 rack_init_fsb(struct tcpcb *tp, struct tcp_rack *rack)
13994 {
13995 /*
13996 * Allocate the larger of spaces V6 if available else just
13997 * V4 and include udphdr (overbook)
13998 */
13999 #ifdef INET6
14000 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + sizeof(struct udphdr);
14001 #else
14002 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr) + sizeof(struct udphdr);
14003 #endif
14004 rack->r_ctl.fsb.tcp_ip_hdr = malloc(rack->r_ctl.fsb.tcp_ip_hdr_len,
14005 M_TCPFSB, M_NOWAIT|M_ZERO);
14006 if (rack->r_ctl.fsb.tcp_ip_hdr == NULL) {
14007 return (ENOMEM);
14008 }
14009 rack->r_fsb_inited = 0;
14010 return (0);
14011 }
14012
14013 static void
rack_log_hystart_event(struct tcp_rack * rack,uint32_t high_seq,uint8_t mod)14014 rack_log_hystart_event(struct tcp_rack *rack, uint32_t high_seq, uint8_t mod)
14015 {
14016 /*
14017 * Types of logs (mod value)
14018 * 20 - Initial round setup
14019 * 21 - Rack declares a new round.
14020 */
14021 struct tcpcb *tp;
14022
14023 tp = rack->rc_tp;
14024 if (tcp_bblogging_on(tp)) {
14025 union tcp_log_stackspecific log;
14026 struct timeval tv;
14027
14028 memset(&log, 0, sizeof(log));
14029 log.u_bbr.flex1 = rack->r_ctl.current_round;
14030 log.u_bbr.flex2 = rack->r_ctl.roundends;
14031 log.u_bbr.flex3 = high_seq;
14032 log.u_bbr.flex4 = tp->snd_max;
14033 log.u_bbr.flex8 = mod;
14034 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
14035 log.u_bbr.cur_del_rate = rack->rc_tp->t_sndbytes;
14036 log.u_bbr.delRate = rack->rc_tp->t_snd_rxt_bytes;
14037 TCP_LOG_EVENTP(tp, NULL,
14038 &tptosocket(tp)->so_rcv,
14039 &tptosocket(tp)->so_snd,
14040 TCP_HYSTART, 0,
14041 0, &log, false, &tv);
14042 }
14043 }
14044
14045 static void
rack_deferred_init(struct tcpcb * tp,struct tcp_rack * rack)14046 rack_deferred_init(struct tcpcb *tp, struct tcp_rack *rack)
14047 {
14048 rack->rack_deferred_inited = 1;
14049 rack->r_ctl.roundends = tp->snd_max;
14050 rack->r_ctl.rc_high_rwnd = tp->snd_wnd;
14051 rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
14052 }
14053
14054 static void
rack_init_retransmit_value(struct tcp_rack * rack,int ctl)14055 rack_init_retransmit_value(struct tcp_rack *rack, int ctl)
14056 {
14057 /* Retransmit bit controls.
14058 *
14059 * The setting of these values control one of
14060 * three settings you can have and dictate
14061 * how rack does retransmissions. Note this
14062 * is in *any* mode i.e. pacing on or off DGP
14063 * fixed rate pacing, or just bursting rack.
14064 *
14065 * 1 - Use full sized retransmits i.e. limit
14066 * the size to whatever the pace_max_segments
14067 * size is.
14068 *
14069 * 2 - Use pacer min granularity as a guide to
14070 * the size combined with the current calculated
14071 * goodput b/w measurement. So for example if
14072 * the goodput is measured at 20Mbps we would
14073 * calculate 8125 (pacer minimum 250usec in
14074 * that b/w) and then round it up to the next
14075 * MSS i.e. for 1448 mss 6 MSS or 8688 bytes.
14076 *
14077 * 0 - The rack default 1 MSS (anything not 0/1/2
14078 * fall here too if we are setting via rack_init()).
14079 *
14080 */
14081 if (ctl == 1) {
14082 rack->full_size_rxt = 1;
14083 rack->shape_rxt_to_pacing_min = 0;
14084 } else if (ctl == 2) {
14085 rack->full_size_rxt = 0;
14086 rack->shape_rxt_to_pacing_min = 1;
14087 } else {
14088 rack->full_size_rxt = 0;
14089 rack->shape_rxt_to_pacing_min = 0;
14090 }
14091 }
14092
14093 static void
rack_log_chg_info(struct tcpcb * tp,struct tcp_rack * rack,uint8_t mod,uint32_t flex1,uint32_t flex2,uint32_t flex3)14094 rack_log_chg_info(struct tcpcb *tp, struct tcp_rack *rack, uint8_t mod,
14095 uint32_t flex1,
14096 uint32_t flex2,
14097 uint32_t flex3)
14098 {
14099 if (tcp_bblogging_on(rack->rc_tp)) {
14100 union tcp_log_stackspecific log;
14101 struct timeval tv;
14102
14103 memset(&log, 0, sizeof(log));
14104 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
14105 log.u_bbr.flex8 = mod;
14106 log.u_bbr.flex1 = flex1;
14107 log.u_bbr.flex2 = flex2;
14108 log.u_bbr.flex3 = flex3;
14109 tcp_log_event(tp, NULL, NULL, NULL, TCP_CHG_QUERY, 0,
14110 0, &log, false, NULL, __func__, __LINE__, &tv);
14111 }
14112 }
14113
14114 static int
rack_chg_query(struct tcpcb * tp,struct tcp_query_resp * reqr)14115 rack_chg_query(struct tcpcb *tp, struct tcp_query_resp *reqr)
14116 {
14117 struct tcp_rack *rack;
14118 struct rack_sendmap *rsm;
14119 int i;
14120
14121
14122 rack = (struct tcp_rack *)tp->t_fb_ptr;
14123 switch (reqr->req) {
14124 case TCP_QUERY_SENDMAP:
14125 if ((reqr->req_param == tp->snd_max) ||
14126 (tp->snd_max == tp->snd_una)){
14127 /* Unlikely */
14128 return (0);
14129 }
14130 rsm = tqhash_find(rack->r_ctl.tqh, reqr->req_param);
14131 if (rsm == NULL) {
14132 /* Can't find that seq -- unlikely */
14133 return (0);
14134 }
14135 reqr->sendmap_start = rsm->r_start;
14136 reqr->sendmap_end = rsm->r_end;
14137 reqr->sendmap_send_cnt = rsm->r_rtr_cnt;
14138 reqr->sendmap_fas = rsm->r_fas;
14139 if (reqr->sendmap_send_cnt > SNDMAP_NRTX)
14140 reqr->sendmap_send_cnt = SNDMAP_NRTX;
14141 for(i=0; i<reqr->sendmap_send_cnt; i++)
14142 reqr->sendmap_time[i] = rsm->r_tim_lastsent[i];
14143 reqr->sendmap_ack_arrival = rsm->r_ack_arrival;
14144 reqr->sendmap_flags = rsm->r_flags & SNDMAP_MASK;
14145 reqr->sendmap_r_rtr_bytes = rsm->r_rtr_bytes;
14146 reqr->sendmap_dupacks = rsm->r_dupack;
14147 rack_log_chg_info(tp, rack, 1,
14148 rsm->r_start,
14149 rsm->r_end,
14150 rsm->r_flags);
14151 return(1);
14152 break;
14153 case TCP_QUERY_TIMERS_UP:
14154 if (rack->r_ctl.rc_hpts_flags == 0) {
14155 /* no timers up */
14156 return (0);
14157 }
14158 reqr->timer_hpts_flags = rack->r_ctl.rc_hpts_flags;
14159 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
14160 reqr->timer_pacing_to = rack->r_ctl.rc_last_output_to;
14161 }
14162 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
14163 reqr->timer_timer_exp = rack->r_ctl.rc_timer_exp;
14164 }
14165 rack_log_chg_info(tp, rack, 2,
14166 rack->r_ctl.rc_hpts_flags,
14167 rack->r_ctl.rc_last_output_to,
14168 rack->r_ctl.rc_timer_exp);
14169 return (1);
14170 break;
14171 case TCP_QUERY_RACK_TIMES:
14172 /* Reordering items */
14173 reqr->rack_num_dsacks = rack->r_ctl.num_dsack;
14174 reqr->rack_reorder_ts = rack->r_ctl.rc_reorder_ts;
14175 /* Timerstamps and timers */
14176 reqr->rack_rxt_last_time = rack->r_ctl.rc_tlp_rxt_last_time;
14177 reqr->rack_min_rtt = rack->r_ctl.rc_rack_min_rtt;
14178 reqr->rack_rtt = rack->rc_rack_rtt;
14179 reqr->rack_tmit_time = rack->r_ctl.rc_rack_tmit_time;
14180 reqr->rack_srtt_measured = rack->rc_srtt_measure_made;
14181 /* PRR data */
14182 reqr->rack_sacked = rack->r_ctl.rc_sacked;
14183 reqr->rack_holes_rxt = rack->r_ctl.rc_holes_rxt;
14184 reqr->rack_prr_delivered = rack->r_ctl.rc_prr_delivered;
14185 reqr->rack_prr_recovery_fs = rack->r_ctl.rc_prr_recovery_fs;
14186 reqr->rack_prr_sndcnt = rack->r_ctl.rc_prr_sndcnt;
14187 reqr->rack_prr_out = rack->r_ctl.rc_prr_out;
14188 /* TLP and persists info */
14189 reqr->rack_tlp_out = rack->rc_tlp_in_progress;
14190 reqr->rack_tlp_cnt_out = rack->r_ctl.rc_tlp_cnt_out;
14191 if (rack->rc_in_persist) {
14192 reqr->rack_time_went_idle = rack->r_ctl.rc_went_idle_time;
14193 reqr->rack_in_persist = 1;
14194 } else {
14195 reqr->rack_time_went_idle = 0;
14196 reqr->rack_in_persist = 0;
14197 }
14198 if (rack->r_wanted_output)
14199 reqr->rack_wanted_output = 1;
14200 else
14201 reqr->rack_wanted_output = 0;
14202 return (1);
14203 break;
14204 default:
14205 return (-EINVAL);
14206 }
14207 }
14208
14209 static void
rack_switch_failed(struct tcpcb * tp)14210 rack_switch_failed(struct tcpcb *tp)
14211 {
14212 /*
14213 * This method gets called if a stack switch was
14214 * attempted and it failed. We are left
14215 * but our hpts timers were stopped and we
14216 * need to validate time units and t_flags2.
14217 */
14218 struct tcp_rack *rack;
14219 struct timeval tv;
14220 uint32_t cts;
14221 uint32_t toval;
14222 struct hpts_diag diag;
14223
14224 rack = (struct tcp_rack *)tp->t_fb_ptr;
14225 tcp_change_time_units(tp, TCP_TMR_GRANULARITY_USEC);
14226 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
14227 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ;
14228 else
14229 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ;
14230 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state))
14231 tp->t_flags2 |= TF2_MBUF_ACKCMP;
14232 if (tp->t_in_hpts > IHPTS_NONE) {
14233 /* Strange */
14234 return;
14235 }
14236 cts = tcp_get_usecs(&tv);
14237 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
14238 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) {
14239 toval = rack->r_ctl.rc_last_output_to - cts;
14240 } else {
14241 /* one slot please */
14242 toval = HPTS_USECS_PER_SLOT;
14243 }
14244 } else if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
14245 if (TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) {
14246 toval = rack->r_ctl.rc_timer_exp - cts;
14247 } else {
14248 /* one slot please */
14249 toval = HPTS_USECS_PER_SLOT;
14250 }
14251 } else
14252 toval = HPTS_USECS_PER_SLOT;
14253 tcp_hpts_insert(tp, toval, &diag);
14254 rack_log_hpts_diag(rack, cts, &diag, &tv);
14255 }
14256
14257 static int
rack_init_outstanding(struct tcpcb * tp,struct tcp_rack * rack,uint32_t us_cts,void * ptr)14258 rack_init_outstanding(struct tcpcb *tp, struct tcp_rack *rack, uint32_t us_cts, void *ptr)
14259 {
14260 struct rack_sendmap *rsm, *ersm;
14261 int insret __diagused;
14262 /*
14263 * When initing outstanding, we must be quite careful
14264 * to not refer to tp->t_fb_ptr. This has the old rack
14265 * pointer in it, not the "new" one (when we are doing
14266 * a stack switch).
14267 */
14268
14269
14270 if (tp->t_fb->tfb_chg_query == NULL) {
14271 /* Create a send map for the current outstanding data */
14272
14273 rsm = rack_alloc(rack);
14274 if (rsm == NULL) {
14275 uma_zfree(rack_pcb_zone, ptr);
14276 return (ENOMEM);
14277 }
14278 rsm->r_no_rtt_allowed = 1;
14279 rsm->r_tim_lastsent[0] = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
14280 rsm->r_rtr_cnt = 1;
14281 rsm->r_rtr_bytes = 0;
14282 if (tp->t_flags & TF_SENTFIN)
14283 rsm->r_flags |= RACK_HAS_FIN;
14284 rsm->r_end = tp->snd_max;
14285 if (tp->snd_una == tp->iss) {
14286 /* The data space is one beyond snd_una */
14287 rsm->r_flags |= RACK_HAS_SYN;
14288 rsm->r_start = tp->iss;
14289 rsm->r_end = rsm->r_start + (tp->snd_max - tp->snd_una);
14290 } else
14291 rsm->r_start = tp->snd_una;
14292 rsm->r_dupack = 0;
14293 if (rack->rc_inp->inp_socket->so_snd.sb_mb != NULL) {
14294 rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 0, &rsm->soff);
14295 if (rsm->m) {
14296 rsm->orig_m_len = rsm->m->m_len;
14297 rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
14298 } else {
14299 rsm->orig_m_len = 0;
14300 rsm->orig_t_space = 0;
14301 }
14302 } else {
14303 /*
14304 * This can happen if we have a stand-alone FIN or
14305 * SYN.
14306 */
14307 rsm->m = NULL;
14308 rsm->orig_m_len = 0;
14309 rsm->orig_t_space = 0;
14310 rsm->soff = 0;
14311 }
14312 #ifdef INVARIANTS
14313 if ((insret = tqhash_insert(rack->r_ctl.tqh, rsm)) != 0) {
14314 panic("Insert in tailq_hash fails ret:%d rack:%p rsm:%p",
14315 insret, rack, rsm);
14316 }
14317 #else
14318 (void)tqhash_insert(rack->r_ctl.tqh, rsm);
14319 #endif
14320 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
14321 rsm->r_in_tmap = 1;
14322 } else {
14323 /* We have a query mechanism, lets use it */
14324 struct tcp_query_resp qr;
14325 int i;
14326 tcp_seq at;
14327
14328 at = tp->snd_una;
14329 while (at != tp->snd_max) {
14330 memset(&qr, 0, sizeof(qr));
14331 qr.req = TCP_QUERY_SENDMAP;
14332 qr.req_param = at;
14333 if ((*tp->t_fb->tfb_chg_query)(tp, &qr) == 0)
14334 break;
14335 /* Move forward */
14336 at = qr.sendmap_end;
14337 /* Now lets build the entry for this one */
14338 rsm = rack_alloc(rack);
14339 if (rsm == NULL) {
14340 uma_zfree(rack_pcb_zone, ptr);
14341 return (ENOMEM);
14342 }
14343 memset(rsm, 0, sizeof(struct rack_sendmap));
14344 /* Now configure the rsm and insert it */
14345 rsm->r_dupack = qr.sendmap_dupacks;
14346 rsm->r_start = qr.sendmap_start;
14347 rsm->r_end = qr.sendmap_end;
14348 if (qr.sendmap_fas)
14349 rsm->r_fas = qr.sendmap_end;
14350 else
14351 rsm->r_fas = rsm->r_start - tp->snd_una;
14352 /*
14353 * We have carefully aligned the bits
14354 * so that all we have to do is copy over
14355 * the bits with the mask.
14356 */
14357 rsm->r_flags = qr.sendmap_flags & SNDMAP_MASK;
14358 rsm->r_rtr_bytes = qr.sendmap_r_rtr_bytes;
14359 rsm->r_rtr_cnt = qr.sendmap_send_cnt;
14360 rsm->r_ack_arrival = qr.sendmap_ack_arrival;
14361 for (i=0 ; i<rsm->r_rtr_cnt; i++)
14362 rsm->r_tim_lastsent[i] = qr.sendmap_time[i];
14363 rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd,
14364 (rsm->r_start - tp->snd_una), &rsm->soff);
14365 if (rsm->m) {
14366 rsm->orig_m_len = rsm->m->m_len;
14367 rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
14368 } else {
14369 rsm->orig_m_len = 0;
14370 rsm->orig_t_space = 0;
14371 }
14372 #ifdef INVARIANTS
14373 if ((insret = tqhash_insert(rack->r_ctl.tqh, rsm)) != 0) {
14374 panic("Insert in tailq_hash fails ret:%d rack:%p rsm:%p",
14375 insret, rack, rsm);
14376 }
14377 #else
14378 (void)tqhash_insert(rack->r_ctl.tqh, rsm);
14379 #endif
14380 if ((rsm->r_flags & RACK_ACKED) == 0) {
14381 TAILQ_FOREACH(ersm, &rack->r_ctl.rc_tmap, r_tnext) {
14382 if (ersm->r_tim_lastsent[(ersm->r_rtr_cnt-1)] >
14383 rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]) {
14384 /*
14385 * If the existing ersm was sent at
14386 * a later time than the new one, then
14387 * the new one should appear ahead of this
14388 * ersm.
14389 */
14390 rsm->r_in_tmap = 1;
14391 TAILQ_INSERT_BEFORE(ersm, rsm, r_tnext);
14392 break;
14393 }
14394 }
14395 if (rsm->r_in_tmap == 0) {
14396 /*
14397 * Not found so shove it on the tail.
14398 */
14399 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
14400 rsm->r_in_tmap = 1;
14401 }
14402 } else {
14403 if ((rack->r_ctl.rc_sacklast == NULL) ||
14404 (SEQ_GT(rsm->r_end, rack->r_ctl.rc_sacklast->r_end))) {
14405 rack->r_ctl.rc_sacklast = rsm;
14406 }
14407 }
14408 rack_log_chg_info(tp, rack, 3,
14409 rsm->r_start,
14410 rsm->r_end,
14411 rsm->r_flags);
14412 }
14413 }
14414 return (0);
14415 }
14416
14417
14418 static int32_t
rack_init(struct tcpcb * tp,void ** ptr)14419 rack_init(struct tcpcb *tp, void **ptr)
14420 {
14421 struct inpcb *inp = tptoinpcb(tp);
14422 struct tcp_rack *rack = NULL;
14423 uint32_t iwin, snt, us_cts;
14424 size_t sz;
14425 int err, no_query;
14426
14427 tcp_hpts_init(tp);
14428
14429 /*
14430 * First are we the initial or are we a switched stack?
14431 * If we are initing via tcp_newtcppcb the ptr passed
14432 * will be tp->t_fb_ptr. If its a stack switch that
14433 * has a previous stack we can query it will be a local
14434 * var that will in the end be set into t_fb_ptr.
14435 */
14436 if (ptr == &tp->t_fb_ptr)
14437 no_query = 1;
14438 else
14439 no_query = 0;
14440 *ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT);
14441 if (*ptr == NULL) {
14442 /*
14443 * We need to allocate memory but cant. The INP and INP_INFO
14444 * locks and they are recursive (happens during setup. So a
14445 * scheme to drop the locks fails :(
14446 *
14447 */
14448 return(ENOMEM);
14449 }
14450 memset(*ptr, 0, sizeof(struct tcp_rack));
14451 rack = (struct tcp_rack *)*ptr;
14452 rack->r_ctl.tqh = malloc(sizeof(struct tailq_hash), M_TCPFSB, M_NOWAIT);
14453 if (rack->r_ctl.tqh == NULL) {
14454 uma_zfree(rack_pcb_zone, rack);
14455 return(ENOMEM);
14456 }
14457 tqhash_init(rack->r_ctl.tqh);
14458 TAILQ_INIT(&rack->r_ctl.rc_free);
14459 TAILQ_INIT(&rack->r_ctl.rc_tmap);
14460 rack->rc_tp = tp;
14461 rack->rc_inp = inp;
14462 /* Set the flag */
14463 rack->r_is_v6 = (inp->inp_vflag & INP_IPV6) != 0;
14464 /* Probably not needed but lets be sure */
14465 rack_clear_rate_sample(rack);
14466 /*
14467 * Save off the default values, socket options will poke
14468 * at these if pacing is not on or we have not yet
14469 * reached where pacing is on (gp_ready/fixed enabled).
14470 * When they get set into the CC module (when gp_ready
14471 * is enabled or we enable fixed) then we will set these
14472 * values into the CC and place in here the old values
14473 * so we have a restoral. Then we will set the flag
14474 * rc_pacing_cc_set. That way whenever we turn off pacing
14475 * or switch off this stack, we will know to go restore
14476 * the saved values.
14477 *
14478 * We specifically put into the beta the ecn value for pacing.
14479 */
14480 rack->rc_new_rnd_needed = 1;
14481 rack->r_ctl.rc_split_limit = V_tcp_map_split_limit;
14482 /* We want abe like behavior as well */
14483
14484 rack->r_ctl.rc_reorder_fade = rack_reorder_fade;
14485 rack->rc_allow_data_af_clo = rack_ignore_data_after_close;
14486 rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh;
14487 if (rack_fill_cw_state)
14488 rack->rc_pace_to_cwnd = 1;
14489 if (rack_pacing_min_seg)
14490 rack->r_ctl.rc_user_set_min_segs = rack_pacing_min_seg;
14491 if (use_rack_rr)
14492 rack->use_rack_rr = 1;
14493 if (rack_dnd_default) {
14494 rack->rc_pace_dnd = 1;
14495 }
14496 if (V_tcp_delack_enabled)
14497 tp->t_delayed_ack = 1;
14498 else
14499 tp->t_delayed_ack = 0;
14500 #ifdef TCP_ACCOUNTING
14501 if (rack_tcp_accounting) {
14502 tp->t_flags2 |= TF2_TCP_ACCOUNTING;
14503 }
14504 #endif
14505 rack->r_ctl.pcm_i.cnt_alloc = RACK_DEFAULT_PCM_ARRAY;
14506 sz = (sizeof(struct rack_pcm_stats) * rack->r_ctl.pcm_i.cnt_alloc);
14507 rack->r_ctl.pcm_s = malloc(sz,M_TCPPCM, M_NOWAIT);
14508 if (rack->r_ctl.pcm_s == NULL) {
14509 rack->r_ctl.pcm_i.cnt_alloc = 0;
14510 }
14511 rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_per_upper_bound_ss;
14512 rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_per_upper_bound_ca;
14513 if (rack_enable_shared_cwnd)
14514 rack->rack_enable_scwnd = 1;
14515 rack->r_ctl.pace_len_divisor = rack_default_pacing_divisor;
14516 rack->rc_user_set_max_segs = rack_hptsi_segments;
14517 rack->r_ctl.max_reduction = rack_max_reduce;
14518 rack->rc_force_max_seg = 0;
14519 TAILQ_INIT(&rack->r_ctl.opt_list);
14520 rack->r_ctl.rc_saved_beta = V_newreno_beta_ecn;
14521 rack->r_ctl.rc_saved_beta_ecn = V_newreno_beta_ecn;
14522 if (rack_hibeta_setting) {
14523 rack->rack_hibeta = 1;
14524 if ((rack_hibeta_setting >= 50) &&
14525 (rack_hibeta_setting <= 100)) {
14526 rack->r_ctl.rc_saved_beta = rack_hibeta_setting;
14527 rack->r_ctl.saved_hibeta = rack_hibeta_setting;
14528 }
14529 } else {
14530 rack->r_ctl.saved_hibeta = 50;
14531 }
14532 /*
14533 * We initialize to all ones so we never match 0
14534 * just in case the client sends in 0, it hopefully
14535 * will never have all 1's in ms :-)
14536 */
14537 rack->r_ctl.last_tm_mark = 0xffffffffffffffff;
14538 rack->r_ctl.rc_reorder_shift = rack_reorder_thresh;
14539 rack->r_ctl.rc_pkt_delay = rack_pkt_delay;
14540 rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp;
14541 rack->r_ctl.rc_lowest_us_rtt = 0xffffffff;
14542 rack->r_ctl.rc_highest_us_rtt = 0;
14543 rack->r_ctl.bw_rate_cap = rack_bw_rate_cap;
14544 rack->pcm_enabled = rack_pcm_is_enabled;
14545 if (rack_fillcw_bw_cap)
14546 rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap;
14547 rack->r_ctl.timer_slop = TICKS_2_USEC(tcp_rexmit_slop);
14548 if (rack_use_cmp_acks)
14549 rack->r_use_cmp_ack = 1;
14550 if (rack_disable_prr)
14551 rack->rack_no_prr = 1;
14552 if (rack_gp_no_rec_chg)
14553 rack->rc_gp_no_rec_chg = 1;
14554 if (rack_pace_every_seg && tcp_can_enable_pacing()) {
14555 rack->r_ctl.pacing_method |= RACK_REG_PACING;
14556 rack->rc_always_pace = 1;
14557 if (rack->rack_hibeta)
14558 rack_set_cc_pacing(rack);
14559 } else
14560 rack->rc_always_pace = 0;
14561 if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack)
14562 rack->r_mbuf_queue = 1;
14563 else
14564 rack->r_mbuf_queue = 0;
14565 rack_set_pace_segments(tp, rack, __LINE__, NULL);
14566 if (rack_limits_scwnd)
14567 rack->r_limit_scw = 1;
14568 else
14569 rack->r_limit_scw = 0;
14570 rack_init_retransmit_value(rack, rack_rxt_controls);
14571 rack->rc_labc = V_tcp_abc_l_var;
14572 if (rack_honors_hpts_min_to)
14573 rack->r_use_hpts_min = 1;
14574 if (tp->snd_una != 0) {
14575 rack->rc_sendvars_notset = 0;
14576 /*
14577 * Make sure any TCP timers are not running.
14578 */
14579 tcp_timer_stop(tp);
14580 } else {
14581 /*
14582 * Server side, we are called from the
14583 * syn-cache. This means none of the
14584 * snd_una/max are set yet so we have
14585 * to defer this until the first send.
14586 */
14587 rack->rc_sendvars_notset = 1;
14588 }
14589
14590 rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method;
14591 rack->rack_tlp_threshold_use = rack_tlp_threshold_use;
14592 rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr;
14593 rack->r_ctl.rc_min_to = rack_min_to;
14594 microuptime(&rack->r_ctl.act_rcv_time);
14595 rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss;
14596 if (rack_hw_up_only)
14597 rack->r_up_only = 1;
14598 if (rack_do_dyn_mul) {
14599 /* When dynamic adjustment is on CA needs to start at 100% */
14600 rack->rc_gp_dyn_mul = 1;
14601 if (rack_do_dyn_mul >= 100)
14602 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul;
14603 } else
14604 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca;
14605 rack->r_ctl.rack_per_of_gp_rec = rack_per_of_gp_rec;
14606 if (rack_timely_off) {
14607 rack->rc_skip_timely = 1;
14608 }
14609 if (rack->rc_skip_timely) {
14610 rack->r_ctl.rack_per_of_gp_rec = 90;
14611 rack->r_ctl.rack_per_of_gp_ca = 100;
14612 rack->r_ctl.rack_per_of_gp_ss = 250;
14613 }
14614 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt;
14615 rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_msec(&rack->r_ctl.act_rcv_time);
14616 rack->r_ctl.last_rcv_tstmp_for_rtt = tcp_tv_to_msec(&rack->r_ctl.act_rcv_time);
14617
14618 setup_time_filter_small(&rack->r_ctl.rc_gp_min_rtt, FILTER_TYPE_MIN,
14619 rack_probertt_filter_life);
14620 us_cts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time);
14621 rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
14622 rack->r_ctl.rc_time_of_last_probertt = us_cts;
14623 rack->r_ctl.rc_went_idle_time = us_cts;
14624 rack->r_ctl.rc_time_probertt_starts = 0;
14625
14626 rack->r_ctl.gp_rnd_thresh = rack_rnd_cnt_req & 0xff;
14627 if (rack_rnd_cnt_req & 0x10000)
14628 rack->r_ctl.gate_to_fs = 1;
14629 rack->r_ctl.gp_gain_req = rack_gp_gain_req;
14630 if ((rack_rnd_cnt_req & 0x100) > 0) {
14631
14632 }
14633 if (rack_dsack_std_based & 0x1) {
14634 /* Basically this means all rack timers are at least (srtt + 1/4 srtt) */
14635 rack->rc_rack_tmr_std_based = 1;
14636 }
14637 if (rack_dsack_std_based & 0x2) {
14638 /* Basically this means rack timers are extended based on dsack by up to (2 * srtt) */
14639 rack->rc_rack_use_dsack = 1;
14640 }
14641 /* We require at least one measurement, even if the sysctl is 0 */
14642 if (rack_req_measurements)
14643 rack->r_ctl.req_measurements = rack_req_measurements;
14644 else
14645 rack->r_ctl.req_measurements = 1;
14646 if (rack_enable_hw_pacing)
14647 rack->rack_hdw_pace_ena = 1;
14648 if (rack_hw_rate_caps)
14649 rack->r_rack_hw_rate_caps = 1;
14650 if (rack_non_rxt_use_cr)
14651 rack->rack_rec_nonrxt_use_cr = 1;
14652 /* Lets setup the fsb block */
14653 err = rack_init_fsb(tp, rack);
14654 if (err) {
14655 uma_zfree(rack_pcb_zone, *ptr);
14656 *ptr = NULL;
14657 return (err);
14658 }
14659 if (rack_do_hystart) {
14660 tp->t_ccv.flags |= CCF_HYSTART_ALLOWED;
14661 if (rack_do_hystart > 1)
14662 tp->t_ccv.flags |= CCF_HYSTART_CAN_SH_CWND;
14663 if (rack_do_hystart > 2)
14664 tp->t_ccv.flags |= CCF_HYSTART_CONS_SSTH;
14665 }
14666 /* Log what we will do with queries */
14667 rack_log_chg_info(tp, rack, 7,
14668 no_query, 0, 0);
14669 if (rack_def_profile)
14670 rack_set_profile(rack, rack_def_profile);
14671 /* Cancel the GP measurement in progress */
14672 tp->t_flags &= ~TF_GPUTINPROG;
14673 if ((tp->t_state != TCPS_CLOSED) &&
14674 (tp->t_state != TCPS_TIME_WAIT)) {
14675 /*
14676 * We are already open, we may
14677 * need to adjust a few things.
14678 */
14679 if (SEQ_GT(tp->snd_max, tp->iss))
14680 snt = tp->snd_max - tp->iss;
14681 else
14682 snt = 0;
14683 iwin = rc_init_window(rack);
14684 if ((snt < iwin) &&
14685 (no_query == 1)) {
14686 /* We are not past the initial window
14687 * on the first init (i.e. a stack switch
14688 * has not yet occured) so we need to make
14689 * sure cwnd and ssthresh is correct.
14690 */
14691 if (tp->snd_cwnd < iwin)
14692 tp->snd_cwnd = iwin;
14693 /*
14694 * If we are within the initial window
14695 * we want ssthresh to be unlimited. Setting
14696 * it to the rwnd (which the default stack does
14697 * and older racks) is not really a good idea
14698 * since we want to be in SS and grow both the
14699 * cwnd and the rwnd (via dynamic rwnd growth). If
14700 * we set it to the rwnd then as the peer grows its
14701 * rwnd we will be stuck in CA and never hit SS.
14702 *
14703 * Its far better to raise it up high (this takes the
14704 * risk that there as been a loss already, probably
14705 * we should have an indicator in all stacks of loss
14706 * but we don't), but considering the normal use this
14707 * is a risk worth taking. The consequences of not
14708 * hitting SS are far worse than going one more time
14709 * into it early on (before we have sent even a IW).
14710 * It is highly unlikely that we will have had a loss
14711 * before getting the IW out.
14712 */
14713 tp->snd_ssthresh = 0xffffffff;
14714 }
14715 /*
14716 * Any init based on sequence numbers
14717 * should be done in the deferred init path
14718 * since we can be CLOSED and not have them
14719 * inited when rack_init() is called. We
14720 * are not closed so lets call it.
14721 */
14722 rack_deferred_init(tp, rack);
14723 }
14724 if ((tp->t_state != TCPS_CLOSED) &&
14725 (tp->t_state != TCPS_TIME_WAIT) &&
14726 (no_query == 0) &&
14727 (tp->snd_una != tp->snd_max)) {
14728 err = rack_init_outstanding(tp, rack, us_cts, *ptr);
14729 if (err) {
14730 *ptr = NULL;
14731 return(err);
14732 }
14733 }
14734 rack_stop_all_timers(tp, rack);
14735 /* Setup all the t_flags2 */
14736 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
14737 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ;
14738 else
14739 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ;
14740 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state))
14741 tp->t_flags2 |= TF2_MBUF_ACKCMP;
14742 /*
14743 * Timers in Rack are kept in microseconds so lets
14744 * convert any initial incoming variables
14745 * from ticks into usecs. Note that we
14746 * also change the values of t_srtt and t_rttvar, if
14747 * they are non-zero. They are kept with a 5
14748 * bit decimal so we have to carefully convert
14749 * these to get the full precision.
14750 */
14751 rack_convert_rtts(tp);
14752 rack_log_hystart_event(rack, rack->r_ctl.roundends, 20);
14753 if ((tptoinpcb(tp)->inp_flags & INP_DROPPED) == 0) {
14754 /* We do not start any timers on DROPPED connections */
14755 if (tp->t_fb->tfb_chg_query == NULL) {
14756 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
14757 } else {
14758 struct tcp_query_resp qr;
14759 int ret;
14760
14761 memset(&qr, 0, sizeof(qr));
14762
14763 /* Get the misc time stamps and such for rack */
14764 qr.req = TCP_QUERY_RACK_TIMES;
14765 ret = (*tp->t_fb->tfb_chg_query)(tp, &qr);
14766 if (ret == 1) {
14767 rack->r_ctl.rc_reorder_ts = qr.rack_reorder_ts;
14768 rack->r_ctl.num_dsack = qr.rack_num_dsacks;
14769 rack->r_ctl.rc_tlp_rxt_last_time = qr.rack_rxt_last_time;
14770 rack->r_ctl.rc_rack_min_rtt = qr.rack_min_rtt;
14771 rack->rc_rack_rtt = qr.rack_rtt;
14772 rack->r_ctl.rc_rack_tmit_time = qr.rack_tmit_time;
14773 rack->r_ctl.rc_sacked = qr.rack_sacked;
14774 rack->r_ctl.rc_holes_rxt = qr.rack_holes_rxt;
14775 rack->r_ctl.rc_prr_delivered = qr.rack_prr_delivered;
14776 rack->r_ctl.rc_prr_recovery_fs = qr.rack_prr_recovery_fs;
14777 rack->r_ctl.rc_prr_sndcnt = qr.rack_prr_sndcnt;
14778 rack->r_ctl.rc_prr_out = qr.rack_prr_out;
14779 if (qr.rack_tlp_out) {
14780 rack->rc_tlp_in_progress = 1;
14781 rack->r_ctl.rc_tlp_cnt_out = qr.rack_tlp_cnt_out;
14782 } else {
14783 rack->rc_tlp_in_progress = 0;
14784 rack->r_ctl.rc_tlp_cnt_out = 0;
14785 }
14786 if (qr.rack_srtt_measured)
14787 rack->rc_srtt_measure_made = 1;
14788 if (qr.rack_in_persist == 1) {
14789 rack->r_ctl.rc_went_idle_time = qr.rack_time_went_idle;
14790 #ifdef NETFLIX_SHARED_CWND
14791 if (rack->r_ctl.rc_scw) {
14792 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
14793 rack->rack_scwnd_is_idle = 1;
14794 }
14795 #endif
14796 rack->r_ctl.persist_lost_ends = 0;
14797 rack->probe_not_answered = 0;
14798 rack->forced_ack = 0;
14799 tp->t_rxtshift = 0;
14800 rack->rc_in_persist = 1;
14801 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
14802 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
14803 }
14804 if (qr.rack_wanted_output)
14805 rack->r_wanted_output = 1;
14806 rack_log_chg_info(tp, rack, 6,
14807 qr.rack_min_rtt,
14808 qr.rack_rtt,
14809 qr.rack_reorder_ts);
14810 }
14811 /* Get the old stack timers */
14812 qr.req_param = 0;
14813 qr.req = TCP_QUERY_TIMERS_UP;
14814 ret = (*tp->t_fb->tfb_chg_query)(tp, &qr);
14815 if (ret) {
14816 /*
14817 * non-zero return means we have a timer('s)
14818 * to start. Zero means no timer (no keepalive
14819 * I suppose).
14820 */
14821 uint32_t tov = 0;
14822
14823 rack->r_ctl.rc_hpts_flags = qr.timer_hpts_flags;
14824 if (qr.timer_hpts_flags & PACE_PKT_OUTPUT) {
14825 rack->r_ctl.rc_last_output_to = qr.timer_pacing_to;
14826 if (TSTMP_GT(qr.timer_pacing_to, us_cts))
14827 tov = qr.timer_pacing_to - us_cts;
14828 else
14829 tov = HPTS_USECS_PER_SLOT;
14830 }
14831 if (qr.timer_hpts_flags & PACE_TMR_MASK) {
14832 rack->r_ctl.rc_timer_exp = qr.timer_timer_exp;
14833 if (tov == 0) {
14834 if (TSTMP_GT(qr.timer_timer_exp, us_cts))
14835 tov = qr.timer_timer_exp - us_cts;
14836 else
14837 tov = HPTS_USECS_PER_SLOT;
14838 }
14839 }
14840 rack_log_chg_info(tp, rack, 4,
14841 rack->r_ctl.rc_hpts_flags,
14842 rack->r_ctl.rc_last_output_to,
14843 rack->r_ctl.rc_timer_exp);
14844 if (tov) {
14845 struct hpts_diag diag;
14846
14847 tcp_hpts_insert(tp, tov, &diag);
14848 rack_log_hpts_diag(rack, us_cts, &diag, &rack->r_ctl.act_rcv_time);
14849 }
14850 }
14851 }
14852 rack_log_rtt_shrinks(rack, us_cts, tp->t_rxtcur,
14853 __LINE__, RACK_RTTS_INIT);
14854 }
14855 return (0);
14856 }
14857
14858 static int
rack_handoff_ok(struct tcpcb * tp)14859 rack_handoff_ok(struct tcpcb *tp)
14860 {
14861 if ((tp->t_state == TCPS_CLOSED) ||
14862 (tp->t_state == TCPS_LISTEN)) {
14863 /* Sure no problem though it may not stick */
14864 return (0);
14865 }
14866 if ((tp->t_state == TCPS_SYN_SENT) ||
14867 (tp->t_state == TCPS_SYN_RECEIVED)) {
14868 /*
14869 * We really don't know if you support sack,
14870 * you have to get to ESTAB or beyond to tell.
14871 */
14872 return (EAGAIN);
14873 }
14874 if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) > 1)) {
14875 /*
14876 * Rack will only send a FIN after all data is acknowledged.
14877 * So in this case we have more data outstanding. We can't
14878 * switch stacks until either all data and only the FIN
14879 * is left (in which case rack_init() now knows how
14880 * to deal with that) <or> all is acknowledged and we
14881 * are only left with incoming data, though why you
14882 * would want to switch to rack after all data is acknowledged
14883 * I have no idea (rrs)!
14884 */
14885 return (EAGAIN);
14886 }
14887 if ((tp->t_flags & TF_SACK_PERMIT) || rack_sack_not_required){
14888 return (0);
14889 }
14890 /*
14891 * If we reach here we don't do SACK on this connection so we can
14892 * never do rack.
14893 */
14894 return (EINVAL);
14895 }
14896
14897 static void
rack_fini(struct tcpcb * tp,int32_t tcb_is_purged)14898 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged)
14899 {
14900
14901 if (tp->t_fb_ptr) {
14902 uint32_t cnt_free = 0;
14903 struct tcp_rack *rack;
14904 struct rack_sendmap *rsm;
14905
14906 tcp_handle_orphaned_packets(tp);
14907 tp->t_flags &= ~TF_FORCEDATA;
14908 rack = (struct tcp_rack *)tp->t_fb_ptr;
14909 rack_log_pacing_delay_calc(rack,
14910 0,
14911 0,
14912 0,
14913 rack_get_gp_est(rack), /* delRate */
14914 rack_get_lt_bw(rack), /* rttProp */
14915 20, __LINE__, NULL, 0);
14916 #ifdef NETFLIX_SHARED_CWND
14917 if (rack->r_ctl.rc_scw) {
14918 uint32_t limit;
14919
14920 if (rack->r_limit_scw)
14921 limit = max(1, rack->r_ctl.rc_lowest_us_rtt);
14922 else
14923 limit = 0;
14924 tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw,
14925 rack->r_ctl.rc_scw_index,
14926 limit);
14927 rack->r_ctl.rc_scw = NULL;
14928 }
14929 #endif
14930 if (rack->r_ctl.fsb.tcp_ip_hdr) {
14931 free(rack->r_ctl.fsb.tcp_ip_hdr, M_TCPFSB);
14932 rack->r_ctl.fsb.tcp_ip_hdr = NULL;
14933 rack->r_ctl.fsb.th = NULL;
14934 }
14935 if (rack->rc_always_pace == 1) {
14936 rack_remove_pacing(rack);
14937 }
14938 /* Clean up any options if they were not applied */
14939 while (!TAILQ_EMPTY(&rack->r_ctl.opt_list)) {
14940 struct deferred_opt_list *dol;
14941
14942 dol = TAILQ_FIRST(&rack->r_ctl.opt_list);
14943 TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next);
14944 free(dol, M_TCPDO);
14945 }
14946 /* rack does not use force data but other stacks may clear it */
14947 if (rack->r_ctl.crte != NULL) {
14948 tcp_rel_pacing_rate(rack->r_ctl.crte, tp);
14949 rack->rack_hdrw_pacing = 0;
14950 rack->r_ctl.crte = NULL;
14951 }
14952 #ifdef TCP_BLACKBOX
14953 tcp_log_flowend(tp);
14954 #endif
14955 /*
14956 * Lets take a different approach to purging just
14957 * get each one and free it like a cum-ack would and
14958 * not use a foreach loop.
14959 */
14960 rsm = tqhash_min(rack->r_ctl.tqh);
14961 while (rsm) {
14962 tqhash_remove(rack->r_ctl.tqh, rsm, REMOVE_TYPE_CUMACK);
14963 rack->r_ctl.rc_num_maps_alloced--;
14964 uma_zfree(rack_zone, rsm);
14965 rsm = tqhash_min(rack->r_ctl.tqh);
14966 }
14967 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
14968 while (rsm) {
14969 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
14970 rack->r_ctl.rc_num_maps_alloced--;
14971 rack->rc_free_cnt--;
14972 cnt_free++;
14973 uma_zfree(rack_zone, rsm);
14974 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
14975 }
14976 if (rack->r_ctl.pcm_s != NULL) {
14977 free(rack->r_ctl.pcm_s, M_TCPPCM);
14978 rack->r_ctl.pcm_s = NULL;
14979 rack->r_ctl.pcm_i.cnt_alloc = 0;
14980 rack->r_ctl.pcm_i.cnt = 0;
14981 }
14982 if ((rack->r_ctl.rc_num_maps_alloced > 0) &&
14983 (tcp_bblogging_on(tp))) {
14984 union tcp_log_stackspecific log;
14985 struct timeval tv;
14986
14987 memset(&log, 0, sizeof(log));
14988 log.u_bbr.flex8 = 10;
14989 log.u_bbr.flex1 = rack->r_ctl.rc_num_maps_alloced;
14990 log.u_bbr.flex2 = rack->rc_free_cnt;
14991 log.u_bbr.flex3 = cnt_free;
14992 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
14993 rsm = tqhash_min(rack->r_ctl.tqh);
14994 log.u_bbr.delRate = (uintptr_t)rsm;
14995 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
14996 log.u_bbr.cur_del_rate = (uintptr_t)rsm;
14997 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
14998 log.u_bbr.pkt_epoch = __LINE__;
14999 (void)tcp_log_event(tp, NULL, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK,
15000 0, &log, false, NULL, NULL, 0, &tv);
15001 }
15002 KASSERT((rack->r_ctl.rc_num_maps_alloced == 0),
15003 ("rack:%p num_aloc:%u after freeing all?",
15004 rack,
15005 rack->r_ctl.rc_num_maps_alloced));
15006 rack->rc_free_cnt = 0;
15007 free(rack->r_ctl.tqh, M_TCPFSB);
15008 rack->r_ctl.tqh = NULL;
15009 uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
15010 tp->t_fb_ptr = NULL;
15011 }
15012 /* Make sure snd_nxt is correctly set */
15013 tp->snd_nxt = tp->snd_max;
15014 }
15015
15016 static void
rack_set_state(struct tcpcb * tp,struct tcp_rack * rack)15017 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack)
15018 {
15019 if ((rack->r_state == TCPS_CLOSED) && (tp->t_state != TCPS_CLOSED)) {
15020 rack->r_is_v6 = (tptoinpcb(tp)->inp_vflag & INP_IPV6) != 0;
15021 }
15022 switch (tp->t_state) {
15023 case TCPS_SYN_SENT:
15024 rack->r_state = TCPS_SYN_SENT;
15025 rack->r_substate = rack_do_syn_sent;
15026 break;
15027 case TCPS_SYN_RECEIVED:
15028 rack->r_state = TCPS_SYN_RECEIVED;
15029 rack->r_substate = rack_do_syn_recv;
15030 break;
15031 case TCPS_ESTABLISHED:
15032 rack_set_pace_segments(tp, rack, __LINE__, NULL);
15033 rack->r_state = TCPS_ESTABLISHED;
15034 rack->r_substate = rack_do_established;
15035 break;
15036 case TCPS_CLOSE_WAIT:
15037 rack->r_state = TCPS_CLOSE_WAIT;
15038 rack->r_substate = rack_do_close_wait;
15039 break;
15040 case TCPS_FIN_WAIT_1:
15041 rack_set_pace_segments(tp, rack, __LINE__, NULL);
15042 rack->r_state = TCPS_FIN_WAIT_1;
15043 rack->r_substate = rack_do_fin_wait_1;
15044 break;
15045 case TCPS_CLOSING:
15046 rack_set_pace_segments(tp, rack, __LINE__, NULL);
15047 rack->r_state = TCPS_CLOSING;
15048 rack->r_substate = rack_do_closing;
15049 break;
15050 case TCPS_LAST_ACK:
15051 rack_set_pace_segments(tp, rack, __LINE__, NULL);
15052 rack->r_state = TCPS_LAST_ACK;
15053 rack->r_substate = rack_do_lastack;
15054 break;
15055 case TCPS_FIN_WAIT_2:
15056 rack->r_state = TCPS_FIN_WAIT_2;
15057 rack->r_substate = rack_do_fin_wait_2;
15058 break;
15059 case TCPS_LISTEN:
15060 case TCPS_CLOSED:
15061 case TCPS_TIME_WAIT:
15062 default:
15063 break;
15064 };
15065 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state))
15066 rack->rc_tp->t_flags2 |= TF2_MBUF_ACKCMP;
15067
15068 }
15069
15070 static void
rack_timer_audit(struct tcpcb * tp,struct tcp_rack * rack,struct sockbuf * sb)15071 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb)
15072 {
15073 /*
15074 * We received an ack, and then did not
15075 * call send or were bounced out due to the
15076 * hpts was running. Now a timer is up as well, is
15077 * it the right timer?
15078 */
15079 struct rack_sendmap *rsm;
15080 int tmr_up;
15081
15082 tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
15083 if (tcp_in_hpts(rack->rc_tp) == 0) {
15084 /*
15085 * Ok we probably need some timer up, but no
15086 * matter what the mask we are not in hpts. We
15087 * may have received an old ack and thus did nothing.
15088 */
15089 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
15090 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
15091 return;
15092 }
15093 if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT))
15094 return;
15095 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
15096 if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) &&
15097 (tmr_up == PACE_TMR_RXT)) {
15098 /* Should be an RXT */
15099 return;
15100 }
15101 if (rsm == NULL) {
15102 /* Nothing outstanding? */
15103 if (tp->t_flags & TF_DELACK) {
15104 if (tmr_up == PACE_TMR_DELACK)
15105 /* We are supposed to have delayed ack up and we do */
15106 return;
15107 } else if (((V_tcp_always_keepalive ||
15108 rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
15109 (tp->t_state <= TCPS_CLOSING)) &&
15110 (tmr_up == PACE_TMR_KEEP) &&
15111 (tp->snd_max == tp->snd_una)) {
15112 /* We should have keep alive up and we do */
15113 return;
15114 }
15115 }
15116 if (SEQ_GT(tp->snd_max, tp->snd_una) &&
15117 ((tmr_up == PACE_TMR_TLP) ||
15118 (tmr_up == PACE_TMR_RACK) ||
15119 (tmr_up == PACE_TMR_RXT))) {
15120 /*
15121 * Either a Rack, TLP or RXT is fine if we
15122 * have outstanding data.
15123 */
15124 return;
15125 } else if (tmr_up == PACE_TMR_DELACK) {
15126 /*
15127 * If the delayed ack was going to go off
15128 * before the rtx/tlp/rack timer were going to
15129 * expire, then that would be the timer in control.
15130 * Note we don't check the time here trusting the
15131 * code is correct.
15132 */
15133 return;
15134 }
15135 /*
15136 * Ok the timer originally started is not what we want now.
15137 * We will force the hpts to be stopped if any, and restart
15138 * with the slot set to what was in the saved slot.
15139 */
15140 if (tcp_in_hpts(rack->rc_tp)) {
15141 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
15142 uint32_t us_cts;
15143
15144 us_cts = tcp_get_usecs(NULL);
15145 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) {
15146 rack->r_early = 1;
15147 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts);
15148 }
15149 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
15150 }
15151 tcp_hpts_remove(rack->rc_tp);
15152 }
15153 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
15154 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
15155 }
15156
15157
15158 static void
rack_do_win_updates(struct tcpcb * tp,struct tcp_rack * rack,uint32_t tiwin,uint32_t seq,uint32_t ack,uint32_t cts)15159 rack_do_win_updates(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tiwin, uint32_t seq, uint32_t ack, uint32_t cts)
15160 {
15161 if ((SEQ_LT(tp->snd_wl1, seq) ||
15162 (tp->snd_wl1 == seq && (SEQ_LT(tp->snd_wl2, ack) ||
15163 (tp->snd_wl2 == ack && tiwin > tp->snd_wnd))))) {
15164 /* keep track of pure window updates */
15165 if ((tp->snd_wl2 == ack) && (tiwin > tp->snd_wnd))
15166 KMOD_TCPSTAT_INC(tcps_rcvwinupd);
15167 tp->snd_wnd = tiwin;
15168 rack_validate_fo_sendwin_up(tp, rack);
15169 tp->snd_wl1 = seq;
15170 tp->snd_wl2 = ack;
15171 if (tp->snd_wnd > tp->max_sndwnd)
15172 tp->max_sndwnd = tp->snd_wnd;
15173 rack->r_wanted_output = 1;
15174 } else if ((tp->snd_wl2 == ack) && (tiwin < tp->snd_wnd)) {
15175 tp->snd_wnd = tiwin;
15176 rack_validate_fo_sendwin_up(tp, rack);
15177 tp->snd_wl1 = seq;
15178 tp->snd_wl2 = ack;
15179 } else {
15180 /* Not a valid win update */
15181 return;
15182 }
15183 if (tp->snd_wnd > tp->max_sndwnd)
15184 tp->max_sndwnd = tp->snd_wnd;
15185 /* Do we exit persists? */
15186 if ((rack->rc_in_persist != 0) &&
15187 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
15188 rack->r_ctl.rc_pace_min_segs))) {
15189 rack_exit_persist(tp, rack, cts);
15190 }
15191 /* Do we enter persists? */
15192 if ((rack->rc_in_persist == 0) &&
15193 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
15194 TCPS_HAVEESTABLISHED(tp->t_state) &&
15195 ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) &&
15196 sbavail(&tptosocket(tp)->so_snd) &&
15197 (sbavail(&tptosocket(tp)->so_snd) > tp->snd_wnd)) {
15198 /*
15199 * Here the rwnd is less than
15200 * the pacing size, we are established,
15201 * nothing is outstanding, and there is
15202 * data to send. Enter persists.
15203 */
15204 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, ack);
15205 }
15206 }
15207
15208 static void
rack_log_input_packet(struct tcpcb * tp,struct tcp_rack * rack,struct tcp_ackent * ae,int ackval,uint32_t high_seq)15209 rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent *ae, int ackval, uint32_t high_seq)
15210 {
15211
15212 if (tcp_bblogging_on(rack->rc_tp)) {
15213 struct inpcb *inp = tptoinpcb(tp);
15214 union tcp_log_stackspecific log;
15215 struct timeval ltv;
15216 char tcp_hdr_buf[60];
15217 struct tcphdr *th;
15218 struct timespec ts;
15219 uint32_t orig_snd_una;
15220 uint8_t xx = 0;
15221
15222 #ifdef TCP_REQUEST_TRK
15223 struct tcp_sendfile_track *tcp_req;
15224
15225 if (SEQ_GT(ae->ack, tp->snd_una)) {
15226 tcp_req = tcp_req_find_req_for_seq(tp, (ae->ack-1));
15227 } else {
15228 tcp_req = tcp_req_find_req_for_seq(tp, ae->ack);
15229 }
15230 #endif
15231 memset(&log, 0, sizeof(log));
15232 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
15233 if (rack->rack_no_prr == 0)
15234 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
15235 else
15236 log.u_bbr.flex1 = 0;
15237 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
15238 log.u_bbr.use_lt_bw <<= 1;
15239 log.u_bbr.use_lt_bw |= rack->r_might_revert;
15240 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced;
15241 log.u_bbr.bbr_state = rack->rc_free_cnt;
15242 log.u_bbr.inflight = ctf_flight_size(tp, rack->r_ctl.rc_sacked);
15243 log.u_bbr.pkts_out = tp->t_maxseg;
15244 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
15245 log.u_bbr.flex7 = 1;
15246 log.u_bbr.lost = ae->flags;
15247 log.u_bbr.cwnd_gain = ackval;
15248 log.u_bbr.pacing_gain = 0x2;
15249 if (ae->flags & TSTMP_HDWR) {
15250 /* Record the hardware timestamp if present */
15251 log.u_bbr.flex3 = M_TSTMP;
15252 ts.tv_sec = ae->timestamp / 1000000000;
15253 ts.tv_nsec = ae->timestamp % 1000000000;
15254 ltv.tv_sec = ts.tv_sec;
15255 ltv.tv_usec = ts.tv_nsec / 1000;
15256 log.u_bbr.lt_epoch = tcp_tv_to_usec(<v);
15257 } else if (ae->flags & TSTMP_LRO) {
15258 /* Record the LRO the arrival timestamp */
15259 log.u_bbr.flex3 = M_TSTMP_LRO;
15260 ts.tv_sec = ae->timestamp / 1000000000;
15261 ts.tv_nsec = ae->timestamp % 1000000000;
15262 ltv.tv_sec = ts.tv_sec;
15263 ltv.tv_usec = ts.tv_nsec / 1000;
15264 log.u_bbr.flex5 = tcp_tv_to_usec(<v);
15265 }
15266 log.u_bbr.timeStamp = tcp_get_usecs(<v);
15267 /* Log the rcv time */
15268 log.u_bbr.delRate = ae->timestamp;
15269 #ifdef TCP_REQUEST_TRK
15270 log.u_bbr.applimited = tp->t_tcpreq_closed;
15271 log.u_bbr.applimited <<= 8;
15272 log.u_bbr.applimited |= tp->t_tcpreq_open;
15273 log.u_bbr.applimited <<= 8;
15274 log.u_bbr.applimited |= tp->t_tcpreq_req;
15275 if (tcp_req) {
15276 /* Copy out any client req info */
15277 /* seconds */
15278 log.u_bbr.pkt_epoch = (tcp_req->localtime / HPTS_USEC_IN_SEC);
15279 /* useconds */
15280 log.u_bbr.delivered = (tcp_req->localtime % HPTS_USEC_IN_SEC);
15281 log.u_bbr.rttProp = tcp_req->timestamp;
15282 log.u_bbr.cur_del_rate = tcp_req->start;
15283 if (tcp_req->flags & TCP_TRK_TRACK_FLG_OPEN) {
15284 log.u_bbr.flex8 |= 1;
15285 } else {
15286 log.u_bbr.flex8 |= 2;
15287 log.u_bbr.bw_inuse = tcp_req->end;
15288 }
15289 log.u_bbr.flex6 = tcp_req->start_seq;
15290 if (tcp_req->flags & TCP_TRK_TRACK_FLG_COMP) {
15291 log.u_bbr.flex8 |= 4;
15292 log.u_bbr.epoch = tcp_req->end_seq;
15293 }
15294 }
15295 #endif
15296 memset(tcp_hdr_buf, 0, sizeof(tcp_hdr_buf));
15297 th = (struct tcphdr *)tcp_hdr_buf;
15298 th->th_seq = ae->seq;
15299 th->th_ack = ae->ack;
15300 th->th_win = ae->win;
15301 /* Now fill in the ports */
15302 th->th_sport = inp->inp_fport;
15303 th->th_dport = inp->inp_lport;
15304 tcp_set_flags(th, ae->flags);
15305 /* Now do we have a timestamp option? */
15306 if (ae->flags & HAS_TSTMP) {
15307 u_char *cp;
15308 uint32_t val;
15309
15310 th->th_off = ((sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA) >> 2);
15311 cp = (u_char *)(th + 1);
15312 *cp = TCPOPT_NOP;
15313 cp++;
15314 *cp = TCPOPT_NOP;
15315 cp++;
15316 *cp = TCPOPT_TIMESTAMP;
15317 cp++;
15318 *cp = TCPOLEN_TIMESTAMP;
15319 cp++;
15320 val = htonl(ae->ts_value);
15321 bcopy((char *)&val,
15322 (char *)cp, sizeof(uint32_t));
15323 val = htonl(ae->ts_echo);
15324 bcopy((char *)&val,
15325 (char *)(cp + 4), sizeof(uint32_t));
15326 } else
15327 th->th_off = (sizeof(struct tcphdr) >> 2);
15328
15329 /*
15330 * For sane logging we need to play a little trick.
15331 * If the ack were fully processed we would have moved
15332 * snd_una to high_seq, but since compressed acks are
15333 * processed in two phases, at this point (logging) snd_una
15334 * won't be advanced. So we would see multiple acks showing
15335 * the advancement. We can prevent that by "pretending" that
15336 * snd_una was advanced and then un-advancing it so that the
15337 * logging code has the right value for tlb_snd_una.
15338 */
15339 if (tp->snd_una != high_seq) {
15340 orig_snd_una = tp->snd_una;
15341 tp->snd_una = high_seq;
15342 xx = 1;
15343 } else
15344 xx = 0;
15345 TCP_LOG_EVENTP(tp, th,
15346 &tptosocket(tp)->so_rcv,
15347 &tptosocket(tp)->so_snd, TCP_LOG_IN, 0,
15348 0, &log, true, <v);
15349 if (xx) {
15350 tp->snd_una = orig_snd_una;
15351 }
15352 }
15353
15354 }
15355
15356 static void
rack_handle_probe_response(struct tcp_rack * rack,uint32_t tiwin,uint32_t us_cts)15357 rack_handle_probe_response(struct tcp_rack *rack, uint32_t tiwin, uint32_t us_cts)
15358 {
15359 uint32_t us_rtt;
15360 /*
15361 * A persist or keep-alive was forced out, update our
15362 * min rtt time. Note now worry about lost responses.
15363 * When a subsequent keep-alive or persist times out
15364 * and forced_ack is still on, then the last probe
15365 * was not responded to. In such cases we have a
15366 * sysctl that controls the behavior. Either we apply
15367 * the rtt but with reduced confidence (0). Or we just
15368 * plain don't apply the rtt estimate. Having data flow
15369 * will clear the probe_not_answered flag i.e. cum-ack
15370 * move forward <or> exiting and reentering persists.
15371 */
15372
15373 rack->forced_ack = 0;
15374 rack->rc_tp->t_rxtshift = 0;
15375 if ((rack->rc_in_persist &&
15376 (tiwin == rack->rc_tp->snd_wnd)) ||
15377 (rack->rc_in_persist == 0)) {
15378 /*
15379 * In persists only apply the RTT update if this is
15380 * a response to our window probe. And that
15381 * means the rwnd sent must match the current
15382 * snd_wnd. If it does not, then we got a
15383 * window update ack instead. For keepalive
15384 * we allow the answer no matter what the window.
15385 *
15386 * Note that if the probe_not_answered is set then
15387 * the forced_ack_ts is the oldest one i.e. the first
15388 * probe sent that might have been lost. This assures
15389 * us that if we do calculate an RTT it is longer not
15390 * some short thing.
15391 */
15392 if (rack->rc_in_persist)
15393 counter_u64_add(rack_persists_acks, 1);
15394 us_rtt = us_cts - rack->r_ctl.forced_ack_ts;
15395 if (us_rtt == 0)
15396 us_rtt = 1;
15397 if (rack->probe_not_answered == 0) {
15398 rack_apply_updated_usrtt(rack, us_rtt, us_cts);
15399 tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 3, NULL, 1);
15400 } else {
15401 /* We have a retransmitted probe here too */
15402 if (rack_apply_rtt_with_reduced_conf) {
15403 rack_apply_updated_usrtt(rack, us_rtt, us_cts);
15404 tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 0, NULL, 1);
15405 }
15406 }
15407 }
15408 }
15409
15410 static void
rack_new_round_starts(struct tcpcb * tp,struct tcp_rack * rack,uint32_t high_seq)15411 rack_new_round_starts(struct tcpcb *tp, struct tcp_rack *rack, uint32_t high_seq)
15412 {
15413 /*
15414 * The next send has occurred mark the end of the round
15415 * as when that data gets acknowledged. We can
15416 * also do common things we might need to do when
15417 * a round begins.
15418 */
15419 rack->r_ctl.roundends = tp->snd_max;
15420 rack->rc_new_rnd_needed = 0;
15421 rack_log_hystart_event(rack, tp->snd_max, 4);
15422 }
15423
15424
15425 static void
rack_log_pcm(struct tcp_rack * rack,uint8_t mod,uint32_t flex1,uint32_t flex2,uint32_t flex3)15426 rack_log_pcm(struct tcp_rack *rack, uint8_t mod, uint32_t flex1, uint32_t flex2,
15427 uint32_t flex3)
15428 {
15429 if (tcp_bblogging_on(rack->rc_tp)) {
15430 union tcp_log_stackspecific log;
15431 struct timeval tv;
15432
15433 (void)tcp_get_usecs(&tv);
15434 memset(&log, 0, sizeof(log));
15435 log.u_bbr.timeStamp = tcp_tv_to_usec(&tv);
15436 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
15437 log.u_bbr.flex8 = mod;
15438 log.u_bbr.flex1 = flex1;
15439 log.u_bbr.flex2 = flex2;
15440 log.u_bbr.flex3 = flex3;
15441 log.u_bbr.flex4 = rack_pcm_every_n_rounds;
15442 log.u_bbr.flex5 = rack->r_ctl.pcm_idle_rounds;
15443 log.u_bbr.bbr_substate = rack->pcm_needed;
15444 log.u_bbr.bbr_substate <<= 1;
15445 log.u_bbr.bbr_substate |= rack->pcm_in_progress;
15446 log.u_bbr.bbr_substate <<= 1;
15447 log.u_bbr.bbr_substate |= rack->pcm_enabled; /* bits are NIE for Needed, Inprogress, Enabled */
15448 (void)tcp_log_event(rack->rc_tp, NULL, NULL, NULL, TCP_PCM_MEASURE, ERRNO_UNK,
15449 0, &log, false, NULL, NULL, 0, &tv);
15450 }
15451 }
15452
15453 static void
rack_new_round_setup(struct tcpcb * tp,struct tcp_rack * rack,uint32_t high_seq)15454 rack_new_round_setup(struct tcpcb *tp, struct tcp_rack *rack, uint32_t high_seq)
15455 {
15456 /*
15457 * The round (current_round) has ended. We now
15458 * setup for the next round by incrementing the
15459 * round numnber and doing any round specific
15460 * things.
15461 */
15462 rack_log_hystart_event(rack, high_seq, 21);
15463 rack->r_ctl.current_round++;
15464 /* New round (current_round) begins at next send */
15465 rack->rc_new_rnd_needed = 1;
15466 if ((rack->pcm_enabled == 1) &&
15467 (rack->pcm_needed == 0) &&
15468 (rack->pcm_in_progress == 0)) {
15469 /*
15470 * If we have enabled PCM, then we need to
15471 * check if the round has adanced to the state
15472 * where one is required.
15473 */
15474 int rnds;
15475
15476 rnds = rack->r_ctl.current_round - rack->r_ctl.last_pcm_round;
15477 if ((rnds + rack->r_ctl.pcm_idle_rounds) >= rack_pcm_every_n_rounds) {
15478 rack->pcm_needed = 1;
15479 rack_log_pcm(rack, 3, rack->r_ctl.last_pcm_round, rack_pcm_every_n_rounds, rack->r_ctl.current_round );
15480 } else if (rack_verbose_logging) {
15481 rack_log_pcm(rack, 3, rack->r_ctl.last_pcm_round, rack_pcm_every_n_rounds, rack->r_ctl.current_round );
15482 }
15483 }
15484 if (tp->t_ccv.flags & CCF_HYSTART_ALLOWED) {
15485 /* We have hystart enabled send the round info in */
15486 if (CC_ALGO(tp)->newround != NULL) {
15487 CC_ALGO(tp)->newround(&tp->t_ccv, rack->r_ctl.current_round);
15488 }
15489 }
15490 /*
15491 * For DGP an initial startup check. We want to validate
15492 * that we are not just pushing on slow-start and just
15493 * not gaining.. i.e. filling buffers without getting any
15494 * boost in b/w during the inital slow-start.
15495 */
15496 if (rack->dgp_on &&
15497 (rack->rc_initial_ss_comp == 0) &&
15498 (tp->snd_cwnd < tp->snd_ssthresh) &&
15499 (rack->r_ctl.num_measurements >= RACK_REQ_AVG) &&
15500 (rack->r_ctl.gp_rnd_thresh > 0) &&
15501 ((rack->r_ctl.current_round - rack->r_ctl.last_rnd_of_gp_rise) >= rack->r_ctl.gp_rnd_thresh)) {
15502
15503 /*
15504 * We are in the initial SS and we have hd rack_rnd_cnt_req rounds(def:5) where
15505 * we have not gained the required amount in the gp_est (120.0% aka 1200). Lets
15506 * exit SS.
15507 *
15508 * Pick up the flight size now as we enter slowstart (not the
15509 * cwnd which may be inflated).
15510 */
15511 rack->rc_initial_ss_comp = 1;
15512
15513 if (tcp_bblogging_on(rack->rc_tp)) {
15514 union tcp_log_stackspecific log;
15515 struct timeval tv;
15516
15517 memset(&log, 0, sizeof(log));
15518 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
15519 log.u_bbr.flex1 = rack->r_ctl.current_round;
15520 log.u_bbr.flex2 = rack->r_ctl.last_rnd_of_gp_rise;
15521 log.u_bbr.flex3 = rack->r_ctl.gp_rnd_thresh;
15522 log.u_bbr.flex4 = rack->r_ctl.gate_to_fs;
15523 log.u_bbr.flex5 = rack->r_ctl.ss_hi_fs;
15524 log.u_bbr.flex8 = 40;
15525 (void)tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
15526 0, &log, false, NULL, __func__, __LINE__,&tv);
15527 }
15528 if ((rack->r_ctl.gate_to_fs == 1) &&
15529 (tp->snd_cwnd > rack->r_ctl.ss_hi_fs)) {
15530 tp->snd_cwnd = rack->r_ctl.ss_hi_fs;
15531 }
15532 tp->snd_ssthresh = tp->snd_cwnd - 1;
15533 /* Turn off any fast output running */
15534 rack->r_fast_output = 0;
15535 }
15536 }
15537
15538 static int
rack_do_compressed_ack_processing(struct tcpcb * tp,struct socket * so,struct mbuf * m,int nxt_pkt,struct timeval * tv)15539 rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mbuf *m, int nxt_pkt, struct timeval *tv)
15540 {
15541 /*
15542 * Handle a "special" compressed ack mbuf. Each incoming
15543 * ack has only four possible dispositions:
15544 *
15545 * A) It moves the cum-ack forward
15546 * B) It is behind the cum-ack.
15547 * C) It is a window-update ack.
15548 * D) It is a dup-ack.
15549 *
15550 * Note that we can have between 1 -> TCP_COMP_ACK_ENTRIES
15551 * in the incoming mbuf. We also need to still pay attention
15552 * to nxt_pkt since there may be another packet after this
15553 * one.
15554 */
15555 #ifdef TCP_ACCOUNTING
15556 uint64_t ts_val;
15557 uint64_t rdstc;
15558 #endif
15559 int segsiz;
15560 struct timespec ts;
15561 struct tcp_rack *rack;
15562 struct tcp_ackent *ae;
15563 uint32_t tiwin, ms_cts, cts, acked, acked_amount, high_seq, win_seq, the_win, win_upd_ack;
15564 int cnt, i, did_out, ourfinisacked = 0;
15565 struct tcpopt to_holder, *to = NULL;
15566 #ifdef TCP_ACCOUNTING
15567 int win_up_req = 0;
15568 #endif
15569 int nsegs = 0;
15570 int under_pacing = 0;
15571 int post_recovery = 0;
15572 #ifdef TCP_ACCOUNTING
15573 sched_pin();
15574 #endif
15575 rack = (struct tcp_rack *)tp->t_fb_ptr;
15576 if (rack->gp_ready &&
15577 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT))
15578 under_pacing = 1;
15579
15580 if (rack->r_state != tp->t_state)
15581 rack_set_state(tp, rack);
15582 if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
15583 (tp->t_flags & TF_GPUTINPROG)) {
15584 /*
15585 * We have a goodput in progress
15586 * and we have entered a late state.
15587 * Do we have enough data in the sb
15588 * to handle the GPUT request?
15589 */
15590 uint32_t bytes;
15591
15592 bytes = tp->gput_ack - tp->gput_seq;
15593 if (SEQ_GT(tp->gput_seq, tp->snd_una))
15594 bytes += tp->gput_seq - tp->snd_una;
15595 if (bytes > sbavail(&tptosocket(tp)->so_snd)) {
15596 /*
15597 * There are not enough bytes in the socket
15598 * buffer that have been sent to cover this
15599 * measurement. Cancel it.
15600 */
15601 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
15602 rack->r_ctl.rc_gp_srtt /*flex1*/,
15603 tp->gput_seq,
15604 0, 0, 18, __LINE__, NULL, 0);
15605 tp->t_flags &= ~TF_GPUTINPROG;
15606 }
15607 }
15608 to = &to_holder;
15609 to->to_flags = 0;
15610 KASSERT((m->m_len >= sizeof(struct tcp_ackent)),
15611 ("tp:%p m_cmpack:%p with invalid len:%u", tp, m, m->m_len));
15612 cnt = m->m_len / sizeof(struct tcp_ackent);
15613 counter_u64_add(rack_multi_single_eq, cnt);
15614 high_seq = tp->snd_una;
15615 the_win = tp->snd_wnd;
15616 win_seq = tp->snd_wl1;
15617 win_upd_ack = tp->snd_wl2;
15618 cts = tcp_tv_to_usec(tv);
15619 ms_cts = tcp_tv_to_msec(tv);
15620 rack->r_ctl.rc_rcvtime = cts;
15621 segsiz = ctf_fixed_maxseg(tp);
15622 if ((rack->rc_gp_dyn_mul) &&
15623 (rack->use_fixed_rate == 0) &&
15624 (rack->rc_always_pace)) {
15625 /* Check in on probertt */
15626 rack_check_probe_rtt(rack, cts);
15627 }
15628 for (i = 0; i < cnt; i++) {
15629 #ifdef TCP_ACCOUNTING
15630 ts_val = get_cyclecount();
15631 #endif
15632 rack_clear_rate_sample(rack);
15633 ae = ((mtod(m, struct tcp_ackent *)) + i);
15634 if (ae->flags & TH_FIN)
15635 rack_log_pacing_delay_calc(rack,
15636 0,
15637 0,
15638 0,
15639 rack_get_gp_est(rack), /* delRate */
15640 rack_get_lt_bw(rack), /* rttProp */
15641 20, __LINE__, NULL, 0);
15642 /* Setup the window */
15643 tiwin = ae->win << tp->snd_scale;
15644 if (tiwin > rack->r_ctl.rc_high_rwnd)
15645 rack->r_ctl.rc_high_rwnd = tiwin;
15646 /* figure out the type of ack */
15647 if (SEQ_LT(ae->ack, high_seq)) {
15648 /* Case B*/
15649 ae->ack_val_set = ACK_BEHIND;
15650 } else if (SEQ_GT(ae->ack, high_seq)) {
15651 /* Case A */
15652 ae->ack_val_set = ACK_CUMACK;
15653 } else if ((tiwin == the_win) && (rack->rc_in_persist == 0)){
15654 /* Case D */
15655 ae->ack_val_set = ACK_DUPACK;
15656 } else {
15657 /* Case C */
15658 ae->ack_val_set = ACK_RWND;
15659 }
15660 rack_log_type_bbrsnd(rack, 0, 0, cts, tv, __LINE__);
15661 rack_log_input_packet(tp, rack, ae, ae->ack_val_set, high_seq);
15662 /* Validate timestamp */
15663 if (ae->flags & HAS_TSTMP) {
15664 /* Setup for a timestamp */
15665 to->to_flags = TOF_TS;
15666 ae->ts_echo -= tp->ts_offset;
15667 to->to_tsecr = ae->ts_echo;
15668 to->to_tsval = ae->ts_value;
15669 /*
15670 * If echoed timestamp is later than the current time, fall back to
15671 * non RFC1323 RTT calculation. Normalize timestamp if syncookies
15672 * were used when this connection was established.
15673 */
15674 if (TSTMP_GT(ae->ts_echo, ms_cts))
15675 to->to_tsecr = 0;
15676 if (tp->ts_recent &&
15677 TSTMP_LT(ae->ts_value, tp->ts_recent)) {
15678 if (ctf_ts_check_ac(tp, (ae->flags & 0xff))) {
15679 #ifdef TCP_ACCOUNTING
15680 rdstc = get_cyclecount();
15681 if (rdstc > ts_val) {
15682 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
15683 tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val);
15684 }
15685 }
15686 #endif
15687 continue;
15688 }
15689 }
15690 if (SEQ_LEQ(ae->seq, tp->last_ack_sent) &&
15691 SEQ_LEQ(tp->last_ack_sent, ae->seq)) {
15692 tp->ts_recent_age = tcp_ts_getticks();
15693 tp->ts_recent = ae->ts_value;
15694 }
15695 } else {
15696 /* Setup for a no options */
15697 to->to_flags = 0;
15698 }
15699 /* Update the rcv time and perform idle reduction possibly */
15700 if (tp->t_idle_reduce &&
15701 (tp->snd_max == tp->snd_una) &&
15702 (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur)) {
15703 counter_u64_add(rack_input_idle_reduces, 1);
15704 rack_cc_after_idle(rack, tp);
15705 }
15706 tp->t_rcvtime = ticks;
15707 /* Now what about ECN of a chain of pure ACKs? */
15708 if (tcp_ecn_input_segment(tp, ae->flags, 0,
15709 tcp_packets_this_ack(tp, ae->ack),
15710 ae->codepoint))
15711 rack_cong_signal(tp, CC_ECN, ae->ack, __LINE__);
15712 if (tp->t_flags & TF_ACKNOW)
15713 rack->r_wanted_output = 1;
15714 #ifdef TCP_ACCOUNTING
15715 /* Count for the specific type of ack in */
15716 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
15717 tp->tcp_cnt_counters[ae->ack_val_set]++;
15718 }
15719 #endif
15720 /*
15721 * Note how we could move up these in the determination
15722 * above, but we don't so that way the timestamp checks (and ECN)
15723 * is done first before we do any processing on the ACK.
15724 * The non-compressed path through the code has this
15725 * weakness (noted by @jtl) that it actually does some
15726 * processing before verifying the timestamp information.
15727 * We don't take that path here which is why we set
15728 * the ack_val_set first, do the timestamp and ecn
15729 * processing, and then look at what we have setup.
15730 */
15731 if (ae->ack_val_set == ACK_BEHIND) {
15732 /*
15733 * Case B flag reordering, if window is not closed
15734 * or it could be a keep-alive or persists
15735 */
15736 if (SEQ_LT(ae->ack, tp->snd_una) && (sbspace(&so->so_rcv) > segsiz)) {
15737 rack->r_ctl.rc_reorder_ts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time);
15738 if (rack->r_ctl.rc_reorder_ts == 0)
15739 rack->r_ctl.rc_reorder_ts = 1;
15740 }
15741 } else if (ae->ack_val_set == ACK_DUPACK) {
15742 /* Case D */
15743 rack_strike_dupack(rack, ae->ack);
15744 } else if (ae->ack_val_set == ACK_RWND) {
15745 /* Case C */
15746 if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) {
15747 ts.tv_sec = ae->timestamp / 1000000000;
15748 ts.tv_nsec = ae->timestamp % 1000000000;
15749 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec;
15750 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000;
15751 } else {
15752 rack->r_ctl.act_rcv_time = *tv;
15753 }
15754 if (rack->forced_ack) {
15755 rack_handle_probe_response(rack, tiwin,
15756 tcp_tv_to_usec(&rack->r_ctl.act_rcv_time));
15757 }
15758 #ifdef TCP_ACCOUNTING
15759 win_up_req = 1;
15760 #endif
15761 win_upd_ack = ae->ack;
15762 win_seq = ae->seq;
15763 the_win = tiwin;
15764 rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts);
15765 } else {
15766 /* Case A */
15767 if (SEQ_GT(ae->ack, tp->snd_max)) {
15768 /*
15769 * We just send an ack since the incoming
15770 * ack is beyond the largest seq we sent.
15771 */
15772 if ((tp->t_flags & TF_ACKNOW) == 0) {
15773 ctf_ack_war_checks(tp);
15774 if (tp->t_flags && TF_ACKNOW)
15775 rack->r_wanted_output = 1;
15776 }
15777 } else {
15778 nsegs++;
15779 /* If the window changed setup to update */
15780 if (tiwin != tp->snd_wnd) {
15781 win_upd_ack = ae->ack;
15782 win_seq = ae->seq;
15783 the_win = tiwin;
15784 rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts);
15785 }
15786 #ifdef TCP_ACCOUNTING
15787 /* Account for the acks */
15788 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
15789 tp->tcp_cnt_counters[CNT_OF_ACKS_IN] += (((ae->ack - high_seq) + segsiz - 1) / segsiz);
15790 }
15791 #endif
15792 high_seq = ae->ack;
15793 /* Setup our act_rcv_time */
15794 if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) {
15795 ts.tv_sec = ae->timestamp / 1000000000;
15796 ts.tv_nsec = ae->timestamp % 1000000000;
15797 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec;
15798 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000;
15799 } else {
15800 rack->r_ctl.act_rcv_time = *tv;
15801 }
15802 rack_process_to_cumack(tp, rack, ae->ack, cts, to,
15803 tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time));
15804 #ifdef TCP_REQUEST_TRK
15805 rack_req_check_for_comp(rack, high_seq);
15806 #endif
15807 if (rack->rc_dsack_round_seen) {
15808 /* Is the dsack round over? */
15809 if (SEQ_GEQ(ae->ack, rack->r_ctl.dsack_round_end)) {
15810 /* Yes it is */
15811 rack->rc_dsack_round_seen = 0;
15812 rack_log_dsack_event(rack, 3, __LINE__, 0, 0);
15813 }
15814 }
15815 }
15816 }
15817 /* And lets be sure to commit the rtt measurements for this ack */
15818 tcp_rack_xmit_timer_commit(rack, tp);
15819 #ifdef TCP_ACCOUNTING
15820 rdstc = get_cyclecount();
15821 if (rdstc > ts_val) {
15822 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
15823 tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val);
15824 if (ae->ack_val_set == ACK_CUMACK)
15825 tp->tcp_proc_time[CYC_HANDLE_MAP] += (rdstc - ts_val);
15826 }
15827 }
15828 #endif
15829 }
15830 #ifdef TCP_ACCOUNTING
15831 ts_val = get_cyclecount();
15832 #endif
15833 /* Tend to any collapsed window */
15834 if (SEQ_GT(tp->snd_max, high_seq) && (tp->snd_wnd < (tp->snd_max - high_seq))) {
15835 /* The peer collapsed the window */
15836 rack_collapsed_window(rack, (tp->snd_max - high_seq), high_seq, __LINE__);
15837 } else if (rack->rc_has_collapsed)
15838 rack_un_collapse_window(rack, __LINE__);
15839 if ((rack->r_collapse_point_valid) &&
15840 (SEQ_GT(high_seq, rack->r_ctl.high_collapse_point)))
15841 rack->r_collapse_point_valid = 0;
15842 acked_amount = acked = (high_seq - tp->snd_una);
15843 if (acked) {
15844 /*
15845 * The draft (v3) calls for us to use SEQ_GEQ, but that
15846 * causes issues when we are just going app limited. Lets
15847 * instead use SEQ_GT <or> where its equal but more data
15848 * is outstanding.
15849 *
15850 * Also make sure we are on the last ack of a series. We
15851 * have to have all the ack's processed in queue to know
15852 * if there is something left outstanding.
15853 *
15854 */
15855 if (SEQ_GEQ(high_seq, rack->r_ctl.roundends) &&
15856 (rack->rc_new_rnd_needed == 0) &&
15857 (nxt_pkt == 0)) {
15858 /*
15859 * We have crossed into a new round with
15860 * this th_ack value.
15861 */
15862 rack_new_round_setup(tp, rack, high_seq);
15863 }
15864 /*
15865 * Clear the probe not answered flag
15866 * since cum-ack moved forward.
15867 */
15868 rack->probe_not_answered = 0;
15869 if (tp->t_flags & TF_NEEDSYN) {
15870 /*
15871 * T/TCP: Connection was half-synchronized, and our SYN has
15872 * been ACK'd (so connection is now fully synchronized). Go
15873 * to non-starred state, increment snd_una for ACK of SYN,
15874 * and check if we can do window scaling.
15875 */
15876 tp->t_flags &= ~TF_NEEDSYN;
15877 tp->snd_una++;
15878 acked_amount = acked = (high_seq - tp->snd_una);
15879 }
15880 if (acked > sbavail(&so->so_snd))
15881 acked_amount = sbavail(&so->so_snd);
15882 if (IN_FASTRECOVERY(tp->t_flags) &&
15883 (rack->rack_no_prr == 0))
15884 rack_update_prr(tp, rack, acked_amount, high_seq);
15885 if (IN_RECOVERY(tp->t_flags)) {
15886 if (SEQ_LT(high_seq, tp->snd_recover) &&
15887 (SEQ_LT(high_seq, tp->snd_max))) {
15888 tcp_rack_partialack(tp);
15889 } else {
15890 rack_post_recovery(tp, high_seq);
15891 post_recovery = 1;
15892 }
15893 } else if ((rack->rto_from_rec == 1) &&
15894 SEQ_GEQ(high_seq, tp->snd_recover)) {
15895 /*
15896 * We were in recovery, hit a rxt timeout
15897 * and never re-entered recovery. The timeout(s)
15898 * made up all the lost data. In such a case
15899 * we need to clear the rto_from_rec flag.
15900 */
15901 rack->rto_from_rec = 0;
15902 }
15903 /* Handle the rack-log-ack part (sendmap) */
15904 if ((sbused(&so->so_snd) == 0) &&
15905 (acked > acked_amount) &&
15906 (tp->t_state >= TCPS_FIN_WAIT_1) &&
15907 (tp->t_flags & TF_SENTFIN)) {
15908 /*
15909 * We must be sure our fin
15910 * was sent and acked (we can be
15911 * in FIN_WAIT_1 without having
15912 * sent the fin).
15913 */
15914 ourfinisacked = 1;
15915 /*
15916 * Lets make sure snd_una is updated
15917 * since most likely acked_amount = 0 (it
15918 * should be).
15919 */
15920 tp->snd_una = high_seq;
15921 }
15922 /* Did we make a RTO error? */
15923 if ((tp->t_flags & TF_PREVVALID) &&
15924 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) {
15925 tp->t_flags &= ~TF_PREVVALID;
15926 if (tp->t_rxtshift == 1 &&
15927 (int)(ticks - tp->t_badrxtwin) < 0)
15928 rack_cong_signal(tp, CC_RTO_ERR, high_seq, __LINE__);
15929 }
15930 /* Handle the data in the socket buffer */
15931 KMOD_TCPSTAT_ADD(tcps_rcvackpack, 1);
15932 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
15933 if (acked_amount > 0) {
15934 uint32_t p_cwnd;
15935 struct mbuf *mfree;
15936
15937 if (post_recovery) {
15938 /*
15939 * Grab the segsiz, multiply by 2 and add the snd_cwnd
15940 * that is the max the CC should add if we are exiting
15941 * recovery and doing a late add.
15942 */
15943 p_cwnd = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
15944 p_cwnd <<= 1;
15945 p_cwnd += tp->snd_cwnd;
15946 }
15947 rack_ack_received(tp, rack, high_seq, nsegs, CC_ACK, post_recovery);
15948 if (post_recovery && (tp->snd_cwnd > p_cwnd)) {
15949 /* Must be non-newreno (cubic) getting too ahead of itself */
15950 tp->snd_cwnd = p_cwnd;
15951 }
15952 SOCK_SENDBUF_LOCK(so);
15953 mfree = sbcut_locked(&so->so_snd, acked_amount);
15954 tp->snd_una = high_seq;
15955 /* Note we want to hold the sb lock through the sendmap adjust */
15956 rack_adjust_sendmap_head(rack, &so->so_snd);
15957 /* Wake up the socket if we have room to write more */
15958 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
15959 sowwakeup_locked(so);
15960 m_freem(mfree);
15961 }
15962 /* update progress */
15963 tp->t_acktime = ticks;
15964 rack_log_progress_event(rack, tp, tp->t_acktime,
15965 PROGRESS_UPDATE, __LINE__);
15966 /* Clear out shifts and such */
15967 tp->t_rxtshift = 0;
15968 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
15969 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
15970 rack->rc_tlp_in_progress = 0;
15971 rack->r_ctl.rc_tlp_cnt_out = 0;
15972 /* Send recover and snd_nxt must be dragged along */
15973 if (SEQ_GT(tp->snd_una, tp->snd_recover))
15974 tp->snd_recover = tp->snd_una;
15975 if (SEQ_LT(tp->snd_nxt, tp->snd_max))
15976 tp->snd_nxt = tp->snd_max;
15977 /*
15978 * If the RXT timer is running we want to
15979 * stop it, so we can restart a TLP (or new RXT).
15980 */
15981 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT)
15982 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
15983 tp->snd_wl2 = high_seq;
15984 tp->t_dupacks = 0;
15985 if (under_pacing &&
15986 (rack->use_fixed_rate == 0) &&
15987 (rack->in_probe_rtt == 0) &&
15988 rack->rc_gp_dyn_mul &&
15989 rack->rc_always_pace) {
15990 /* Check if we are dragging bottom */
15991 rack_check_bottom_drag(tp, rack, so);
15992 }
15993 if (tp->snd_una == tp->snd_max) {
15994 tp->t_flags &= ~TF_PREVVALID;
15995 rack->r_ctl.retran_during_recovery = 0;
15996 rack->rc_suspicious = 0;
15997 rack->r_ctl.dsack_byte_cnt = 0;
15998 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
15999 if (rack->r_ctl.rc_went_idle_time == 0)
16000 rack->r_ctl.rc_went_idle_time = 1;
16001 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
16002 if (sbavail(&tptosocket(tp)->so_snd) == 0)
16003 tp->t_acktime = 0;
16004 /* Set so we might enter persists... */
16005 rack->r_wanted_output = 1;
16006 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
16007 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
16008 if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
16009 (sbavail(&so->so_snd) == 0) &&
16010 (tp->t_flags2 & TF2_DROP_AF_DATA)) {
16011 /*
16012 * The socket was gone and the
16013 * peer sent data (not now in the past), time to
16014 * reset him.
16015 */
16016 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
16017 /* tcp_close will kill the inp pre-log the Reset */
16018 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
16019 #ifdef TCP_ACCOUNTING
16020 rdstc = get_cyclecount();
16021 if (rdstc > ts_val) {
16022 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16023 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
16024 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
16025 }
16026 }
16027 #endif
16028 m_freem(m);
16029 tp = tcp_close(tp);
16030 if (tp == NULL) {
16031 #ifdef TCP_ACCOUNTING
16032 sched_unpin();
16033 #endif
16034 return (1);
16035 }
16036 /*
16037 * We would normally do drop-with-reset which would
16038 * send back a reset. We can't since we don't have
16039 * all the needed bits. Instead lets arrange for
16040 * a call to tcp_output(). That way since we
16041 * are in the closed state we will generate a reset.
16042 *
16043 * Note if tcp_accounting is on we don't unpin since
16044 * we do that after the goto label.
16045 */
16046 goto send_out_a_rst;
16047 }
16048 if ((sbused(&so->so_snd) == 0) &&
16049 (tp->t_state >= TCPS_FIN_WAIT_1) &&
16050 (tp->t_flags & TF_SENTFIN)) {
16051 /*
16052 * If we can't receive any more data, then closing user can
16053 * proceed. Starting the timer is contrary to the
16054 * specification, but if we don't get a FIN we'll hang
16055 * forever.
16056 *
16057 */
16058 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
16059 soisdisconnected(so);
16060 tcp_timer_activate(tp, TT_2MSL,
16061 (tcp_fast_finwait2_recycle ?
16062 tcp_finwait2_timeout :
16063 TP_MAXIDLE(tp)));
16064 }
16065 if (ourfinisacked == 0) {
16066 /*
16067 * We don't change to fin-wait-2 if we have our fin acked
16068 * which means we are probably in TCPS_CLOSING.
16069 */
16070 tcp_state_change(tp, TCPS_FIN_WAIT_2);
16071 }
16072 }
16073 }
16074 /* Wake up the socket if we have room to write more */
16075 if (sbavail(&so->so_snd)) {
16076 rack->r_wanted_output = 1;
16077 if (ctf_progress_timeout_check(tp, true)) {
16078 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
16079 tp, tick, PROGRESS_DROP, __LINE__);
16080 /*
16081 * We cheat here and don't send a RST, we should send one
16082 * when the pacer drops the connection.
16083 */
16084 #ifdef TCP_ACCOUNTING
16085 rdstc = get_cyclecount();
16086 if (rdstc > ts_val) {
16087 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16088 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
16089 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
16090 }
16091 }
16092 sched_unpin();
16093 #endif
16094 (void)tcp_drop(tp, ETIMEDOUT);
16095 m_freem(m);
16096 return (1);
16097 }
16098 }
16099 if (ourfinisacked) {
16100 switch(tp->t_state) {
16101 case TCPS_CLOSING:
16102 #ifdef TCP_ACCOUNTING
16103 rdstc = get_cyclecount();
16104 if (rdstc > ts_val) {
16105 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16106 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
16107 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
16108 }
16109 }
16110 sched_unpin();
16111 #endif
16112 tcp_twstart(tp);
16113 m_freem(m);
16114 return (1);
16115 break;
16116 case TCPS_LAST_ACK:
16117 #ifdef TCP_ACCOUNTING
16118 rdstc = get_cyclecount();
16119 if (rdstc > ts_val) {
16120 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16121 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
16122 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
16123 }
16124 }
16125 sched_unpin();
16126 #endif
16127 tp = tcp_close(tp);
16128 ctf_do_drop(m, tp);
16129 return (1);
16130 break;
16131 case TCPS_FIN_WAIT_1:
16132 #ifdef TCP_ACCOUNTING
16133 rdstc = get_cyclecount();
16134 if (rdstc > ts_val) {
16135 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16136 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
16137 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
16138 }
16139 }
16140 #endif
16141 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
16142 soisdisconnected(so);
16143 tcp_timer_activate(tp, TT_2MSL,
16144 (tcp_fast_finwait2_recycle ?
16145 tcp_finwait2_timeout :
16146 TP_MAXIDLE(tp)));
16147 }
16148 tcp_state_change(tp, TCPS_FIN_WAIT_2);
16149 break;
16150 default:
16151 break;
16152 }
16153 }
16154 if (rack->r_fast_output) {
16155 /*
16156 * We re doing fast output.. can we expand that?
16157 */
16158 rack_gain_for_fastoutput(rack, tp, so, acked_amount);
16159 }
16160 #ifdef TCP_ACCOUNTING
16161 rdstc = get_cyclecount();
16162 if (rdstc > ts_val) {
16163 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16164 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
16165 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
16166 }
16167 }
16168
16169 } else if (win_up_req) {
16170 rdstc = get_cyclecount();
16171 if (rdstc > ts_val) {
16172 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16173 tp->tcp_proc_time[ACK_RWND] += (rdstc - ts_val);
16174 }
16175 }
16176 #endif
16177 }
16178 /* Now is there a next packet, if so we are done */
16179 m_freem(m);
16180 did_out = 0;
16181 if (nxt_pkt) {
16182 #ifdef TCP_ACCOUNTING
16183 sched_unpin();
16184 #endif
16185 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 5, nsegs);
16186 return (0);
16187 }
16188 rack_handle_might_revert(tp, rack);
16189 ctf_calc_rwin(so, tp);
16190 if ((rack->r_wanted_output != 0) ||
16191 (rack->r_fast_output != 0) ||
16192 (tp->t_flags & TF_ACKNOW )) {
16193 send_out_a_rst:
16194 if (tcp_output(tp) < 0) {
16195 #ifdef TCP_ACCOUNTING
16196 sched_unpin();
16197 #endif
16198 return (1);
16199 }
16200 did_out = 1;
16201 }
16202 if (tp->t_flags2 & TF2_HPTS_CALLS)
16203 tp->t_flags2 &= ~TF2_HPTS_CALLS;
16204 rack_free_trim(rack);
16205 #ifdef TCP_ACCOUNTING
16206 sched_unpin();
16207 #endif
16208 rack_timer_audit(tp, rack, &so->so_snd);
16209 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 6, nsegs);
16210 return (0);
16211 }
16212
16213 #define TCP_LRO_TS_OPTION \
16214 ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \
16215 (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)
16216
16217 static int
rack_do_segment_nounlock(struct tcpcb * tp,struct mbuf * m,struct tcphdr * th,int32_t drop_hdrlen,int32_t tlen,uint8_t iptos,int32_t nxt_pkt,struct timeval * tv)16218 rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
16219 int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, int32_t nxt_pkt,
16220 struct timeval *tv)
16221 {
16222 struct inpcb *inp = tptoinpcb(tp);
16223 struct socket *so = tptosocket(tp);
16224 #ifdef TCP_ACCOUNTING
16225 uint64_t ts_val;
16226 #endif
16227 int32_t thflags, retval, did_out = 0;
16228 int32_t way_out = 0;
16229 /*
16230 * cts - is the current time from tv (caller gets ts) in microseconds.
16231 * ms_cts - is the current time from tv in milliseconds.
16232 * us_cts - is the time that LRO or hardware actually got the packet in microseconds.
16233 */
16234 uint32_t cts, us_cts, ms_cts;
16235 uint32_t tiwin;
16236 struct timespec ts;
16237 struct tcpopt to;
16238 struct tcp_rack *rack;
16239 struct rack_sendmap *rsm;
16240 int32_t prev_state = 0;
16241 int no_output = 0;
16242 int time_remaining = 0;
16243 #ifdef TCP_ACCOUNTING
16244 int ack_val_set = 0xf;
16245 #endif
16246 int nsegs;
16247
16248 NET_EPOCH_ASSERT();
16249 INP_WLOCK_ASSERT(inp);
16250
16251 /*
16252 * tv passed from common code is from either M_TSTMP_LRO or
16253 * tcp_get_usecs() if no LRO m_pkthdr timestamp is present.
16254 */
16255 rack = (struct tcp_rack *)tp->t_fb_ptr;
16256 if (rack->rack_deferred_inited == 0) {
16257 /*
16258 * If we are the connecting socket we will
16259 * hit rack_init() when no sequence numbers
16260 * are setup. This makes it so we must defer
16261 * some initialization. Call that now.
16262 */
16263 rack_deferred_init(tp, rack);
16264 }
16265 /*
16266 * Check to see if we need to skip any output plans. This
16267 * can happen in the non-LRO path where we are pacing and
16268 * must process the ack coming in but need to defer sending
16269 * anything becase a pacing timer is running.
16270 */
16271 us_cts = tcp_tv_to_usec(tv);
16272 if (m->m_flags & M_ACKCMP) {
16273 /*
16274 * All compressed ack's are ack's by definition so
16275 * remove any ack required flag and then do the processing.
16276 */
16277 rack->rc_ack_required = 0;
16278 return (rack_do_compressed_ack_processing(tp, so, m, nxt_pkt, tv));
16279 }
16280 thflags = tcp_get_flags(th);
16281 if ((rack->rc_always_pace == 1) &&
16282 (rack->rc_ack_can_sendout_data == 0) &&
16283 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
16284 (TSTMP_LT(us_cts, rack->r_ctl.rc_last_output_to))) {
16285 /*
16286 * Ok conditions are right for queuing the packets
16287 * but we do have to check the flags in the inp, it
16288 * could be, if a sack is present, we want to be awoken and
16289 * so should process the packets.
16290 */
16291 time_remaining = rack->r_ctl.rc_last_output_to - us_cts;
16292 if (rack->rc_tp->t_flags2 & TF2_DONT_SACK_QUEUE) {
16293 no_output = 1;
16294 } else {
16295 /*
16296 * If there is no options, or just a
16297 * timestamp option, we will want to queue
16298 * the packets. This is the same that LRO does
16299 * and will need to change with accurate ECN.
16300 */
16301 uint32_t *ts_ptr;
16302 int optlen;
16303
16304 optlen = (th->th_off << 2) - sizeof(struct tcphdr);
16305 ts_ptr = (uint32_t *)(th + 1);
16306 if ((optlen == 0) ||
16307 ((optlen == TCPOLEN_TSTAMP_APPA) &&
16308 (*ts_ptr == TCP_LRO_TS_OPTION)))
16309 no_output = 1;
16310 }
16311 if ((no_output == 1) && (time_remaining < tcp_min_hptsi_time)) {
16312 /*
16313 * It is unrealistic to think we can pace in less than
16314 * the minimum granularity of the pacer (def:250usec). So
16315 * if we have less than that time remaining we should go
16316 * ahead and allow output to be "early". We will attempt to
16317 * make up for it in any pacing time we try to apply on
16318 * the outbound packet.
16319 */
16320 no_output = 0;
16321 }
16322 }
16323 /*
16324 * If there is a RST or FIN lets dump out the bw
16325 * with a FIN the connection may go on but we
16326 * may not.
16327 */
16328 if ((thflags & TH_FIN) || (thflags & TH_RST))
16329 rack_log_pacing_delay_calc(rack,
16330 rack->r_ctl.gp_bw,
16331 0,
16332 0,
16333 rack_get_gp_est(rack), /* delRate */
16334 rack_get_lt_bw(rack), /* rttProp */
16335 20, __LINE__, NULL, 0);
16336 if (m->m_flags & M_ACKCMP) {
16337 panic("Impossible reach m has ackcmp? m:%p tp:%p", m, tp);
16338 }
16339 cts = tcp_tv_to_usec(tv);
16340 ms_cts = tcp_tv_to_msec(tv);
16341 nsegs = m->m_pkthdr.lro_nsegs;
16342 counter_u64_add(rack_proc_non_comp_ack, 1);
16343 #ifdef TCP_ACCOUNTING
16344 sched_pin();
16345 if (thflags & TH_ACK)
16346 ts_val = get_cyclecount();
16347 #endif
16348 if ((m->m_flags & M_TSTMP) ||
16349 (m->m_flags & M_TSTMP_LRO)) {
16350 mbuf_tstmp2timespec(m, &ts);
16351 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec;
16352 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000;
16353 } else
16354 rack->r_ctl.act_rcv_time = *tv;
16355 kern_prefetch(rack, &prev_state);
16356 prev_state = 0;
16357 /*
16358 * Unscale the window into a 32-bit value. For the SYN_SENT state
16359 * the scale is zero.
16360 */
16361 tiwin = th->th_win << tp->snd_scale;
16362 #ifdef TCP_ACCOUNTING
16363 if (thflags & TH_ACK) {
16364 /*
16365 * We have a tradeoff here. We can either do what we are
16366 * doing i.e. pinning to this CPU and then doing the accounting
16367 * <or> we could do a critical enter, setup the rdtsc and cpu
16368 * as in below, and then validate we are on the same CPU on
16369 * exit. I have choosen to not do the critical enter since
16370 * that often will gain you a context switch, and instead lock
16371 * us (line above this if) to the same CPU with sched_pin(). This
16372 * means we may be context switched out for a higher priority
16373 * interupt but we won't be moved to another CPU.
16374 *
16375 * If this occurs (which it won't very often since we most likely
16376 * are running this code in interupt context and only a higher
16377 * priority will bump us ... clock?) we will falsely add in
16378 * to the time the interupt processing time plus the ack processing
16379 * time. This is ok since its a rare event.
16380 */
16381 ack_val_set = tcp_do_ack_accounting(tp, th, &to, tiwin,
16382 ctf_fixed_maxseg(tp));
16383 }
16384 #endif
16385 /*
16386 * Parse options on any incoming segment.
16387 */
16388 memset(&to, 0, sizeof(to));
16389 tcp_dooptions(&to, (u_char *)(th + 1),
16390 (th->th_off << 2) - sizeof(struct tcphdr),
16391 (thflags & TH_SYN) ? TO_SYN : 0);
16392 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
16393 __func__));
16394 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
16395 __func__));
16396 if (tp->t_flags2 & TF2_PROC_SACK_PROHIBIT) {
16397 /*
16398 * We don't look at sack's from the
16399 * peer because the MSS is too small which
16400 * can subject us to an attack.
16401 */
16402 to.to_flags &= ~TOF_SACK;
16403 }
16404 if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
16405 (tp->t_flags & TF_GPUTINPROG)) {
16406 /*
16407 * We have a goodput in progress
16408 * and we have entered a late state.
16409 * Do we have enough data in the sb
16410 * to handle the GPUT request?
16411 */
16412 uint32_t bytes;
16413
16414 bytes = tp->gput_ack - tp->gput_seq;
16415 if (SEQ_GT(tp->gput_seq, tp->snd_una))
16416 bytes += tp->gput_seq - tp->snd_una;
16417 if (bytes > sbavail(&tptosocket(tp)->so_snd)) {
16418 /*
16419 * There are not enough bytes in the socket
16420 * buffer that have been sent to cover this
16421 * measurement. Cancel it.
16422 */
16423 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
16424 rack->r_ctl.rc_gp_srtt /*flex1*/,
16425 tp->gput_seq,
16426 0, 0, 18, __LINE__, NULL, 0);
16427 tp->t_flags &= ~TF_GPUTINPROG;
16428 }
16429 }
16430 if (tcp_bblogging_on(rack->rc_tp)) {
16431 union tcp_log_stackspecific log;
16432 struct timeval ltv;
16433 #ifdef TCP_REQUEST_TRK
16434 struct tcp_sendfile_track *tcp_req;
16435
16436 if (SEQ_GT(th->th_ack, tp->snd_una)) {
16437 tcp_req = tcp_req_find_req_for_seq(tp, (th->th_ack-1));
16438 } else {
16439 tcp_req = tcp_req_find_req_for_seq(tp, th->th_ack);
16440 }
16441 #endif
16442 memset(&log, 0, sizeof(log));
16443 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
16444 if (rack->rack_no_prr == 0)
16445 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
16446 else
16447 log.u_bbr.flex1 = 0;
16448 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
16449 log.u_bbr.use_lt_bw <<= 1;
16450 log.u_bbr.use_lt_bw |= rack->r_might_revert;
16451 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced;
16452 log.u_bbr.bbr_state = rack->rc_free_cnt;
16453 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
16454 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg;
16455 log.u_bbr.flex3 = m->m_flags;
16456 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
16457 log.u_bbr.lost = thflags;
16458 log.u_bbr.pacing_gain = 0x1;
16459 #ifdef TCP_ACCOUNTING
16460 log.u_bbr.cwnd_gain = ack_val_set;
16461 #endif
16462 log.u_bbr.flex7 = 2;
16463 if (m->m_flags & M_TSTMP) {
16464 /* Record the hardware timestamp if present */
16465 mbuf_tstmp2timespec(m, &ts);
16466 ltv.tv_sec = ts.tv_sec;
16467 ltv.tv_usec = ts.tv_nsec / 1000;
16468 log.u_bbr.lt_epoch = tcp_tv_to_usec(<v);
16469 } else if (m->m_flags & M_TSTMP_LRO) {
16470 /* Record the LRO the arrival timestamp */
16471 mbuf_tstmp2timespec(m, &ts);
16472 ltv.tv_sec = ts.tv_sec;
16473 ltv.tv_usec = ts.tv_nsec / 1000;
16474 log.u_bbr.flex5 = tcp_tv_to_usec(<v);
16475 }
16476 log.u_bbr.timeStamp = tcp_get_usecs(<v);
16477 /* Log the rcv time */
16478 log.u_bbr.delRate = m->m_pkthdr.rcv_tstmp;
16479 #ifdef TCP_REQUEST_TRK
16480 log.u_bbr.applimited = tp->t_tcpreq_closed;
16481 log.u_bbr.applimited <<= 8;
16482 log.u_bbr.applimited |= tp->t_tcpreq_open;
16483 log.u_bbr.applimited <<= 8;
16484 log.u_bbr.applimited |= tp->t_tcpreq_req;
16485 if (tcp_req) {
16486 /* Copy out any client req info */
16487 /* seconds */
16488 log.u_bbr.pkt_epoch = (tcp_req->localtime / HPTS_USEC_IN_SEC);
16489 /* useconds */
16490 log.u_bbr.delivered = (tcp_req->localtime % HPTS_USEC_IN_SEC);
16491 log.u_bbr.rttProp = tcp_req->timestamp;
16492 log.u_bbr.cur_del_rate = tcp_req->start;
16493 if (tcp_req->flags & TCP_TRK_TRACK_FLG_OPEN) {
16494 log.u_bbr.flex8 |= 1;
16495 } else {
16496 log.u_bbr.flex8 |= 2;
16497 log.u_bbr.bw_inuse = tcp_req->end;
16498 }
16499 log.u_bbr.flex6 = tcp_req->start_seq;
16500 if (tcp_req->flags & TCP_TRK_TRACK_FLG_COMP) {
16501 log.u_bbr.flex8 |= 4;
16502 log.u_bbr.epoch = tcp_req->end_seq;
16503 }
16504 }
16505 #endif
16506 TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
16507 tlen, &log, true, <v);
16508 }
16509 /* Remove ack required flag if set, we have one */
16510 if (thflags & TH_ACK)
16511 rack->rc_ack_required = 0;
16512 rack_log_type_bbrsnd(rack, 0, 0, cts, tv, __LINE__);
16513 if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
16514 way_out = 4;
16515 retval = 0;
16516 m_freem(m);
16517 goto done_with_input;
16518 }
16519 /*
16520 * If a segment with the ACK-bit set arrives in the SYN-SENT state
16521 * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9.
16522 */
16523 if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
16524 (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
16525 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
16526 ctf_do_dropwithreset(m, tp, th, tlen);
16527 #ifdef TCP_ACCOUNTING
16528 sched_unpin();
16529 #endif
16530 return (1);
16531 }
16532 /*
16533 * If timestamps were negotiated during SYN/ACK and a
16534 * segment without a timestamp is received, silently drop
16535 * the segment, unless it is a RST segment or missing timestamps are
16536 * tolerated.
16537 * See section 3.2 of RFC 7323.
16538 */
16539 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS) &&
16540 ((thflags & TH_RST) == 0) && (V_tcp_tolerate_missing_ts == 0)) {
16541 way_out = 5;
16542 retval = 0;
16543 m_freem(m);
16544 goto done_with_input;
16545 }
16546 /*
16547 * Segment received on connection. Reset idle time and keep-alive
16548 * timer. XXX: This should be done after segment validation to
16549 * ignore broken/spoofed segs.
16550 */
16551 if (tp->t_idle_reduce &&
16552 (tp->snd_max == tp->snd_una) &&
16553 (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur)) {
16554 counter_u64_add(rack_input_idle_reduces, 1);
16555 rack_cc_after_idle(rack, tp);
16556 }
16557 tp->t_rcvtime = ticks;
16558 #ifdef STATS
16559 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
16560 #endif
16561 if (tiwin > rack->r_ctl.rc_high_rwnd)
16562 rack->r_ctl.rc_high_rwnd = tiwin;
16563 /*
16564 * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move
16565 * this to occur after we've validated the segment.
16566 */
16567 if (tcp_ecn_input_segment(tp, thflags, tlen,
16568 tcp_packets_this_ack(tp, th->th_ack),
16569 iptos))
16570 rack_cong_signal(tp, CC_ECN, th->th_ack, __LINE__);
16571 if (tp->t_flags & TF_ACKNOW)
16572 rack->r_wanted_output = 1;
16573 /*
16574 * If echoed timestamp is later than the current time, fall back to
16575 * non RFC1323 RTT calculation. Normalize timestamp if syncookies
16576 * were used when this connection was established.
16577 */
16578 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
16579 to.to_tsecr -= tp->ts_offset;
16580 if (TSTMP_GT(to.to_tsecr, ms_cts))
16581 to.to_tsecr = 0;
16582 }
16583 if ((rack->r_rcvpath_rtt_up == 1) &&
16584 (to.to_flags & TOF_TS) &&
16585 (TSTMP_GEQ(to.to_tsecr, rack->r_ctl.last_rcv_tstmp_for_rtt))) {
16586 uint32_t rtt = 0;
16587
16588 /*
16589 * We are receiving only and thus not sending
16590 * data to do an RTT. We set a flag when we first
16591 * sent this TS to the peer. We now have it back
16592 * and have an RTT to share. We log it as a conf
16593 * 4, we are not so sure about it.. since we
16594 * may have lost an ack.
16595 */
16596 if (TSTMP_GT(cts, rack->r_ctl.last_time_of_arm_rcv))
16597 rtt = (cts - rack->r_ctl.last_time_of_arm_rcv);
16598 rack->r_rcvpath_rtt_up = 0;
16599 /* Submit and commit the timer */
16600 if (rtt > 0) {
16601 tcp_rack_xmit_timer(rack, rtt, 0, rtt, 4, NULL, 1);
16602 tcp_rack_xmit_timer_commit(rack, tp);
16603 }
16604 }
16605 /*
16606 * If its the first time in we need to take care of options and
16607 * verify we can do SACK for rack!
16608 */
16609 if (rack->r_state == 0) {
16610 /* Should be init'd by rack_init() */
16611 KASSERT(rack->rc_inp != NULL,
16612 ("%s: rack->rc_inp unexpectedly NULL", __func__));
16613 if (rack->rc_inp == NULL) {
16614 rack->rc_inp = inp;
16615 }
16616
16617 /*
16618 * Process options only when we get SYN/ACK back. The SYN
16619 * case for incoming connections is handled in tcp_syncache.
16620 * According to RFC1323 the window field in a SYN (i.e., a
16621 * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX
16622 * this is traditional behavior, may need to be cleaned up.
16623 */
16624 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
16625 /* Handle parallel SYN for ECN */
16626 tcp_ecn_input_parallel_syn(tp, thflags, iptos);
16627 if ((to.to_flags & TOF_SCALE) &&
16628 (tp->t_flags & TF_REQ_SCALE)) {
16629 tp->t_flags |= TF_RCVD_SCALE;
16630 tp->snd_scale = to.to_wscale;
16631 } else
16632 tp->t_flags &= ~TF_REQ_SCALE;
16633 /*
16634 * Initial send window. It will be updated with the
16635 * next incoming segment to the scaled value.
16636 */
16637 tp->snd_wnd = th->th_win;
16638 rack_validate_fo_sendwin_up(tp, rack);
16639 if ((to.to_flags & TOF_TS) &&
16640 (tp->t_flags & TF_REQ_TSTMP)) {
16641 tp->t_flags |= TF_RCVD_TSTMP;
16642 tp->ts_recent = to.to_tsval;
16643 tp->ts_recent_age = cts;
16644 } else
16645 tp->t_flags &= ~TF_REQ_TSTMP;
16646 if (to.to_flags & TOF_MSS) {
16647 tcp_mss(tp, to.to_mss);
16648 }
16649 if ((tp->t_flags & TF_SACK_PERMIT) &&
16650 (to.to_flags & TOF_SACKPERM) == 0)
16651 tp->t_flags &= ~TF_SACK_PERMIT;
16652 if (tp->t_flags & TF_FASTOPEN) {
16653 if (to.to_flags & TOF_FASTOPEN) {
16654 uint16_t mss;
16655
16656 if (to.to_flags & TOF_MSS)
16657 mss = to.to_mss;
16658 else
16659 if ((inp->inp_vflag & INP_IPV6) != 0)
16660 mss = TCP6_MSS;
16661 else
16662 mss = TCP_MSS;
16663 tcp_fastopen_update_cache(tp, mss,
16664 to.to_tfo_len, to.to_tfo_cookie);
16665 } else
16666 tcp_fastopen_disable_path(tp);
16667 }
16668 }
16669 /*
16670 * At this point we are at the initial call. Here we decide
16671 * if we are doing RACK or not. We do this by seeing if
16672 * TF_SACK_PERMIT is set and the sack-not-required is clear.
16673 * The code now does do dup-ack counting so if you don't
16674 * switch back you won't get rack & TLP, but you will still
16675 * get this stack.
16676 */
16677
16678 if ((rack_sack_not_required == 0) &&
16679 ((tp->t_flags & TF_SACK_PERMIT) == 0)) {
16680 tcp_switch_back_to_default(tp);
16681 (*tp->t_fb->tfb_tcp_do_segment)(tp, m, th, drop_hdrlen,
16682 tlen, iptos);
16683 #ifdef TCP_ACCOUNTING
16684 sched_unpin();
16685 #endif
16686 return (1);
16687 }
16688 tcp_set_hpts(tp);
16689 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack);
16690 }
16691 if (thflags & TH_FIN)
16692 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN);
16693 us_cts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time);
16694 if ((rack->rc_gp_dyn_mul) &&
16695 (rack->use_fixed_rate == 0) &&
16696 (rack->rc_always_pace)) {
16697 /* Check in on probertt */
16698 rack_check_probe_rtt(rack, cts);
16699 }
16700 rack_clear_rate_sample(rack);
16701 if ((rack->forced_ack) &&
16702 ((tcp_get_flags(th) & TH_RST) == 0)) {
16703 rack_handle_probe_response(rack, tiwin, us_cts);
16704 }
16705 /*
16706 * This is the one exception case where we set the rack state
16707 * always. All other times (timers etc) we must have a rack-state
16708 * set (so we assure we have done the checks above for SACK).
16709 */
16710 rack->r_ctl.rc_rcvtime = cts;
16711 if (rack->r_state != tp->t_state)
16712 rack_set_state(tp, rack);
16713 if (SEQ_GT(th->th_ack, tp->snd_una) &&
16714 (rsm = tqhash_min(rack->r_ctl.tqh)) != NULL)
16715 kern_prefetch(rsm, &prev_state);
16716 prev_state = rack->r_state;
16717 if ((thflags & TH_RST) &&
16718 ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
16719 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
16720 (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq))) {
16721 /* The connection will be killed by a reset check the tracepoint */
16722 tcp_trace_point(rack->rc_tp, TCP_TP_RESET_RCV);
16723 }
16724 retval = (*rack->r_substate) (m, th, so,
16725 tp, &to, drop_hdrlen,
16726 tlen, tiwin, thflags, nxt_pkt, iptos);
16727 if (retval == 0) {
16728 /*
16729 * If retval is 1 the tcb is unlocked and most likely the tp
16730 * is gone.
16731 */
16732 INP_WLOCK_ASSERT(inp);
16733 if ((rack->rc_gp_dyn_mul) &&
16734 (rack->rc_always_pace) &&
16735 (rack->use_fixed_rate == 0) &&
16736 rack->in_probe_rtt &&
16737 (rack->r_ctl.rc_time_probertt_starts == 0)) {
16738 /*
16739 * If we are going for target, lets recheck before
16740 * we output.
16741 */
16742 rack_check_probe_rtt(rack, cts);
16743 }
16744 if (rack->set_pacing_done_a_iw == 0) {
16745 /* How much has been acked? */
16746 if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) {
16747 /* We have enough to set in the pacing segment size */
16748 rack->set_pacing_done_a_iw = 1;
16749 rack_set_pace_segments(tp, rack, __LINE__, NULL);
16750 }
16751 }
16752 tcp_rack_xmit_timer_commit(rack, tp);
16753 #ifdef TCP_ACCOUNTING
16754 /*
16755 * If we set the ack_val_se to what ack processing we are doing
16756 * we also want to track how many cycles we burned. Note
16757 * the bits after tcp_output we let be "free". This is because
16758 * we are also tracking the tcp_output times as well. Note the
16759 * use of 0xf here since we only have 11 counter (0 - 0xa) and
16760 * 0xf cannot be returned and is what we initialize it too to
16761 * indicate we are not doing the tabulations.
16762 */
16763 if (ack_val_set != 0xf) {
16764 uint64_t crtsc;
16765
16766 crtsc = get_cyclecount();
16767 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16768 tp->tcp_proc_time[ack_val_set] += (crtsc - ts_val);
16769 }
16770 }
16771 #endif
16772 if ((nxt_pkt == 0) && (no_output == 0)) {
16773 if ((rack->r_wanted_output != 0) ||
16774 (tp->t_flags & TF_ACKNOW) ||
16775 (rack->r_fast_output != 0)) {
16776
16777 do_output_now:
16778 if (tcp_output(tp) < 0) {
16779 #ifdef TCP_ACCOUNTING
16780 sched_unpin();
16781 #endif
16782 return (1);
16783 }
16784 did_out = 1;
16785 }
16786 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
16787 rack_free_trim(rack);
16788 } else if ((nxt_pkt == 0) && (tp->t_flags & TF_ACKNOW)) {
16789 goto do_output_now;
16790 } else if ((no_output == 1) &&
16791 (nxt_pkt == 0) &&
16792 (tcp_in_hpts(rack->rc_tp) == 0)) {
16793 /*
16794 * We are not in hpts and we had a pacing timer up. Use
16795 * the remaining time (time_remaining) to restart the timer.
16796 */
16797 KASSERT ((time_remaining != 0), ("slot remaining is zero for rack:%p tp:%p", rack, tp));
16798 rack_start_hpts_timer(rack, tp, cts, time_remaining, 0, 0);
16799 rack_free_trim(rack);
16800 }
16801 /* Clear the flag, it may have been cleared by output but we may not have */
16802 if ((nxt_pkt == 0) && (tp->t_flags2 & TF2_HPTS_CALLS))
16803 tp->t_flags2 &= ~TF2_HPTS_CALLS;
16804 /*
16805 * The draft (v3) calls for us to use SEQ_GEQ, but that
16806 * causes issues when we are just going app limited. Lets
16807 * instead use SEQ_GT <or> where its equal but more data
16808 * is outstanding.
16809 *
16810 * Also make sure we are on the last ack of a series. We
16811 * have to have all the ack's processed in queue to know
16812 * if there is something left outstanding.
16813 */
16814 if (SEQ_GEQ(tp->snd_una, rack->r_ctl.roundends) &&
16815 (rack->rc_new_rnd_needed == 0) &&
16816 (nxt_pkt == 0)) {
16817 /*
16818 * We have crossed into a new round with
16819 * the new snd_unae.
16820 */
16821 rack_new_round_setup(tp, rack, tp->snd_una);
16822 }
16823 if ((nxt_pkt == 0) &&
16824 ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) &&
16825 (SEQ_GT(tp->snd_max, tp->snd_una) ||
16826 (tp->t_flags & TF_DELACK) ||
16827 ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
16828 (tp->t_state <= TCPS_CLOSING)))) {
16829 /* We could not send (probably in the hpts but stopped the timer earlier)? */
16830 if ((tp->snd_max == tp->snd_una) &&
16831 ((tp->t_flags & TF_DELACK) == 0) &&
16832 (tcp_in_hpts(rack->rc_tp)) &&
16833 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
16834 /* keep alive not needed if we are hptsi output yet */
16835 ;
16836 } else {
16837 int late = 0;
16838 if (tcp_in_hpts(tp)) {
16839 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
16840 us_cts = tcp_get_usecs(NULL);
16841 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) {
16842 rack->r_early = 1;
16843 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts);
16844 } else
16845 late = 1;
16846 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
16847 }
16848 tcp_hpts_remove(tp);
16849 }
16850 if (late && (did_out == 0)) {
16851 /*
16852 * We are late in the sending
16853 * and we did not call the output
16854 * (this probably should not happen).
16855 */
16856 goto do_output_now;
16857 }
16858 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
16859 }
16860 way_out = 1;
16861 } else if (nxt_pkt == 0) {
16862 /* Do we have the correct timer running? */
16863 rack_timer_audit(tp, rack, &so->so_snd);
16864 way_out = 2;
16865 }
16866 done_with_input:
16867 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out, max(1, nsegs));
16868 if (did_out)
16869 rack->r_wanted_output = 0;
16870 }
16871
16872 #ifdef TCP_ACCOUNTING
16873 sched_unpin();
16874 #endif
16875 return (retval);
16876 }
16877
16878 static void
rack_do_segment(struct tcpcb * tp,struct mbuf * m,struct tcphdr * th,int32_t drop_hdrlen,int32_t tlen,uint8_t iptos)16879 rack_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
16880 int32_t drop_hdrlen, int32_t tlen, uint8_t iptos)
16881 {
16882 struct timeval tv;
16883
16884 /* First lets see if we have old packets */
16885 if (!STAILQ_EMPTY(&tp->t_inqueue)) {
16886 if (ctf_do_queued_segments(tp, 1)) {
16887 m_freem(m);
16888 return;
16889 }
16890 }
16891 if (m->m_flags & M_TSTMP_LRO) {
16892 mbuf_tstmp2timeval(m, &tv);
16893 } else {
16894 /* Should not be should we kassert instead? */
16895 tcp_get_usecs(&tv);
16896 }
16897 if (rack_do_segment_nounlock(tp, m, th, drop_hdrlen, tlen, iptos, 0,
16898 &tv) == 0) {
16899 INP_WUNLOCK(tptoinpcb(tp));
16900 }
16901 }
16902
16903 struct rack_sendmap *
tcp_rack_output(struct tcpcb * tp,struct tcp_rack * rack,uint32_t tsused)16904 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused)
16905 {
16906 struct rack_sendmap *rsm = NULL;
16907 int32_t idx;
16908 uint32_t srtt = 0, thresh = 0, ts_low = 0;
16909
16910 /* Return the next guy to be re-transmitted */
16911 if (tqhash_empty(rack->r_ctl.tqh)) {
16912 return (NULL);
16913 }
16914 if (tp->t_flags & TF_SENTFIN) {
16915 /* retran the end FIN? */
16916 return (NULL);
16917 }
16918 /* ok lets look at this one */
16919 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
16920 if (rack->r_must_retran && rsm && (rsm->r_flags & RACK_MUST_RXT)) {
16921 return (rsm);
16922 }
16923 if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) {
16924 goto check_it;
16925 }
16926 rsm = rack_find_lowest_rsm(rack);
16927 if (rsm == NULL) {
16928 return (NULL);
16929 }
16930 check_it:
16931 if (((rack->rc_tp->t_flags & TF_SACK_PERMIT) == 0) &&
16932 (rsm->r_dupack >= DUP_ACK_THRESHOLD)) {
16933 /*
16934 * No sack so we automatically do the 3 strikes and
16935 * retransmit (no rack timer would be started).
16936 */
16937 return (rsm);
16938 }
16939 if (rsm->r_flags & RACK_ACKED) {
16940 return (NULL);
16941 }
16942 if (((rsm->r_flags & RACK_SACK_PASSED) == 0) &&
16943 (rsm->r_dupack < DUP_ACK_THRESHOLD)) {
16944 /* Its not yet ready */
16945 return (NULL);
16946 }
16947 srtt = rack_grab_rtt(tp, rack);
16948 idx = rsm->r_rtr_cnt - 1;
16949 ts_low = (uint32_t)rsm->r_tim_lastsent[idx];
16950 thresh = rack_calc_thresh_rack(rack, srtt, tsused, __LINE__, 1);
16951 if ((tsused == ts_low) ||
16952 (TSTMP_LT(tsused, ts_low))) {
16953 /* No time since sending */
16954 return (NULL);
16955 }
16956 if ((tsused - ts_low) < thresh) {
16957 /* It has not been long enough yet */
16958 return (NULL);
16959 }
16960 if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) ||
16961 ((rsm->r_flags & RACK_SACK_PASSED))) {
16962 /*
16963 * We have passed the dup-ack threshold <or>
16964 * a SACK has indicated this is missing.
16965 * Note that if you are a declared attacker
16966 * it is only the dup-ack threshold that
16967 * will cause retransmits.
16968 */
16969 /* log retransmit reason */
16970 rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1);
16971 rack->r_fast_output = 0;
16972 return (rsm);
16973 }
16974 return (NULL);
16975 }
16976
16977 static void
rack_log_pacing_delay_calc(struct tcp_rack * rack,uint32_t len,uint32_t pacing_delay,uint64_t bw_est,uint64_t bw,uint64_t len_time,int method,int line,struct rack_sendmap * rsm,uint8_t quality)16978 rack_log_pacing_delay_calc (struct tcp_rack *rack, uint32_t len, uint32_t pacing_delay,
16979 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method,
16980 int line, struct rack_sendmap *rsm, uint8_t quality)
16981 {
16982 if (tcp_bblogging_on(rack->rc_tp)) {
16983 union tcp_log_stackspecific log;
16984 struct timeval tv;
16985
16986 if (rack_verbose_logging == 0) {
16987 /*
16988 * We are not verbose screen out all but
16989 * ones we always want.
16990 */
16991 if ((method != 2) &&
16992 (method != 3) &&
16993 (method != 7) &&
16994 (method != 89) &&
16995 (method != 14) &&
16996 (method != 20)) {
16997 return;
16998 }
16999 }
17000 memset(&log, 0, sizeof(log));
17001 log.u_bbr.flex1 = pacing_delay;
17002 log.u_bbr.flex2 = len;
17003 log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs;
17004 log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs;
17005 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ss;
17006 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_ca;
17007 log.u_bbr.use_lt_bw = rack->rc_ack_can_sendout_data;
17008 log.u_bbr.use_lt_bw <<= 1;
17009 log.u_bbr.use_lt_bw |= rack->r_late;
17010 log.u_bbr.use_lt_bw <<= 1;
17011 log.u_bbr.use_lt_bw |= rack->r_early;
17012 log.u_bbr.use_lt_bw <<= 1;
17013 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set;
17014 log.u_bbr.use_lt_bw <<= 1;
17015 log.u_bbr.use_lt_bw |= rack->rc_gp_filled;
17016 log.u_bbr.use_lt_bw <<= 1;
17017 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt;
17018 log.u_bbr.use_lt_bw <<= 1;
17019 log.u_bbr.use_lt_bw |= rack->in_probe_rtt;
17020 log.u_bbr.use_lt_bw <<= 1;
17021 log.u_bbr.use_lt_bw |= rack->gp_ready;
17022 log.u_bbr.pkt_epoch = line;
17023 log.u_bbr.epoch = rack->r_ctl.rc_agg_delayed;
17024 log.u_bbr.lt_epoch = rack->r_ctl.rc_agg_early;
17025 log.u_bbr.applimited = rack->r_ctl.rack_per_of_gp_rec;
17026 log.u_bbr.bw_inuse = bw_est;
17027 log.u_bbr.delRate = bw;
17028 if (rack->r_ctl.gp_bw == 0)
17029 log.u_bbr.cur_del_rate = 0;
17030 else
17031 log.u_bbr.cur_del_rate = rack_get_bw(rack);
17032 log.u_bbr.rttProp = len_time;
17033 log.u_bbr.pkts_out = rack->r_ctl.rc_rack_min_rtt;
17034 log.u_bbr.lost = rack->r_ctl.rc_probertt_sndmax_atexit;
17035 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm);
17036 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) {
17037 /* We are in slow start */
17038 log.u_bbr.flex7 = 1;
17039 } else {
17040 /* we are on congestion avoidance */
17041 log.u_bbr.flex7 = 0;
17042 }
17043 log.u_bbr.flex8 = method;
17044 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
17045 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
17046 log.u_bbr.cwnd_gain = rack->rc_gp_saw_rec;
17047 log.u_bbr.cwnd_gain <<= 1;
17048 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss;
17049 log.u_bbr.cwnd_gain <<= 1;
17050 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca;
17051 log.u_bbr.cwnd_gain <<= 1;
17052 log.u_bbr.cwnd_gain |= rack->use_fixed_rate;
17053 log.u_bbr.cwnd_gain <<= 1;
17054 log.u_bbr.cwnd_gain |= rack->rc_always_pace;
17055 log.u_bbr.cwnd_gain <<= 1;
17056 log.u_bbr.cwnd_gain |= rack->gp_ready;
17057 log.u_bbr.bbr_substate = quality;
17058 log.u_bbr.bbr_state = rack->dgp_on;
17059 log.u_bbr.bbr_state <<= 1;
17060 log.u_bbr.bbr_state |= rack->rc_pace_to_cwnd;
17061 log.u_bbr.bbr_state <<= 2;
17062 TCP_LOG_EVENTP(rack->rc_tp, NULL,
17063 &rack->rc_inp->inp_socket->so_rcv,
17064 &rack->rc_inp->inp_socket->so_snd,
17065 BBR_LOG_HPTSI_CALC, 0,
17066 0, &log, false, &tv);
17067 }
17068 }
17069
17070 static uint32_t
rack_get_pacing_len(struct tcp_rack * rack,uint64_t bw,uint32_t mss)17071 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss)
17072 {
17073 uint32_t new_tso, user_max, pace_one;
17074
17075 user_max = rack->rc_user_set_max_segs * mss;
17076 if (rack->rc_force_max_seg) {
17077 return (user_max);
17078 }
17079 if (rack->use_fixed_rate &&
17080 ((rack->r_ctl.crte == NULL) ||
17081 (bw != rack->r_ctl.crte->rate))) {
17082 /* Use the user mss since we are not exactly matched */
17083 return (user_max);
17084 }
17085 if (rack_pace_one_seg ||
17086 (rack->r_ctl.rc_user_set_min_segs == 1))
17087 pace_one = 1;
17088 else
17089 pace_one = 0;
17090
17091 new_tso = tcp_get_pacing_burst_size_w_divisor(rack->rc_tp, bw, mss,
17092 pace_one, rack->r_ctl.crte, NULL, rack->r_ctl.pace_len_divisor);
17093 if (new_tso > user_max)
17094 new_tso = user_max;
17095 if (rack->rc_hybrid_mode && rack->r_ctl.client_suggested_maxseg) {
17096 if (((uint32_t)rack->r_ctl.client_suggested_maxseg * mss) > new_tso)
17097 new_tso = (uint32_t)rack->r_ctl.client_suggested_maxseg * mss;
17098 }
17099 if (rack->r_ctl.rc_user_set_min_segs &&
17100 ((rack->r_ctl.rc_user_set_min_segs * mss) > new_tso))
17101 new_tso = rack->r_ctl.rc_user_set_min_segs * mss;
17102 return (new_tso);
17103 }
17104
17105 static uint64_t
rack_arrive_at_discounted_rate(struct tcp_rack * rack,uint64_t window_input,uint32_t * rate_set,uint32_t * gain_b)17106 rack_arrive_at_discounted_rate(struct tcp_rack *rack, uint64_t window_input, uint32_t *rate_set, uint32_t *gain_b)
17107 {
17108 uint64_t reduced_win;
17109 uint32_t gain;
17110
17111 if (window_input < rc_init_window(rack)) {
17112 /*
17113 * The cwnd is collapsed to
17114 * nearly zero, maybe because of a time-out?
17115 * Lets drop back to the lt-bw.
17116 */
17117 reduced_win = rack_get_lt_bw(rack);
17118 /* Set the flag so the caller knows its a rate and not a reduced window */
17119 *rate_set = 1;
17120 gain = 100;
17121 } else if (IN_RECOVERY(rack->rc_tp->t_flags)) {
17122 /*
17123 * If we are in recover our cwnd needs to be less for
17124 * our pacing consideration.
17125 */
17126 if (rack->rack_hibeta == 0) {
17127 reduced_win = window_input / 2;
17128 gain = 50;
17129 } else {
17130 reduced_win = window_input * rack->r_ctl.saved_hibeta;
17131 reduced_win /= 100;
17132 gain = rack->r_ctl.saved_hibeta;
17133 }
17134 } else {
17135 /*
17136 * Apply Timely factor to increase/decrease the
17137 * amount we are pacing at.
17138 */
17139 gain = rack_get_output_gain(rack, NULL);
17140 if (gain > rack_gain_p5_ub) {
17141 gain = rack_gain_p5_ub;
17142 }
17143 reduced_win = window_input * gain;
17144 reduced_win /= 100;
17145 }
17146 if (gain_b != NULL)
17147 *gain_b = gain;
17148 /*
17149 * What is being returned here is a trimmed down
17150 * window values in all cases where rate_set is left
17151 * at 0. In one case we actually return the rate (lt_bw).
17152 * the "reduced_win" is returned as a slimmed down cwnd that
17153 * is then calculated by the caller into a rate when rate_set
17154 * is 0.
17155 */
17156 return (reduced_win);
17157 }
17158
17159 static int32_t
pace_to_fill_cwnd(struct tcp_rack * rack,int32_t pacing_delay,uint32_t len,uint32_t segsiz,int * capped,uint64_t * rate_wanted,uint8_t non_paced)17160 pace_to_fill_cwnd(struct tcp_rack *rack, int32_t pacing_delay, uint32_t len, uint32_t segsiz, int *capped, uint64_t *rate_wanted, uint8_t non_paced)
17161 {
17162 uint64_t lentim, fill_bw;
17163
17164 rack->r_via_fill_cw = 0;
17165 if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use)
17166 return (pacing_delay);
17167 if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd)
17168 return (pacing_delay);
17169 if (rack->r_ctl.rc_last_us_rtt == 0)
17170 return (pacing_delay);
17171 if (rack->rc_pace_fill_if_rttin_range &&
17172 (rack->r_ctl.rc_last_us_rtt >=
17173 (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) {
17174 /* The rtt is huge, N * smallest, lets not fill */
17175 return (pacing_delay);
17176 }
17177 if (rack->r_ctl.fillcw_cap && *rate_wanted >= rack->r_ctl.fillcw_cap)
17178 return (pacing_delay);
17179 /*
17180 * first lets calculate the b/w based on the last us-rtt
17181 * and the the smallest send window.
17182 */
17183 fill_bw = min(rack->rc_tp->snd_cwnd, rack->r_ctl.cwnd_to_use);
17184 if (rack->rc_fillcw_apply_discount) {
17185 uint32_t rate_set = 0;
17186
17187 fill_bw = rack_arrive_at_discounted_rate(rack, fill_bw, &rate_set, NULL);
17188 if (rate_set) {
17189 goto at_lt_bw;
17190 }
17191 }
17192 /* Take the rwnd if its smaller */
17193 if (fill_bw > rack->rc_tp->snd_wnd)
17194 fill_bw = rack->rc_tp->snd_wnd;
17195 /* Now lets make it into a b/w */
17196 fill_bw *= (uint64_t)HPTS_USEC_IN_SEC;
17197 fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt;
17198 /* Adjust to any cap */
17199 if (rack->r_ctl.fillcw_cap && fill_bw >= rack->r_ctl.fillcw_cap)
17200 fill_bw = rack->r_ctl.fillcw_cap;
17201
17202 at_lt_bw:
17203 if (rack_bw_multipler > 0) {
17204 /*
17205 * We want to limit fill-cw to the some multiplier
17206 * of the max(lt_bw, gp_est). The normal default
17207 * is 0 for off, so a sysctl has enabled it.
17208 */
17209 uint64_t lt_bw, gp, rate;
17210
17211 gp = rack_get_gp_est(rack);
17212 lt_bw = rack_get_lt_bw(rack);
17213 if (lt_bw > gp)
17214 rate = lt_bw;
17215 else
17216 rate = gp;
17217 rate *= rack_bw_multipler;
17218 rate /= 100;
17219 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
17220 union tcp_log_stackspecific log;
17221 struct timeval tv;
17222
17223 memset(&log, 0, sizeof(log));
17224 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
17225 log.u_bbr.flex1 = rack_bw_multipler;
17226 log.u_bbr.flex2 = len;
17227 log.u_bbr.cur_del_rate = gp;
17228 log.u_bbr.delRate = lt_bw;
17229 log.u_bbr.bw_inuse = rate;
17230 log.u_bbr.rttProp = fill_bw;
17231 log.u_bbr.flex8 = 44;
17232 tcp_log_event(rack->rc_tp, NULL, NULL, NULL,
17233 BBR_LOG_CWND, 0,
17234 0, &log, false, NULL,
17235 __func__, __LINE__, &tv);
17236 }
17237 if (fill_bw > rate)
17238 fill_bw = rate;
17239 }
17240 /* We are below the min b/w */
17241 if (non_paced)
17242 *rate_wanted = fill_bw;
17243 if ((fill_bw < RACK_MIN_BW) || (fill_bw < *rate_wanted))
17244 return (pacing_delay);
17245 rack->r_via_fill_cw = 1;
17246 if (rack->r_rack_hw_rate_caps &&
17247 (rack->r_ctl.crte != NULL)) {
17248 uint64_t high_rate;
17249
17250 high_rate = tcp_hw_highest_rate(rack->r_ctl.crte);
17251 if (fill_bw > high_rate) {
17252 /* We are capping bw at the highest rate table entry */
17253 if (*rate_wanted > high_rate) {
17254 /* The original rate was also capped */
17255 rack->r_via_fill_cw = 0;
17256 }
17257 rack_log_hdwr_pacing(rack,
17258 fill_bw, high_rate, __LINE__,
17259 0, 3);
17260 fill_bw = high_rate;
17261 if (capped)
17262 *capped = 1;
17263 }
17264 } else if ((rack->r_ctl.crte == NULL) &&
17265 (rack->rack_hdrw_pacing == 0) &&
17266 (rack->rack_hdw_pace_ena) &&
17267 rack->r_rack_hw_rate_caps &&
17268 (rack->rack_attempt_hdwr_pace == 0) &&
17269 (rack->rc_inp->inp_route.ro_nh != NULL) &&
17270 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) {
17271 /*
17272 * Ok we may have a first attempt that is greater than our top rate
17273 * lets check.
17274 */
17275 uint64_t high_rate;
17276
17277 high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp);
17278 if (high_rate) {
17279 if (fill_bw > high_rate) {
17280 fill_bw = high_rate;
17281 if (capped)
17282 *capped = 1;
17283 }
17284 }
17285 }
17286 if (rack->r_ctl.bw_rate_cap && (fill_bw > rack->r_ctl.bw_rate_cap)) {
17287 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
17288 fill_bw, 0, 0, HYBRID_LOG_RATE_CAP, 2, NULL, __LINE__);
17289 fill_bw = rack->r_ctl.bw_rate_cap;
17290 }
17291 /*
17292 * Ok fill_bw holds our mythical b/w to fill the cwnd
17293 * in an rtt (unless it was capped), what does that
17294 * time wise equate too?
17295 */
17296 lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC;
17297 lentim /= fill_bw;
17298 *rate_wanted = fill_bw;
17299 if (non_paced || (lentim < pacing_delay)) {
17300 rack_log_pacing_delay_calc(rack, len, pacing_delay, fill_bw,
17301 0, lentim, 12, __LINE__, NULL, 0);
17302 return ((int32_t)lentim);
17303 } else
17304 return (pacing_delay);
17305 }
17306
17307 static int32_t
rack_get_pacing_delay(struct tcp_rack * rack,struct tcpcb * tp,uint32_t len,struct rack_sendmap * rsm,uint32_t segsiz,int line)17308 rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz, int line)
17309 {
17310 uint64_t srtt;
17311 int32_t pacing_delay = 0;
17312 int can_start_hw_pacing = 1;
17313 int err;
17314 int pace_one;
17315
17316 if (rack_pace_one_seg ||
17317 (rack->r_ctl.rc_user_set_min_segs == 1))
17318 pace_one = 1;
17319 else
17320 pace_one = 0;
17321 if (rack->rc_always_pace == 0) {
17322 /*
17323 * We use the most optimistic possible cwnd/srtt for
17324 * sending calculations. This will make our
17325 * calculation anticipate getting more through
17326 * quicker then possible. But thats ok we don't want
17327 * the peer to have a gap in data sending.
17328 */
17329 uint64_t cwnd, tr_perms = 0;
17330 int32_t reduce;
17331
17332 old_method:
17333 /*
17334 * We keep no precise pacing with the old method
17335 * instead we use the pacer to mitigate bursts.
17336 */
17337 if (rack->r_ctl.rc_rack_min_rtt)
17338 srtt = rack->r_ctl.rc_rack_min_rtt;
17339 else
17340 srtt = max(tp->t_srtt, 1);
17341 if (rack->r_ctl.rc_rack_largest_cwnd)
17342 cwnd = rack->r_ctl.rc_rack_largest_cwnd;
17343 else
17344 cwnd = rack->r_ctl.cwnd_to_use;
17345 /* Inflate cwnd by 1000 so srtt of usecs is in ms */
17346 tr_perms = (cwnd * 1000) / srtt;
17347 if (tr_perms == 0) {
17348 tr_perms = ctf_fixed_maxseg(tp);
17349 }
17350 /*
17351 * Calculate how long this will take to drain, if
17352 * the calculation comes out to zero, thats ok we
17353 * will use send_a_lot to possibly spin around for
17354 * more increasing tot_len_this_send to the point
17355 * that its going to require a pace, or we hit the
17356 * cwnd. Which in that case we are just waiting for
17357 * a ACK.
17358 */
17359 pacing_delay = len / tr_perms;
17360 /* Now do we reduce the time so we don't run dry? */
17361 if (pacing_delay && rack_pacing_delay_reduction) {
17362 reduce = (pacing_delay / rack_pacing_delay_reduction);
17363 if (reduce < pacing_delay) {
17364 pacing_delay -= reduce;
17365 } else
17366 pacing_delay = 0;
17367 } else
17368 reduce = 0;
17369 pacing_delay *= HPTS_USEC_IN_MSEC;
17370 if (rack->rc_pace_to_cwnd) {
17371 uint64_t rate_wanted = 0;
17372
17373 pacing_delay = pace_to_fill_cwnd(rack, pacing_delay, len, segsiz, NULL, &rate_wanted, 1);
17374 rack->rc_ack_can_sendout_data = 1;
17375 rack_log_pacing_delay_calc(rack, len, pacing_delay, rate_wanted, 0, 0, 14, __LINE__, NULL, 0);
17376 } else
17377 rack_log_pacing_delay_calc(rack, len, pacing_delay, tr_perms, reduce, 0, 7, __LINE__, NULL, 0);
17378 /*******************************************************/
17379 /* RRS: We insert non-paced call to stats here for len */
17380 /*******************************************************/
17381 } else {
17382 uint64_t bw_est, res, lentim, rate_wanted;
17383 uint32_t segs, oh;
17384 int capped = 0;
17385 int prev_fill;
17386
17387 if ((rack->r_rr_config == 1) && rsm) {
17388 return (rack->r_ctl.rc_min_to);
17389 }
17390 if (rack->use_fixed_rate) {
17391 rate_wanted = bw_est = rack_get_fixed_pacing_bw(rack);
17392 } else if ((rack->r_ctl.init_rate == 0) &&
17393 (rack->r_ctl.gp_bw == 0)) {
17394 /* no way to yet do an estimate */
17395 bw_est = rate_wanted = 0;
17396 } else if (rack->dgp_on) {
17397 bw_est = rack_get_bw(rack);
17398 rate_wanted = rack_get_output_bw(rack, bw_est, rsm, &capped);
17399 } else {
17400 uint32_t gain, rate_set = 0;
17401
17402 rate_wanted = min(rack->rc_tp->snd_cwnd, rack->r_ctl.cwnd_to_use);
17403 rate_wanted = rack_arrive_at_discounted_rate(rack, rate_wanted, &rate_set, &gain);
17404 if (rate_set == 0) {
17405 if (rate_wanted > rack->rc_tp->snd_wnd)
17406 rate_wanted = rack->rc_tp->snd_wnd;
17407 /* Now lets make it into a b/w */
17408 rate_wanted *= (uint64_t)HPTS_USEC_IN_SEC;
17409 rate_wanted /= (uint64_t)rack->r_ctl.rc_last_us_rtt;
17410 }
17411 bw_est = rate_wanted;
17412 rack_log_pacing_delay_calc(rack, rack->rc_tp->snd_cwnd,
17413 rack->r_ctl.cwnd_to_use,
17414 rate_wanted, bw_est,
17415 rack->r_ctl.rc_last_us_rtt,
17416 88, __LINE__, NULL, gain);
17417 }
17418 if (((bw_est == 0) || (rate_wanted == 0) || (rack->gp_ready == 0)) &&
17419 (rack->use_fixed_rate == 0)) {
17420 /*
17421 * No way yet to make a b/w estimate or
17422 * our raise is set incorrectly.
17423 */
17424 goto old_method;
17425 }
17426 rack_rate_cap_bw(rack, &rate_wanted, &capped);
17427 /* We need to account for all the overheads */
17428 segs = (len + segsiz - 1) / segsiz;
17429 /*
17430 * We need the diff between 1514 bytes (e-mtu with e-hdr)
17431 * and how much data we put in each packet. Yes this
17432 * means we may be off if we are larger than 1500 bytes
17433 * or smaller. But this just makes us more conservative.
17434 */
17435
17436 oh = (tp->t_maxseg - segsiz) + sizeof(struct tcphdr);
17437 if (rack->r_is_v6) {
17438 #ifdef INET6
17439 oh += sizeof(struct ip6_hdr);
17440 #endif
17441 } else {
17442 #ifdef INET
17443 oh += sizeof(struct ip);
17444 #endif
17445 }
17446 /* We add a fixed 14 for the ethernet header */
17447 oh += 14;
17448 segs *= oh;
17449 lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC;
17450 res = lentim / rate_wanted;
17451 pacing_delay = (uint32_t)res;
17452 if (rack_hw_rate_min &&
17453 (rate_wanted < rack_hw_rate_min)) {
17454 can_start_hw_pacing = 0;
17455 if (rack->r_ctl.crte) {
17456 /*
17457 * Ok we need to release it, we
17458 * have fallen too low.
17459 */
17460 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp);
17461 rack->r_ctl.crte = NULL;
17462 rack->rack_attempt_hdwr_pace = 0;
17463 rack->rack_hdrw_pacing = 0;
17464 }
17465 }
17466 if (rack->r_ctl.crte &&
17467 (tcp_hw_highest_rate(rack->r_ctl.crte) < rate_wanted)) {
17468 /*
17469 * We want more than the hardware can give us,
17470 * don't start any hw pacing.
17471 */
17472 can_start_hw_pacing = 0;
17473 if (rack->r_rack_hw_rate_caps == 0) {
17474 /*
17475 * Ok we need to release it, we
17476 * want more than the card can give us and
17477 * no rate cap is in place. Set it up so
17478 * when we want less we can retry.
17479 */
17480 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp);
17481 rack->r_ctl.crte = NULL;
17482 rack->rack_attempt_hdwr_pace = 0;
17483 rack->rack_hdrw_pacing = 0;
17484 }
17485 }
17486 if ((rack->r_ctl.crte != NULL) && (rack->rc_inp->inp_snd_tag == NULL)) {
17487 /*
17488 * We lost our rate somehow, this can happen
17489 * if the interface changed underneath us.
17490 */
17491 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp);
17492 rack->r_ctl.crte = NULL;
17493 /* Lets re-allow attempting to setup pacing */
17494 rack->rack_hdrw_pacing = 0;
17495 rack->rack_attempt_hdwr_pace = 0;
17496 rack_log_hdwr_pacing(rack,
17497 rate_wanted, bw_est, __LINE__,
17498 0, 6);
17499 }
17500 prev_fill = rack->r_via_fill_cw;
17501 if ((rack->rc_pace_to_cwnd) &&
17502 (capped == 0) &&
17503 (rack->dgp_on == 1) &&
17504 (rack->use_fixed_rate == 0) &&
17505 (rack->in_probe_rtt == 0) &&
17506 (IN_FASTRECOVERY(rack->rc_tp->t_flags) == 0)) {
17507 /*
17508 * We want to pace at our rate *or* faster to
17509 * fill the cwnd to the max if its not full.
17510 */
17511 pacing_delay = pace_to_fill_cwnd(rack, pacing_delay, (len+segs), segsiz, &capped, &rate_wanted, 0);
17512 /* Re-check to make sure we are not exceeding our max b/w */
17513 if ((rack->r_ctl.crte != NULL) &&
17514 (tcp_hw_highest_rate(rack->r_ctl.crte) < rate_wanted)) {
17515 /*
17516 * We want more than the hardware can give us,
17517 * don't start any hw pacing.
17518 */
17519 can_start_hw_pacing = 0;
17520 if (rack->r_rack_hw_rate_caps == 0) {
17521 /*
17522 * Ok we need to release it, we
17523 * want more than the card can give us and
17524 * no rate cap is in place. Set it up so
17525 * when we want less we can retry.
17526 */
17527 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp);
17528 rack->r_ctl.crte = NULL;
17529 rack->rack_attempt_hdwr_pace = 0;
17530 rack->rack_hdrw_pacing = 0;
17531 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
17532 }
17533 }
17534 }
17535 if ((rack->rc_inp->inp_route.ro_nh != NULL) &&
17536 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) {
17537 if ((rack->rack_hdw_pace_ena) &&
17538 (can_start_hw_pacing > 0) &&
17539 (rack->rack_hdrw_pacing == 0) &&
17540 (rack->rack_attempt_hdwr_pace == 0)) {
17541 /*
17542 * Lets attempt to turn on hardware pacing
17543 * if we can.
17544 */
17545 rack->rack_attempt_hdwr_pace = 1;
17546 rack->r_ctl.crte = tcp_set_pacing_rate(rack->rc_tp,
17547 rack->rc_inp->inp_route.ro_nh->nh_ifp,
17548 rate_wanted,
17549 RS_PACING_GEQ,
17550 &err, &rack->r_ctl.crte_prev_rate);
17551 if (rack->r_ctl.crte) {
17552 rack->rack_hdrw_pacing = 1;
17553 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size_w_divisor(tp, rate_wanted, segsiz,
17554 pace_one, rack->r_ctl.crte,
17555 NULL, rack->r_ctl.pace_len_divisor);
17556 rack_log_hdwr_pacing(rack,
17557 rate_wanted, rack->r_ctl.crte->rate, __LINE__,
17558 err, 0);
17559 rack->r_ctl.last_hw_bw_req = rate_wanted;
17560 } else {
17561 counter_u64_add(rack_hw_pace_init_fail, 1);
17562 }
17563 } else if (rack->rack_hdrw_pacing &&
17564 (rack->r_ctl.last_hw_bw_req != rate_wanted)) {
17565 /* Do we need to adjust our rate? */
17566 const struct tcp_hwrate_limit_table *nrte;
17567
17568 if (rack->r_up_only &&
17569 (rate_wanted < rack->r_ctl.crte->rate)) {
17570 /**
17571 * We have four possible states here
17572 * having to do with the previous time
17573 * and this time.
17574 * previous | this-time
17575 * A) 0 | 0 -- fill_cw not in the picture
17576 * B) 1 | 0 -- we were doing a fill-cw but now are not
17577 * C) 1 | 1 -- all rates from fill_cw
17578 * D) 0 | 1 -- we were doing non-fill and now we are filling
17579 *
17580 * For case A, C and D we don't allow a drop. But for
17581 * case B where we now our on our steady rate we do
17582 * allow a drop.
17583 *
17584 */
17585 if (!((prev_fill == 1) && (rack->r_via_fill_cw == 0)))
17586 goto done_w_hdwr;
17587 }
17588 if ((rate_wanted > rack->r_ctl.crte->rate) ||
17589 (rate_wanted <= rack->r_ctl.crte_prev_rate)) {
17590 if (rack_hw_rate_to_low &&
17591 (bw_est < rack_hw_rate_to_low)) {
17592 /*
17593 * The pacing rate is too low for hardware, but
17594 * do allow hardware pacing to be restarted.
17595 */
17596 rack_log_hdwr_pacing(rack,
17597 bw_est, rack->r_ctl.crte->rate, __LINE__,
17598 0, 5);
17599 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp);
17600 rack->r_ctl.crte = NULL;
17601 rack->rack_attempt_hdwr_pace = 0;
17602 rack->rack_hdrw_pacing = 0;
17603 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted);
17604 goto done_w_hdwr;
17605 }
17606 nrte = tcp_chg_pacing_rate(rack->r_ctl.crte,
17607 rack->rc_tp,
17608 rack->rc_inp->inp_route.ro_nh->nh_ifp,
17609 rate_wanted,
17610 RS_PACING_GEQ,
17611 &err, &rack->r_ctl.crte_prev_rate);
17612 if (nrte == NULL) {
17613 /*
17614 * Lost the rate, lets drop hardware pacing
17615 * period.
17616 */
17617 rack->rack_hdrw_pacing = 0;
17618 rack->r_ctl.crte = NULL;
17619 rack_log_hdwr_pacing(rack,
17620 rate_wanted, 0, __LINE__,
17621 err, 1);
17622 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted);
17623 counter_u64_add(rack_hw_pace_lost, 1);
17624 } else if (nrte != rack->r_ctl.crte) {
17625 rack->r_ctl.crte = nrte;
17626 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size_w_divisor(tp, rate_wanted,
17627 segsiz, pace_one, rack->r_ctl.crte,
17628 NULL, rack->r_ctl.pace_len_divisor);
17629 rack_log_hdwr_pacing(rack,
17630 rate_wanted, rack->r_ctl.crte->rate, __LINE__,
17631 err, 2);
17632 rack->r_ctl.last_hw_bw_req = rate_wanted;
17633 }
17634 } else {
17635 /* We just need to adjust the segment size */
17636 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted);
17637 rack_log_hdwr_pacing(rack,
17638 rate_wanted, rack->r_ctl.crte->rate, __LINE__,
17639 0, 4);
17640 rack->r_ctl.last_hw_bw_req = rate_wanted;
17641 }
17642 }
17643 }
17644 done_w_hdwr:
17645 if (rack_limit_time_with_srtt &&
17646 (rack->use_fixed_rate == 0) &&
17647 (rack->rack_hdrw_pacing == 0)) {
17648 /*
17649 * Sanity check, we do not allow the pacing delay
17650 * to be longer than the SRTT of the path. If it is
17651 * a slow path, then adding a packet should increase
17652 * the RTT and compensate for this i.e. the srtt will
17653 * be greater so the allowed pacing time will be greater.
17654 *
17655 * Note this restriction is not for where a peak rate
17656 * is set, we are doing fixed pacing or hardware pacing.
17657 */
17658 if (rack->rc_tp->t_srtt)
17659 srtt = rack->rc_tp->t_srtt;
17660 else
17661 srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC; /* its in ms convert */
17662 if (srtt < (uint64_t)pacing_delay) {
17663 rack_log_pacing_delay_calc(rack, srtt, pacing_delay, rate_wanted, bw_est, lentim, 99, __LINE__, NULL, 0);
17664 pacing_delay = srtt;
17665 }
17666 }
17667 /*******************************************************************/
17668 /* RRS: We insert paced call to stats here for len and rate_wanted */
17669 /*******************************************************************/
17670 rack_log_pacing_delay_calc(rack, len, pacing_delay, rate_wanted, bw_est, lentim, 2, __LINE__, rsm, 0);
17671 }
17672 if (rack->r_ctl.crte && (rack->r_ctl.crte->rs_num_enobufs > 0)) {
17673 /*
17674 * If this rate is seeing enobufs when it
17675 * goes to send then either the nic is out
17676 * of gas or we are mis-estimating the time
17677 * somehow and not letting the queue empty
17678 * completely. Lets add to the pacing time.
17679 */
17680 int hw_boost_delay;
17681
17682 hw_boost_delay = rack->r_ctl.crte->time_between * rack_enobuf_hw_boost_mult;
17683 if (hw_boost_delay > rack_enobuf_hw_max)
17684 hw_boost_delay = rack_enobuf_hw_max;
17685 else if (hw_boost_delay < rack_enobuf_hw_min)
17686 hw_boost_delay = rack_enobuf_hw_min;
17687 pacing_delay += hw_boost_delay;
17688 }
17689 return (pacing_delay);
17690 }
17691
17692 static void
rack_start_gp_measurement(struct tcpcb * tp,struct tcp_rack * rack,tcp_seq startseq,uint32_t sb_offset)17693 rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack,
17694 tcp_seq startseq, uint32_t sb_offset)
17695 {
17696 struct rack_sendmap *my_rsm = NULL;
17697
17698 if (tp->t_state < TCPS_ESTABLISHED) {
17699 /*
17700 * We don't start any measurements if we are
17701 * not at least established.
17702 */
17703 return;
17704 }
17705 if (tp->t_state >= TCPS_FIN_WAIT_1) {
17706 /*
17707 * We will get no more data into the SB
17708 * this means we need to have the data available
17709 * before we start a measurement.
17710 */
17711
17712 if (sbavail(&tptosocket(tp)->so_snd) <
17713 max(rc_init_window(rack),
17714 (MIN_GP_WIN * ctf_fixed_maxseg(tp)))) {
17715 /* Nope not enough data */
17716 return;
17717 }
17718 }
17719 tp->t_flags |= TF_GPUTINPROG;
17720 rack->r_ctl.rc_gp_cumack_ts = 0;
17721 rack->r_ctl.rc_gp_lowrtt = 0xffffffff;
17722 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd;
17723 tp->gput_seq = startseq;
17724 rack->app_limited_needs_set = 0;
17725 if (rack->in_probe_rtt)
17726 rack->measure_saw_probe_rtt = 1;
17727 else if ((rack->measure_saw_probe_rtt) &&
17728 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit)))
17729 rack->measure_saw_probe_rtt = 0;
17730 if (rack->rc_gp_filled)
17731 tp->gput_ts = rack->r_ctl.last_cumack_advance;
17732 else {
17733 /* Special case initial measurement */
17734 struct timeval tv;
17735
17736 tp->gput_ts = tcp_get_usecs(&tv);
17737 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv);
17738 }
17739 /*
17740 * We take a guess out into the future,
17741 * if we have no measurement and no
17742 * initial rate, we measure the first
17743 * initial-windows worth of data to
17744 * speed up getting some GP measurement and
17745 * thus start pacing.
17746 */
17747 if ((rack->rc_gp_filled == 0) && (rack->r_ctl.init_rate == 0)) {
17748 rack->app_limited_needs_set = 1;
17749 tp->gput_ack = startseq + max(rc_init_window(rack),
17750 (MIN_GP_WIN * ctf_fixed_maxseg(tp)));
17751 rack_log_pacing_delay_calc(rack,
17752 tp->gput_seq,
17753 tp->gput_ack,
17754 0,
17755 tp->gput_ts,
17756 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts),
17757 9,
17758 __LINE__, NULL, 0);
17759 rack_tend_gp_marks(tp, rack);
17760 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL);
17761 return;
17762 }
17763 if (sb_offset) {
17764 /*
17765 * We are out somewhere in the sb
17766 * can we use the already outstanding data?
17767 */
17768
17769 if (rack->r_ctl.rc_app_limited_cnt == 0) {
17770 /*
17771 * Yes first one is good and in this case
17772 * the tp->gput_ts is correctly set based on
17773 * the last ack that arrived (no need to
17774 * set things up when an ack comes in).
17775 */
17776 my_rsm = tqhash_min(rack->r_ctl.tqh);
17777 if ((my_rsm == NULL) ||
17778 (my_rsm->r_rtr_cnt != 1)) {
17779 /* retransmission? */
17780 goto use_latest;
17781 }
17782 } else {
17783 if (rack->r_ctl.rc_first_appl == NULL) {
17784 /*
17785 * If rc_first_appl is NULL
17786 * then the cnt should be 0.
17787 * This is probably an error, maybe
17788 * a KASSERT would be approprate.
17789 */
17790 goto use_latest;
17791 }
17792 /*
17793 * If we have a marker pointer to the last one that is
17794 * app limited we can use that, but we need to set
17795 * things up so that when it gets ack'ed we record
17796 * the ack time (if its not already acked).
17797 */
17798 rack->app_limited_needs_set = 1;
17799 /*
17800 * We want to get to the rsm that is either
17801 * next with space i.e. over 1 MSS or the one
17802 * after that (after the app-limited).
17803 */
17804 my_rsm = tqhash_next(rack->r_ctl.tqh, rack->r_ctl.rc_first_appl);
17805 if (my_rsm) {
17806 if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp))
17807 /* Have to use the next one */
17808 my_rsm = tqhash_next(rack->r_ctl.tqh, my_rsm);
17809 else {
17810 /* Use after the first MSS of it is acked */
17811 tp->gput_seq = my_rsm->r_start + ctf_fixed_maxseg(tp);
17812 goto start_set;
17813 }
17814 }
17815 if ((my_rsm == NULL) ||
17816 (my_rsm->r_rtr_cnt != 1)) {
17817 /*
17818 * Either its a retransmit or
17819 * the last is the app-limited one.
17820 */
17821 goto use_latest;
17822 }
17823 }
17824 tp->gput_seq = my_rsm->r_start;
17825 start_set:
17826 if (my_rsm->r_flags & RACK_ACKED) {
17827 /*
17828 * This one has been acked use the arrival ack time
17829 */
17830 struct rack_sendmap *nrsm;
17831
17832 tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival;
17833 rack->app_limited_needs_set = 0;
17834 /*
17835 * Ok in this path we need to use the r_end now
17836 * since this guy is the starting ack.
17837 */
17838 tp->gput_seq = my_rsm->r_end;
17839 /*
17840 * We also need to adjust up the sendtime
17841 * to the send of the next data after my_rsm.
17842 */
17843 nrsm = tqhash_next(rack->r_ctl.tqh, my_rsm);
17844 if (nrsm != NULL)
17845 my_rsm = nrsm;
17846 else {
17847 /*
17848 * The next as not been sent, thats the
17849 * case for using the latest.
17850 */
17851 goto use_latest;
17852 }
17853 }
17854 rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[0];
17855 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
17856 rack->r_ctl.rc_gp_cumack_ts = 0;
17857 if ((rack->r_ctl.cleared_app_ack == 1) &&
17858 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.cleared_app_ack_seq))) {
17859 /*
17860 * We just cleared an application limited period
17861 * so the next seq out needs to skip the first
17862 * ack.
17863 */
17864 rack->app_limited_needs_set = 1;
17865 rack->r_ctl.cleared_app_ack = 0;
17866 }
17867 rack_log_pacing_delay_calc(rack,
17868 tp->gput_seq,
17869 tp->gput_ack,
17870 (uintptr_t)my_rsm,
17871 tp->gput_ts,
17872 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts),
17873 9,
17874 __LINE__, my_rsm, 0);
17875 /* Now lets make sure all are marked as they should be */
17876 rack_tend_gp_marks(tp, rack);
17877 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL);
17878 return;
17879 }
17880
17881 use_latest:
17882 /*
17883 * We don't know how long we may have been
17884 * idle or if this is the first-send. Lets
17885 * setup the flag so we will trim off
17886 * the first ack'd data so we get a true
17887 * measurement.
17888 */
17889 rack->app_limited_needs_set = 1;
17890 tp->gput_ack = startseq + rack_get_measure_window(tp, rack);
17891 rack->r_ctl.rc_gp_cumack_ts = 0;
17892 /* Find this guy so we can pull the send time */
17893 my_rsm = tqhash_find(rack->r_ctl.tqh, startseq);
17894 if (my_rsm) {
17895 rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[0];
17896 if (my_rsm->r_flags & RACK_ACKED) {
17897 /*
17898 * Unlikely since its probably what was
17899 * just transmitted (but I am paranoid).
17900 */
17901 tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival;
17902 rack->app_limited_needs_set = 0;
17903 }
17904 if (SEQ_LT(my_rsm->r_start, tp->gput_seq)) {
17905 /* This also is unlikely */
17906 tp->gput_seq = my_rsm->r_start;
17907 }
17908 } else {
17909 /*
17910 * TSNH unless we have some send-map limit,
17911 * and even at that it should not be hitting
17912 * that limit (we should have stopped sending).
17913 */
17914 struct timeval tv;
17915
17916 microuptime(&tv);
17917 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv);
17918 }
17919 rack_tend_gp_marks(tp, rack);
17920 rack_log_pacing_delay_calc(rack,
17921 tp->gput_seq,
17922 tp->gput_ack,
17923 (uintptr_t)my_rsm,
17924 tp->gput_ts,
17925 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts),
17926 9, __LINE__, NULL, 0);
17927 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL);
17928 }
17929
17930 static inline uint32_t
rack_what_can_we_send(struct tcpcb * tp,struct tcp_rack * rack,uint32_t cwnd_to_use,uint32_t avail,int32_t sb_offset)17931 rack_what_can_we_send(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cwnd_to_use,
17932 uint32_t avail, int32_t sb_offset)
17933 {
17934 uint32_t len;
17935 uint32_t sendwin;
17936
17937 if (tp->snd_wnd > cwnd_to_use)
17938 sendwin = cwnd_to_use;
17939 else
17940 sendwin = tp->snd_wnd;
17941 if (ctf_outstanding(tp) >= tp->snd_wnd) {
17942 /* We never want to go over our peers rcv-window */
17943 len = 0;
17944 } else {
17945 uint32_t flight;
17946
17947 flight = ctf_flight_size(tp, rack->r_ctl.rc_sacked);
17948 if (flight >= sendwin) {
17949 /*
17950 * We have in flight what we are allowed by cwnd (if
17951 * it was rwnd blocking it would have hit above out
17952 * >= tp->snd_wnd).
17953 */
17954 return (0);
17955 }
17956 len = sendwin - flight;
17957 if ((len + ctf_outstanding(tp)) > tp->snd_wnd) {
17958 /* We would send too much (beyond the rwnd) */
17959 len = tp->snd_wnd - ctf_outstanding(tp);
17960 }
17961 if ((len + sb_offset) > avail) {
17962 /*
17963 * We don't have that much in the SB, how much is
17964 * there?
17965 */
17966 len = avail - sb_offset;
17967 }
17968 }
17969 return (len);
17970 }
17971
17972 static void
rack_log_fsb(struct tcp_rack * rack,struct tcpcb * tp,struct socket * so,uint32_t flags,unsigned ipoptlen,int32_t orig_len,int32_t len,int error,int rsm_is_null,int optlen,int line,uint16_t mode)17973 rack_log_fsb(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t flags,
17974 unsigned ipoptlen, int32_t orig_len, int32_t len, int error,
17975 int rsm_is_null, int optlen, int line, uint16_t mode)
17976 {
17977 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
17978 union tcp_log_stackspecific log;
17979 struct timeval tv;
17980
17981 memset(&log, 0, sizeof(log));
17982 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
17983 log.u_bbr.flex1 = error;
17984 log.u_bbr.flex2 = flags;
17985 log.u_bbr.flex3 = rsm_is_null;
17986 log.u_bbr.flex4 = ipoptlen;
17987 log.u_bbr.flex5 = tp->rcv_numsacks;
17988 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
17989 log.u_bbr.flex7 = optlen;
17990 log.u_bbr.flex8 = rack->r_fsb_inited;
17991 log.u_bbr.applimited = rack->r_fast_output;
17992 log.u_bbr.bw_inuse = rack_get_bw(rack);
17993 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL);
17994 log.u_bbr.cwnd_gain = mode;
17995 log.u_bbr.pkts_out = orig_len;
17996 log.u_bbr.lt_epoch = len;
17997 log.u_bbr.delivered = line;
17998 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
17999 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
18000 tcp_log_event(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_FSB, 0,
18001 len, &log, false, NULL, __func__, __LINE__, &tv);
18002 }
18003 }
18004
18005
18006 static struct mbuf *
rack_fo_base_copym(struct mbuf * the_m,uint32_t the_off,int32_t * plen,struct rack_fast_send_blk * fsb,int32_t seglimit,int32_t segsize,int hw_tls)18007 rack_fo_base_copym(struct mbuf *the_m, uint32_t the_off, int32_t *plen,
18008 struct rack_fast_send_blk *fsb,
18009 int32_t seglimit, int32_t segsize, int hw_tls)
18010 {
18011 #ifdef KERN_TLS
18012 struct ktls_session *tls, *ntls;
18013 #ifdef INVARIANTS
18014 struct mbuf *start;
18015 #endif
18016 #endif
18017 struct mbuf *m, *n, **np, *smb;
18018 struct mbuf *top;
18019 int32_t off, soff;
18020 int32_t len = *plen;
18021 int32_t fragsize;
18022 int32_t len_cp = 0;
18023 uint32_t mlen, frags;
18024
18025 soff = off = the_off;
18026 smb = m = the_m;
18027 np = ⊤
18028 top = NULL;
18029 #ifdef KERN_TLS
18030 if (hw_tls && (m->m_flags & M_EXTPG))
18031 tls = m->m_epg_tls;
18032 else
18033 tls = NULL;
18034 #ifdef INVARIANTS
18035 start = m;
18036 #endif
18037 #endif
18038 while (len > 0) {
18039 if (m == NULL) {
18040 *plen = len_cp;
18041 break;
18042 }
18043 #ifdef KERN_TLS
18044 if (hw_tls) {
18045 if (m->m_flags & M_EXTPG)
18046 ntls = m->m_epg_tls;
18047 else
18048 ntls = NULL;
18049
18050 /*
18051 * Avoid mixing TLS records with handshake
18052 * data or TLS records from different
18053 * sessions.
18054 */
18055 if (tls != ntls) {
18056 MPASS(m != start);
18057 *plen = len_cp;
18058 break;
18059 }
18060 }
18061 #endif
18062 mlen = min(len, m->m_len - off);
18063 if (seglimit) {
18064 /*
18065 * For M_EXTPG mbufs, add 3 segments
18066 * + 1 in case we are crossing page boundaries
18067 * + 2 in case the TLS hdr/trailer are used
18068 * It is cheaper to just add the segments
18069 * than it is to take the cache miss to look
18070 * at the mbuf ext_pgs state in detail.
18071 */
18072 if (m->m_flags & M_EXTPG) {
18073 fragsize = min(segsize, PAGE_SIZE);
18074 frags = 3;
18075 } else {
18076 fragsize = segsize;
18077 frags = 0;
18078 }
18079
18080 /* Break if we really can't fit anymore. */
18081 if ((frags + 1) >= seglimit) {
18082 *plen = len_cp;
18083 break;
18084 }
18085
18086 /*
18087 * Reduce size if you can't copy the whole
18088 * mbuf. If we can't copy the whole mbuf, also
18089 * adjust len so the loop will end after this
18090 * mbuf.
18091 */
18092 if ((frags + howmany(mlen, fragsize)) >= seglimit) {
18093 mlen = (seglimit - frags - 1) * fragsize;
18094 len = mlen;
18095 *plen = len_cp + len;
18096 }
18097 frags += howmany(mlen, fragsize);
18098 if (frags == 0)
18099 frags++;
18100 seglimit -= frags;
18101 KASSERT(seglimit > 0,
18102 ("%s: seglimit went too low", __func__));
18103 }
18104 n = m_get(M_NOWAIT, m->m_type);
18105 *np = n;
18106 if (n == NULL)
18107 goto nospace;
18108 n->m_len = mlen;
18109 soff += mlen;
18110 len_cp += n->m_len;
18111 if (m->m_flags & (M_EXT | M_EXTPG)) {
18112 n->m_data = m->m_data + off;
18113 mb_dupcl(n, m);
18114 } else {
18115 bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
18116 (u_int)n->m_len);
18117 }
18118 len -= n->m_len;
18119 off = 0;
18120 m = m->m_next;
18121 np = &n->m_next;
18122 if (len || (soff == smb->m_len)) {
18123 /*
18124 * We have more so we move forward or
18125 * we have consumed the entire mbuf and
18126 * len has fell to 0.
18127 */
18128 soff = 0;
18129 smb = m;
18130 }
18131
18132 }
18133 if (fsb != NULL) {
18134 fsb->m = smb;
18135 fsb->off = soff;
18136 if (smb) {
18137 /*
18138 * Save off the size of the mbuf. We do
18139 * this so that we can recognize when it
18140 * has been trimmed by sbcut() as acks
18141 * come in.
18142 */
18143 fsb->o_m_len = smb->m_len;
18144 fsb->o_t_len = M_TRAILINGROOM(smb);
18145 } else {
18146 /*
18147 * This is the case where the next mbuf went to NULL. This
18148 * means with this copy we have sent everything in the sb.
18149 * In theory we could clear the fast_output flag, but lets
18150 * not since its possible that we could get more added
18151 * and acks that call the extend function which would let
18152 * us send more.
18153 */
18154 fsb->o_m_len = 0;
18155 fsb->o_t_len = 0;
18156 }
18157 }
18158 return (top);
18159 nospace:
18160 if (top)
18161 m_freem(top);
18162 return (NULL);
18163
18164 }
18165
18166 /*
18167 * This is a copy of m_copym(), taking the TSO segment size/limit
18168 * constraints into account, and advancing the sndptr as it goes.
18169 */
18170 static struct mbuf *
rack_fo_m_copym(struct tcp_rack * rack,int32_t * plen,int32_t seglimit,int32_t segsize,struct mbuf ** s_mb,int * s_soff)18171 rack_fo_m_copym(struct tcp_rack *rack, int32_t *plen,
18172 int32_t seglimit, int32_t segsize, struct mbuf **s_mb, int *s_soff)
18173 {
18174 struct mbuf *m, *n;
18175 int32_t soff;
18176
18177 m = rack->r_ctl.fsb.m;
18178 if (M_TRAILINGROOM(m) != rack->r_ctl.fsb.o_t_len) {
18179 /*
18180 * The trailing space changed, mbufs can grow
18181 * at the tail but they can't shrink from
18182 * it, KASSERT that. Adjust the orig_m_len to
18183 * compensate for this change.
18184 */
18185 KASSERT((rack->r_ctl.fsb.o_t_len > M_TRAILINGROOM(m)),
18186 ("mbuf:%p rack:%p trailing_space:%jd ots:%u oml:%u mlen:%u\n",
18187 m,
18188 rack,
18189 (intmax_t)M_TRAILINGROOM(m),
18190 rack->r_ctl.fsb.o_t_len,
18191 rack->r_ctl.fsb.o_m_len,
18192 m->m_len));
18193 rack->r_ctl.fsb.o_m_len += (rack->r_ctl.fsb.o_t_len - M_TRAILINGROOM(m));
18194 rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(m);
18195 }
18196 if (m->m_len < rack->r_ctl.fsb.o_m_len) {
18197 /*
18198 * Mbuf shrank, trimmed off the top by an ack, our
18199 * offset changes.
18200 */
18201 KASSERT((rack->r_ctl.fsb.off >= (rack->r_ctl.fsb.o_m_len - m->m_len)),
18202 ("mbuf:%p len:%u rack:%p oml:%u soff:%u\n",
18203 m, m->m_len,
18204 rack, rack->r_ctl.fsb.o_m_len,
18205 rack->r_ctl.fsb.off));
18206
18207 if (rack->r_ctl.fsb.off >= (rack->r_ctl.fsb.o_m_len- m->m_len))
18208 rack->r_ctl.fsb.off -= (rack->r_ctl.fsb.o_m_len - m->m_len);
18209 else
18210 rack->r_ctl.fsb.off = 0;
18211 rack->r_ctl.fsb.o_m_len = m->m_len;
18212 #ifdef INVARIANTS
18213 } else if (m->m_len > rack->r_ctl.fsb.o_m_len) {
18214 panic("rack:%p m:%p m_len grew outside of t_space compensation",
18215 rack, m);
18216 #endif
18217 }
18218 soff = rack->r_ctl.fsb.off;
18219 KASSERT(soff >= 0, ("%s, negative off %d", __FUNCTION__, soff));
18220 KASSERT(*plen >= 0, ("%s, negative len %d", __FUNCTION__, *plen));
18221 KASSERT(soff < m->m_len, ("%s rack:%p len:%u m:%p m->m_len:%u < off?",
18222 __FUNCTION__,
18223 rack, *plen, m, m->m_len));
18224 /* Save off the right location before we copy and advance */
18225 *s_soff = soff;
18226 *s_mb = rack->r_ctl.fsb.m;
18227 n = rack_fo_base_copym(m, soff, plen,
18228 &rack->r_ctl.fsb,
18229 seglimit, segsize, rack->r_ctl.fsb.hw_tls);
18230 return (n);
18231 }
18232
18233 /* Log the buffer level */
18234 static void
rack_log_queue_level(struct tcpcb * tp,struct tcp_rack * rack,int len,struct timeval * tv,uint32_t cts)18235 rack_log_queue_level(struct tcpcb *tp, struct tcp_rack *rack,
18236 int len, struct timeval *tv,
18237 uint32_t cts)
18238 {
18239 uint32_t p_rate = 0, p_queue = 0, err = 0;
18240 union tcp_log_stackspecific log;
18241
18242 #ifdef RATELIMIT
18243 err = in_pcbquery_txrlevel(rack->rc_inp, &p_queue);
18244 err = in_pcbquery_txrtlmt(rack->rc_inp, &p_rate);
18245 #endif
18246 memset(&log, 0, sizeof(log));
18247 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
18248 log.u_bbr.flex1 = p_rate;
18249 log.u_bbr.flex2 = p_queue;
18250 log.u_bbr.flex4 = (uint32_t)rack->r_ctl.crte->using;
18251 log.u_bbr.flex5 = (uint32_t)rack->r_ctl.crte->rs_num_enobufs;
18252 log.u_bbr.flex6 = rack->r_ctl.crte->time_between;
18253 log.u_bbr.flex7 = 99;
18254 log.u_bbr.flex8 = 0;
18255 log.u_bbr.pkts_out = err;
18256 log.u_bbr.delRate = rack->r_ctl.crte->rate;
18257 log.u_bbr.timeStamp = cts;
18258 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
18259 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_HDWR_PACE, 0,
18260 len, &log, false, NULL, __func__, __LINE__, tv);
18261
18262 }
18263
18264 static uint32_t
rack_check_queue_level(struct tcp_rack * rack,struct tcpcb * tp,struct timeval * tv,uint32_t cts,int len,uint32_t segsiz)18265 rack_check_queue_level(struct tcp_rack *rack, struct tcpcb *tp,
18266 struct timeval *tv, uint32_t cts, int len, uint32_t segsiz)
18267 {
18268 uint64_t lentime = 0;
18269 #ifdef RATELIMIT
18270 uint32_t p_rate = 0, p_queue = 0, err;
18271 union tcp_log_stackspecific log;
18272 uint64_t bw;
18273
18274 err = in_pcbquery_txrlevel(rack->rc_inp, &p_queue);
18275 /* Failed or queue is zero */
18276 if (err || (p_queue == 0)) {
18277 lentime = 0;
18278 goto out;
18279 }
18280 err = in_pcbquery_txrtlmt(rack->rc_inp, &p_rate);
18281 if (err) {
18282 lentime = 0;
18283 goto out;
18284 }
18285 /*
18286 * If we reach here we have some bytes in
18287 * the queue. The number returned is a value
18288 * between 0 and 0xffff where ffff is full
18289 * and 0 is empty. So how best to make this into
18290 * something usable?
18291 *
18292 * The "safer" way is lets take the b/w gotten
18293 * from the query (which should be our b/w rate)
18294 * and pretend that a full send (our rc_pace_max_segs)
18295 * is outstanding. We factor it so its as if a full
18296 * number of our MSS segment is terms of full
18297 * ethernet segments are outstanding.
18298 */
18299 bw = p_rate / 8;
18300 if (bw) {
18301 lentime = (rack->r_ctl.rc_pace_max_segs / segsiz);
18302 lentime *= ETHERNET_SEGMENT_SIZE;
18303 lentime *= (uint64_t)HPTS_USEC_IN_SEC;
18304 lentime /= bw;
18305 } else {
18306 /* TSNH -- KASSERT? */
18307 lentime = 0;
18308 }
18309 out:
18310 if (tcp_bblogging_on(tp)) {
18311 memset(&log, 0, sizeof(log));
18312 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
18313 log.u_bbr.flex1 = p_rate;
18314 log.u_bbr.flex2 = p_queue;
18315 log.u_bbr.flex4 = (uint32_t)rack->r_ctl.crte->using;
18316 log.u_bbr.flex5 = (uint32_t)rack->r_ctl.crte->rs_num_enobufs;
18317 log.u_bbr.flex6 = rack->r_ctl.crte->time_between;
18318 log.u_bbr.flex7 = 99;
18319 log.u_bbr.flex8 = 0;
18320 log.u_bbr.pkts_out = err;
18321 log.u_bbr.delRate = rack->r_ctl.crte->rate;
18322 log.u_bbr.cur_del_rate = lentime;
18323 log.u_bbr.timeStamp = cts;
18324 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
18325 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_HDWR_PACE, 0,
18326 len, &log, false, NULL, __func__, __LINE__,tv);
18327 }
18328 #endif
18329 return ((uint32_t)lentime);
18330 }
18331
18332 static int
rack_fast_rsm_output(struct tcpcb * tp,struct tcp_rack * rack,struct rack_sendmap * rsm,uint64_t ts_val,uint32_t cts,uint32_t ms_cts,struct timeval * tv,int len,uint8_t doing_tlp)18333 rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm,
18334 uint64_t ts_val, uint32_t cts, uint32_t ms_cts, struct timeval *tv, int len, uint8_t doing_tlp)
18335 {
18336 /*
18337 * Enter the fast retransmit path. We are given that a sched_pin is
18338 * in place (if accounting is compliled in) and the cycle count taken
18339 * at the entry is in the ts_val. The concept her is that the rsm
18340 * now holds the mbuf offsets and such so we can directly transmit
18341 * without a lot of overhead, the len field is already set for
18342 * us to prohibit us from sending too much (usually its 1MSS).
18343 */
18344 struct ip *ip = NULL;
18345 struct udphdr *udp = NULL;
18346 struct tcphdr *th = NULL;
18347 struct mbuf *m = NULL;
18348 struct inpcb *inp;
18349 uint8_t *cpto;
18350 struct tcp_log_buffer *lgb;
18351 #ifdef TCP_ACCOUNTING
18352 uint64_t crtsc;
18353 int cnt_thru = 1;
18354 #endif
18355 struct tcpopt to;
18356 u_char opt[TCP_MAXOLEN];
18357 uint32_t hdrlen, optlen;
18358 int32_t pacing_delay, segsiz, max_val, tso = 0, error = 0, ulen = 0;
18359 uint16_t flags;
18360 uint32_t if_hw_tsomaxsegcount = 0, startseq;
18361 uint32_t if_hw_tsomaxsegsize;
18362 int32_t ip_sendflag = IP_NO_SND_TAG_RL;
18363
18364 #ifdef INET6
18365 struct ip6_hdr *ip6 = NULL;
18366
18367 if (rack->r_is_v6) {
18368 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
18369 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
18370 } else
18371 #endif /* INET6 */
18372 {
18373 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
18374 hdrlen = sizeof(struct tcpiphdr);
18375 }
18376 if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) {
18377 goto failed;
18378 }
18379 if (doing_tlp) {
18380 /* Its a TLP add the flag, it may already be there but be sure */
18381 rsm->r_flags |= RACK_TLP;
18382 } else {
18383 /* If it was a TLP it is not not on this retransmit */
18384 rsm->r_flags &= ~RACK_TLP;
18385 }
18386 startseq = rsm->r_start;
18387 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
18388 inp = rack->rc_inp;
18389 to.to_flags = 0;
18390 flags = tcp_outflags[tp->t_state];
18391 if (flags & (TH_SYN|TH_RST)) {
18392 goto failed;
18393 }
18394 if (rsm->r_flags & RACK_HAS_FIN) {
18395 /* We can't send a FIN here */
18396 goto failed;
18397 }
18398 if (flags & TH_FIN) {
18399 /* We never send a FIN */
18400 flags &= ~TH_FIN;
18401 }
18402 if (tp->t_flags & TF_RCVD_TSTMP) {
18403 to.to_tsval = ms_cts + tp->ts_offset;
18404 to.to_tsecr = tp->ts_recent;
18405 to.to_flags = TOF_TS;
18406 }
18407 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
18408 /* TCP-MD5 (RFC2385). */
18409 if (tp->t_flags & TF_SIGNATURE)
18410 to.to_flags |= TOF_SIGNATURE;
18411 #endif
18412 optlen = tcp_addoptions(&to, opt);
18413 hdrlen += optlen;
18414 udp = rack->r_ctl.fsb.udp;
18415 if (udp)
18416 hdrlen += sizeof(struct udphdr);
18417 if (rack->r_ctl.rc_pace_max_segs)
18418 max_val = rack->r_ctl.rc_pace_max_segs;
18419 else if (rack->rc_user_set_max_segs)
18420 max_val = rack->rc_user_set_max_segs * segsiz;
18421 else
18422 max_val = len;
18423 if ((tp->t_flags & TF_TSO) &&
18424 V_tcp_do_tso &&
18425 (len > segsiz) &&
18426 (tp->t_port == 0))
18427 tso = 1;
18428 #ifdef INET6
18429 if (MHLEN < hdrlen + max_linkhdr)
18430 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
18431 else
18432 #endif
18433 m = m_gethdr(M_NOWAIT, MT_DATA);
18434 if (m == NULL)
18435 goto failed;
18436 m->m_data += max_linkhdr;
18437 m->m_len = hdrlen;
18438 th = rack->r_ctl.fsb.th;
18439 /* Establish the len to send */
18440 if (len > max_val)
18441 len = max_val;
18442 if ((tso) && (len + optlen > segsiz)) {
18443 uint32_t if_hw_tsomax;
18444 int32_t max_len;
18445
18446 /* extract TSO information */
18447 if_hw_tsomax = tp->t_tsomax;
18448 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
18449 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
18450 /*
18451 * Check if we should limit by maximum payload
18452 * length:
18453 */
18454 if (if_hw_tsomax != 0) {
18455 /* compute maximum TSO length */
18456 max_len = (if_hw_tsomax - hdrlen -
18457 max_linkhdr);
18458 if (max_len <= 0) {
18459 goto failed;
18460 } else if (len > max_len) {
18461 len = max_len;
18462 }
18463 }
18464 if (len <= segsiz) {
18465 /*
18466 * In case there are too many small fragments don't
18467 * use TSO:
18468 */
18469 tso = 0;
18470 }
18471 } else {
18472 tso = 0;
18473 }
18474 if ((tso == 0) && (len > segsiz))
18475 len = segsiz;
18476 (void)tcp_get_usecs(tv);
18477 if ((len == 0) ||
18478 (len <= MHLEN - hdrlen - max_linkhdr)) {
18479 goto failed;
18480 }
18481 th->th_seq = htonl(rsm->r_start);
18482 th->th_ack = htonl(tp->rcv_nxt);
18483 /*
18484 * The PUSH bit should only be applied
18485 * if the full retransmission is made. If
18486 * we are sending less than this is the
18487 * left hand edge and should not have
18488 * the PUSH bit.
18489 */
18490 if ((rsm->r_flags & RACK_HAD_PUSH) &&
18491 (len == (rsm->r_end - rsm->r_start)))
18492 flags |= TH_PUSH;
18493 th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale));
18494 if (th->th_win == 0) {
18495 tp->t_sndzerowin++;
18496 tp->t_flags |= TF_RXWIN0SENT;
18497 } else
18498 tp->t_flags &= ~TF_RXWIN0SENT;
18499 if (rsm->r_flags & RACK_TLP) {
18500 /*
18501 * TLP should not count in retran count, but
18502 * in its own bin
18503 */
18504 counter_u64_add(rack_tlp_retran, 1);
18505 counter_u64_add(rack_tlp_retran_bytes, len);
18506 } else {
18507 tp->t_sndrexmitpack++;
18508 KMOD_TCPSTAT_INC(tcps_sndrexmitpack);
18509 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len);
18510 }
18511 #ifdef STATS
18512 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
18513 len);
18514 #endif
18515 if (rsm->m == NULL)
18516 goto failed;
18517 if (rsm->m &&
18518 ((rsm->orig_m_len != rsm->m->m_len) ||
18519 (M_TRAILINGROOM(rsm->m) != rsm->orig_t_space))) {
18520 /* Fix up the orig_m_len and possibly the mbuf offset */
18521 rack_adjust_orig_mlen(rsm);
18522 }
18523 m->m_next = rack_fo_base_copym(rsm->m, rsm->soff, &len, NULL, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, rsm->r_hw_tls);
18524 if (len <= segsiz) {
18525 /*
18526 * Must have ran out of mbufs for the copy
18527 * shorten it to no longer need tso. Lets
18528 * not put on sendalot since we are low on
18529 * mbufs.
18530 */
18531 tso = 0;
18532 }
18533 if ((m->m_next == NULL) || (len <= 0)){
18534 goto failed;
18535 }
18536 if (udp) {
18537 if (rack->r_is_v6)
18538 ulen = hdrlen + len - sizeof(struct ip6_hdr);
18539 else
18540 ulen = hdrlen + len - sizeof(struct ip);
18541 udp->uh_ulen = htons(ulen);
18542 }
18543 m->m_pkthdr.rcvif = (struct ifnet *)0;
18544 if (TCPS_HAVERCVDSYN(tp->t_state) &&
18545 (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) {
18546 int ect = tcp_ecn_output_established(tp, &flags, len, true);
18547 if ((tp->t_state == TCPS_SYN_RECEIVED) &&
18548 (tp->t_flags2 & TF2_ECN_SND_ECE))
18549 tp->t_flags2 &= ~TF2_ECN_SND_ECE;
18550 #ifdef INET6
18551 if (rack->r_is_v6) {
18552 ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20);
18553 ip6->ip6_flow |= htonl(ect << 20);
18554 }
18555 else
18556 #endif
18557 {
18558 ip->ip_tos &= ~IPTOS_ECN_MASK;
18559 ip->ip_tos |= ect;
18560 }
18561 }
18562 if (rack->r_ctl.crte != NULL) {
18563 /* See if we can send via the hw queue */
18564 pacing_delay = rack_check_queue_level(rack, tp, tv, cts, len, segsiz);
18565 /* If there is nothing in queue (no pacing time) we can send via the hw queue */
18566 if (pacing_delay == 0)
18567 ip_sendflag = 0;
18568 }
18569 tcp_set_flags(th, flags);
18570 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
18571 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
18572 if (to.to_flags & TOF_SIGNATURE) {
18573 /*
18574 * Calculate MD5 signature and put it into the place
18575 * determined before.
18576 * NOTE: since TCP options buffer doesn't point into
18577 * mbuf's data, calculate offset and use it.
18578 */
18579 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th,
18580 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) {
18581 /*
18582 * Do not send segment if the calculation of MD5
18583 * digest has failed.
18584 */
18585 goto failed;
18586 }
18587 }
18588 #endif
18589 #ifdef INET6
18590 if (rack->r_is_v6) {
18591 if (tp->t_port) {
18592 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
18593 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
18594 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
18595 th->th_sum = htons(0);
18596 UDPSTAT_INC(udps_opackets);
18597 } else {
18598 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
18599 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
18600 th->th_sum = in6_cksum_pseudo(ip6,
18601 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
18602 0);
18603 }
18604 }
18605 #endif
18606 #if defined(INET6) && defined(INET)
18607 else
18608 #endif
18609 #ifdef INET
18610 {
18611 if (tp->t_port) {
18612 m->m_pkthdr.csum_flags = CSUM_UDP;
18613 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
18614 udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
18615 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
18616 th->th_sum = htons(0);
18617 UDPSTAT_INC(udps_opackets);
18618 } else {
18619 m->m_pkthdr.csum_flags = CSUM_TCP;
18620 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
18621 th->th_sum = in_pseudo(ip->ip_src.s_addr,
18622 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
18623 IPPROTO_TCP + len + optlen));
18624 }
18625 /* IP version must be set here for ipv4/ipv6 checking later */
18626 KASSERT(ip->ip_v == IPVERSION,
18627 ("%s: IP version incorrect: %d", __func__, ip->ip_v));
18628 }
18629 #endif
18630 if (tso) {
18631 /*
18632 * Here we use segsiz since we have no added options besides
18633 * any standard timestamp options (no DSACKs or SACKS are sent
18634 * via either fast-path).
18635 */
18636 KASSERT(len > segsiz,
18637 ("%s: len <= tso_segsz tp:%p", __func__, tp));
18638 m->m_pkthdr.csum_flags |= CSUM_TSO;
18639 m->m_pkthdr.tso_segsz = segsiz;
18640 }
18641 #ifdef INET6
18642 if (rack->r_is_v6) {
18643 ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit;
18644 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
18645 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
18646 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
18647 else
18648 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
18649 }
18650 #endif
18651 #if defined(INET) && defined(INET6)
18652 else
18653 #endif
18654 #ifdef INET
18655 {
18656 ip->ip_len = htons(m->m_pkthdr.len);
18657 ip->ip_ttl = rack->r_ctl.fsb.hoplimit;
18658 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
18659 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
18660 if (tp->t_port == 0 || len < V_tcp_minmss) {
18661 ip->ip_off |= htons(IP_DF);
18662 }
18663 } else {
18664 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
18665 }
18666 }
18667 #endif
18668 if (doing_tlp == 0) {
18669 /* Set we retransmitted */
18670 rack->rc_gp_saw_rec = 1;
18671 } else {
18672 /* Its a TLP set ca or ss */
18673 if (tp->snd_cwnd > tp->snd_ssthresh) {
18674 /* Set we sent in CA */
18675 rack->rc_gp_saw_ca = 1;
18676 } else {
18677 /* Set we sent in SS */
18678 rack->rc_gp_saw_ss = 1;
18679 }
18680 }
18681 /* Time to copy in our header */
18682 cpto = mtod(m, uint8_t *);
18683 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len);
18684 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr));
18685 if (optlen) {
18686 bcopy(opt, th + 1, optlen);
18687 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
18688 } else {
18689 th->th_off = sizeof(struct tcphdr) >> 2;
18690 }
18691 if (tcp_bblogging_on(rack->rc_tp)) {
18692 union tcp_log_stackspecific log;
18693
18694 if (rsm->r_flags & RACK_RWND_COLLAPSED) {
18695 rack_log_collapse(rack, rsm->r_start, rsm->r_end, 0, __LINE__, 5, rsm->r_flags, rsm);
18696 counter_u64_add(rack_collapsed_win_rxt, 1);
18697 counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start));
18698 }
18699 memset(&log, 0, sizeof(log));
18700 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
18701 if (rack->rack_no_prr)
18702 log.u_bbr.flex1 = 0;
18703 else
18704 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
18705 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
18706 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
18707 log.u_bbr.flex4 = max_val;
18708 /* Save off the early/late values */
18709 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
18710 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
18711 log.u_bbr.bw_inuse = rack_get_bw(rack);
18712 log.u_bbr.cur_del_rate = rack->r_ctl.gp_bw;
18713 if (doing_tlp == 0)
18714 log.u_bbr.flex8 = 1;
18715 else
18716 log.u_bbr.flex8 = 2;
18717 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL);
18718 log.u_bbr.flex7 = 55;
18719 log.u_bbr.pkts_out = tp->t_maxseg;
18720 log.u_bbr.timeStamp = cts;
18721 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
18722 if (rsm->r_rtr_cnt > 0) {
18723 /*
18724 * When we have a retransmit we want to log the
18725 * burst at send and flight at send from before.
18726 */
18727 log.u_bbr.flex5 = rsm->r_fas;
18728 log.u_bbr.bbr_substate = rsm->r_bas;
18729 } else {
18730 /*
18731 * This is currently unlikely until we do the
18732 * packet pair probes but I will add it for completeness.
18733 */
18734 log.u_bbr.flex5 = log.u_bbr.inflight;
18735 log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz);
18736 }
18737 log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use;
18738 log.u_bbr.delivered = 0;
18739 log.u_bbr.rttProp = (uintptr_t)rsm;
18740 log.u_bbr.delRate = rsm->r_flags;
18741 log.u_bbr.delRate <<= 31;
18742 log.u_bbr.delRate |= rack->r_must_retran;
18743 log.u_bbr.delRate <<= 1;
18744 log.u_bbr.delRate |= 1;
18745 log.u_bbr.pkt_epoch = __LINE__;
18746 lgb = tcp_log_event(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK,
18747 len, &log, false, NULL, __func__, __LINE__, tv);
18748 } else
18749 lgb = NULL;
18750 if ((rack->r_ctl.crte != NULL) &&
18751 tcp_bblogging_on(tp)) {
18752 rack_log_queue_level(tp, rack, len, tv, cts);
18753 }
18754 #ifdef INET6
18755 if (rack->r_is_v6) {
18756 error = ip6_output(m, inp->in6p_outputopts,
18757 &inp->inp_route6,
18758 ip_sendflag, NULL, NULL, inp);
18759 }
18760 else
18761 #endif
18762 #ifdef INET
18763 {
18764 error = ip_output(m, NULL,
18765 &inp->inp_route,
18766 ip_sendflag, 0, inp);
18767 }
18768 #endif
18769 m = NULL;
18770 if (lgb) {
18771 lgb->tlb_errno = error;
18772 lgb = NULL;
18773 }
18774 /* Move snd_nxt to snd_max so we don't have false retransmissions */
18775 tp->snd_nxt = tp->snd_max;
18776 if (error) {
18777 goto failed;
18778 } else if (rack->rc_hw_nobuf && (ip_sendflag != IP_NO_SND_TAG_RL)) {
18779 rack->rc_hw_nobuf = 0;
18780 rack->r_ctl.rc_agg_delayed = 0;
18781 rack->r_early = 0;
18782 rack->r_late = 0;
18783 rack->r_ctl.rc_agg_early = 0;
18784 }
18785 rack_log_output(tp, &to, len, rsm->r_start, flags, error, rack_to_usec_ts(tv),
18786 rsm, RACK_SENT_FP, rsm->m, rsm->soff, rsm->r_hw_tls, segsiz);
18787 if (doing_tlp) {
18788 rack->rc_tlp_in_progress = 1;
18789 rack->r_ctl.rc_tlp_cnt_out++;
18790 }
18791 if (error == 0) {
18792 counter_u64_add(rack_total_bytes, len);
18793 tcp_account_for_send(tp, len, 1, doing_tlp, rsm->r_hw_tls);
18794 if (doing_tlp) {
18795 rack->rc_last_sent_tlp_past_cumack = 0;
18796 rack->rc_last_sent_tlp_seq_valid = 1;
18797 rack->r_ctl.last_sent_tlp_seq = rsm->r_start;
18798 rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start;
18799 }
18800 if (rack->r_ctl.rc_prr_sndcnt >= len)
18801 rack->r_ctl.rc_prr_sndcnt -= len;
18802 else
18803 rack->r_ctl.rc_prr_sndcnt = 0;
18804 }
18805 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
18806 rack->forced_ack = 0; /* If we send something zap the FA flag */
18807 if (IN_FASTRECOVERY(tp->t_flags) && rsm)
18808 rack->r_ctl.retran_during_recovery += len;
18809 {
18810 int idx;
18811
18812 idx = (len / segsiz) + 3;
18813 if (idx >= TCP_MSS_ACCT_ATIMER)
18814 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
18815 else
18816 counter_u64_add(rack_out_size[idx], 1);
18817 }
18818 if (tp->t_rtttime == 0) {
18819 tp->t_rtttime = ticks;
18820 tp->t_rtseq = startseq;
18821 KMOD_TCPSTAT_INC(tcps_segstimed);
18822 }
18823 counter_u64_add(rack_fto_rsm_send, 1);
18824 if (error && (error == ENOBUFS)) {
18825 if (rack->r_ctl.crte != NULL) {
18826 tcp_trace_point(rack->rc_tp, TCP_TP_HWENOBUF);
18827 if (tcp_bblogging_on(rack->rc_tp))
18828 rack_log_queue_level(tp, rack, len, tv, cts);
18829 } else
18830 tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF);
18831 pacing_delay = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);
18832 if (rack->rc_enobuf < 0x7f)
18833 rack->rc_enobuf++;
18834 if (pacing_delay < (10 * HPTS_USEC_IN_MSEC))
18835 pacing_delay = 10 * HPTS_USEC_IN_MSEC;
18836 if (rack->r_ctl.crte != NULL) {
18837 counter_u64_add(rack_saw_enobuf_hw, 1);
18838 tcp_rl_log_enobuf(rack->r_ctl.crte);
18839 }
18840 counter_u64_add(rack_saw_enobuf, 1);
18841 } else {
18842 pacing_delay = rack_get_pacing_delay(rack, tp, len, NULL, segsiz, __LINE__);
18843 }
18844 rack_start_hpts_timer(rack, tp, cts, pacing_delay, len, 0);
18845 #ifdef TCP_ACCOUNTING
18846 crtsc = get_cyclecount();
18847 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
18848 tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru;
18849 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val);
18850 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((len + segsiz - 1) / segsiz);
18851 }
18852 sched_unpin();
18853 #endif
18854 return (0);
18855 failed:
18856 if (m)
18857 m_free(m);
18858 return (-1);
18859 }
18860
18861 static void
rack_sndbuf_autoscale(struct tcp_rack * rack)18862 rack_sndbuf_autoscale(struct tcp_rack *rack)
18863 {
18864 /*
18865 * Automatic sizing of send socket buffer. Often the send buffer
18866 * size is not optimally adjusted to the actual network conditions
18867 * at hand (delay bandwidth product). Setting the buffer size too
18868 * small limits throughput on links with high bandwidth and high
18869 * delay (eg. trans-continental/oceanic links). Setting the
18870 * buffer size too big consumes too much real kernel memory,
18871 * especially with many connections on busy servers.
18872 *
18873 * The criteria to step up the send buffer one notch are:
18874 * 1. receive window of remote host is larger than send buffer
18875 * (with a fudge factor of 5/4th);
18876 * 2. send buffer is filled to 7/8th with data (so we actually
18877 * have data to make use of it);
18878 * 3. send buffer fill has not hit maximal automatic size;
18879 * 4. our send window (slow start and cogestion controlled) is
18880 * larger than sent but unacknowledged data in send buffer.
18881 *
18882 * Note that the rack version moves things much faster since
18883 * we want to avoid hitting cache lines in the rack_fast_output()
18884 * path so this is called much less often and thus moves
18885 * the SB forward by a percentage.
18886 */
18887 struct socket *so;
18888 struct tcpcb *tp;
18889 uint32_t sendwin, scaleup;
18890
18891 tp = rack->rc_tp;
18892 so = rack->rc_inp->inp_socket;
18893 sendwin = min(rack->r_ctl.cwnd_to_use, tp->snd_wnd);
18894 if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
18895 if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
18896 sbused(&so->so_snd) >=
18897 (so->so_snd.sb_hiwat / 8 * 7) &&
18898 sbused(&so->so_snd) < V_tcp_autosndbuf_max &&
18899 sendwin >= (sbused(&so->so_snd) -
18900 (tp->snd_max - tp->snd_una))) {
18901 if (rack_autosndbuf_inc)
18902 scaleup = (rack_autosndbuf_inc * so->so_snd.sb_hiwat) / 100;
18903 else
18904 scaleup = V_tcp_autosndbuf_inc;
18905 if (scaleup < V_tcp_autosndbuf_inc)
18906 scaleup = V_tcp_autosndbuf_inc;
18907 scaleup += so->so_snd.sb_hiwat;
18908 if (scaleup > V_tcp_autosndbuf_max)
18909 scaleup = V_tcp_autosndbuf_max;
18910 if (!sbreserve_locked(so, SO_SND, scaleup, curthread))
18911 so->so_snd.sb_flags &= ~SB_AUTOSIZE;
18912 }
18913 }
18914 }
18915
18916 static int
rack_fast_output(struct tcpcb * tp,struct tcp_rack * rack,uint64_t ts_val,uint32_t cts,uint32_t ms_cts,struct timeval * tv,long * tot_len,int * send_err,int line)18917 rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val,
18918 uint32_t cts, uint32_t ms_cts, struct timeval *tv, long *tot_len, int *send_err, int line)
18919 {
18920 /*
18921 * Enter to do fast output. We are given that the sched_pin is
18922 * in place (if accounting is compiled in) and the cycle count taken
18923 * at entry is in place in ts_val. The idea here is that
18924 * we know how many more bytes needs to be sent (presumably either
18925 * during pacing or to fill the cwnd and that was greater than
18926 * the max-burst). We have how much to send and all the info we
18927 * need to just send.
18928 */
18929 #ifdef INET
18930 struct ip *ip = NULL;
18931 #endif
18932 struct udphdr *udp = NULL;
18933 struct tcphdr *th = NULL;
18934 struct mbuf *m, *s_mb;
18935 struct inpcb *inp;
18936 uint8_t *cpto;
18937 struct tcp_log_buffer *lgb;
18938 #ifdef TCP_ACCOUNTING
18939 uint64_t crtsc;
18940 #endif
18941 struct tcpopt to;
18942 u_char opt[TCP_MAXOLEN];
18943 uint32_t hdrlen, optlen;
18944 #ifdef TCP_ACCOUNTING
18945 int cnt_thru = 1;
18946 #endif
18947 int32_t pacing_delay, segsiz, len, max_val, tso = 0, sb_offset, error, ulen = 0;
18948 uint16_t flags;
18949 uint32_t s_soff;
18950 uint32_t if_hw_tsomaxsegcount = 0, startseq;
18951 uint32_t if_hw_tsomaxsegsize;
18952 uint32_t add_flag = RACK_SENT_FP;
18953 #ifdef INET6
18954 struct ip6_hdr *ip6 = NULL;
18955
18956 if (rack->r_is_v6) {
18957 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
18958 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
18959 } else
18960 #endif /* INET6 */
18961 {
18962 #ifdef INET
18963 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
18964 hdrlen = sizeof(struct tcpiphdr);
18965 #endif
18966 }
18967 if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) {
18968 m = NULL;
18969 goto failed;
18970 }
18971 rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
18972 startseq = tp->snd_max;
18973 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
18974 inp = rack->rc_inp;
18975 len = rack->r_ctl.fsb.left_to_send;
18976 to.to_flags = 0;
18977 flags = rack->r_ctl.fsb.tcp_flags;
18978 if (tp->t_flags & TF_RCVD_TSTMP) {
18979 to.to_tsval = ms_cts + tp->ts_offset;
18980 to.to_tsecr = tp->ts_recent;
18981 to.to_flags = TOF_TS;
18982 }
18983 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
18984 /* TCP-MD5 (RFC2385). */
18985 if (tp->t_flags & TF_SIGNATURE)
18986 to.to_flags |= TOF_SIGNATURE;
18987 #endif
18988 optlen = tcp_addoptions(&to, opt);
18989 hdrlen += optlen;
18990 udp = rack->r_ctl.fsb.udp;
18991 if (udp)
18992 hdrlen += sizeof(struct udphdr);
18993 if (rack->r_ctl.rc_pace_max_segs)
18994 max_val = rack->r_ctl.rc_pace_max_segs;
18995 else if (rack->rc_user_set_max_segs)
18996 max_val = rack->rc_user_set_max_segs * segsiz;
18997 else
18998 max_val = len;
18999 if ((tp->t_flags & TF_TSO) &&
19000 V_tcp_do_tso &&
19001 (len > segsiz) &&
19002 (tp->t_port == 0))
19003 tso = 1;
19004 again:
19005 #ifdef INET6
19006 if (MHLEN < hdrlen + max_linkhdr)
19007 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
19008 else
19009 #endif
19010 m = m_gethdr(M_NOWAIT, MT_DATA);
19011 if (m == NULL)
19012 goto failed;
19013 m->m_data += max_linkhdr;
19014 m->m_len = hdrlen;
19015 th = rack->r_ctl.fsb.th;
19016 /* Establish the len to send */
19017 if (len > max_val)
19018 len = max_val;
19019 if ((tso) && (len + optlen > segsiz)) {
19020 uint32_t if_hw_tsomax;
19021 int32_t max_len;
19022
19023 /* extract TSO information */
19024 if_hw_tsomax = tp->t_tsomax;
19025 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
19026 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
19027 /*
19028 * Check if we should limit by maximum payload
19029 * length:
19030 */
19031 if (if_hw_tsomax != 0) {
19032 /* compute maximum TSO length */
19033 max_len = (if_hw_tsomax - hdrlen -
19034 max_linkhdr);
19035 if (max_len <= 0) {
19036 goto failed;
19037 } else if (len > max_len) {
19038 len = max_len;
19039 }
19040 }
19041 if (len <= segsiz) {
19042 /*
19043 * In case there are too many small fragments don't
19044 * use TSO:
19045 */
19046 tso = 0;
19047 }
19048 } else {
19049 tso = 0;
19050 }
19051 if ((tso == 0) && (len > segsiz))
19052 len = segsiz;
19053 (void)tcp_get_usecs(tv);
19054 if ((len == 0) ||
19055 (len <= MHLEN - hdrlen - max_linkhdr)) {
19056 goto failed;
19057 }
19058 sb_offset = tp->snd_max - tp->snd_una;
19059 th->th_seq = htonl(tp->snd_max);
19060 th->th_ack = htonl(tp->rcv_nxt);
19061 th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale));
19062 if (th->th_win == 0) {
19063 tp->t_sndzerowin++;
19064 tp->t_flags |= TF_RXWIN0SENT;
19065 } else
19066 tp->t_flags &= ~TF_RXWIN0SENT;
19067 tp->snd_up = tp->snd_una; /* drag it along, its deprecated */
19068 KMOD_TCPSTAT_INC(tcps_sndpack);
19069 KMOD_TCPSTAT_ADD(tcps_sndbyte, len);
19070 #ifdef STATS
19071 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
19072 len);
19073 #endif
19074 if (rack->r_ctl.fsb.m == NULL)
19075 goto failed;
19076
19077 /* s_mb and s_soff are saved for rack_log_output */
19078 m->m_next = rack_fo_m_copym(rack, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize,
19079 &s_mb, &s_soff);
19080 if (len <= segsiz) {
19081 /*
19082 * Must have ran out of mbufs for the copy
19083 * shorten it to no longer need tso. Lets
19084 * not put on sendalot since we are low on
19085 * mbufs.
19086 */
19087 tso = 0;
19088 }
19089 if (rack->r_ctl.fsb.rfo_apply_push &&
19090 (len == rack->r_ctl.fsb.left_to_send)) {
19091 flags |= TH_PUSH;
19092 add_flag |= RACK_HAD_PUSH;
19093 }
19094 if ((m->m_next == NULL) || (len <= 0)){
19095 goto failed;
19096 }
19097 if (udp) {
19098 if (rack->r_is_v6)
19099 ulen = hdrlen + len - sizeof(struct ip6_hdr);
19100 else
19101 ulen = hdrlen + len - sizeof(struct ip);
19102 udp->uh_ulen = htons(ulen);
19103 }
19104 m->m_pkthdr.rcvif = (struct ifnet *)0;
19105 if (TCPS_HAVERCVDSYN(tp->t_state) &&
19106 (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) {
19107 int ect = tcp_ecn_output_established(tp, &flags, len, false);
19108 if ((tp->t_state == TCPS_SYN_RECEIVED) &&
19109 (tp->t_flags2 & TF2_ECN_SND_ECE))
19110 tp->t_flags2 &= ~TF2_ECN_SND_ECE;
19111 #ifdef INET6
19112 if (rack->r_is_v6) {
19113 ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20);
19114 ip6->ip6_flow |= htonl(ect << 20);
19115 }
19116 else
19117 #endif
19118 {
19119 #ifdef INET
19120 ip->ip_tos &= ~IPTOS_ECN_MASK;
19121 ip->ip_tos |= ect;
19122 #endif
19123 }
19124 }
19125 tcp_set_flags(th, flags);
19126 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
19127 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
19128 if (to.to_flags & TOF_SIGNATURE) {
19129 /*
19130 * Calculate MD5 signature and put it into the place
19131 * determined before.
19132 * NOTE: since TCP options buffer doesn't point into
19133 * mbuf's data, calculate offset and use it.
19134 */
19135 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th,
19136 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) {
19137 /*
19138 * Do not send segment if the calculation of MD5
19139 * digest has failed.
19140 */
19141 goto failed;
19142 }
19143 }
19144 #endif
19145 #ifdef INET6
19146 if (rack->r_is_v6) {
19147 if (tp->t_port) {
19148 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
19149 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
19150 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
19151 th->th_sum = htons(0);
19152 UDPSTAT_INC(udps_opackets);
19153 } else {
19154 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
19155 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
19156 th->th_sum = in6_cksum_pseudo(ip6,
19157 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
19158 0);
19159 }
19160 }
19161 #endif
19162 #if defined(INET6) && defined(INET)
19163 else
19164 #endif
19165 #ifdef INET
19166 {
19167 if (tp->t_port) {
19168 m->m_pkthdr.csum_flags = CSUM_UDP;
19169 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
19170 udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
19171 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
19172 th->th_sum = htons(0);
19173 UDPSTAT_INC(udps_opackets);
19174 } else {
19175 m->m_pkthdr.csum_flags = CSUM_TCP;
19176 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
19177 th->th_sum = in_pseudo(ip->ip_src.s_addr,
19178 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
19179 IPPROTO_TCP + len + optlen));
19180 }
19181 /* IP version must be set here for ipv4/ipv6 checking later */
19182 KASSERT(ip->ip_v == IPVERSION,
19183 ("%s: IP version incorrect: %d", __func__, ip->ip_v));
19184 }
19185 #endif
19186 if (tso) {
19187 /*
19188 * Here we use segsiz since we have no added options besides
19189 * any standard timestamp options (no DSACKs or SACKS are sent
19190 * via either fast-path).
19191 */
19192 KASSERT(len > segsiz,
19193 ("%s: len <= tso_segsz tp:%p", __func__, tp));
19194 m->m_pkthdr.csum_flags |= CSUM_TSO;
19195 m->m_pkthdr.tso_segsz = segsiz;
19196 }
19197 #ifdef INET6
19198 if (rack->r_is_v6) {
19199 ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit;
19200 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
19201 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
19202 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
19203 else
19204 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
19205 }
19206 #endif
19207 #if defined(INET) && defined(INET6)
19208 else
19209 #endif
19210 #ifdef INET
19211 {
19212 ip->ip_len = htons(m->m_pkthdr.len);
19213 ip->ip_ttl = rack->r_ctl.fsb.hoplimit;
19214 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
19215 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
19216 if (tp->t_port == 0 || len < V_tcp_minmss) {
19217 ip->ip_off |= htons(IP_DF);
19218 }
19219 } else {
19220 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
19221 }
19222 }
19223 #endif
19224 if (tp->snd_cwnd > tp->snd_ssthresh) {
19225 /* Set we sent in CA */
19226 rack->rc_gp_saw_ca = 1;
19227 } else {
19228 /* Set we sent in SS */
19229 rack->rc_gp_saw_ss = 1;
19230 }
19231 /* Time to copy in our header */
19232 cpto = mtod(m, uint8_t *);
19233 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len);
19234 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr));
19235 if (optlen) {
19236 bcopy(opt, th + 1, optlen);
19237 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
19238 } else {
19239 th->th_off = sizeof(struct tcphdr) >> 2;
19240 }
19241 if ((rack->r_ctl.crte != NULL) &&
19242 tcp_bblogging_on(tp)) {
19243 rack_log_queue_level(tp, rack, len, tv, cts);
19244 }
19245 if (tcp_bblogging_on(rack->rc_tp)) {
19246 union tcp_log_stackspecific log;
19247
19248 memset(&log, 0, sizeof(log));
19249 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
19250 if (rack->rack_no_prr)
19251 log.u_bbr.flex1 = 0;
19252 else
19253 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
19254 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
19255 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
19256 log.u_bbr.flex4 = max_val;
19257 /* Save off the early/late values */
19258 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
19259 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
19260 log.u_bbr.bw_inuse = rack_get_bw(rack);
19261 log.u_bbr.cur_del_rate = rack->r_ctl.gp_bw;
19262 log.u_bbr.flex8 = 0;
19263 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL);
19264 log.u_bbr.flex7 = 44;
19265 log.u_bbr.pkts_out = tp->t_maxseg;
19266 log.u_bbr.timeStamp = cts;
19267 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
19268 log.u_bbr.flex5 = log.u_bbr.inflight;
19269 log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use;
19270 log.u_bbr.delivered = rack->r_ctl.fsb.left_to_send;
19271 log.u_bbr.rttProp = 0;
19272 log.u_bbr.delRate = rack->r_must_retran;
19273 log.u_bbr.delRate <<= 1;
19274 log.u_bbr.pkt_epoch = line;
19275 /* For fast output no retrans so just inflight and how many mss we send */
19276 log.u_bbr.flex5 = log.u_bbr.inflight;
19277 log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz);
19278 lgb = tcp_log_event(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK,
19279 len, &log, false, NULL, __func__, __LINE__, tv);
19280 } else
19281 lgb = NULL;
19282 #ifdef INET6
19283 if (rack->r_is_v6) {
19284 error = ip6_output(m, inp->in6p_outputopts,
19285 &inp->inp_route6,
19286 0, NULL, NULL, inp);
19287 }
19288 #endif
19289 #if defined(INET) && defined(INET6)
19290 else
19291 #endif
19292 #ifdef INET
19293 {
19294 error = ip_output(m, NULL,
19295 &inp->inp_route,
19296 0, 0, inp);
19297 }
19298 #endif
19299 if (lgb) {
19300 lgb->tlb_errno = error;
19301 lgb = NULL;
19302 }
19303 if (error) {
19304 *send_err = error;
19305 m = NULL;
19306 goto failed;
19307 } else if (rack->rc_hw_nobuf) {
19308 rack->rc_hw_nobuf = 0;
19309 rack->r_ctl.rc_agg_delayed = 0;
19310 rack->r_early = 0;
19311 rack->r_late = 0;
19312 rack->r_ctl.rc_agg_early = 0;
19313 }
19314 if ((error == 0) && (rack->lt_bw_up == 0)) {
19315 /* Unlikely */
19316 rack->r_ctl.lt_timemark = tcp_tv_to_lusec(tv);
19317 rack->r_ctl.lt_seq = tp->snd_una;
19318 rack->lt_bw_up = 1;
19319 } else if ((error == 0) &&
19320 (((tp->snd_max + len) - rack->r_ctl.lt_seq) > 0x7fffffff)) {
19321 /*
19322 * Need to record what we have since we are
19323 * approaching seq wrap.
19324 */
19325 struct timeval tv;
19326 uint64_t tmark;
19327
19328 rack->r_ctl.lt_bw_bytes += (tp->snd_una - rack->r_ctl.lt_seq);
19329 rack->r_ctl.lt_seq = tp->snd_una;
19330 tmark = tcp_get_u64_usecs(&tv);
19331 if (tmark > rack->r_ctl.lt_timemark) {
19332 rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark);
19333 rack->r_ctl.lt_timemark = tmark;
19334 }
19335 }
19336 rack_log_output(tp, &to, len, tp->snd_max, flags, error, rack_to_usec_ts(tv),
19337 NULL, add_flag, s_mb, s_soff, rack->r_ctl.fsb.hw_tls, segsiz);
19338 if (tp->snd_una == tp->snd_max) {
19339 rack->r_ctl.rc_tlp_rxt_last_time = cts;
19340 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
19341 tp->t_acktime = ticks;
19342 }
19343 counter_u64_add(rack_total_bytes, len);
19344 tcp_account_for_send(tp, len, 0, 0, rack->r_ctl.fsb.hw_tls);
19345
19346 rack->forced_ack = 0; /* If we send something zap the FA flag */
19347 *tot_len += len;
19348 if ((tp->t_flags & TF_GPUTINPROG) == 0)
19349 rack_start_gp_measurement(tp, rack, tp->snd_max, sb_offset);
19350 tp->snd_max += len;
19351 tp->snd_nxt = tp->snd_max;
19352 if (rack->rc_new_rnd_needed) {
19353 rack_new_round_starts(tp, rack, tp->snd_max);
19354 }
19355 {
19356 int idx;
19357
19358 idx = (len / segsiz) + 3;
19359 if (idx >= TCP_MSS_ACCT_ATIMER)
19360 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
19361 else
19362 counter_u64_add(rack_out_size[idx], 1);
19363 }
19364 if (len <= rack->r_ctl.fsb.left_to_send)
19365 rack->r_ctl.fsb.left_to_send -= len;
19366 else
19367 rack->r_ctl.fsb.left_to_send = 0;
19368 if (rack->r_ctl.fsb.left_to_send < segsiz) {
19369 rack->r_fast_output = 0;
19370 rack->r_ctl.fsb.left_to_send = 0;
19371 /* At the end of fast_output scale up the sb */
19372 SOCK_SENDBUF_LOCK(rack->rc_inp->inp_socket);
19373 rack_sndbuf_autoscale(rack);
19374 SOCK_SENDBUF_UNLOCK(rack->rc_inp->inp_socket);
19375 }
19376 if (tp->t_rtttime == 0) {
19377 tp->t_rtttime = ticks;
19378 tp->t_rtseq = startseq;
19379 KMOD_TCPSTAT_INC(tcps_segstimed);
19380 }
19381 if ((rack->r_ctl.fsb.left_to_send >= segsiz) &&
19382 (max_val > len) &&
19383 (*tot_len < rack->r_ctl.rc_pace_max_segs) &&
19384 (tso == 0)) {
19385 max_val -= len;
19386 len = segsiz;
19387 th = rack->r_ctl.fsb.th;
19388 #ifdef TCP_ACCOUNTING
19389 cnt_thru++;
19390 #endif
19391 goto again;
19392 }
19393 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
19394 counter_u64_add(rack_fto_send, 1);
19395 pacing_delay = rack_get_pacing_delay(rack, tp, *tot_len, NULL, segsiz, __LINE__);
19396 rack_start_hpts_timer(rack, tp, cts, pacing_delay, *tot_len, 0);
19397 #ifdef TCP_ACCOUNTING
19398 crtsc = get_cyclecount();
19399 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19400 tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru;
19401 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val);
19402 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((*tot_len + segsiz - 1) / segsiz);
19403 }
19404 sched_unpin();
19405 #endif
19406 return (0);
19407 failed:
19408 if (m)
19409 m_free(m);
19410 rack->r_fast_output = 0;
19411 return (-1);
19412 }
19413
19414 static inline void
rack_setup_fast_output(struct tcpcb * tp,struct tcp_rack * rack,struct sockbuf * sb,int len,int orig_len,int segsiz,uint32_t pace_max_seg,bool hw_tls,uint16_t flags)19415 rack_setup_fast_output(struct tcpcb *tp, struct tcp_rack *rack,
19416 struct sockbuf *sb,
19417 int len, int orig_len, int segsiz, uint32_t pace_max_seg,
19418 bool hw_tls,
19419 uint16_t flags)
19420 {
19421 rack->r_fast_output = 1;
19422 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
19423 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
19424 rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(rack->r_ctl.fsb.m);
19425 rack->r_ctl.fsb.tcp_flags = flags;
19426 rack->r_ctl.fsb.left_to_send = orig_len - len;
19427 if (rack->r_ctl.fsb.left_to_send < pace_max_seg) {
19428 /* Less than a full sized pace, lets not */
19429 rack->r_fast_output = 0;
19430 return;
19431 } else {
19432 /* Round down to the nearest pace_max_seg */
19433 rack->r_ctl.fsb.left_to_send = rounddown(rack->r_ctl.fsb.left_to_send, pace_max_seg);
19434 }
19435 if (hw_tls)
19436 rack->r_ctl.fsb.hw_tls = 1;
19437 else
19438 rack->r_ctl.fsb.hw_tls = 0;
19439 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))),
19440 ("rack:%p left_to_send:%u sbavail:%u out:%u",
19441 rack, rack->r_ctl.fsb.left_to_send, sbavail(sb),
19442 (tp->snd_max - tp->snd_una)));
19443 if (rack->r_ctl.fsb.left_to_send < segsiz)
19444 rack->r_fast_output = 0;
19445 else {
19446 if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una)))
19447 rack->r_ctl.fsb.rfo_apply_push = 1;
19448 else
19449 rack->r_ctl.fsb.rfo_apply_push = 0;
19450 }
19451 }
19452
19453 static uint32_t
rack_get_hpts_pacing_min_for_bw(struct tcp_rack * rack,int32_t segsiz)19454 rack_get_hpts_pacing_min_for_bw(struct tcp_rack *rack, int32_t segsiz)
19455 {
19456 uint64_t min_time;
19457 uint32_t maxlen;
19458
19459 min_time = (uint64_t)get_hpts_min_sleep_time();
19460 maxlen = (uint32_t)((rack->r_ctl.gp_bw * min_time) / (uint64_t)HPTS_USEC_IN_SEC);
19461 maxlen = roundup(maxlen, segsiz);
19462 return (maxlen);
19463 }
19464
19465 static struct rack_sendmap *
rack_check_collapsed(struct tcp_rack * rack,uint32_t cts)19466 rack_check_collapsed(struct tcp_rack *rack, uint32_t cts)
19467 {
19468 struct rack_sendmap *rsm = NULL;
19469 int thresh;
19470
19471 restart:
19472 rsm = tqhash_find(rack->r_ctl.tqh, rack->r_ctl.last_collapse_point);
19473 if ((rsm == NULL) || ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0)) {
19474 /* Nothing, strange turn off validity */
19475 rack->r_collapse_point_valid = 0;
19476 return (NULL);
19477 }
19478 /* Can we send it yet? */
19479 if (rsm->r_end > (rack->rc_tp->snd_una + rack->rc_tp->snd_wnd)) {
19480 /*
19481 * Receiver window has not grown enough for
19482 * the segment to be put on the wire.
19483 */
19484 return (NULL);
19485 }
19486 if (rsm->r_flags & RACK_ACKED) {
19487 /*
19488 * It has been sacked, lets move to the
19489 * next one if possible.
19490 */
19491 rack->r_ctl.last_collapse_point = rsm->r_end;
19492 /* Are we done? */
19493 if (SEQ_GEQ(rack->r_ctl.last_collapse_point,
19494 rack->r_ctl.high_collapse_point)) {
19495 rack->r_collapse_point_valid = 0;
19496 return (NULL);
19497 }
19498 goto restart;
19499 }
19500 /* Now has it been long enough ? */
19501 thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(rack->rc_tp, rack), cts, __LINE__, 1);
19502 if ((cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])) > thresh) {
19503 rack_log_collapse(rack, rsm->r_start,
19504 (cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])),
19505 thresh, __LINE__, 6, rsm->r_flags, rsm);
19506 return (rsm);
19507 }
19508 /* Not enough time */
19509 rack_log_collapse(rack, rsm->r_start,
19510 (cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])),
19511 thresh, __LINE__, 7, rsm->r_flags, rsm);
19512 return (NULL);
19513 }
19514
19515 static inline void
rack_validate_sizes(struct tcp_rack * rack,int32_t * len,int32_t segsiz,uint32_t pace_max_seg)19516 rack_validate_sizes(struct tcp_rack *rack, int32_t *len, int32_t segsiz, uint32_t pace_max_seg)
19517 {
19518 if ((rack->full_size_rxt == 0) &&
19519 (rack->shape_rxt_to_pacing_min == 0) &&
19520 (*len >= segsiz)) {
19521 *len = segsiz;
19522 } else if (rack->shape_rxt_to_pacing_min &&
19523 rack->gp_ready) {
19524 /* We use pacing min as shaping len req */
19525 uint32_t maxlen;
19526
19527 maxlen = rack_get_hpts_pacing_min_for_bw(rack, segsiz);
19528 if (*len > maxlen)
19529 *len = maxlen;
19530 } else {
19531 /*
19532 * The else is full_size_rxt is on so send it all
19533 * note we do need to check this for exceeding
19534 * our max segment size due to the fact that
19535 * we do sometimes merge chunks together i.e.
19536 * we cannot just assume that we will never have
19537 * a chunk greater than pace_max_seg
19538 */
19539 if (*len > pace_max_seg)
19540 *len = pace_max_seg;
19541 }
19542 }
19543
19544 static int
rack_output(struct tcpcb * tp)19545 rack_output(struct tcpcb *tp)
19546 {
19547 struct socket *so;
19548 uint32_t recwin;
19549 uint32_t sb_offset, s_moff = 0;
19550 int32_t len, error = 0;
19551 uint16_t flags;
19552 struct mbuf *m, *s_mb = NULL;
19553 struct mbuf *mb;
19554 uint32_t if_hw_tsomaxsegcount = 0;
19555 uint32_t if_hw_tsomaxsegsize;
19556 int32_t segsiz, minseg;
19557 long tot_len_this_send = 0;
19558 #ifdef INET
19559 struct ip *ip = NULL;
19560 #endif
19561 struct udphdr *udp = NULL;
19562 struct tcp_rack *rack;
19563 struct tcphdr *th;
19564 uint8_t pass = 0;
19565 uint8_t mark = 0;
19566 uint8_t check_done = 0;
19567 uint8_t wanted_cookie = 0;
19568 u_char opt[TCP_MAXOLEN];
19569 unsigned ipoptlen, optlen, hdrlen, ulen=0;
19570 uint32_t rack_seq;
19571
19572 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
19573 unsigned ipsec_optlen = 0;
19574
19575 #endif
19576 int32_t idle, sendalot;
19577 uint32_t tot_idle;
19578 int32_t sub_from_prr = 0;
19579 volatile int32_t sack_rxmit;
19580 struct rack_sendmap *rsm = NULL;
19581 int32_t tso, mtu;
19582 struct tcpopt to;
19583 int32_t pacing_delay = 0;
19584 int32_t sup_rack = 0;
19585 uint32_t cts, ms_cts, delayed, early;
19586 uint32_t add_flag = RACK_SENT_SP;
19587 /* The doing_tlp flag will be set by the actual rack_timeout_tlp() */
19588 uint8_t doing_tlp = 0;
19589 uint32_t cwnd_to_use, pace_max_seg;
19590 int32_t do_a_prefetch = 0;
19591 int32_t prefetch_rsm = 0;
19592 int32_t orig_len = 0;
19593 struct timeval tv;
19594 int32_t prefetch_so_done = 0;
19595 struct tcp_log_buffer *lgb;
19596 struct inpcb *inp = tptoinpcb(tp);
19597 struct sockbuf *sb;
19598 uint64_t ts_val = 0;
19599 #ifdef TCP_ACCOUNTING
19600 uint64_t crtsc;
19601 #endif
19602 #ifdef INET6
19603 struct ip6_hdr *ip6 = NULL;
19604 int32_t isipv6;
19605 #endif
19606 bool hpts_calling, hw_tls = false;
19607
19608 NET_EPOCH_ASSERT();
19609 INP_WLOCK_ASSERT(inp);
19610
19611 /* setup and take the cache hits here */
19612 rack = (struct tcp_rack *)tp->t_fb_ptr;
19613 #ifdef TCP_ACCOUNTING
19614 sched_pin();
19615 ts_val = get_cyclecount();
19616 #endif
19617 hpts_calling = !!(tp->t_flags2 & TF2_HPTS_CALLS);
19618 tp->t_flags2 &= ~TF2_HPTS_CALLS;
19619 #ifdef TCP_OFFLOAD
19620 if (tp->t_flags & TF_TOE) {
19621 #ifdef TCP_ACCOUNTING
19622 sched_unpin();
19623 #endif
19624 return (tcp_offload_output(tp));
19625 }
19626 #endif
19627 if (rack->rack_deferred_inited == 0) {
19628 /*
19629 * If we are the connecting socket we will
19630 * hit rack_init() when no sequence numbers
19631 * are setup. This makes it so we must defer
19632 * some initialization. Call that now.
19633 */
19634 rack_deferred_init(tp, rack);
19635 }
19636 /*
19637 * For TFO connections in SYN_RECEIVED, only allow the initial
19638 * SYN|ACK and those sent by the retransmit timer.
19639 */
19640 if ((tp->t_flags & TF_FASTOPEN) &&
19641 (tp->t_state == TCPS_SYN_RECEIVED) &&
19642 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */
19643 (rack->r_ctl.rc_resend == NULL)) { /* not a retransmit */
19644 #ifdef TCP_ACCOUNTING
19645 sched_unpin();
19646 #endif
19647 return (0);
19648 }
19649 #ifdef INET6
19650 if (rack->r_state) {
19651 /* Use the cache line loaded if possible */
19652 isipv6 = rack->r_is_v6;
19653 } else {
19654 isipv6 = (rack->rc_inp->inp_vflag & INP_IPV6) != 0;
19655 }
19656 #endif
19657 early = 0;
19658 cts = tcp_get_usecs(&tv);
19659 ms_cts = tcp_tv_to_msec(&tv);
19660 if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) &&
19661 tcp_in_hpts(rack->rc_tp)) {
19662 /*
19663 * We are on the hpts for some timer but not hptsi output.
19664 * Remove from the hpts unconditionally.
19665 */
19666 rack_timer_cancel(tp, rack, cts, __LINE__);
19667 }
19668 /* Are we pacing and late? */
19669 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
19670 TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) {
19671 /* We are delayed */
19672 delayed = cts - rack->r_ctl.rc_last_output_to;
19673 } else {
19674 delayed = 0;
19675 }
19676 /* Do the timers, which may override the pacer */
19677 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
19678 int retval;
19679
19680 retval = rack_process_timers(tp, rack, cts, hpts_calling,
19681 &doing_tlp);
19682 if (retval != 0) {
19683 counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1);
19684 #ifdef TCP_ACCOUNTING
19685 sched_unpin();
19686 #endif
19687 /*
19688 * If timers want tcp_drop(), then pass error out,
19689 * otherwise suppress it.
19690 */
19691 return (retval < 0 ? retval : 0);
19692 }
19693 }
19694 if (rack->rc_in_persist) {
19695 if (tcp_in_hpts(rack->rc_tp) == 0) {
19696 /* Timer is not running */
19697 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
19698 }
19699 #ifdef TCP_ACCOUNTING
19700 sched_unpin();
19701 #endif
19702 return (0);
19703 }
19704 if ((rack->rc_ack_required == 1) &&
19705 (rack->r_timer_override == 0)){
19706 /* A timeout occurred and no ack has arrived */
19707 if (tcp_in_hpts(rack->rc_tp) == 0) {
19708 /* Timer is not running */
19709 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
19710 }
19711 #ifdef TCP_ACCOUNTING
19712 sched_unpin();
19713 #endif
19714 return (0);
19715 }
19716 if ((rack->r_timer_override) ||
19717 (rack->rc_ack_can_sendout_data) ||
19718 (delayed) ||
19719 (tp->t_state < TCPS_ESTABLISHED)) {
19720 rack->rc_ack_can_sendout_data = 0;
19721 if (tcp_in_hpts(rack->rc_tp))
19722 tcp_hpts_remove(rack->rc_tp);
19723 } else if (tcp_in_hpts(rack->rc_tp)) {
19724 /*
19725 * On the hpts you can't pass even if ACKNOW is on, we will
19726 * when the hpts fires.
19727 */
19728 #ifdef TCP_ACCOUNTING
19729 crtsc = get_cyclecount();
19730 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19731 tp->tcp_proc_time[SND_BLOCKED] += (crtsc - ts_val);
19732 tp->tcp_cnt_counters[SND_BLOCKED]++;
19733 }
19734 sched_unpin();
19735 #endif
19736 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1);
19737 return (0);
19738 }
19739 /* Finish out both pacing early and late accounting */
19740 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
19741 TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) {
19742 early = rack->r_ctl.rc_last_output_to - cts;
19743 } else
19744 early = 0;
19745 if (delayed && (rack->rc_always_pace == 1)) {
19746 rack->r_ctl.rc_agg_delayed += delayed;
19747 rack->r_late = 1;
19748 } else if (early && (rack->rc_always_pace == 1)) {
19749 rack->r_ctl.rc_agg_early += early;
19750 rack->r_early = 1;
19751 } else if (rack->rc_always_pace == 0) {
19752 /* Non-paced we are not late */
19753 rack->r_ctl.rc_agg_delayed = rack->r_ctl.rc_agg_early = 0;
19754 rack->r_early = rack->r_late = 0;
19755 }
19756 /* Now that early/late accounting is done turn off the flag */
19757 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
19758 rack->r_wanted_output = 0;
19759 rack->r_timer_override = 0;
19760 if ((tp->t_state != rack->r_state) &&
19761 TCPS_HAVEESTABLISHED(tp->t_state)) {
19762 rack_set_state(tp, rack);
19763 }
19764 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
19765 minseg = segsiz;
19766 if (rack->r_ctl.rc_pace_max_segs == 0)
19767 pace_max_seg = rack->rc_user_set_max_segs * segsiz;
19768 else
19769 pace_max_seg = rack->r_ctl.rc_pace_max_segs;
19770 if ((rack->r_fast_output) &&
19771 (doing_tlp == 0) &&
19772 (tp->rcv_numsacks == 0)) {
19773 int ret;
19774
19775 error = 0;
19776 ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, &tot_len_this_send, &error, __LINE__);
19777 if (ret > 0)
19778 return(ret);
19779 else if (error) {
19780 inp = rack->rc_inp;
19781 so = inp->inp_socket;
19782 sb = &so->so_snd;
19783 goto nomore;
19784 } else {
19785 /* Return == 0, if there is more we can send tot_len wise fall through and send */
19786 if (tot_len_this_send >= pace_max_seg)
19787 return (ret);
19788 #ifdef TCP_ACCOUNTING
19789 /* We need to re-pin since fast_output un-pined */
19790 sched_pin();
19791 ts_val = get_cyclecount();
19792 #endif
19793 /* Fall back out so we can send any more that may bring us to pace_max_seg */
19794 }
19795 }
19796 inp = rack->rc_inp;
19797 /*
19798 * For TFO connections in SYN_SENT or SYN_RECEIVED,
19799 * only allow the initial SYN or SYN|ACK and those sent
19800 * by the retransmit timer.
19801 */
19802 if ((tp->t_flags & TF_FASTOPEN) &&
19803 ((tp->t_state == TCPS_SYN_RECEIVED) ||
19804 (tp->t_state == TCPS_SYN_SENT)) &&
19805 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */
19806 (tp->t_rxtshift == 0)) { /* not a retransmit */
19807 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
19808 #ifdef TCP_ACCOUNTING
19809 sched_unpin();
19810 #endif
19811 return (0);
19812 }
19813 /*
19814 * Determine length of data that should be transmitted, and flags
19815 * that will be used. If there is some data or critical controls
19816 * (SYN, RST) to send, then transmit; otherwise, investigate
19817 * further.
19818 */
19819 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
19820 if (tp->t_idle_reduce) {
19821 if (idle && (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur))
19822 rack_cc_after_idle(rack, tp);
19823 }
19824 tp->t_flags &= ~TF_LASTIDLE;
19825 if (idle) {
19826 if (tp->t_flags & TF_MORETOCOME) {
19827 tp->t_flags |= TF_LASTIDLE;
19828 idle = 0;
19829 }
19830 }
19831 if ((tp->snd_una == tp->snd_max) &&
19832 rack->r_ctl.rc_went_idle_time &&
19833 (cts > rack->r_ctl.rc_went_idle_time)) {
19834 tot_idle = (cts - rack->r_ctl.rc_went_idle_time);
19835 if (tot_idle > rack_min_probertt_hold) {
19836 /* Count as a probe rtt */
19837 if (rack->in_probe_rtt == 0) {
19838 rack->r_ctl.rc_lower_rtt_us_cts = cts;
19839 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts;
19840 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts;
19841 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts;
19842 } else {
19843 rack_exit_probertt(rack, cts);
19844 }
19845 }
19846 } else
19847 tot_idle = 0;
19848 if (rack_use_fsb &&
19849 (rack->r_ctl.fsb.tcp_ip_hdr) &&
19850 (rack->r_fsb_inited == 0) &&
19851 (rack->r_state != TCPS_CLOSED))
19852 rack_init_fsb_block(tp, rack, tcp_outflags[tp->t_state]);
19853 if (rack->rc_sendvars_notset == 1) {
19854 rack->rc_sendvars_notset = 0;
19855 /*
19856 * Make sure any TCP timers (keep-alive) is not running.
19857 */
19858 tcp_timer_stop(tp);
19859 }
19860 if ((rack->rack_no_prr == 1) &&
19861 (rack->rc_always_pace == 0)) {
19862 /*
19863 * Sanity check before sending, if we have
19864 * no-pacing enabled and prr is turned off that
19865 * is a logistics error. Correct this by turnning
19866 * prr back on. A user *must* set some form of
19867 * pacing in order to turn PRR off. We do this
19868 * in the output path so that we can avoid socket
19869 * option ordering issues that would occur if we
19870 * tried to do it while setting rack_no_prr on.
19871 */
19872 rack->rack_no_prr = 0;
19873 }
19874 if ((rack->pcm_enabled == 1) &&
19875 (rack->pcm_needed == 0) &&
19876 (tot_idle > 0)) {
19877 /*
19878 * We have been idle some micro seconds. We need
19879 * to factor this in to see if a PCM is needed.
19880 */
19881 uint32_t rtts_idle, rnds;
19882
19883 if (tp->t_srtt)
19884 rtts_idle = tot_idle / tp->t_srtt;
19885 else
19886 rtts_idle = 0;
19887 rnds = rack->r_ctl.current_round - rack->r_ctl.last_pcm_round;
19888 rack->r_ctl.pcm_idle_rounds += rtts_idle;
19889 if ((rnds + rack->r_ctl.pcm_idle_rounds) >= rack_pcm_every_n_rounds) {
19890 rack->pcm_needed = 1;
19891 rack_log_pcm(rack, 8, rack->r_ctl.last_pcm_round, rtts_idle, rack->r_ctl.current_round );
19892 }
19893 }
19894 again:
19895 sendalot = 0;
19896 cts = tcp_get_usecs(&tv);
19897 ms_cts = tcp_tv_to_msec(&tv);
19898 tso = 0;
19899 mtu = 0;
19900 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
19901 (rack->r_ctl.pcm_max_seg == 0)) {
19902 /*
19903 * We set in our first send so we know that the ctf_fixed_maxseg
19904 * has been fully set. If we do it in rack_init() we most likely
19905 * see 512 bytes so we end up at 5120, not desirable.
19906 */
19907 rack->r_ctl.pcm_max_seg = rc_init_window(rack);
19908 if (rack->r_ctl.pcm_max_seg < (ctf_fixed_maxseg(tp) * 10)) {
19909 /*
19910 * Assure our initial PCM probe is at least 10 MSS.
19911 */
19912 rack->r_ctl.pcm_max_seg = ctf_fixed_maxseg(tp) * 10;
19913 }
19914 }
19915 if ((rack->r_ctl.pcm_max_seg != 0) && (rack->pcm_needed == 1)) {
19916 uint32_t rw_avail, cwa;
19917
19918 if (tp->snd_wnd > ctf_outstanding(tp))
19919 rw_avail = tp->snd_wnd - ctf_outstanding(tp);
19920 else
19921 rw_avail = 0;
19922 if (tp->snd_cwnd > ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked))
19923 cwa = tp->snd_cwnd -ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
19924 else
19925 cwa = 0;
19926 if ((cwa >= rack->r_ctl.pcm_max_seg) &&
19927 (rw_avail > rack->r_ctl.pcm_max_seg)) {
19928 /* Raise up the max seg for this trip through */
19929 pace_max_seg = rack->r_ctl.pcm_max_seg;
19930 /* Disable any fast output */
19931 rack->r_fast_output = 0;
19932 }
19933 if (rack_verbose_logging) {
19934 rack_log_pcm(rack, 4,
19935 cwa, rack->r_ctl.pcm_max_seg, rw_avail);
19936 }
19937 }
19938 sb_offset = tp->snd_max - tp->snd_una;
19939 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
19940 flags = tcp_outflags[tp->t_state];
19941 while (rack->rc_free_cnt < rack_free_cache) {
19942 rsm = rack_alloc(rack);
19943 if (rsm == NULL) {
19944 if (hpts_calling)
19945 /* Retry in a ms */
19946 pacing_delay = (1 * HPTS_USEC_IN_MSEC);
19947 so = inp->inp_socket;
19948 sb = &so->so_snd;
19949 goto just_return_nolock;
19950 }
19951 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext);
19952 rack->rc_free_cnt++;
19953 rsm = NULL;
19954 }
19955 sack_rxmit = 0;
19956 len = 0;
19957 rsm = NULL;
19958 if (flags & TH_RST) {
19959 SOCK_SENDBUF_LOCK(inp->inp_socket);
19960 so = inp->inp_socket;
19961 sb = &so->so_snd;
19962 goto send;
19963 }
19964 if (rack->r_ctl.rc_resend) {
19965 /* Retransmit timer */
19966 rsm = rack->r_ctl.rc_resend;
19967 rack->r_ctl.rc_resend = NULL;
19968 len = rsm->r_end - rsm->r_start;
19969 sack_rxmit = 1;
19970 sendalot = 0;
19971 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
19972 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
19973 __func__, __LINE__,
19974 rsm->r_start, tp->snd_una, tp, rack, rsm));
19975 sb_offset = rsm->r_start - tp->snd_una;
19976 rack_validate_sizes(rack, &len, segsiz, pace_max_seg);
19977 } else if (rack->r_collapse_point_valid &&
19978 ((rsm = rack_check_collapsed(rack, cts)) != NULL)) {
19979 /*
19980 * If an RSM is returned then enough time has passed
19981 * for us to retransmit it. Move up the collapse point,
19982 * since this rsm has its chance to retransmit now.
19983 */
19984 tcp_trace_point(rack->rc_tp, TCP_TP_COLLAPSED_RXT);
19985 rack->r_ctl.last_collapse_point = rsm->r_end;
19986 /* Are we done? */
19987 if (SEQ_GEQ(rack->r_ctl.last_collapse_point,
19988 rack->r_ctl.high_collapse_point))
19989 rack->r_collapse_point_valid = 0;
19990 sack_rxmit = 1;
19991 /* We are not doing a TLP */
19992 doing_tlp = 0;
19993 len = rsm->r_end - rsm->r_start;
19994 sb_offset = rsm->r_start - tp->snd_una;
19995 sendalot = 0;
19996 rack_validate_sizes(rack, &len, segsiz, pace_max_seg);
19997 } else if ((rsm = tcp_rack_output(tp, rack, cts)) != NULL) {
19998 /* We have a retransmit that takes precedence */
19999 if ((!IN_FASTRECOVERY(tp->t_flags)) &&
20000 ((rsm->r_flags & RACK_MUST_RXT) == 0) &&
20001 ((tp->t_flags & TF_WASFRECOVERY) == 0)) {
20002 /* Enter recovery if not induced by a time-out */
20003 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__);
20004 }
20005 #ifdef INVARIANTS
20006 if (SEQ_LT(rsm->r_start, tp->snd_una)) {
20007 panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n",
20008 tp, rack, rsm, rsm->r_start, tp->snd_una);
20009 }
20010 #endif
20011 len = rsm->r_end - rsm->r_start;
20012 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
20013 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
20014 __func__, __LINE__,
20015 rsm->r_start, tp->snd_una, tp, rack, rsm));
20016 sb_offset = rsm->r_start - tp->snd_una;
20017 sendalot = 0;
20018 rack_validate_sizes(rack, &len, segsiz, pace_max_seg);
20019 if (len > 0) {
20020 sack_rxmit = 1;
20021 KMOD_TCPSTAT_INC(tcps_sack_rexmits);
20022 KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes,
20023 min(len, segsiz));
20024 }
20025 } else if (rack->r_ctl.rc_tlpsend) {
20026 /* Tail loss probe */
20027 long cwin;
20028 long tlen;
20029
20030 /*
20031 * Check if we can do a TLP with a RACK'd packet
20032 * this can happen if we are not doing the rack
20033 * cheat and we skipped to a TLP and it
20034 * went off.
20035 */
20036 rsm = rack->r_ctl.rc_tlpsend;
20037 /* We are doing a TLP make sure the flag is preent */
20038 rsm->r_flags |= RACK_TLP;
20039 rack->r_ctl.rc_tlpsend = NULL;
20040 sack_rxmit = 1;
20041 tlen = rsm->r_end - rsm->r_start;
20042 if (tlen > segsiz)
20043 tlen = segsiz;
20044 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
20045 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
20046 __func__, __LINE__,
20047 rsm->r_start, tp->snd_una, tp, rack, rsm));
20048 sb_offset = rsm->r_start - tp->snd_una;
20049 cwin = min(tp->snd_wnd, tlen);
20050 len = cwin;
20051 }
20052 if (rack->r_must_retran &&
20053 (doing_tlp == 0) &&
20054 (SEQ_GT(tp->snd_max, tp->snd_una)) &&
20055 (rsm == NULL)) {
20056 /*
20057 * There are two different ways that we
20058 * can get into this block:
20059 * a) This is a non-sack connection, we had a time-out
20060 * and thus r_must_retran was set and everything
20061 * left outstanding as been marked for retransmit.
20062 * b) The MTU of the path shrank, so that everything
20063 * was marked to be retransmitted with the smaller
20064 * mtu and r_must_retran was set.
20065 *
20066 * This means that we expect the sendmap (outstanding)
20067 * to all be marked must. We can use the tmap to
20068 * look at them.
20069 *
20070 */
20071 int sendwin, flight;
20072
20073 sendwin = min(tp->snd_wnd, tp->snd_cwnd);
20074 flight = ctf_flight_size(tp, rack->r_ctl.rc_out_at_rto);
20075 if (flight >= sendwin) {
20076 /*
20077 * We can't send yet.
20078 */
20079 so = inp->inp_socket;
20080 sb = &so->so_snd;
20081 goto just_return_nolock;
20082 }
20083 /*
20084 * This is the case a/b mentioned above. All
20085 * outstanding/not-acked should be marked.
20086 * We can use the tmap to find them.
20087 */
20088 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
20089 if (rsm == NULL) {
20090 /* TSNH */
20091 rack->r_must_retran = 0;
20092 rack->r_ctl.rc_out_at_rto = 0;
20093 so = inp->inp_socket;
20094 sb = &so->so_snd;
20095 goto just_return_nolock;
20096 }
20097 if ((rsm->r_flags & RACK_MUST_RXT) == 0) {
20098 /*
20099 * The first one does not have the flag, did we collapse
20100 * further up in our list?
20101 */
20102 rack->r_must_retran = 0;
20103 rack->r_ctl.rc_out_at_rto = 0;
20104 rsm = NULL;
20105 sack_rxmit = 0;
20106 } else {
20107 sack_rxmit = 1;
20108 len = rsm->r_end - rsm->r_start;
20109 sb_offset = rsm->r_start - tp->snd_una;
20110 sendalot = 0;
20111 if ((rack->full_size_rxt == 0) &&
20112 (rack->shape_rxt_to_pacing_min == 0) &&
20113 (len >= segsiz))
20114 len = segsiz;
20115 else if (rack->shape_rxt_to_pacing_min &&
20116 rack->gp_ready) {
20117 /* We use pacing min as shaping len req */
20118 uint32_t maxlen;
20119
20120 maxlen = rack_get_hpts_pacing_min_for_bw(rack, segsiz);
20121 if (len > maxlen)
20122 len = maxlen;
20123 }
20124 /*
20125 * Delay removing the flag RACK_MUST_RXT so
20126 * that the fastpath for retransmit will
20127 * work with this rsm.
20128 */
20129 }
20130 }
20131 /*
20132 * Enforce a connection sendmap count limit if set
20133 * as long as we are not retransmiting.
20134 */
20135 if ((rsm == NULL) &&
20136 (V_tcp_map_entries_limit > 0) &&
20137 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) {
20138 counter_u64_add(rack_to_alloc_limited, 1);
20139 if (!rack->alloc_limit_reported) {
20140 rack->alloc_limit_reported = 1;
20141 counter_u64_add(rack_alloc_limited_conns, 1);
20142 }
20143 so = inp->inp_socket;
20144 sb = &so->so_snd;
20145 goto just_return_nolock;
20146 }
20147 if (rsm && (rsm->r_flags & RACK_HAS_FIN)) {
20148 /* we are retransmitting the fin */
20149 len--;
20150 if (len) {
20151 /*
20152 * When retransmitting data do *not* include the
20153 * FIN. This could happen from a TLP probe.
20154 */
20155 flags &= ~TH_FIN;
20156 }
20157 }
20158 if (rsm && rack->r_fsb_inited &&
20159 rack_use_rsm_rfo &&
20160 ((rsm->r_flags & RACK_HAS_FIN) == 0)) {
20161 int ret;
20162
20163 ret = rack_fast_rsm_output(tp, rack, rsm, ts_val, cts, ms_cts, &tv, len, doing_tlp);
20164 if (ret == 0)
20165 return (0);
20166 }
20167 so = inp->inp_socket;
20168 sb = &so->so_snd;
20169 if (do_a_prefetch == 0) {
20170 kern_prefetch(sb, &do_a_prefetch);
20171 do_a_prefetch = 1;
20172 }
20173 #ifdef NETFLIX_SHARED_CWND
20174 if ((tp->t_flags2 & TF2_TCP_SCWND_ALLOWED) &&
20175 rack->rack_enable_scwnd) {
20176 /* We are doing cwnd sharing */
20177 if (rack->gp_ready &&
20178 (rack->rack_attempted_scwnd == 0) &&
20179 (rack->r_ctl.rc_scw == NULL) &&
20180 tp->t_lib) {
20181 /* The pcbid is in, lets make an attempt */
20182 counter_u64_add(rack_try_scwnd, 1);
20183 rack->rack_attempted_scwnd = 1;
20184 rack->r_ctl.rc_scw = tcp_shared_cwnd_alloc(tp,
20185 &rack->r_ctl.rc_scw_index,
20186 segsiz);
20187 }
20188 if (rack->r_ctl.rc_scw &&
20189 (rack->rack_scwnd_is_idle == 1) &&
20190 sbavail(&so->so_snd)) {
20191 /* we are no longer out of data */
20192 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
20193 rack->rack_scwnd_is_idle = 0;
20194 }
20195 if (rack->r_ctl.rc_scw) {
20196 /* First lets update and get the cwnd */
20197 rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw,
20198 rack->r_ctl.rc_scw_index,
20199 tp->snd_cwnd, tp->snd_wnd, segsiz);
20200 }
20201 }
20202 #endif
20203 /*
20204 * Get standard flags, and add SYN or FIN if requested by 'hidden'
20205 * state flags.
20206 */
20207 if (tp->t_flags & TF_NEEDFIN)
20208 flags |= TH_FIN;
20209 if (tp->t_flags & TF_NEEDSYN)
20210 flags |= TH_SYN;
20211 if ((sack_rxmit == 0) && (prefetch_rsm == 0)) {
20212 void *end_rsm;
20213 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
20214 if (end_rsm)
20215 kern_prefetch(end_rsm, &prefetch_rsm);
20216 prefetch_rsm = 1;
20217 }
20218 SOCK_SENDBUF_LOCK(so);
20219 if ((sack_rxmit == 0) &&
20220 (TCPS_HAVEESTABLISHED(tp->t_state) ||
20221 (tp->t_flags & TF_FASTOPEN))) {
20222 /*
20223 * We are not retransmitting (sack_rxmit is 0) so we
20224 * are sending new data. This is always based on snd_max.
20225 * Now in theory snd_max may be equal to snd_una, if so
20226 * then nothing is outstanding and the offset would be 0.
20227 */
20228 uint32_t avail;
20229
20230 avail = sbavail(sb);
20231 if (SEQ_GT(tp->snd_max, tp->snd_una) && avail)
20232 sb_offset = tp->snd_max - tp->snd_una;
20233 else
20234 sb_offset = 0;
20235 if ((IN_FASTRECOVERY(tp->t_flags) == 0) || rack->rack_no_prr) {
20236 if (rack->r_ctl.rc_tlp_new_data) {
20237 /* TLP is forcing out new data */
20238 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) {
20239 rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset);
20240 }
20241 if ((rack->r_ctl.rc_tlp_new_data + sb_offset) > tp->snd_wnd) {
20242 if (tp->snd_wnd > sb_offset)
20243 len = tp->snd_wnd - sb_offset;
20244 else
20245 len = 0;
20246 } else {
20247 len = rack->r_ctl.rc_tlp_new_data;
20248 }
20249 rack->r_ctl.rc_tlp_new_data = 0;
20250 } else {
20251 len = rack_what_can_we_send(tp, rack, cwnd_to_use, avail, sb_offset);
20252 }
20253 if ((rack->r_ctl.crte == NULL) &&
20254 IN_FASTRECOVERY(tp->t_flags) &&
20255 (rack->full_size_rxt == 0) &&
20256 (rack->shape_rxt_to_pacing_min == 0) &&
20257 (len > segsiz)) {
20258 /*
20259 * For prr=off, we need to send only 1 MSS
20260 * at a time. We do this because another sack could
20261 * be arriving that causes us to send retransmits and
20262 * we don't want to be on a long pace due to a larger send
20263 * that keeps us from sending out the retransmit.
20264 */
20265 len = segsiz;
20266 } else if (rack->shape_rxt_to_pacing_min &&
20267 rack->gp_ready) {
20268 /* We use pacing min as shaping len req */
20269 uint32_t maxlen;
20270
20271 maxlen = rack_get_hpts_pacing_min_for_bw(rack, segsiz);
20272 if (len > maxlen)
20273 len = maxlen;
20274 }/* The else is full_size_rxt is on so send it all */
20275 } else {
20276 uint32_t outstanding;
20277 /*
20278 * We are inside of a Fast recovery episode, this
20279 * is caused by a SACK or 3 dup acks. At this point
20280 * we have sent all the retransmissions and we rely
20281 * on PRR to dictate what we will send in the form of
20282 * new data.
20283 */
20284
20285 outstanding = tp->snd_max - tp->snd_una;
20286 if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) {
20287 if (tp->snd_wnd > outstanding) {
20288 len = tp->snd_wnd - outstanding;
20289 /* Check to see if we have the data */
20290 if ((sb_offset + len) > avail) {
20291 /* It does not all fit */
20292 if (avail > sb_offset)
20293 len = avail - sb_offset;
20294 else
20295 len = 0;
20296 }
20297 } else {
20298 len = 0;
20299 }
20300 } else if (avail > sb_offset) {
20301 len = avail - sb_offset;
20302 } else {
20303 len = 0;
20304 }
20305 if (len > 0) {
20306 if (len > rack->r_ctl.rc_prr_sndcnt) {
20307 len = rack->r_ctl.rc_prr_sndcnt;
20308 }
20309 if (len > 0) {
20310 sub_from_prr = 1;
20311 }
20312 }
20313 if (len > segsiz) {
20314 /*
20315 * We should never send more than a MSS when
20316 * retransmitting or sending new data in prr
20317 * mode unless the override flag is on. Most
20318 * likely the PRR algorithm is not going to
20319 * let us send a lot as well :-)
20320 */
20321 if (rack->r_ctl.rc_prr_sendalot == 0) {
20322 len = segsiz;
20323 }
20324 } else if (len < segsiz) {
20325 /*
20326 * Do we send any? The idea here is if the
20327 * send empty's the socket buffer we want to
20328 * do it. However if not then lets just wait
20329 * for our prr_sndcnt to get bigger.
20330 */
20331 long leftinsb;
20332
20333 leftinsb = sbavail(sb) - sb_offset;
20334 if (leftinsb > len) {
20335 /* This send does not empty the sb */
20336 len = 0;
20337 }
20338 }
20339 }
20340 } else if (!TCPS_HAVEESTABLISHED(tp->t_state)) {
20341 /*
20342 * If you have not established
20343 * and are not doing FAST OPEN
20344 * no data please.
20345 */
20346 if ((sack_rxmit == 0) &&
20347 !(tp->t_flags & TF_FASTOPEN)) {
20348 len = 0;
20349 sb_offset = 0;
20350 }
20351 }
20352 if (prefetch_so_done == 0) {
20353 kern_prefetch(so, &prefetch_so_done);
20354 prefetch_so_done = 1;
20355 }
20356 orig_len = len;
20357 /*
20358 * Lop off SYN bit if it has already been sent. However, if this is
20359 * SYN-SENT state and if segment contains data and if we don't know
20360 * that foreign host supports TAO, suppress sending segment.
20361 */
20362 if ((flags & TH_SYN) &&
20363 SEQ_GT(tp->snd_max, tp->snd_una) &&
20364 ((sack_rxmit == 0) &&
20365 (tp->t_rxtshift == 0))) {
20366 /*
20367 * When sending additional segments following a TFO SYN|ACK,
20368 * do not include the SYN bit.
20369 */
20370 if ((tp->t_flags & TF_FASTOPEN) &&
20371 (tp->t_state == TCPS_SYN_RECEIVED))
20372 flags &= ~TH_SYN;
20373 }
20374 /*
20375 * Be careful not to send data and/or FIN on SYN segments. This
20376 * measure is needed to prevent interoperability problems with not
20377 * fully conformant TCP implementations.
20378 */
20379 if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
20380 len = 0;
20381 flags &= ~TH_FIN;
20382 }
20383 /*
20384 * On TFO sockets, ensure no data is sent in the following cases:
20385 *
20386 * - When retransmitting SYN|ACK on a passively-created socket
20387 *
20388 * - When retransmitting SYN on an actively created socket
20389 *
20390 * - When sending a zero-length cookie (cookie request) on an
20391 * actively created socket
20392 *
20393 * - When the socket is in the CLOSED state (RST is being sent)
20394 */
20395 if ((tp->t_flags & TF_FASTOPEN) &&
20396 (((flags & TH_SYN) && (tp->t_rxtshift > 0)) ||
20397 ((tp->t_state == TCPS_SYN_SENT) &&
20398 (tp->t_tfo_client_cookie_len == 0)) ||
20399 (flags & TH_RST))) {
20400 sack_rxmit = 0;
20401 len = 0;
20402 }
20403 /* Without fast-open there should never be data sent on a SYN */
20404 if ((flags & TH_SYN) && !(tp->t_flags & TF_FASTOPEN)) {
20405 len = 0;
20406 }
20407 if ((len > segsiz) && (tcp_dsack_block_exists(tp))) {
20408 /* We only send 1 MSS if we have a DSACK block */
20409 add_flag |= RACK_SENT_W_DSACK;
20410 len = segsiz;
20411 }
20412 if (len <= 0) {
20413 /*
20414 * We have nothing to send, or the window shrank, or
20415 * is closed, do we need to go into persists?
20416 */
20417 len = 0;
20418 if ((tp->snd_wnd == 0) &&
20419 (TCPS_HAVEESTABLISHED(tp->t_state)) &&
20420 (tp->snd_una == tp->snd_max) &&
20421 (sb_offset < (int)sbavail(sb))) {
20422 rack_enter_persist(tp, rack, cts, tp->snd_una);
20423 }
20424 } else if ((rsm == NULL) &&
20425 (doing_tlp == 0) &&
20426 (len < pace_max_seg)) {
20427 /*
20428 * We are not sending a maximum sized segment for
20429 * some reason. Should we not send anything (think
20430 * sws or persists)?
20431 */
20432 if ((tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg)) &&
20433 (TCPS_HAVEESTABLISHED(tp->t_state)) &&
20434 (len < minseg) &&
20435 (len < (int)(sbavail(sb) - sb_offset))) {
20436 /*
20437 * Here the rwnd is less than
20438 * the minimum pacing size, this is not a retransmit,
20439 * we are established and
20440 * the send is not the last in the socket buffer
20441 * we send nothing, and we may enter persists
20442 * if nothing is outstanding.
20443 */
20444 len = 0;
20445 if (tp->snd_max == tp->snd_una) {
20446 /*
20447 * Nothing out we can
20448 * go into persists.
20449 */
20450 rack_enter_persist(tp, rack, cts, tp->snd_una);
20451 }
20452 } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) &&
20453 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) &&
20454 (len < (int)(sbavail(sb) - sb_offset)) &&
20455 (len < minseg)) {
20456 /*
20457 * Here we are not retransmitting, and
20458 * the cwnd is not so small that we could
20459 * not send at least a min size (rxt timer
20460 * not having gone off), We have 2 segments or
20461 * more already in flight, its not the tail end
20462 * of the socket buffer and the cwnd is blocking
20463 * us from sending out a minimum pacing segment size.
20464 * Lets not send anything.
20465 */
20466 len = 0;
20467 } else if (((tp->snd_wnd - ctf_outstanding(tp)) <
20468 min((rack->r_ctl.rc_high_rwnd/2), minseg)) &&
20469 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) &&
20470 (len < (int)(sbavail(sb) - sb_offset)) &&
20471 (TCPS_HAVEESTABLISHED(tp->t_state))) {
20472 /*
20473 * Here we have a send window but we have
20474 * filled it up and we can't send another pacing segment.
20475 * We also have in flight more than 2 segments
20476 * and we are not completing the sb i.e. we allow
20477 * the last bytes of the sb to go out even if
20478 * its not a full pacing segment.
20479 */
20480 len = 0;
20481 } else if ((rack->r_ctl.crte != NULL) &&
20482 (tp->snd_wnd >= (pace_max_seg * max(1, rack_hw_rwnd_factor))) &&
20483 (cwnd_to_use >= (pace_max_seg + (4 * segsiz))) &&
20484 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) >= (2 * segsiz)) &&
20485 (len < (int)(sbavail(sb) - sb_offset))) {
20486 /*
20487 * Here we are doing hardware pacing, this is not a TLP,
20488 * we are not sending a pace max segment size, there is rwnd
20489 * room to send at least N pace_max_seg, the cwnd is greater
20490 * than or equal to a full pacing segments plus 4 mss and we have 2 or
20491 * more segments in flight and its not the tail of the socket buffer.
20492 *
20493 * We don't want to send instead we need to get more ack's in to
20494 * allow us to send a full pacing segment. Normally, if we are pacing
20495 * about the right speed, we should have finished our pacing
20496 * send as most of the acks have come back if we are at the
20497 * right rate. This is a bit fuzzy since return path delay
20498 * can delay the acks, which is why we want to make sure we
20499 * have cwnd space to have a bit more than a max pace segments in flight.
20500 *
20501 * If we have not gotten our acks back we are pacing at too high a
20502 * rate delaying will not hurt and will bring our GP estimate down by
20503 * injecting the delay. If we don't do this we will send
20504 * 2 MSS out in response to the acks being clocked in which
20505 * defeats the point of hw-pacing (i.e. to help us get
20506 * larger TSO's out).
20507 */
20508 len = 0;
20509 }
20510
20511 }
20512 /* len will be >= 0 after this point. */
20513 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
20514 rack_sndbuf_autoscale(rack);
20515 /*
20516 * Decide if we can use TCP Segmentation Offloading (if supported by
20517 * hardware).
20518 *
20519 * TSO may only be used if we are in a pure bulk sending state. The
20520 * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP
20521 * options prevent using TSO. With TSO the TCP header is the same
20522 * (except for the sequence number) for all generated packets. This
20523 * makes it impossible to transmit any options which vary per
20524 * generated segment or packet.
20525 *
20526 * IPv4 handling has a clear separation of ip options and ip header
20527 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does
20528 * the right thing below to provide length of just ip options and thus
20529 * checking for ipoptlen is enough to decide if ip options are present.
20530 */
20531 ipoptlen = 0;
20532 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
20533 /*
20534 * Pre-calculate here as we save another lookup into the darknesses
20535 * of IPsec that way and can actually decide if TSO is ok.
20536 */
20537 #ifdef INET6
20538 if (isipv6 && IPSEC_ENABLED(ipv6))
20539 ipsec_optlen = IPSEC_HDRSIZE(ipv6, inp);
20540 #ifdef INET
20541 else
20542 #endif
20543 #endif /* INET6 */
20544 #ifdef INET
20545 if (IPSEC_ENABLED(ipv4))
20546 ipsec_optlen = IPSEC_HDRSIZE(ipv4, inp);
20547 #endif /* INET */
20548 #endif
20549
20550 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
20551 ipoptlen += ipsec_optlen;
20552 #endif
20553 if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > segsiz &&
20554 (tp->t_port == 0) &&
20555 ((tp->t_flags & TF_SIGNATURE) == 0) &&
20556 sack_rxmit == 0 &&
20557 ipoptlen == 0)
20558 tso = 1;
20559 {
20560 uint32_t outstanding __unused;
20561
20562 outstanding = tp->snd_max - tp->snd_una;
20563 if (tp->t_flags & TF_SENTFIN) {
20564 /*
20565 * If we sent a fin, snd_max is 1 higher than
20566 * snd_una
20567 */
20568 outstanding--;
20569 }
20570 if (sack_rxmit) {
20571 if ((rsm->r_flags & RACK_HAS_FIN) == 0)
20572 flags &= ~TH_FIN;
20573 }
20574 }
20575 recwin = lmin(lmax(sbspace(&so->so_rcv), 0),
20576 (long)TCP_MAXWIN << tp->rcv_scale);
20577
20578 /*
20579 * Sender silly window avoidance. We transmit under the following
20580 * conditions when len is non-zero:
20581 *
20582 * - We have a full segment (or more with TSO) - This is the last
20583 * buffer in a write()/send() and we are either idle or running
20584 * NODELAY - we've timed out (e.g. persist timer) - we have more
20585 * then 1/2 the maximum send window's worth of data (receiver may be
20586 * limited the window size) - we need to retransmit
20587 */
20588 if (len) {
20589 if (len >= segsiz) {
20590 goto send;
20591 }
20592 /*
20593 * NOTE! on localhost connections an 'ack' from the remote
20594 * end may occur synchronously with the output and cause us
20595 * to flush a buffer queued with moretocome. XXX
20596 *
20597 */
20598 if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */
20599 (idle || (tp->t_flags & TF_NODELAY)) &&
20600 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) &&
20601 (tp->t_flags & TF_NOPUSH) == 0) {
20602 pass = 2;
20603 goto send;
20604 }
20605 if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */
20606 pass = 22;
20607 goto send;
20608 }
20609 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
20610 pass = 4;
20611 goto send;
20612 }
20613 if (sack_rxmit) {
20614 pass = 6;
20615 goto send;
20616 }
20617 if (((tp->snd_wnd - ctf_outstanding(tp)) < segsiz) &&
20618 (ctf_outstanding(tp) < (segsiz * 2))) {
20619 /*
20620 * We have less than two MSS outstanding (delayed ack)
20621 * and our rwnd will not let us send a full sized
20622 * MSS. Lets go ahead and let this small segment
20623 * out because we want to try to have at least two
20624 * packets inflight to not be caught by delayed ack.
20625 */
20626 pass = 12;
20627 goto send;
20628 }
20629 }
20630 /*
20631 * Sending of standalone window updates.
20632 *
20633 * Window updates are important when we close our window due to a
20634 * full socket buffer and are opening it again after the application
20635 * reads data from it. Once the window has opened again and the
20636 * remote end starts to send again the ACK clock takes over and
20637 * provides the most current window information.
20638 *
20639 * We must avoid the silly window syndrome whereas every read from
20640 * the receive buffer, no matter how small, causes a window update
20641 * to be sent. We also should avoid sending a flurry of window
20642 * updates when the socket buffer had queued a lot of data and the
20643 * application is doing small reads.
20644 *
20645 * Prevent a flurry of pointless window updates by only sending an
20646 * update when we can increase the advertized window by more than
20647 * 1/4th of the socket buffer capacity. When the buffer is getting
20648 * full or is very small be more aggressive and send an update
20649 * whenever we can increase by two mss sized segments. In all other
20650 * situations the ACK's to new incoming data will carry further
20651 * window increases.
20652 *
20653 * Don't send an independent window update if a delayed ACK is
20654 * pending (it will get piggy-backed on it) or the remote side
20655 * already has done a half-close and won't send more data. Skip
20656 * this if the connection is in T/TCP half-open state.
20657 */
20658 if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
20659 !(tp->t_flags & TF_DELACK) &&
20660 !TCPS_HAVERCVDFIN(tp->t_state)) {
20661 /*
20662 * "adv" is the amount we could increase the window, taking
20663 * into account that we are limited by TCP_MAXWIN <<
20664 * tp->rcv_scale.
20665 */
20666 int32_t adv;
20667 int oldwin;
20668
20669 adv = recwin;
20670 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
20671 oldwin = (tp->rcv_adv - tp->rcv_nxt);
20672 if (adv > oldwin)
20673 adv -= oldwin;
20674 else {
20675 /* We can't increase the window */
20676 adv = 0;
20677 }
20678 } else
20679 oldwin = 0;
20680
20681 /*
20682 * If the new window size ends up being the same as or less
20683 * than the old size when it is scaled, then don't force
20684 * a window update.
20685 */
20686 if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale)
20687 goto dontupdate;
20688
20689 if (adv >= (int32_t)(2 * segsiz) &&
20690 (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) ||
20691 recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) ||
20692 so->so_rcv.sb_hiwat <= 8 * segsiz)) {
20693 pass = 7;
20694 goto send;
20695 }
20696 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) {
20697 pass = 23;
20698 goto send;
20699 }
20700 }
20701 dontupdate:
20702
20703 /*
20704 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW
20705 * is also a catch-all for the retransmit timer timeout case.
20706 */
20707 if (tp->t_flags & TF_ACKNOW) {
20708 pass = 8;
20709 goto send;
20710 }
20711 if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) {
20712 pass = 9;
20713 goto send;
20714 }
20715 /*
20716 * If our state indicates that FIN should be sent and we have not
20717 * yet done so, then we need to send.
20718 */
20719 if ((flags & TH_FIN) &&
20720 (tp->snd_max == tp->snd_una)) {
20721 pass = 11;
20722 goto send;
20723 }
20724 /*
20725 * No reason to send a segment, just return.
20726 */
20727 just_return:
20728 SOCK_SENDBUF_UNLOCK(so);
20729 just_return_nolock:
20730 {
20731 int app_limited = CTF_JR_SENT_DATA;
20732
20733 if ((tp->t_flags & TF_FASTOPEN) == 0 &&
20734 (flags & TH_FIN) &&
20735 (len == 0) &&
20736 (sbused(sb) == (tp->snd_max - tp->snd_una)) &&
20737 ((tp->snd_max - tp->snd_una) <= segsiz)) {
20738 /*
20739 * Ok less than or right at a MSS is
20740 * outstanding. The original FreeBSD stack would
20741 * have sent a FIN, which can speed things up for
20742 * a transactional application doing a MSG_WAITALL.
20743 * To speed things up since we do *not* send a FIN
20744 * if data is outstanding, we send a "challenge ack".
20745 * The idea behind that is instead of having to have
20746 * the peer wait for the delayed-ack timer to run off
20747 * we send an ack that makes the peer send us an ack.
20748 *
20749 * Note we do not send anything if its been less than
20750 * a srtt.
20751 */
20752 uint64_t tmark;
20753
20754 tmark = tcp_get_u64_usecs(&tv);
20755 if ((tmark > rack->r_ctl.lt_timemark) &&
20756 (((tmark - rack->r_ctl.lt_timemark) / 1000) > tp->t_srtt)) {
20757 rack_send_ack_challange(rack);
20758 }
20759 }
20760 if (tot_len_this_send > 0) {
20761 rack->r_ctl.fsb.recwin = recwin;
20762 pacing_delay = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz, __LINE__);
20763 if ((error == 0) &&
20764 rack_use_rfo &&
20765 ((flags & (TH_SYN|TH_FIN)) == 0) &&
20766 (ipoptlen == 0) &&
20767 rack->r_fsb_inited &&
20768 TCPS_HAVEESTABLISHED(tp->t_state) &&
20769 ((IN_RECOVERY(tp->t_flags)) == 0) &&
20770 (doing_tlp == 0) &&
20771 (rack->r_must_retran == 0) &&
20772 ((tp->t_flags & TF_NEEDFIN) == 0) &&
20773 (len > 0) && (orig_len > 0) &&
20774 (orig_len > len) &&
20775 ((orig_len - len) >= segsiz) &&
20776 ((optlen == 0) ||
20777 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) {
20778 /* We can send at least one more MSS using our fsb */
20779 rack_setup_fast_output(tp, rack, sb, len, orig_len,
20780 segsiz, pace_max_seg, hw_tls, flags);
20781 } else
20782 rack->r_fast_output = 0;
20783 rack_log_fsb(rack, tp, so, flags,
20784 ipoptlen, orig_len, len, 0,
20785 1, optlen, __LINE__, 1);
20786 /* Assure when we leave that snd_nxt will point to top */
20787 if (SEQ_GT(tp->snd_max, tp->snd_nxt))
20788 tp->snd_nxt = tp->snd_max;
20789 } else {
20790 int end_window = 0;
20791 uint32_t seq = tp->gput_ack;
20792
20793 rsm = tqhash_max(rack->r_ctl.tqh);
20794 if (rsm) {
20795 /*
20796 * Mark the last sent that we just-returned (hinting
20797 * that delayed ack may play a role in any rtt measurement).
20798 */
20799 rsm->r_just_ret = 1;
20800 }
20801 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1);
20802 rack->r_ctl.rc_agg_delayed = 0;
20803 rack->r_early = 0;
20804 rack->r_late = 0;
20805 rack->r_ctl.rc_agg_early = 0;
20806 if ((ctf_outstanding(tp) +
20807 min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)),
20808 minseg)) >= tp->snd_wnd) {
20809 /* We are limited by the rwnd */
20810 app_limited = CTF_JR_RWND_LIMITED;
20811 if (IN_FASTRECOVERY(tp->t_flags))
20812 rack->r_ctl.rc_prr_sndcnt = 0;
20813 } else if (ctf_outstanding(tp) >= sbavail(sb)) {
20814 /* We are limited by whats available -- app limited */
20815 app_limited = CTF_JR_APP_LIMITED;
20816 if (IN_FASTRECOVERY(tp->t_flags))
20817 rack->r_ctl.rc_prr_sndcnt = 0;
20818 } else if ((idle == 0) &&
20819 ((tp->t_flags & TF_NODELAY) == 0) &&
20820 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) &&
20821 (len < segsiz)) {
20822 /*
20823 * No delay is not on and the
20824 * user is sending less than 1MSS. This
20825 * brings out SWS avoidance so we
20826 * don't send. Another app-limited case.
20827 */
20828 app_limited = CTF_JR_APP_LIMITED;
20829 } else if (tp->t_flags & TF_NOPUSH) {
20830 /*
20831 * The user has requested no push of
20832 * the last segment and we are
20833 * at the last segment. Another app
20834 * limited case.
20835 */
20836 app_limited = CTF_JR_APP_LIMITED;
20837 } else if ((ctf_outstanding(tp) + minseg) > cwnd_to_use) {
20838 /* Its the cwnd */
20839 app_limited = CTF_JR_CWND_LIMITED;
20840 } else if (IN_FASTRECOVERY(tp->t_flags) &&
20841 (rack->rack_no_prr == 0) &&
20842 (rack->r_ctl.rc_prr_sndcnt < segsiz)) {
20843 app_limited = CTF_JR_PRR;
20844 } else {
20845 /* Now why here are we not sending? */
20846 #ifdef NOW
20847 #ifdef INVARIANTS
20848 panic("rack:%p hit JR_ASSESSING case cwnd_to_use:%u?", rack, cwnd_to_use);
20849 #endif
20850 #endif
20851 app_limited = CTF_JR_ASSESSING;
20852 }
20853 /*
20854 * App limited in some fashion, for our pacing GP
20855 * measurements we don't want any gap (even cwnd).
20856 * Close down the measurement window.
20857 */
20858 if (rack_cwnd_block_ends_measure &&
20859 ((app_limited == CTF_JR_CWND_LIMITED) ||
20860 (app_limited == CTF_JR_PRR))) {
20861 /*
20862 * The reason we are not sending is
20863 * the cwnd (or prr). We have been configured
20864 * to end the measurement window in
20865 * this case.
20866 */
20867 end_window = 1;
20868 } else if (rack_rwnd_block_ends_measure &&
20869 (app_limited == CTF_JR_RWND_LIMITED)) {
20870 /*
20871 * We are rwnd limited and have been
20872 * configured to end the measurement
20873 * window in this case.
20874 */
20875 end_window = 1;
20876 } else if (app_limited == CTF_JR_APP_LIMITED) {
20877 /*
20878 * A true application limited period, we have
20879 * ran out of data.
20880 */
20881 end_window = 1;
20882 } else if (app_limited == CTF_JR_ASSESSING) {
20883 /*
20884 * In the assessing case we hit the end of
20885 * the if/else and had no known reason
20886 * This will panic us under invariants..
20887 *
20888 * If we get this out in logs we need to
20889 * investagate which reason we missed.
20890 */
20891 end_window = 1;
20892 }
20893 if (end_window) {
20894 uint8_t log = 0;
20895
20896 /* Adjust the Gput measurement */
20897 if ((tp->t_flags & TF_GPUTINPROG) &&
20898 SEQ_GT(tp->gput_ack, tp->snd_max)) {
20899 tp->gput_ack = tp->snd_max;
20900 if ((tp->gput_ack - tp->gput_seq) < (MIN_GP_WIN * segsiz)) {
20901 /*
20902 * There is not enough to measure.
20903 */
20904 tp->t_flags &= ~TF_GPUTINPROG;
20905 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
20906 rack->r_ctl.rc_gp_srtt /*flex1*/,
20907 tp->gput_seq,
20908 0, 0, 18, __LINE__, NULL, 0);
20909 } else
20910 log = 1;
20911 }
20912 /* Mark the last packet as app limited */
20913 rsm = tqhash_max(rack->r_ctl.tqh);
20914 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) {
20915 if (rack->r_ctl.rc_app_limited_cnt == 0)
20916 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm;
20917 else {
20918 /*
20919 * Go out to the end app limited and mark
20920 * this new one as next and move the end_appl up
20921 * to this guy.
20922 */
20923 if (rack->r_ctl.rc_end_appl)
20924 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start;
20925 rack->r_ctl.rc_end_appl = rsm;
20926 }
20927 rsm->r_flags |= RACK_APP_LIMITED;
20928 rack->r_ctl.rc_app_limited_cnt++;
20929 }
20930 if (log)
20931 rack_log_pacing_delay_calc(rack,
20932 rack->r_ctl.rc_app_limited_cnt, seq,
20933 tp->gput_ack, 0, 0, 4, __LINE__, NULL, 0);
20934 }
20935 }
20936 /* Check if we need to go into persists or not */
20937 if ((tp->snd_max == tp->snd_una) &&
20938 TCPS_HAVEESTABLISHED(tp->t_state) &&
20939 sbavail(sb) &&
20940 (sbavail(sb) > tp->snd_wnd) &&
20941 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg))) {
20942 /* Yes lets make sure to move to persist before timer-start */
20943 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, tp->snd_una);
20944 }
20945 rack_start_hpts_timer(rack, tp, cts, pacing_delay, tot_len_this_send, sup_rack);
20946 rack_log_type_just_return(rack, cts, tot_len_this_send, pacing_delay, hpts_calling, app_limited, cwnd_to_use);
20947 }
20948 #ifdef NETFLIX_SHARED_CWND
20949 if ((sbavail(sb) == 0) &&
20950 rack->r_ctl.rc_scw) {
20951 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
20952 rack->rack_scwnd_is_idle = 1;
20953 }
20954 #endif
20955 #ifdef TCP_ACCOUNTING
20956 if (tot_len_this_send > 0) {
20957 crtsc = get_cyclecount();
20958 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
20959 tp->tcp_cnt_counters[SND_OUT_DATA]++;
20960 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val);
20961 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) / segsiz);
20962 }
20963 } else {
20964 crtsc = get_cyclecount();
20965 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
20966 tp->tcp_cnt_counters[SND_LIMITED]++;
20967 tp->tcp_proc_time[SND_LIMITED] += (crtsc - ts_val);
20968 }
20969 }
20970 sched_unpin();
20971 #endif
20972 return (0);
20973
20974 send:
20975 if ((rack->r_ctl.crte != NULL) &&
20976 (rsm == NULL) &&
20977 ((rack->rc_hw_nobuf == 1) ||
20978 (rack_hw_check_queue && (check_done == 0)))) {
20979 /*
20980 * We only want to do this once with the hw_check_queue,
20981 * for the enobuf case we would only do it once if
20982 * we come around to again, the flag will be clear.
20983 */
20984 check_done = 1;
20985 pacing_delay = rack_check_queue_level(rack, tp, &tv, cts, len, segsiz);
20986 if (pacing_delay) {
20987 rack->r_ctl.rc_agg_delayed = 0;
20988 rack->r_ctl.rc_agg_early = 0;
20989 rack->r_early = 0;
20990 rack->r_late = 0;
20991 SOCK_SENDBUF_UNLOCK(so);
20992 goto skip_all_send;
20993 }
20994 }
20995 if (rsm || sack_rxmit)
20996 counter_u64_add(rack_nfto_resend, 1);
20997 else
20998 counter_u64_add(rack_non_fto_send, 1);
20999 if ((flags & TH_FIN) &&
21000 sbavail(sb)) {
21001 /*
21002 * We do not transmit a FIN
21003 * with data outstanding. We
21004 * need to make it so all data
21005 * is acked first.
21006 */
21007 flags &= ~TH_FIN;
21008 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
21009 (sbused(sb) == (tp->snd_max - tp->snd_una)) &&
21010 ((tp->snd_max - tp->snd_una) <= segsiz)) {
21011 /*
21012 * Ok less than or right at a MSS is
21013 * outstanding. The original FreeBSD stack would
21014 * have sent a FIN, which can speed things up for
21015 * a transactional application doing a MSG_WAITALL.
21016 * To speed things up since we do *not* send a FIN
21017 * if data is outstanding, we send a "challenge ack".
21018 * The idea behind that is instead of having to have
21019 * the peer wait for the delayed-ack timer to run off
21020 * we send an ack that makes the peer send us an ack.
21021 */
21022 rack_send_ack_challange(rack);
21023 }
21024 }
21025 /* Enforce stack imposed max seg size if we have one */
21026 if (pace_max_seg &&
21027 (len > pace_max_seg)) {
21028 mark = 1;
21029 len = pace_max_seg;
21030 }
21031 if ((rsm == NULL) &&
21032 (rack->pcm_in_progress == 0) &&
21033 (rack->r_ctl.pcm_max_seg > 0) &&
21034 (len >= rack->r_ctl.pcm_max_seg)) {
21035 /* It is large enough for a measurement */
21036 add_flag |= RACK_IS_PCM;
21037 rack_log_pcm(rack, 5, len, rack->r_ctl.pcm_max_seg, add_flag);
21038 } else if (rack_verbose_logging) {
21039 rack_log_pcm(rack, 6, len, rack->r_ctl.pcm_max_seg, add_flag);
21040 }
21041
21042 SOCKBUF_LOCK_ASSERT(sb);
21043 if (len > 0) {
21044 if (len >= segsiz)
21045 tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
21046 else
21047 tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
21048 }
21049 /*
21050 * Before ESTABLISHED, force sending of initial options unless TCP
21051 * set not to do any options. NOTE: we assume that the IP/TCP header
21052 * plus TCP options always fit in a single mbuf, leaving room for a
21053 * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr)
21054 * + optlen <= MCLBYTES
21055 */
21056 optlen = 0;
21057 #ifdef INET6
21058 if (isipv6)
21059 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
21060 else
21061 #endif
21062 hdrlen = sizeof(struct tcpiphdr);
21063
21064 /*
21065 * Ok what seq are we sending from. If we have
21066 * no rsm to use, then we look at various bits,
21067 * if we are putting out a SYN it will be ISS.
21068 * If we are retransmitting a FIN it will
21069 * be snd_max-1 else its snd_max.
21070 */
21071 if (rsm == NULL) {
21072 if (flags & TH_SYN)
21073 rack_seq = tp->iss;
21074 else if ((flags & TH_FIN) &&
21075 (tp->t_flags & TF_SENTFIN))
21076 rack_seq = tp->snd_max - 1;
21077 else
21078 rack_seq = tp->snd_max;
21079 } else {
21080 rack_seq = rsm->r_start;
21081 }
21082 /*
21083 * Compute options for segment. We only have to care about SYN and
21084 * established connection segments. Options for SYN-ACK segments
21085 * are handled in TCP syncache.
21086 */
21087 to.to_flags = 0;
21088 if ((tp->t_flags & TF_NOOPT) == 0) {
21089 /* Maximum segment size. */
21090 if (flags & TH_SYN) {
21091 to.to_mss = tcp_mssopt(&inp->inp_inc);
21092 if (tp->t_port)
21093 to.to_mss -= V_tcp_udp_tunneling_overhead;
21094 to.to_flags |= TOF_MSS;
21095
21096 /*
21097 * On SYN or SYN|ACK transmits on TFO connections,
21098 * only include the TFO option if it is not a
21099 * retransmit, as the presence of the TFO option may
21100 * have caused the original SYN or SYN|ACK to have
21101 * been dropped by a middlebox.
21102 */
21103 if ((tp->t_flags & TF_FASTOPEN) &&
21104 (tp->t_rxtshift == 0)) {
21105 if (tp->t_state == TCPS_SYN_RECEIVED) {
21106 to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
21107 to.to_tfo_cookie =
21108 (u_int8_t *)&tp->t_tfo_cookie.server;
21109 to.to_flags |= TOF_FASTOPEN;
21110 wanted_cookie = 1;
21111 } else if (tp->t_state == TCPS_SYN_SENT) {
21112 to.to_tfo_len =
21113 tp->t_tfo_client_cookie_len;
21114 to.to_tfo_cookie =
21115 tp->t_tfo_cookie.client;
21116 to.to_flags |= TOF_FASTOPEN;
21117 wanted_cookie = 1;
21118 /*
21119 * If we wind up having more data to
21120 * send with the SYN than can fit in
21121 * one segment, don't send any more
21122 * until the SYN|ACK comes back from
21123 * the other end.
21124 */
21125 sendalot = 0;
21126 }
21127 }
21128 }
21129 /* Window scaling. */
21130 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
21131 to.to_wscale = tp->request_r_scale;
21132 to.to_flags |= TOF_SCALE;
21133 }
21134 /* Timestamps. */
21135 if ((tp->t_flags & TF_RCVD_TSTMP) ||
21136 ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
21137 uint32_t ts_to_use;
21138
21139 if ((rack->r_rcvpath_rtt_up == 1) &&
21140 (ms_cts == rack->r_ctl.last_rcv_tstmp_for_rtt)) {
21141 /*
21142 * When we are doing a rcv_rtt probe all
21143 * other timestamps use the next msec. This
21144 * is safe since our previous ack is in the
21145 * air and we will just have a few more
21146 * on the next ms. This assures that only
21147 * the one ack has the ms_cts that was on
21148 * our ack-probe.
21149 */
21150 ts_to_use = ms_cts + 1;
21151 } else {
21152 ts_to_use = ms_cts;
21153 }
21154 to.to_tsval = ts_to_use + tp->ts_offset;
21155 to.to_tsecr = tp->ts_recent;
21156 to.to_flags |= TOF_TS;
21157 if ((len == 0) &&
21158 (tp->t_state == TCPS_ESTABLISHED) &&
21159 ((ms_cts - rack->r_ctl.last_rcv_tstmp_for_rtt) > RCV_PATH_RTT_MS) &&
21160 (tp->snd_una == tp->snd_max) &&
21161 (flags & TH_ACK) &&
21162 (sbavail(sb) == 0) &&
21163 (rack->r_ctl.current_round != 0) &&
21164 ((flags & (TH_SYN|TH_FIN)) == 0) &&
21165 (rack->r_rcvpath_rtt_up == 0)) {
21166 rack->r_ctl.last_rcv_tstmp_for_rtt = ms_cts;
21167 rack->r_ctl.last_time_of_arm_rcv = cts;
21168 rack->r_rcvpath_rtt_up = 1;
21169 /* Subtract 1 from seq to force a response */
21170 rack_seq--;
21171 }
21172 }
21173 /* Set receive buffer autosizing timestamp. */
21174 if (tp->rfbuf_ts == 0 &&
21175 (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
21176 tp->rfbuf_ts = ms_cts;
21177 }
21178 /* Selective ACK's. */
21179 if (tp->t_flags & TF_SACK_PERMIT) {
21180 if (flags & TH_SYN)
21181 to.to_flags |= TOF_SACKPERM;
21182 else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
21183 tp->rcv_numsacks > 0) {
21184 to.to_flags |= TOF_SACK;
21185 to.to_nsacks = tp->rcv_numsacks;
21186 to.to_sacks = (u_char *)tp->sackblks;
21187 }
21188 }
21189 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
21190 /* TCP-MD5 (RFC2385). */
21191 if (tp->t_flags & TF_SIGNATURE)
21192 to.to_flags |= TOF_SIGNATURE;
21193 #endif
21194
21195 /* Processing the options. */
21196 hdrlen += optlen = tcp_addoptions(&to, opt);
21197 /*
21198 * If we wanted a TFO option to be added, but it was unable
21199 * to fit, ensure no data is sent.
21200 */
21201 if ((tp->t_flags & TF_FASTOPEN) && wanted_cookie &&
21202 !(to.to_flags & TOF_FASTOPEN))
21203 len = 0;
21204 }
21205 if (tp->t_port) {
21206 if (V_tcp_udp_tunneling_port == 0) {
21207 /* The port was removed?? */
21208 SOCK_SENDBUF_UNLOCK(so);
21209 #ifdef TCP_ACCOUNTING
21210 crtsc = get_cyclecount();
21211 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
21212 tp->tcp_cnt_counters[SND_OUT_FAIL]++;
21213 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
21214 }
21215 sched_unpin();
21216 #endif
21217 return (EHOSTUNREACH);
21218 }
21219 hdrlen += sizeof(struct udphdr);
21220 }
21221 #ifdef INET6
21222 if (isipv6)
21223 ipoptlen = ip6_optlen(inp);
21224 else
21225 #endif
21226 if (inp->inp_options)
21227 ipoptlen = inp->inp_options->m_len -
21228 offsetof(struct ipoption, ipopt_list);
21229 else
21230 ipoptlen = 0;
21231 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
21232 ipoptlen += ipsec_optlen;
21233 #endif
21234
21235 /*
21236 * Adjust data length if insertion of options will bump the packet
21237 * length beyond the t_maxseg length. Clear the FIN bit because we
21238 * cut off the tail of the segment.
21239 */
21240 if (len + optlen + ipoptlen > tp->t_maxseg) {
21241 if (tso) {
21242 uint32_t if_hw_tsomax;
21243 uint32_t moff;
21244 int32_t max_len;
21245
21246 /* extract TSO information */
21247 if_hw_tsomax = tp->t_tsomax;
21248 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
21249 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
21250 KASSERT(ipoptlen == 0,
21251 ("%s: TSO can't do IP options", __func__));
21252
21253 /*
21254 * Check if we should limit by maximum payload
21255 * length:
21256 */
21257 if (if_hw_tsomax != 0) {
21258 /* compute maximum TSO length */
21259 max_len = (if_hw_tsomax - hdrlen -
21260 max_linkhdr);
21261 if (max_len <= 0) {
21262 len = 0;
21263 } else if (len > max_len) {
21264 if (doing_tlp == 0)
21265 sendalot = 1;
21266 len = max_len;
21267 mark = 2;
21268 }
21269 }
21270 /*
21271 * Prevent the last segment from being fractional
21272 * unless the send sockbuf can be emptied:
21273 */
21274 max_len = (tp->t_maxseg - optlen);
21275 if ((sb_offset + len) < sbavail(sb)) {
21276 moff = len % (u_int)max_len;
21277 if (moff != 0) {
21278 mark = 3;
21279 len -= moff;
21280 }
21281 }
21282 /*
21283 * In case there are too many small fragments don't
21284 * use TSO:
21285 */
21286 if (len <= max_len) {
21287 mark = 4;
21288 tso = 0;
21289 }
21290 /*
21291 * Send the FIN in a separate segment after the bulk
21292 * sending is done. We don't trust the TSO
21293 * implementations to clear the FIN flag on all but
21294 * the last segment.
21295 */
21296 if (tp->t_flags & TF_NEEDFIN) {
21297 sendalot = 4;
21298 }
21299 } else {
21300 mark = 5;
21301 if (optlen + ipoptlen >= tp->t_maxseg) {
21302 /*
21303 * Since we don't have enough space to put
21304 * the IP header chain and the TCP header in
21305 * one packet as required by RFC 7112, don't
21306 * send it. Also ensure that at least one
21307 * byte of the payload can be put into the
21308 * TCP segment.
21309 */
21310 SOCK_SENDBUF_UNLOCK(so);
21311 error = EMSGSIZE;
21312 sack_rxmit = 0;
21313 goto out;
21314 }
21315 len = tp->t_maxseg - optlen - ipoptlen;
21316 sendalot = 5;
21317 }
21318 } else {
21319 tso = 0;
21320 mark = 6;
21321 }
21322 KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
21323 ("%s: len > IP_MAXPACKET", __func__));
21324 #ifdef DIAGNOSTIC
21325 #ifdef INET6
21326 if (max_linkhdr + hdrlen > MCLBYTES)
21327 #else
21328 if (max_linkhdr + hdrlen > MHLEN)
21329 #endif
21330 panic("tcphdr too big");
21331 #endif
21332
21333 /*
21334 * This KASSERT is here to catch edge cases at a well defined place.
21335 * Before, those had triggered (random) panic conditions further
21336 * down.
21337 */
21338 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
21339 if ((len == 0) &&
21340 (flags & TH_FIN) &&
21341 (sbused(sb))) {
21342 /*
21343 * We have outstanding data, don't send a fin by itself!.
21344 *
21345 * Check to see if we need to send a challenge ack.
21346 */
21347 if ((sbused(sb) == (tp->snd_max - tp->snd_una)) &&
21348 ((tp->snd_max - tp->snd_una) <= segsiz)) {
21349 /*
21350 * Ok less than or right at a MSS is
21351 * outstanding. The original FreeBSD stack would
21352 * have sent a FIN, which can speed things up for
21353 * a transactional application doing a MSG_WAITALL.
21354 * To speed things up since we do *not* send a FIN
21355 * if data is outstanding, we send a "challenge ack".
21356 * The idea behind that is instead of having to have
21357 * the peer wait for the delayed-ack timer to run off
21358 * we send an ack that makes the peer send us an ack.
21359 */
21360 rack_send_ack_challange(rack);
21361 }
21362 goto just_return;
21363 }
21364 /*
21365 * Grab a header mbuf, attaching a copy of data to be transmitted,
21366 * and initialize the header from the template for sends on this
21367 * connection.
21368 */
21369 hw_tls = tp->t_nic_ktls_xmit != 0;
21370 if (len) {
21371 uint32_t max_val;
21372 uint32_t moff;
21373
21374 if (pace_max_seg)
21375 max_val = pace_max_seg;
21376 else
21377 max_val = len;
21378 /*
21379 * We allow a limit on sending with hptsi.
21380 */
21381 if (len > max_val) {
21382 mark = 7;
21383 len = max_val;
21384 }
21385 #ifdef INET6
21386 if (MHLEN < hdrlen + max_linkhdr)
21387 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
21388 else
21389 #endif
21390 m = m_gethdr(M_NOWAIT, MT_DATA);
21391
21392 if (m == NULL) {
21393 SOCK_SENDBUF_UNLOCK(so);
21394 error = ENOBUFS;
21395 sack_rxmit = 0;
21396 goto out;
21397 }
21398 m->m_data += max_linkhdr;
21399 m->m_len = hdrlen;
21400
21401 /*
21402 * Start the m_copy functions from the closest mbuf to the
21403 * sb_offset in the socket buffer chain.
21404 */
21405 mb = sbsndptr_noadv(sb, sb_offset, &moff);
21406 s_mb = mb;
21407 s_moff = moff;
21408 if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) {
21409 m_copydata(mb, moff, (int)len,
21410 mtod(m, caddr_t)+hdrlen);
21411 /*
21412 * If we are not retransmitting advance the
21413 * sndptr to help remember the next place in
21414 * the sb.
21415 */
21416 if (rsm == NULL)
21417 sbsndptr_adv(sb, mb, len);
21418 m->m_len += len;
21419 } else {
21420 struct sockbuf *msb;
21421
21422 /*
21423 * If we are not retransmitting pass in msb so
21424 * the socket buffer can be advanced. Otherwise
21425 * set it to NULL if its a retransmission since
21426 * we don't want to change the sb remembered
21427 * location.
21428 */
21429 if (rsm == NULL)
21430 msb = sb;
21431 else
21432 msb = NULL;
21433 m->m_next = tcp_m_copym(
21434 mb, moff, &len,
21435 if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb,
21436 ((rsm == NULL) ? hw_tls : 0));
21437 if (len <= (tp->t_maxseg - optlen)) {
21438 /*
21439 * Must have ran out of mbufs for the copy
21440 * shorten it to no longer need tso. Lets
21441 * not put on sendalot since we are low on
21442 * mbufs.
21443 */
21444 tso = 0;
21445 }
21446 if (m->m_next == NULL) {
21447 SOCK_SENDBUF_UNLOCK(so);
21448 (void)m_free(m);
21449 error = ENOBUFS;
21450 sack_rxmit = 0;
21451 goto out;
21452 }
21453 }
21454 if (sack_rxmit) {
21455 if (rsm && (rsm->r_flags & RACK_TLP)) {
21456 /*
21457 * TLP should not count in retran count, but
21458 * in its own bin
21459 */
21460 counter_u64_add(rack_tlp_retran, 1);
21461 counter_u64_add(rack_tlp_retran_bytes, len);
21462 } else {
21463 tp->t_sndrexmitpack++;
21464 KMOD_TCPSTAT_INC(tcps_sndrexmitpack);
21465 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len);
21466 }
21467 #ifdef STATS
21468 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
21469 len);
21470 #endif
21471 } else {
21472 KMOD_TCPSTAT_INC(tcps_sndpack);
21473 KMOD_TCPSTAT_ADD(tcps_sndbyte, len);
21474 #ifdef STATS
21475 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
21476 len);
21477 #endif
21478 }
21479 /*
21480 * If we're sending everything we've got, set PUSH. (This
21481 * will keep happy those implementations which only give
21482 * data to the user when a buffer fills or a PUSH comes in.)
21483 */
21484 if (sb_offset + len == sbused(sb) &&
21485 sbused(sb) &&
21486 !(flags & TH_SYN)) {
21487 flags |= TH_PUSH;
21488 add_flag |= RACK_HAD_PUSH;
21489 }
21490 SOCK_SENDBUF_UNLOCK(so);
21491 } else {
21492 SOCK_SENDBUF_UNLOCK(so);
21493 if (tp->t_flags & TF_ACKNOW)
21494 KMOD_TCPSTAT_INC(tcps_sndacks);
21495 else if (flags & (TH_SYN | TH_FIN | TH_RST))
21496 KMOD_TCPSTAT_INC(tcps_sndctrl);
21497 else
21498 KMOD_TCPSTAT_INC(tcps_sndwinup);
21499
21500 m = m_gethdr(M_NOWAIT, MT_DATA);
21501 if (m == NULL) {
21502 error = ENOBUFS;
21503 sack_rxmit = 0;
21504 goto out;
21505 }
21506 #ifdef INET6
21507 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
21508 MHLEN >= hdrlen) {
21509 M_ALIGN(m, hdrlen);
21510 } else
21511 #endif
21512 m->m_data += max_linkhdr;
21513 m->m_len = hdrlen;
21514 }
21515 SOCK_SENDBUF_UNLOCK_ASSERT(so);
21516 m->m_pkthdr.rcvif = (struct ifnet *)0;
21517 #ifdef MAC
21518 mac_inpcb_create_mbuf(inp, m);
21519 #endif
21520 if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) {
21521 #ifdef INET6
21522 if (isipv6)
21523 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
21524 else
21525 #endif /* INET6 */
21526 #ifdef INET
21527 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
21528 #endif
21529 th = rack->r_ctl.fsb.th;
21530 udp = rack->r_ctl.fsb.udp;
21531 if (udp) {
21532 #ifdef INET6
21533 if (isipv6)
21534 ulen = hdrlen + len - sizeof(struct ip6_hdr);
21535 else
21536 #endif /* INET6 */
21537 ulen = hdrlen + len - sizeof(struct ip);
21538 udp->uh_ulen = htons(ulen);
21539 }
21540 } else {
21541 #ifdef INET6
21542 if (isipv6) {
21543 ip6 = mtod(m, struct ip6_hdr *);
21544 if (tp->t_port) {
21545 udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr));
21546 udp->uh_sport = htons(V_tcp_udp_tunneling_port);
21547 udp->uh_dport = tp->t_port;
21548 ulen = hdrlen + len - sizeof(struct ip6_hdr);
21549 udp->uh_ulen = htons(ulen);
21550 th = (struct tcphdr *)(udp + 1);
21551 } else
21552 th = (struct tcphdr *)(ip6 + 1);
21553 tcpip_fillheaders(inp, tp->t_port, ip6, th);
21554 } else
21555 #endif /* INET6 */
21556 {
21557 #ifdef INET
21558 ip = mtod(m, struct ip *);
21559 if (tp->t_port) {
21560 udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip));
21561 udp->uh_sport = htons(V_tcp_udp_tunneling_port);
21562 udp->uh_dport = tp->t_port;
21563 ulen = hdrlen + len - sizeof(struct ip);
21564 udp->uh_ulen = htons(ulen);
21565 th = (struct tcphdr *)(udp + 1);
21566 } else
21567 th = (struct tcphdr *)(ip + 1);
21568 tcpip_fillheaders(inp, tp->t_port, ip, th);
21569 #endif
21570 }
21571 }
21572 /*
21573 * If we are starting a connection, send ECN setup SYN packet. If we
21574 * are on a retransmit, we may resend those bits a number of times
21575 * as per RFC 3168.
21576 */
21577 if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) {
21578 flags |= tcp_ecn_output_syn_sent(tp);
21579 }
21580 /* Also handle parallel SYN for ECN */
21581 if (TCPS_HAVERCVDSYN(tp->t_state) &&
21582 (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) {
21583 int ect = tcp_ecn_output_established(tp, &flags, len, sack_rxmit);
21584 if ((tp->t_state == TCPS_SYN_RECEIVED) &&
21585 (tp->t_flags2 & TF2_ECN_SND_ECE))
21586 tp->t_flags2 &= ~TF2_ECN_SND_ECE;
21587 #ifdef INET6
21588 if (isipv6) {
21589 ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20);
21590 ip6->ip6_flow |= htonl(ect << 20);
21591 }
21592 else
21593 #endif
21594 {
21595 #ifdef INET
21596 ip->ip_tos &= ~IPTOS_ECN_MASK;
21597 ip->ip_tos |= ect;
21598 #endif
21599 }
21600 }
21601 th->th_seq = htonl(rack_seq);
21602 th->th_ack = htonl(tp->rcv_nxt);
21603 tcp_set_flags(th, flags);
21604 /*
21605 * Calculate receive window. Don't shrink window, but avoid silly
21606 * window syndrome.
21607 * If a RST segment is sent, advertise a window of zero.
21608 */
21609 if (flags & TH_RST) {
21610 recwin = 0;
21611 } else {
21612 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
21613 recwin < (long)segsiz) {
21614 recwin = 0;
21615 }
21616 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
21617 recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
21618 recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
21619 }
21620
21621 /*
21622 * According to RFC1323 the window field in a SYN (i.e., a <SYN> or
21623 * <SYN,ACK>) segment itself is never scaled. The <SYN,ACK> case is
21624 * handled in syncache.
21625 */
21626 if (flags & TH_SYN)
21627 th->th_win = htons((u_short)
21628 (min(sbspace(&so->so_rcv), TCP_MAXWIN)));
21629 else {
21630 /* Avoid shrinking window with window scaling. */
21631 recwin = roundup2(recwin, 1 << tp->rcv_scale);
21632 th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
21633 }
21634 /*
21635 * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0
21636 * window. This may cause the remote transmitter to stall. This
21637 * flag tells soreceive() to disable delayed acknowledgements when
21638 * draining the buffer. This can occur if the receiver is
21639 * attempting to read more data than can be buffered prior to
21640 * transmitting on the connection.
21641 */
21642 if (th->th_win == 0) {
21643 tp->t_sndzerowin++;
21644 tp->t_flags |= TF_RXWIN0SENT;
21645 } else
21646 tp->t_flags &= ~TF_RXWIN0SENT;
21647 tp->snd_up = tp->snd_una; /* drag it along, its deprecated */
21648 /* Now are we using fsb?, if so copy the template data to the mbuf */
21649 if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) {
21650 uint8_t *cpto;
21651
21652 cpto = mtod(m, uint8_t *);
21653 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len);
21654 /*
21655 * We have just copied in:
21656 * IP/IP6
21657 * <optional udphdr>
21658 * tcphdr (no options)
21659 *
21660 * We need to grab the correct pointers into the mbuf
21661 * for both the tcp header, and possibly the udp header (if tunneling).
21662 * We do this by using the offset in the copy buffer and adding it
21663 * to the mbuf base pointer (cpto).
21664 */
21665 #ifdef INET6
21666 if (isipv6)
21667 ip6 = mtod(m, struct ip6_hdr *);
21668 else
21669 #endif /* INET6 */
21670 #ifdef INET
21671 ip = mtod(m, struct ip *);
21672 #endif
21673 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr));
21674 /* If we have a udp header lets set it into the mbuf as well */
21675 if (udp)
21676 udp = (struct udphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.udp - rack->r_ctl.fsb.tcp_ip_hdr));
21677 }
21678 if (optlen) {
21679 bcopy(opt, th + 1, optlen);
21680 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
21681 }
21682 /*
21683 * Put TCP length in extended header, and then checksum extended
21684 * header and data.
21685 */
21686 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
21687 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
21688 if (to.to_flags & TOF_SIGNATURE) {
21689 /*
21690 * Calculate MD5 signature and put it into the place
21691 * determined before.
21692 * NOTE: since TCP options buffer doesn't point into
21693 * mbuf's data, calculate offset and use it.
21694 */
21695 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th,
21696 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) {
21697 /*
21698 * Do not send segment if the calculation of MD5
21699 * digest has failed.
21700 */
21701 goto out;
21702 }
21703 }
21704 #endif
21705 #ifdef INET6
21706 if (isipv6) {
21707 /*
21708 * ip6_plen is not need to be filled now, and will be filled
21709 * in ip6_output.
21710 */
21711 if (tp->t_port) {
21712 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
21713 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
21714 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
21715 th->th_sum = htons(0);
21716 UDPSTAT_INC(udps_opackets);
21717 } else {
21718 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
21719 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
21720 th->th_sum = in6_cksum_pseudo(ip6,
21721 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
21722 0);
21723 }
21724 }
21725 #endif
21726 #if defined(INET6) && defined(INET)
21727 else
21728 #endif
21729 #ifdef INET
21730 {
21731 if (tp->t_port) {
21732 m->m_pkthdr.csum_flags = CSUM_UDP;
21733 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
21734 udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
21735 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
21736 th->th_sum = htons(0);
21737 UDPSTAT_INC(udps_opackets);
21738 } else {
21739 m->m_pkthdr.csum_flags = CSUM_TCP;
21740 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
21741 th->th_sum = in_pseudo(ip->ip_src.s_addr,
21742 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
21743 IPPROTO_TCP + len + optlen));
21744 }
21745 /* IP version must be set here for ipv4/ipv6 checking later */
21746 KASSERT(ip->ip_v == IPVERSION,
21747 ("%s: IP version incorrect: %d", __func__, ip->ip_v));
21748 }
21749 #endif
21750 /*
21751 * Enable TSO and specify the size of the segments. The TCP pseudo
21752 * header checksum is always provided. XXX: Fixme: This is currently
21753 * not the case for IPv6.
21754 */
21755 if (tso) {
21756 /*
21757 * Here we must use t_maxseg and the optlen since
21758 * the optlen may include SACK's (or DSACK).
21759 */
21760 KASSERT(len > tp->t_maxseg - optlen,
21761 ("%s: len <= tso_segsz", __func__));
21762 m->m_pkthdr.csum_flags |= CSUM_TSO;
21763 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
21764 }
21765 KASSERT(len + hdrlen == m_length(m, NULL),
21766 ("%s: mbuf chain different than expected: %d + %u != %u",
21767 __func__, len, hdrlen, m_length(m, NULL)));
21768
21769 #ifdef TCP_HHOOK
21770 /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
21771 hhook_run_tcp_est_out(tp, th, &to, len, tso);
21772 #endif
21773 if ((rack->r_ctl.crte != NULL) &&
21774 (rack->rc_hw_nobuf == 0) &&
21775 tcp_bblogging_on(tp)) {
21776 rack_log_queue_level(tp, rack, len, &tv, cts);
21777 }
21778 /* We're getting ready to send; log now. */
21779 if (tcp_bblogging_on(rack->rc_tp)) {
21780 union tcp_log_stackspecific log;
21781
21782 memset(&log, 0, sizeof(log));
21783 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
21784 if (rack->rack_no_prr)
21785 log.u_bbr.flex1 = 0;
21786 else
21787 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
21788 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
21789 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
21790 log.u_bbr.flex4 = orig_len;
21791 /* Save off the early/late values */
21792 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
21793 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
21794 log.u_bbr.bw_inuse = rack_get_bw(rack);
21795 log.u_bbr.cur_del_rate = rack->r_ctl.gp_bw;
21796 log.u_bbr.flex8 = 0;
21797 if (rsm) {
21798 if (rsm->r_flags & RACK_RWND_COLLAPSED) {
21799 rack_log_collapse(rack, rsm->r_start, rsm->r_end, 0, __LINE__, 5, rsm->r_flags, rsm);
21800 counter_u64_add(rack_collapsed_win_rxt, 1);
21801 counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start));
21802 }
21803 if (doing_tlp)
21804 log.u_bbr.flex8 = 2;
21805 else
21806 log.u_bbr.flex8 = 1;
21807 } else {
21808 if (doing_tlp)
21809 log.u_bbr.flex8 = 3;
21810 }
21811 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm);
21812 log.u_bbr.flex7 = mark;
21813 log.u_bbr.flex7 <<= 8;
21814 log.u_bbr.flex7 |= pass;
21815 log.u_bbr.pkts_out = tp->t_maxseg;
21816 log.u_bbr.timeStamp = cts;
21817 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
21818 if (rsm && (rsm->r_rtr_cnt > 0)) {
21819 /*
21820 * When we have a retransmit we want to log the
21821 * burst at send and flight at send from before.
21822 */
21823 log.u_bbr.flex5 = rsm->r_fas;
21824 log.u_bbr.bbr_substate = rsm->r_bas;
21825 } else {
21826 /*
21827 * New transmits we log in flex5 the inflight again as
21828 * well as the number of segments in our send in the
21829 * substate field.
21830 */
21831 log.u_bbr.flex5 = log.u_bbr.inflight;
21832 log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz);
21833 }
21834 log.u_bbr.lt_epoch = cwnd_to_use;
21835 log.u_bbr.delivered = sendalot;
21836 log.u_bbr.rttProp = (uintptr_t)rsm;
21837 log.u_bbr.pkt_epoch = __LINE__;
21838 if (rsm) {
21839 log.u_bbr.delRate = rsm->r_flags;
21840 log.u_bbr.delRate <<= 31;
21841 log.u_bbr.delRate |= rack->r_must_retran;
21842 log.u_bbr.delRate <<= 1;
21843 log.u_bbr.delRate |= (sack_rxmit & 0x00000001);
21844 } else {
21845 log.u_bbr.delRate = rack->r_must_retran;
21846 log.u_bbr.delRate <<= 1;
21847 log.u_bbr.delRate |= (sack_rxmit & 0x00000001);
21848 }
21849 lgb = tcp_log_event(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
21850 len, &log, false, NULL, __func__, __LINE__, &tv);
21851 } else
21852 lgb = NULL;
21853
21854 /*
21855 * Fill in IP length and desired time to live and send to IP level.
21856 * There should be a better way to handle ttl and tos; we could keep
21857 * them in the template, but need a way to checksum without them.
21858 */
21859 /*
21860 * m->m_pkthdr.len should have been set before cksum calcuration,
21861 * because in6_cksum() need it.
21862 */
21863 #ifdef INET6
21864 if (isipv6) {
21865 /*
21866 * we separately set hoplimit for every segment, since the
21867 * user might want to change the value via setsockopt. Also,
21868 * desired default hop limit might be changed via Neighbor
21869 * Discovery.
21870 */
21871 rack->r_ctl.fsb.hoplimit = ip6->ip6_hlim = in6_selecthlim(inp, NULL);
21872
21873 /*
21874 * Set the packet size here for the benefit of DTrace
21875 * probes. ip6_output() will set it properly; it's supposed
21876 * to include the option header lengths as well.
21877 */
21878 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
21879
21880 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
21881 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
21882 else
21883 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
21884
21885 if (tp->t_state == TCPS_SYN_SENT)
21886 TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
21887
21888 TCP_PROBE5(send, NULL, tp, ip6, tp, th);
21889 /* TODO: IPv6 IP6TOS_ECT bit on */
21890 error = ip6_output(m,
21891 inp->in6p_outputopts,
21892 &inp->inp_route6,
21893 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0),
21894 NULL, NULL, inp);
21895
21896 if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL)
21897 mtu = inp->inp_route6.ro_nh->nh_mtu;
21898 }
21899 #endif /* INET6 */
21900 #if defined(INET) && defined(INET6)
21901 else
21902 #endif
21903 #ifdef INET
21904 {
21905 ip->ip_len = htons(m->m_pkthdr.len);
21906 #ifdef INET6
21907 if (inp->inp_vflag & INP_IPV6PROTO)
21908 ip->ip_ttl = in6_selecthlim(inp, NULL);
21909 #endif /* INET6 */
21910 rack->r_ctl.fsb.hoplimit = ip->ip_ttl;
21911 /*
21912 * If we do path MTU discovery, then we set DF on every
21913 * packet. This might not be the best thing to do according
21914 * to RFC3390 Section 2. However the tcp hostcache migitates
21915 * the problem so it affects only the first tcp connection
21916 * with a host.
21917 *
21918 * NB: Don't set DF on small MTU/MSS to have a safe
21919 * fallback.
21920 */
21921 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
21922 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
21923 if (tp->t_port == 0 || len < V_tcp_minmss) {
21924 ip->ip_off |= htons(IP_DF);
21925 }
21926 } else {
21927 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
21928 }
21929
21930 if (tp->t_state == TCPS_SYN_SENT)
21931 TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
21932
21933 TCP_PROBE5(send, NULL, tp, ip, tp, th);
21934
21935 error = ip_output(m,
21936 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
21937 inp->inp_options,
21938 #else
21939 NULL,
21940 #endif
21941 &inp->inp_route,
21942 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0,
21943 inp);
21944 if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL)
21945 mtu = inp->inp_route.ro_nh->nh_mtu;
21946 }
21947 #endif /* INET */
21948 if (lgb) {
21949 lgb->tlb_errno = error;
21950 lgb = NULL;
21951 }
21952
21953 out:
21954 /*
21955 * In transmit state, time the transmission and arrange for the
21956 * retransmit. In persist state, just set snd_max.
21957 */
21958 if ((rsm == NULL) && doing_tlp)
21959 add_flag |= RACK_TLP;
21960 rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error,
21961 rack_to_usec_ts(&tv),
21962 rsm, add_flag, s_mb, s_moff, hw_tls, segsiz);
21963 if (error == 0) {
21964 if (add_flag & RACK_IS_PCM) {
21965 /* We just launched a PCM */
21966 /* rrs here log */
21967 rack->pcm_in_progress = 1;
21968 rack->pcm_needed = 0;
21969 rack_log_pcm(rack, 7, len, rack->r_ctl.pcm_max_seg, add_flag);
21970 }
21971 if (rsm == NULL) {
21972 if (rack->lt_bw_up == 0) {
21973 rack->r_ctl.lt_timemark = tcp_tv_to_lusec(&tv);
21974 rack->r_ctl.lt_seq = tp->snd_una;
21975 rack->lt_bw_up = 1;
21976 } else if (((rack_seq + len) - rack->r_ctl.lt_seq) > 0x7fffffff) {
21977 /*
21978 * Need to record what we have since we are
21979 * approaching seq wrap.
21980 */
21981 uint64_t tmark;
21982
21983 rack->r_ctl.lt_bw_bytes += (tp->snd_una - rack->r_ctl.lt_seq);
21984 rack->r_ctl.lt_seq = tp->snd_una;
21985 tmark = tcp_get_u64_usecs(&tv);
21986 if (tmark > rack->r_ctl.lt_timemark) {
21987 rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark);
21988 rack->r_ctl.lt_timemark = tmark;
21989 }
21990 }
21991 }
21992 rack->forced_ack = 0; /* If we send something zap the FA flag */
21993 counter_u64_add(rack_total_bytes, len);
21994 tcp_account_for_send(tp, len, (rsm != NULL), doing_tlp, hw_tls);
21995 if (rsm && doing_tlp) {
21996 rack->rc_last_sent_tlp_past_cumack = 0;
21997 rack->rc_last_sent_tlp_seq_valid = 1;
21998 rack->r_ctl.last_sent_tlp_seq = rsm->r_start;
21999 rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start;
22000 }
22001 if (rack->rc_hw_nobuf) {
22002 rack->rc_hw_nobuf = 0;
22003 rack->r_ctl.rc_agg_delayed = 0;
22004 rack->r_early = 0;
22005 rack->r_late = 0;
22006 rack->r_ctl.rc_agg_early = 0;
22007 }
22008 if (rsm && (doing_tlp == 0)) {
22009 /* Set we retransmitted */
22010 rack->rc_gp_saw_rec = 1;
22011 } else {
22012 if (cwnd_to_use > tp->snd_ssthresh) {
22013 /* Set we sent in CA */
22014 rack->rc_gp_saw_ca = 1;
22015 } else {
22016 /* Set we sent in SS */
22017 rack->rc_gp_saw_ss = 1;
22018 }
22019 }
22020 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
22021 (tp->t_flags & TF_SACK_PERMIT) &&
22022 tp->rcv_numsacks > 0)
22023 tcp_clean_dsack_blocks(tp);
22024 tot_len_this_send += len;
22025 if (len == 0) {
22026 counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1);
22027 } else {
22028 int idx;
22029
22030 idx = (len / segsiz) + 3;
22031 if (idx >= TCP_MSS_ACCT_ATIMER)
22032 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
22033 else
22034 counter_u64_add(rack_out_size[idx], 1);
22035 }
22036 }
22037 if ((rack->rack_no_prr == 0) &&
22038 sub_from_prr &&
22039 (error == 0)) {
22040 if (rack->r_ctl.rc_prr_sndcnt >= len)
22041 rack->r_ctl.rc_prr_sndcnt -= len;
22042 else
22043 rack->r_ctl.rc_prr_sndcnt = 0;
22044 }
22045 sub_from_prr = 0;
22046 if (rsm != NULL) {
22047 if (doing_tlp)
22048 /* Make sure the TLP is added */
22049 rsm->r_flags |= RACK_TLP;
22050 else
22051 /* If its a resend without TLP then it must not have the flag */
22052 rsm->r_flags &= ~RACK_TLP;
22053 }
22054 if ((error == 0) &&
22055 (len > 0) &&
22056 (tp->snd_una == tp->snd_max))
22057 rack->r_ctl.rc_tlp_rxt_last_time = cts;
22058
22059 {
22060 /*
22061 * This block is not associated with the above error == 0 test.
22062 * It is used to advance snd_max if we have a new transmit.
22063 */
22064 tcp_seq startseq = tp->snd_max;
22065
22066
22067 if (rsm && (doing_tlp == 0))
22068 rack->r_ctl.rc_loss_count += rsm->r_end - rsm->r_start;
22069 if (error)
22070 /* We don't log or do anything with errors */
22071 goto nomore;
22072 if (doing_tlp == 0) {
22073 if (rsm == NULL) {
22074 /*
22075 * Not a retransmission of some
22076 * sort, new data is going out so
22077 * clear our TLP count and flag.
22078 */
22079 rack->rc_tlp_in_progress = 0;
22080 rack->r_ctl.rc_tlp_cnt_out = 0;
22081 }
22082 } else {
22083 /*
22084 * We have just sent a TLP, mark that it is true
22085 * and make sure our in progress is set so we
22086 * continue to check the count.
22087 */
22088 rack->rc_tlp_in_progress = 1;
22089 rack->r_ctl.rc_tlp_cnt_out++;
22090 }
22091 /*
22092 * If we are retransmitting we are done, snd_max
22093 * does not get updated.
22094 */
22095 if (sack_rxmit)
22096 goto nomore;
22097 if ((tp->snd_una == tp->snd_max) && (len > 0)) {
22098 /*
22099 * Update the time we just added data since
22100 * nothing was outstanding.
22101 */
22102 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
22103 tp->t_acktime = ticks;
22104 }
22105 /*
22106 * Now for special SYN/FIN handling.
22107 */
22108 if (flags & (TH_SYN | TH_FIN)) {
22109 if ((flags & TH_SYN) &&
22110 ((tp->t_flags & TF_SENTSYN) == 0)) {
22111 tp->snd_max++;
22112 tp->t_flags |= TF_SENTSYN;
22113 }
22114 if ((flags & TH_FIN) &&
22115 ((tp->t_flags & TF_SENTFIN) == 0)) {
22116 tp->snd_max++;
22117 tp->t_flags |= TF_SENTFIN;
22118 }
22119 }
22120 tp->snd_max += len;
22121 if (rack->rc_new_rnd_needed) {
22122 rack_new_round_starts(tp, rack, tp->snd_max);
22123 }
22124 /*
22125 * Time this transmission if not a retransmission and
22126 * not currently timing anything.
22127 * This is only relevant in case of switching back to
22128 * the base stack.
22129 */
22130 if (tp->t_rtttime == 0) {
22131 tp->t_rtttime = ticks;
22132 tp->t_rtseq = startseq;
22133 KMOD_TCPSTAT_INC(tcps_segstimed);
22134 }
22135 if (len &&
22136 ((tp->t_flags & TF_GPUTINPROG) == 0))
22137 rack_start_gp_measurement(tp, rack, startseq, sb_offset);
22138 /*
22139 * If we are doing FO we need to update the mbuf position and subtract
22140 * this happens when the peer sends us duplicate information and
22141 * we thus want to send a DSACK.
22142 *
22143 * XXXRRS: This brings to mind a ?, when we send a DSACK block is TSO
22144 * turned off? If not then we are going to echo multiple DSACK blocks
22145 * out (with the TSO), which we should not be doing.
22146 */
22147 if (rack->r_fast_output && len) {
22148 if (rack->r_ctl.fsb.left_to_send > len)
22149 rack->r_ctl.fsb.left_to_send -= len;
22150 else
22151 rack->r_ctl.fsb.left_to_send = 0;
22152 if (rack->r_ctl.fsb.left_to_send < segsiz)
22153 rack->r_fast_output = 0;
22154 if (rack->r_fast_output) {
22155 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
22156 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
22157 rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(rack->r_ctl.fsb.m);
22158 }
22159 }
22160 if (rack_pcm_blast == 0) {
22161 if ((orig_len > len) &&
22162 (add_flag & RACK_IS_PCM) &&
22163 (len < pace_max_seg) &&
22164 ((pace_max_seg - len) > segsiz)) {
22165 /*
22166 * We are doing a PCM measurement and we did
22167 * not get enough data in the TSO to meet the
22168 * burst requirement.
22169 */
22170 uint32_t n_len;
22171
22172 n_len = (orig_len - len);
22173 orig_len -= len;
22174 pace_max_seg -= len;
22175 len = n_len;
22176 sb_offset = tp->snd_max - tp->snd_una;
22177 /* Re-lock for the next spin */
22178 SOCK_SENDBUF_LOCK(so);
22179 goto send;
22180 }
22181 } else {
22182 if ((orig_len > len) &&
22183 (add_flag & RACK_IS_PCM) &&
22184 ((orig_len - len) > segsiz)) {
22185 /*
22186 * We are doing a PCM measurement and we did
22187 * not get enough data in the TSO to meet the
22188 * burst requirement.
22189 */
22190 uint32_t n_len;
22191
22192 n_len = (orig_len - len);
22193 orig_len -= len;
22194 len = n_len;
22195 sb_offset = tp->snd_max - tp->snd_una;
22196 /* Re-lock for the next spin */
22197 SOCK_SENDBUF_LOCK(so);
22198 goto send;
22199 }
22200 }
22201 }
22202 nomore:
22203 if (error) {
22204 rack->r_ctl.rc_agg_delayed = 0;
22205 rack->r_early = 0;
22206 rack->r_late = 0;
22207 rack->r_ctl.rc_agg_early = 0;
22208 SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */
22209 /*
22210 * Failures do not advance the seq counter above. For the
22211 * case of ENOBUFS we will fall out and retry in 1ms with
22212 * the hpts. Everything else will just have to retransmit
22213 * with the timer.
22214 *
22215 * In any case, we do not want to loop around for another
22216 * send without a good reason.
22217 */
22218 sendalot = 0;
22219 switch (error) {
22220 case EPERM:
22221 case EACCES:
22222 tp->t_softerror = error;
22223 #ifdef TCP_ACCOUNTING
22224 crtsc = get_cyclecount();
22225 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
22226 tp->tcp_cnt_counters[SND_OUT_FAIL]++;
22227 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
22228 }
22229 sched_unpin();
22230 #endif
22231 return (error);
22232 case ENOBUFS:
22233 /*
22234 * Pace us right away to retry in a some
22235 * time
22236 */
22237 if (rack->r_ctl.crte != NULL) {
22238 tcp_trace_point(rack->rc_tp, TCP_TP_HWENOBUF);
22239 if (tcp_bblogging_on(rack->rc_tp))
22240 rack_log_queue_level(tp, rack, len, &tv, cts);
22241 } else
22242 tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF);
22243 pacing_delay = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);
22244 if (rack->rc_enobuf < 0x7f)
22245 rack->rc_enobuf++;
22246 if (pacing_delay < (10 * HPTS_USEC_IN_MSEC))
22247 pacing_delay = 10 * HPTS_USEC_IN_MSEC;
22248 if (rack->r_ctl.crte != NULL) {
22249 counter_u64_add(rack_saw_enobuf_hw, 1);
22250 tcp_rl_log_enobuf(rack->r_ctl.crte);
22251 }
22252 counter_u64_add(rack_saw_enobuf, 1);
22253 goto enobufs;
22254 case EMSGSIZE:
22255 /*
22256 * For some reason the interface we used initially
22257 * to send segments changed to another or lowered
22258 * its MTU. If TSO was active we either got an
22259 * interface without TSO capabilits or TSO was
22260 * turned off. If we obtained mtu from ip_output()
22261 * then update it and try again.
22262 */
22263 if (tso)
22264 tp->t_flags &= ~TF_TSO;
22265 if (mtu != 0) {
22266 int saved_mtu;
22267
22268 saved_mtu = tp->t_maxseg;
22269 tcp_mss_update(tp, -1, mtu, NULL, NULL);
22270 if (saved_mtu > tp->t_maxseg) {
22271 goto again;
22272 }
22273 }
22274 pacing_delay = 10 * HPTS_USEC_IN_MSEC;
22275 rack_start_hpts_timer(rack, tp, cts, pacing_delay, 0, 0);
22276 #ifdef TCP_ACCOUNTING
22277 crtsc = get_cyclecount();
22278 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
22279 tp->tcp_cnt_counters[SND_OUT_FAIL]++;
22280 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
22281 }
22282 sched_unpin();
22283 #endif
22284 return (error);
22285 case ENETUNREACH:
22286 counter_u64_add(rack_saw_enetunreach, 1);
22287 /* FALLTHROUGH */
22288 case EHOSTDOWN:
22289 case EHOSTUNREACH:
22290 case ENETDOWN:
22291 if (TCPS_HAVERCVDSYN(tp->t_state)) {
22292 tp->t_softerror = error;
22293 error = 0;
22294 }
22295 /* FALLTHROUGH */
22296 default:
22297 pacing_delay = 10 * HPTS_USEC_IN_MSEC;
22298 rack_start_hpts_timer(rack, tp, cts, pacing_delay, 0, 0);
22299 #ifdef TCP_ACCOUNTING
22300 crtsc = get_cyclecount();
22301 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
22302 tp->tcp_cnt_counters[SND_OUT_FAIL]++;
22303 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
22304 }
22305 sched_unpin();
22306 #endif
22307 return (error);
22308 }
22309 } else {
22310 rack->rc_enobuf = 0;
22311 if (IN_FASTRECOVERY(tp->t_flags) && rsm)
22312 rack->r_ctl.retran_during_recovery += len;
22313 }
22314 KMOD_TCPSTAT_INC(tcps_sndtotal);
22315
22316 /*
22317 * Data sent (as far as we can tell). If this advertises a larger
22318 * window than any other segment, then remember the size of the
22319 * advertised window. Any pending ACK has now been sent.
22320 */
22321 if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
22322 tp->rcv_adv = tp->rcv_nxt + recwin;
22323
22324 tp->last_ack_sent = tp->rcv_nxt;
22325 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
22326 enobufs:
22327 if (sendalot) {
22328 /* Do we need to turn off sendalot? */
22329 if (pace_max_seg &&
22330 (tot_len_this_send >= pace_max_seg)) {
22331 /* We hit our max. */
22332 sendalot = 0;
22333 }
22334 }
22335 if ((error == 0) && (flags & TH_FIN))
22336 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN);
22337 if (flags & TH_RST) {
22338 /*
22339 * We don't send again after sending a RST.
22340 */
22341 pacing_delay = 0;
22342 sendalot = 0;
22343 if (error == 0)
22344 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
22345 } else if ((pacing_delay == 0) && (sendalot == 0) && tot_len_this_send) {
22346 /*
22347 * Get our pacing rate, if an error
22348 * occurred in sending (ENOBUF) we would
22349 * hit the else if with slot preset. Other
22350 * errors return.
22351 */
22352 pacing_delay = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz, __LINE__);
22353 }
22354 /* We have sent clear the flag */
22355 rack->r_ent_rec_ns = 0;
22356 if (rack->r_must_retran) {
22357 if (rsm) {
22358 rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start);
22359 if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) {
22360 /*
22361 * We have retransmitted all.
22362 */
22363 rack->r_must_retran = 0;
22364 rack->r_ctl.rc_out_at_rto = 0;
22365 }
22366 } else if (SEQ_GEQ(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) {
22367 /*
22368 * Sending new data will also kill
22369 * the loop.
22370 */
22371 rack->r_must_retran = 0;
22372 rack->r_ctl.rc_out_at_rto = 0;
22373 }
22374 }
22375 rack->r_ctl.fsb.recwin = recwin;
22376 if ((tp->t_flags & (TF_WASCRECOVERY|TF_WASFRECOVERY)) &&
22377 SEQ_GT(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) {
22378 /*
22379 * We hit an RTO and now have past snd_max at the RTO
22380 * clear all the WAS flags.
22381 */
22382 tp->t_flags &= ~(TF_WASCRECOVERY|TF_WASFRECOVERY);
22383 }
22384 if (pacing_delay) {
22385 /* set the rack tcb into the slot N */
22386 if ((error == 0) &&
22387 rack_use_rfo &&
22388 ((flags & (TH_SYN|TH_FIN)) == 0) &&
22389 (rsm == NULL) &&
22390 (ipoptlen == 0) &&
22391 (doing_tlp == 0) &&
22392 rack->r_fsb_inited &&
22393 TCPS_HAVEESTABLISHED(tp->t_state) &&
22394 ((IN_RECOVERY(tp->t_flags)) == 0) &&
22395 (rack->r_must_retran == 0) &&
22396 ((tp->t_flags & TF_NEEDFIN) == 0) &&
22397 (len > 0) && (orig_len > 0) &&
22398 (orig_len > len) &&
22399 ((orig_len - len) >= segsiz) &&
22400 ((optlen == 0) ||
22401 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) {
22402 /* We can send at least one more MSS using our fsb */
22403 rack_setup_fast_output(tp, rack, sb, len, orig_len,
22404 segsiz, pace_max_seg, hw_tls, flags);
22405 } else
22406 rack->r_fast_output = 0;
22407 rack_log_fsb(rack, tp, so, flags,
22408 ipoptlen, orig_len, len, error,
22409 (rsm == NULL), optlen, __LINE__, 2);
22410 } else if (sendalot) {
22411 int ret;
22412
22413 sack_rxmit = 0;
22414 if ((error == 0) &&
22415 rack_use_rfo &&
22416 ((flags & (TH_SYN|TH_FIN)) == 0) &&
22417 (rsm == NULL) &&
22418 (doing_tlp == 0) &&
22419 (ipoptlen == 0) &&
22420 (rack->r_must_retran == 0) &&
22421 rack->r_fsb_inited &&
22422 TCPS_HAVEESTABLISHED(tp->t_state) &&
22423 ((IN_RECOVERY(tp->t_flags)) == 0) &&
22424 ((tp->t_flags & TF_NEEDFIN) == 0) &&
22425 (len > 0) && (orig_len > 0) &&
22426 (orig_len > len) &&
22427 ((orig_len - len) >= segsiz) &&
22428 ((optlen == 0) ||
22429 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) {
22430 /* we can use fast_output for more */
22431 rack_setup_fast_output(tp, rack, sb, len, orig_len,
22432 segsiz, pace_max_seg, hw_tls, flags);
22433 if (rack->r_fast_output) {
22434 error = 0;
22435 ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, &tot_len_this_send, &error, __LINE__);
22436 if (ret >= 0)
22437 return (ret);
22438 else if (error)
22439 goto nomore;
22440
22441 }
22442 }
22443 goto again;
22444 }
22445 skip_all_send:
22446 /* Assure when we leave that snd_nxt will point to top */
22447 if (SEQ_GT(tp->snd_max, tp->snd_nxt))
22448 tp->snd_nxt = tp->snd_max;
22449 rack_start_hpts_timer(rack, tp, cts, pacing_delay, tot_len_this_send, 0);
22450 #ifdef TCP_ACCOUNTING
22451 crtsc = get_cyclecount() - ts_val;
22452 if (tot_len_this_send) {
22453 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
22454 tp->tcp_cnt_counters[SND_OUT_DATA]++;
22455 tp->tcp_proc_time[SND_OUT_DATA] += crtsc;
22456 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) /segsiz);
22457 }
22458 } else {
22459 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
22460 tp->tcp_cnt_counters[SND_OUT_ACK]++;
22461 tp->tcp_proc_time[SND_OUT_ACK] += crtsc;
22462 }
22463 }
22464 sched_unpin();
22465 #endif
22466 if (error == ENOBUFS)
22467 error = 0;
22468 return (error);
22469 }
22470
22471 static void
rack_update_seg(struct tcp_rack * rack)22472 rack_update_seg(struct tcp_rack *rack)
22473 {
22474 uint32_t orig_val;
22475
22476 orig_val = rack->r_ctl.rc_pace_max_segs;
22477 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
22478 if (orig_val != rack->r_ctl.rc_pace_max_segs)
22479 rack_log_pacing_delay_calc(rack, 0, 0, orig_val, 0, 0, 15, __LINE__, NULL, 0);
22480 }
22481
22482 static void
rack_mtu_change(struct tcpcb * tp)22483 rack_mtu_change(struct tcpcb *tp)
22484 {
22485 /*
22486 * The MSS may have changed
22487 */
22488 struct tcp_rack *rack;
22489 struct rack_sendmap *rsm;
22490
22491 rack = (struct tcp_rack *)tp->t_fb_ptr;
22492 if (rack->r_ctl.rc_pace_min_segs != ctf_fixed_maxseg(tp)) {
22493 /*
22494 * The MTU has changed we need to resend everything
22495 * since all we have sent is lost. We first fix
22496 * up the mtu though.
22497 */
22498 rack_set_pace_segments(tp, rack, __LINE__, NULL);
22499 /* We treat this like a full retransmit timeout without the cwnd adjustment */
22500 rack_remxt_tmr(tp);
22501 rack->r_fast_output = 0;
22502 rack->r_ctl.rc_out_at_rto = ctf_flight_size(tp,
22503 rack->r_ctl.rc_sacked);
22504 rack->r_ctl.rc_snd_max_at_rto = tp->snd_max;
22505 rack->r_must_retran = 1;
22506 /* Mark all inflight to needing to be rxt'd */
22507 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
22508 rsm->r_flags |= (RACK_MUST_RXT|RACK_PMTU_CHG);
22509 }
22510 }
22511 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
22512 /* We don't use snd_nxt to retransmit */
22513 tp->snd_nxt = tp->snd_max;
22514 }
22515
22516 static int
rack_set_dgp(struct tcp_rack * rack)22517 rack_set_dgp(struct tcp_rack *rack)
22518 {
22519 if (rack->dgp_on == 1)
22520 return(0);
22521 if ((rack->use_fixed_rate == 1) &&
22522 (rack->rc_always_pace == 1)) {
22523 /*
22524 * We are already pacing another
22525 * way.
22526 */
22527 return (EBUSY);
22528 }
22529 if (rack->rc_always_pace == 1) {
22530 rack_remove_pacing(rack);
22531 }
22532 if (tcp_incr_dgp_pacing_cnt() == 0)
22533 return (ENOSPC);
22534 rack->r_ctl.pacing_method |= RACK_DGP_PACING;
22535 rack->rc_fillcw_apply_discount = 0;
22536 rack->dgp_on = 1;
22537 rack->rc_always_pace = 1;
22538 rack->rc_pace_dnd = 1;
22539 rack->use_fixed_rate = 0;
22540 if (rack->gp_ready)
22541 rack_set_cc_pacing(rack);
22542 rack->rc_tp->t_flags2 |= TF2_SUPPORTS_MBUFQ;
22543 rack->rack_attempt_hdwr_pace = 0;
22544 /* rxt settings */
22545 rack->full_size_rxt = 1;
22546 rack->shape_rxt_to_pacing_min = 0;
22547 /* cmpack=1 */
22548 rack->r_use_cmp_ack = 1;
22549 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) &&
22550 rack->r_use_cmp_ack)
22551 rack->rc_tp->t_flags2 |= TF2_MBUF_ACKCMP;
22552 /* scwnd=1 */
22553 rack->rack_enable_scwnd = 1;
22554 /* dynamic=100 */
22555 rack->rc_gp_dyn_mul = 1;
22556 /* gp_inc_ca */
22557 rack->r_ctl.rack_per_of_gp_ca = 100;
22558 /* rrr_conf=3 */
22559 rack->r_rr_config = 3;
22560 /* npush=2 */
22561 rack->r_ctl.rc_no_push_at_mrtt = 2;
22562 /* fillcw=1 */
22563 rack->rc_pace_to_cwnd = 1;
22564 rack->rc_pace_fill_if_rttin_range = 0;
22565 rack->rtt_limit_mul = 0;
22566 /* noprr=1 */
22567 rack->rack_no_prr = 1;
22568 /* lscwnd=1 */
22569 rack->r_limit_scw = 1;
22570 /* gp_inc_rec */
22571 rack->r_ctl.rack_per_of_gp_rec = 90;
22572 return (0);
22573 }
22574
22575 static int
rack_set_profile(struct tcp_rack * rack,int prof)22576 rack_set_profile(struct tcp_rack *rack, int prof)
22577 {
22578 int err = EINVAL;
22579 if (prof == 1) {
22580 /*
22581 * Profile 1 is "standard" DGP. It ignores
22582 * client buffer level.
22583 */
22584 err = rack_set_dgp(rack);
22585 if (err)
22586 return (err);
22587 } else if (prof == 6) {
22588 err = rack_set_dgp(rack);
22589 if (err)
22590 return (err);
22591 /*
22592 * Profile 6 tweaks DGP so that it will apply to
22593 * fill-cw the same settings that profile5 does
22594 * to replace DGP. It gets then the max(dgp-rate, fillcw(discounted).
22595 */
22596 rack->rc_fillcw_apply_discount = 1;
22597 } else if (prof == 0) {
22598 /* This changes things back to the default settings */
22599 if (rack->rc_always_pace == 1) {
22600 rack_remove_pacing(rack);
22601 } else {
22602 /* Make sure any stray flags are off */
22603 rack->dgp_on = 0;
22604 rack->rc_hybrid_mode = 0;
22605 rack->use_fixed_rate = 0;
22606 }
22607 err = 0;
22608 if (rack_fill_cw_state)
22609 rack->rc_pace_to_cwnd = 1;
22610 else
22611 rack->rc_pace_to_cwnd = 0;
22612
22613 if (rack_pace_every_seg && tcp_can_enable_pacing()) {
22614 rack->r_ctl.pacing_method |= RACK_REG_PACING;
22615 rack->rc_always_pace = 1;
22616 if (rack->rack_hibeta)
22617 rack_set_cc_pacing(rack);
22618 } else
22619 rack->rc_always_pace = 0;
22620 if (rack_dsack_std_based & 0x1) {
22621 /* Basically this means all rack timers are at least (srtt + 1/4 srtt) */
22622 rack->rc_rack_tmr_std_based = 1;
22623 }
22624 if (rack_dsack_std_based & 0x2) {
22625 /* Basically this means rack timers are extended based on dsack by up to (2 * srtt) */
22626 rack->rc_rack_use_dsack = 1;
22627 }
22628 if (rack_use_cmp_acks)
22629 rack->r_use_cmp_ack = 1;
22630 else
22631 rack->r_use_cmp_ack = 0;
22632 if (rack_disable_prr)
22633 rack->rack_no_prr = 1;
22634 else
22635 rack->rack_no_prr = 0;
22636 if (rack_gp_no_rec_chg)
22637 rack->rc_gp_no_rec_chg = 1;
22638 else
22639 rack->rc_gp_no_rec_chg = 0;
22640 if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack) {
22641 rack->r_mbuf_queue = 1;
22642 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state))
22643 rack->rc_tp->t_flags2 |= TF2_MBUF_ACKCMP;
22644 rack->rc_tp->t_flags2 |= TF2_SUPPORTS_MBUFQ;
22645 } else {
22646 rack->r_mbuf_queue = 0;
22647 rack->rc_tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ;
22648 }
22649 if (rack_enable_shared_cwnd)
22650 rack->rack_enable_scwnd = 1;
22651 else
22652 rack->rack_enable_scwnd = 0;
22653 if (rack_do_dyn_mul) {
22654 /* When dynamic adjustment is on CA needs to start at 100% */
22655 rack->rc_gp_dyn_mul = 1;
22656 if (rack_do_dyn_mul >= 100)
22657 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul;
22658 } else {
22659 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca;
22660 rack->rc_gp_dyn_mul = 0;
22661 }
22662 rack->r_rr_config = 0;
22663 rack->r_ctl.rc_no_push_at_mrtt = 0;
22664 rack->rc_pace_fill_if_rttin_range = 0;
22665 rack->rtt_limit_mul = 0;
22666
22667 if (rack_enable_hw_pacing)
22668 rack->rack_hdw_pace_ena = 1;
22669 else
22670 rack->rack_hdw_pace_ena = 0;
22671 if (rack_disable_prr)
22672 rack->rack_no_prr = 1;
22673 else
22674 rack->rack_no_prr = 0;
22675 if (rack_limits_scwnd)
22676 rack->r_limit_scw = 1;
22677 else
22678 rack->r_limit_scw = 0;
22679 rack_init_retransmit_value(rack, rack_rxt_controls);
22680 err = 0;
22681 }
22682 return (err);
22683 }
22684
22685 static int
rack_add_deferred_option(struct tcp_rack * rack,int sopt_name,uint64_t loptval)22686 rack_add_deferred_option(struct tcp_rack *rack, int sopt_name, uint64_t loptval)
22687 {
22688 struct deferred_opt_list *dol;
22689
22690 dol = malloc(sizeof(struct deferred_opt_list),
22691 M_TCPDO, M_NOWAIT|M_ZERO);
22692 if (dol == NULL) {
22693 /*
22694 * No space yikes -- fail out..
22695 */
22696 return (0);
22697 }
22698 dol->optname = sopt_name;
22699 dol->optval = loptval;
22700 TAILQ_INSERT_TAIL(&rack->r_ctl.opt_list, dol, next);
22701 return (1);
22702 }
22703
22704 static int
process_hybrid_pacing(struct tcp_rack * rack,struct tcp_hybrid_req * hybrid)22705 process_hybrid_pacing(struct tcp_rack *rack, struct tcp_hybrid_req *hybrid)
22706 {
22707 #ifdef TCP_REQUEST_TRK
22708 struct tcp_sendfile_track *sft;
22709 struct timeval tv;
22710 tcp_seq seq;
22711 int err;
22712
22713 microuptime(&tv);
22714
22715 /* Make sure no fixed rate is on */
22716 rack->use_fixed_rate = 0;
22717 rack->r_ctl.rc_fixed_pacing_rate_rec = 0;
22718 rack->r_ctl.rc_fixed_pacing_rate_ca = 0;
22719 rack->r_ctl.rc_fixed_pacing_rate_ss = 0;
22720 /* Now allocate or find our entry that will have these settings */
22721 sft = tcp_req_alloc_req_full(rack->rc_tp, &hybrid->req, tcp_tv_to_lusec(&tv), 0);
22722 if (sft == NULL) {
22723 rack->rc_tp->tcp_hybrid_error++;
22724 /* no space, where would it have gone? */
22725 seq = rack->rc_tp->snd_una + rack->rc_tp->t_inpcb.inp_socket->so_snd.sb_ccc;
22726 rack_log_hybrid(rack, seq, NULL, HYBRID_LOG_NO_ROOM, __LINE__, 0);
22727 return (ENOSPC);
22728 }
22729 /* mask our internal flags */
22730 hybrid->hybrid_flags &= TCP_HYBRID_PACING_USER_MASK;
22731 /* The seq will be snd_una + everything in the buffer */
22732 seq = sft->start_seq;
22733 if ((hybrid->hybrid_flags & TCP_HYBRID_PACING_ENABLE) == 0) {
22734 /* Disabling hybrid pacing */
22735 if (rack->rc_hybrid_mode) {
22736 rack_set_profile(rack, 0);
22737 rack->rc_tp->tcp_hybrid_stop++;
22738 }
22739 rack_log_hybrid(rack, seq, sft, HYBRID_LOG_TURNED_OFF, __LINE__, 0);
22740 return (0);
22741 }
22742 if (rack->dgp_on == 0) {
22743 /*
22744 * If we have not yet turned DGP on, do so
22745 * now setting pure DGP mode, no buffer level
22746 * response.
22747 */
22748 if ((err = rack_set_profile(rack, 1)) != 0){
22749 /* Failed to turn pacing on */
22750 rack->rc_tp->tcp_hybrid_error++;
22751 rack_log_hybrid(rack, seq, sft, HYBRID_LOG_NO_PACING, __LINE__, 0);
22752 return (err);
22753 }
22754 }
22755 /*
22756 * Now we must switch to hybrid mode as well which also
22757 * means moving to regular pacing.
22758 */
22759 if (rack->rc_hybrid_mode == 0) {
22760 /* First time */
22761 if (tcp_can_enable_pacing()) {
22762 rack->r_ctl.pacing_method |= RACK_REG_PACING;
22763 rack->rc_hybrid_mode = 1;
22764 } else {
22765 return (ENOSPC);
22766 }
22767 if (rack->r_ctl.pacing_method & RACK_DGP_PACING) {
22768 /*
22769 * This should be true.
22770 */
22771 tcp_dec_dgp_pacing_cnt();
22772 rack->r_ctl.pacing_method &= ~RACK_DGP_PACING;
22773 }
22774 }
22775 /* Now set in our flags */
22776 sft->hybrid_flags = hybrid->hybrid_flags | TCP_HYBRID_PACING_WASSET;
22777 if (hybrid->hybrid_flags & TCP_HYBRID_PACING_CSPR)
22778 sft->cspr = hybrid->cspr;
22779 else
22780 sft->cspr = 0;
22781 if (hybrid->hybrid_flags & TCP_HYBRID_PACING_H_MS)
22782 sft->hint_maxseg = hybrid->hint_maxseg;
22783 else
22784 sft->hint_maxseg = 0;
22785 rack->rc_tp->tcp_hybrid_start++;
22786 rack_log_hybrid(rack, seq, sft, HYBRID_LOG_RULES_SET, __LINE__,0);
22787 return (0);
22788 #else
22789 return (ENOTSUP);
22790 #endif
22791 }
22792
22793 static int
rack_stack_information(struct tcpcb * tp,struct stack_specific_info * si)22794 rack_stack_information(struct tcpcb *tp, struct stack_specific_info *si)
22795 {
22796 /* We pulled a SSI info log out what was there */
22797 si->bytes_transmitted = tp->t_sndbytes;
22798 si->bytes_retransmitted = tp->t_snd_rxt_bytes;
22799 return (0);
22800 }
22801
22802 static int
rack_process_option(struct tcpcb * tp,struct tcp_rack * rack,int sopt_name,uint32_t optval,uint64_t loptval,struct tcp_hybrid_req * hybrid)22803 rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name,
22804 uint32_t optval, uint64_t loptval, struct tcp_hybrid_req *hybrid)
22805
22806 {
22807 struct epoch_tracker et;
22808 struct sockopt sopt;
22809 struct cc_newreno_opts opt;
22810 uint64_t val;
22811 int error = 0;
22812 uint16_t ca, ss;
22813
22814 switch (sopt_name) {
22815 case TCP_RACK_SET_RXT_OPTIONS:
22816 if (optval <= 2) {
22817 rack_init_retransmit_value(rack, optval);
22818 } else {
22819 /*
22820 * You must send in 0, 1 or 2 all else is
22821 * invalid.
22822 */
22823 error = EINVAL;
22824 }
22825 break;
22826 case TCP_RACK_DSACK_OPT:
22827 RACK_OPTS_INC(tcp_rack_dsack_opt);
22828 if (optval & 0x1) {
22829 rack->rc_rack_tmr_std_based = 1;
22830 } else {
22831 rack->rc_rack_tmr_std_based = 0;
22832 }
22833 if (optval & 0x2) {
22834 rack->rc_rack_use_dsack = 1;
22835 } else {
22836 rack->rc_rack_use_dsack = 0;
22837 }
22838 rack_log_dsack_event(rack, 5, __LINE__, 0, 0);
22839 break;
22840 case TCP_RACK_PACING_DIVISOR:
22841 RACK_OPTS_INC(tcp_rack_pacing_divisor);
22842 if (optval == 0) {
22843 rack->r_ctl.pace_len_divisor = rack_default_pacing_divisor;
22844 } else {
22845 if (optval < RL_MIN_DIVISOR)
22846 rack->r_ctl.pace_len_divisor = RL_MIN_DIVISOR;
22847 else
22848 rack->r_ctl.pace_len_divisor = optval;
22849 }
22850 break;
22851 case TCP_RACK_HI_BETA:
22852 RACK_OPTS_INC(tcp_rack_hi_beta);
22853 if (optval > 0) {
22854 rack->rack_hibeta = 1;
22855 if ((optval >= 50) &&
22856 (optval <= 100)) {
22857 /*
22858 * User wants to set a custom beta.
22859 */
22860 rack->r_ctl.saved_hibeta = optval;
22861 if (rack->rc_pacing_cc_set)
22862 rack_undo_cc_pacing(rack);
22863 rack->r_ctl.rc_saved_beta = optval;
22864 }
22865 if (rack->rc_pacing_cc_set == 0)
22866 rack_set_cc_pacing(rack);
22867 } else {
22868 rack->rack_hibeta = 0;
22869 if (rack->rc_pacing_cc_set)
22870 rack_undo_cc_pacing(rack);
22871 }
22872 break;
22873 case TCP_RACK_PACING_BETA:
22874 error = EINVAL;
22875 break;
22876 case TCP_RACK_TIMER_SLOP:
22877 RACK_OPTS_INC(tcp_rack_timer_slop);
22878 rack->r_ctl.timer_slop = optval;
22879 if (rack->rc_tp->t_srtt) {
22880 /*
22881 * If we have an SRTT lets update t_rxtcur
22882 * to have the new slop.
22883 */
22884 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
22885 rack_rto_min, rack_rto_max,
22886 rack->r_ctl.timer_slop);
22887 }
22888 break;
22889 case TCP_RACK_PACING_BETA_ECN:
22890 RACK_OPTS_INC(tcp_rack_beta_ecn);
22891 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) {
22892 /* This only works for newreno. */
22893 error = EINVAL;
22894 break;
22895 }
22896 if (rack->rc_pacing_cc_set) {
22897 /*
22898 * Set them into the real CC module
22899 * whats in the rack pcb is the old values
22900 * to be used on restoral/
22901 */
22902 sopt.sopt_dir = SOPT_SET;
22903 opt.name = CC_NEWRENO_BETA_ECN;
22904 opt.val = optval;
22905 if (CC_ALGO(tp)->ctl_output != NULL)
22906 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt);
22907 else
22908 error = ENOENT;
22909 } else {
22910 /*
22911 * Not pacing yet so set it into our local
22912 * rack pcb storage.
22913 */
22914 rack->r_ctl.rc_saved_beta_ecn = optval;
22915 }
22916 break;
22917 case TCP_DEFER_OPTIONS:
22918 RACK_OPTS_INC(tcp_defer_opt);
22919 if (optval) {
22920 if (rack->gp_ready) {
22921 /* Too late */
22922 error = EINVAL;
22923 break;
22924 }
22925 rack->defer_options = 1;
22926 } else
22927 rack->defer_options = 0;
22928 break;
22929 case TCP_RACK_MEASURE_CNT:
22930 RACK_OPTS_INC(tcp_rack_measure_cnt);
22931 if (optval && (optval <= 0xff)) {
22932 rack->r_ctl.req_measurements = optval;
22933 } else
22934 error = EINVAL;
22935 break;
22936 case TCP_REC_ABC_VAL:
22937 RACK_OPTS_INC(tcp_rec_abc_val);
22938 if (optval > 0)
22939 rack->r_use_labc_for_rec = 1;
22940 else
22941 rack->r_use_labc_for_rec = 0;
22942 break;
22943 case TCP_RACK_ABC_VAL:
22944 RACK_OPTS_INC(tcp_rack_abc_val);
22945 if ((optval > 0) && (optval < 255))
22946 rack->rc_labc = optval;
22947 else
22948 error = EINVAL;
22949 break;
22950 case TCP_HDWR_UP_ONLY:
22951 RACK_OPTS_INC(tcp_pacing_up_only);
22952 if (optval)
22953 rack->r_up_only = 1;
22954 else
22955 rack->r_up_only = 0;
22956 break;
22957 case TCP_FILLCW_RATE_CAP: /* URL:fillcw_cap */
22958 RACK_OPTS_INC(tcp_fillcw_rate_cap);
22959 rack->r_ctl.fillcw_cap = loptval;
22960 break;
22961 case TCP_PACING_RATE_CAP:
22962 RACK_OPTS_INC(tcp_pacing_rate_cap);
22963 if ((rack->dgp_on == 1) &&
22964 (rack->r_ctl.pacing_method & RACK_DGP_PACING)) {
22965 /*
22966 * If we are doing DGP we need to switch
22967 * to using the pacing limit.
22968 */
22969 if (tcp_can_enable_pacing() == 0) {
22970 error = ENOSPC;
22971 break;
22972 }
22973 /*
22974 * Now change up the flags and counts to be correct.
22975 */
22976 rack->r_ctl.pacing_method |= RACK_REG_PACING;
22977 tcp_dec_dgp_pacing_cnt();
22978 rack->r_ctl.pacing_method &= ~RACK_DGP_PACING;
22979 }
22980 rack->r_ctl.bw_rate_cap = loptval;
22981 break;
22982 case TCP_HYBRID_PACING:
22983 if (hybrid == NULL) {
22984 error = EINVAL;
22985 break;
22986 }
22987 if (rack->r_ctl.side_chan_dis_mask & HYBRID_DIS_MASK) {
22988 error = EPERM;
22989 break;
22990 }
22991 error = process_hybrid_pacing(rack, hybrid);
22992 break;
22993 case TCP_SIDECHAN_DIS: /* URL:scodm */
22994 if (optval)
22995 rack->r_ctl.side_chan_dis_mask = optval;
22996 else
22997 rack->r_ctl.side_chan_dis_mask = 0;
22998 break;
22999 case TCP_RACK_PROFILE:
23000 RACK_OPTS_INC(tcp_profile);
23001 error = rack_set_profile(rack, optval);
23002 break;
23003 case TCP_USE_CMP_ACKS:
23004 RACK_OPTS_INC(tcp_use_cmp_acks);
23005 if ((optval == 0) && (tp->t_flags2 & TF2_MBUF_ACKCMP)) {
23006 /* You can't turn it off once its on! */
23007 error = EINVAL;
23008 } else if ((optval == 1) && (rack->r_use_cmp_ack == 0)) {
23009 rack->r_use_cmp_ack = 1;
23010 rack->r_mbuf_queue = 1;
23011 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ;
23012 }
23013 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state))
23014 tp->t_flags2 |= TF2_MBUF_ACKCMP;
23015 break;
23016 case TCP_SHARED_CWND_TIME_LIMIT:
23017 RACK_OPTS_INC(tcp_lscwnd);
23018 if (optval)
23019 rack->r_limit_scw = 1;
23020 else
23021 rack->r_limit_scw = 0;
23022 break;
23023 case TCP_RACK_DGP_IN_REC:
23024 error = EINVAL;
23025 break;
23026 case TCP_RACK_PACE_TO_FILL:
23027 RACK_OPTS_INC(tcp_fillcw);
23028 if (optval == 0)
23029 rack->rc_pace_to_cwnd = 0;
23030 else {
23031 rack->rc_pace_to_cwnd = 1;
23032 }
23033 if ((optval >= rack_gp_rtt_maxmul) &&
23034 rack_gp_rtt_maxmul &&
23035 (optval < 0xf)) {
23036 rack->rc_pace_fill_if_rttin_range = 1;
23037 rack->rtt_limit_mul = optval;
23038 } else {
23039 rack->rc_pace_fill_if_rttin_range = 0;
23040 rack->rtt_limit_mul = 0;
23041 }
23042 break;
23043 case TCP_RACK_NO_PUSH_AT_MAX:
23044 RACK_OPTS_INC(tcp_npush);
23045 if (optval == 0)
23046 rack->r_ctl.rc_no_push_at_mrtt = 0;
23047 else if (optval < 0xff)
23048 rack->r_ctl.rc_no_push_at_mrtt = optval;
23049 else
23050 error = EINVAL;
23051 break;
23052 case TCP_SHARED_CWND_ENABLE:
23053 RACK_OPTS_INC(tcp_rack_scwnd);
23054 if (optval == 0)
23055 rack->rack_enable_scwnd = 0;
23056 else
23057 rack->rack_enable_scwnd = 1;
23058 break;
23059 case TCP_RACK_MBUF_QUEUE:
23060 /* Now do we use the LRO mbuf-queue feature */
23061 RACK_OPTS_INC(tcp_rack_mbufq);
23062 if (optval || rack->r_use_cmp_ack)
23063 rack->r_mbuf_queue = 1;
23064 else
23065 rack->r_mbuf_queue = 0;
23066 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
23067 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ;
23068 else
23069 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ;
23070 break;
23071 case TCP_RACK_NONRXT_CFG_RATE:
23072 RACK_OPTS_INC(tcp_rack_cfg_rate);
23073 if (optval == 0)
23074 rack->rack_rec_nonrxt_use_cr = 0;
23075 else
23076 rack->rack_rec_nonrxt_use_cr = 1;
23077 break;
23078 case TCP_NO_PRR:
23079 RACK_OPTS_INC(tcp_rack_noprr);
23080 if (optval == 0)
23081 rack->rack_no_prr = 0;
23082 else if (optval == 1)
23083 rack->rack_no_prr = 1;
23084 else if (optval == 2)
23085 rack->no_prr_addback = 1;
23086 else
23087 error = EINVAL;
23088 break;
23089 case RACK_CSPR_IS_FCC: /* URL:csprisfcc */
23090 if (optval > 0)
23091 rack->cspr_is_fcc = 1;
23092 else
23093 rack->cspr_is_fcc = 0;
23094 break;
23095 case TCP_TIMELY_DYN_ADJ:
23096 RACK_OPTS_INC(tcp_timely_dyn);
23097 if (optval == 0)
23098 rack->rc_gp_dyn_mul = 0;
23099 else {
23100 rack->rc_gp_dyn_mul = 1;
23101 if (optval >= 100) {
23102 /*
23103 * If the user sets something 100 or more
23104 * its the gp_ca value.
23105 */
23106 rack->r_ctl.rack_per_of_gp_ca = optval;
23107 }
23108 }
23109 break;
23110 case TCP_RACK_DO_DETECTION:
23111 error = EINVAL;
23112 break;
23113 case TCP_RACK_TLP_USE:
23114 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) {
23115 error = EINVAL;
23116 break;
23117 }
23118 RACK_OPTS_INC(tcp_tlp_use);
23119 rack->rack_tlp_threshold_use = optval;
23120 break;
23121 case TCP_RACK_TLP_REDUCE:
23122 /* RACK TLP cwnd reduction (bool) */
23123 RACK_OPTS_INC(tcp_rack_tlp_reduce);
23124 rack->r_ctl.rc_tlp_cwnd_reduce = optval;
23125 break;
23126 /* Pacing related ones */
23127 case TCP_RACK_PACE_ALWAYS:
23128 /*
23129 * zero is old rack method, 1 is new
23130 * method using a pacing rate.
23131 */
23132 RACK_OPTS_INC(tcp_rack_pace_always);
23133 if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) {
23134 error = EPERM;
23135 break;
23136 }
23137 if (optval > 0) {
23138 if (rack->rc_always_pace) {
23139 error = EALREADY;
23140 break;
23141 } else if (tcp_can_enable_pacing()) {
23142 rack->r_ctl.pacing_method |= RACK_REG_PACING;
23143 rack->rc_always_pace = 1;
23144 if (rack->rack_hibeta)
23145 rack_set_cc_pacing(rack);
23146 }
23147 else {
23148 error = ENOSPC;
23149 break;
23150 }
23151 } else {
23152 if (rack->rc_always_pace == 1) {
23153 rack_remove_pacing(rack);
23154 }
23155 }
23156 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
23157 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ;
23158 else
23159 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ;
23160 /* A rate may be set irate or other, if so set seg size */
23161 rack_update_seg(rack);
23162 break;
23163 case TCP_BBR_RACK_INIT_RATE:
23164 RACK_OPTS_INC(tcp_initial_rate);
23165 val = optval;
23166 /* Change from kbits per second to bytes per second */
23167 val *= 1000;
23168 val /= 8;
23169 rack->r_ctl.init_rate = val;
23170 if (rack->rc_always_pace)
23171 rack_update_seg(rack);
23172 break;
23173 case TCP_BBR_IWINTSO:
23174 error = EINVAL;
23175 break;
23176 case TCP_RACK_FORCE_MSEG:
23177 RACK_OPTS_INC(tcp_rack_force_max_seg);
23178 if (optval)
23179 rack->rc_force_max_seg = 1;
23180 else
23181 rack->rc_force_max_seg = 0;
23182 break;
23183 case TCP_RACK_PACE_MIN_SEG:
23184 RACK_OPTS_INC(tcp_rack_min_seg);
23185 rack->r_ctl.rc_user_set_min_segs = (0x0000ffff & optval);
23186 rack_set_pace_segments(tp, rack, __LINE__, NULL);
23187 break;
23188 case TCP_RACK_PACE_MAX_SEG:
23189 /* Max segments size in a pace in bytes */
23190 RACK_OPTS_INC(tcp_rack_max_seg);
23191 if ((rack->dgp_on == 1) &&
23192 (rack->r_ctl.pacing_method & RACK_DGP_PACING)) {
23193 /*
23194 * If we set a max-seg and are doing DGP then
23195 * we now fall under the pacing limits not the
23196 * DGP ones.
23197 */
23198 if (tcp_can_enable_pacing() == 0) {
23199 error = ENOSPC;
23200 break;
23201 }
23202 /*
23203 * Now change up the flags and counts to be correct.
23204 */
23205 rack->r_ctl.pacing_method |= RACK_REG_PACING;
23206 tcp_dec_dgp_pacing_cnt();
23207 rack->r_ctl.pacing_method &= ~RACK_DGP_PACING;
23208 }
23209 if (optval <= MAX_USER_SET_SEG)
23210 rack->rc_user_set_max_segs = optval;
23211 else
23212 rack->rc_user_set_max_segs = MAX_USER_SET_SEG;
23213 rack_set_pace_segments(tp, rack, __LINE__, NULL);
23214 break;
23215 case TCP_RACK_PACE_RATE_REC:
23216 /* Set the fixed pacing rate in Bytes per second ca */
23217 RACK_OPTS_INC(tcp_rack_pace_rate_rec);
23218 if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) {
23219 error = EPERM;
23220 break;
23221 }
23222 if (rack->dgp_on) {
23223 /*
23224 * We are already pacing another
23225 * way.
23226 */
23227 error = EBUSY;
23228 break;
23229 }
23230 rack->r_ctl.rc_fixed_pacing_rate_rec = optval;
23231 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0)
23232 rack->r_ctl.rc_fixed_pacing_rate_ca = optval;
23233 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0)
23234 rack->r_ctl.rc_fixed_pacing_rate_ss = optval;
23235 rack->use_fixed_rate = 1;
23236 if (rack->rack_hibeta)
23237 rack_set_cc_pacing(rack);
23238 rack_log_pacing_delay_calc(rack,
23239 rack->r_ctl.rc_fixed_pacing_rate_ss,
23240 rack->r_ctl.rc_fixed_pacing_rate_ca,
23241 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8,
23242 __LINE__, NULL,0);
23243 break;
23244
23245 case TCP_RACK_PACE_RATE_SS:
23246 /* Set the fixed pacing rate in Bytes per second ca */
23247 RACK_OPTS_INC(tcp_rack_pace_rate_ss);
23248 if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) {
23249 error = EPERM;
23250 break;
23251 }
23252 if (rack->dgp_on) {
23253 /*
23254 * We are already pacing another
23255 * way.
23256 */
23257 error = EBUSY;
23258 break;
23259 }
23260 rack->r_ctl.rc_fixed_pacing_rate_ss = optval;
23261 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0)
23262 rack->r_ctl.rc_fixed_pacing_rate_ca = optval;
23263 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0)
23264 rack->r_ctl.rc_fixed_pacing_rate_rec = optval;
23265 rack->use_fixed_rate = 1;
23266 if (rack->rack_hibeta)
23267 rack_set_cc_pacing(rack);
23268 rack_log_pacing_delay_calc(rack,
23269 rack->r_ctl.rc_fixed_pacing_rate_ss,
23270 rack->r_ctl.rc_fixed_pacing_rate_ca,
23271 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8,
23272 __LINE__, NULL, 0);
23273 break;
23274
23275 case TCP_RACK_PACE_RATE_CA:
23276 /* Set the fixed pacing rate in Bytes per second ca */
23277 RACK_OPTS_INC(tcp_rack_pace_rate_ca);
23278 if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) {
23279 error = EPERM;
23280 break;
23281 }
23282 if (rack->dgp_on) {
23283 /*
23284 * We are already pacing another
23285 * way.
23286 */
23287 error = EBUSY;
23288 break;
23289 }
23290 rack->r_ctl.rc_fixed_pacing_rate_ca = optval;
23291 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0)
23292 rack->r_ctl.rc_fixed_pacing_rate_ss = optval;
23293 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0)
23294 rack->r_ctl.rc_fixed_pacing_rate_rec = optval;
23295 rack->use_fixed_rate = 1;
23296 if (rack->rack_hibeta)
23297 rack_set_cc_pacing(rack);
23298 rack_log_pacing_delay_calc(rack,
23299 rack->r_ctl.rc_fixed_pacing_rate_ss,
23300 rack->r_ctl.rc_fixed_pacing_rate_ca,
23301 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8,
23302 __LINE__, NULL, 0);
23303 break;
23304 case TCP_RACK_GP_INCREASE_REC:
23305 RACK_OPTS_INC(tcp_gp_inc_rec);
23306 rack->r_ctl.rack_per_of_gp_rec = optval;
23307 rack_log_pacing_delay_calc(rack,
23308 rack->r_ctl.rack_per_of_gp_ss,
23309 rack->r_ctl.rack_per_of_gp_ca,
23310 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1,
23311 __LINE__, NULL, 0);
23312 break;
23313 case TCP_RACK_GP_INCREASE_CA:
23314 RACK_OPTS_INC(tcp_gp_inc_ca);
23315 ca = optval;
23316 if (ca < 100) {
23317 /*
23318 * We don't allow any reduction
23319 * over the GP b/w.
23320 */
23321 error = EINVAL;
23322 break;
23323 }
23324 rack->r_ctl.rack_per_of_gp_ca = ca;
23325 rack_log_pacing_delay_calc(rack,
23326 rack->r_ctl.rack_per_of_gp_ss,
23327 rack->r_ctl.rack_per_of_gp_ca,
23328 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1,
23329 __LINE__, NULL, 0);
23330 break;
23331 case TCP_RACK_GP_INCREASE_SS:
23332 RACK_OPTS_INC(tcp_gp_inc_ss);
23333 ss = optval;
23334 if (ss < 100) {
23335 /*
23336 * We don't allow any reduction
23337 * over the GP b/w.
23338 */
23339 error = EINVAL;
23340 break;
23341 }
23342 rack->r_ctl.rack_per_of_gp_ss = ss;
23343 rack_log_pacing_delay_calc(rack,
23344 rack->r_ctl.rack_per_of_gp_ss,
23345 rack->r_ctl.rack_per_of_gp_ca,
23346 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1,
23347 __LINE__, NULL, 0);
23348 break;
23349 case TCP_RACK_RR_CONF:
23350 RACK_OPTS_INC(tcp_rack_rrr_no_conf_rate);
23351 if (optval && optval <= 3)
23352 rack->r_rr_config = optval;
23353 else
23354 rack->r_rr_config = 0;
23355 break;
23356 case TCP_PACING_DND: /* URL:dnd */
23357 if (optval > 0)
23358 rack->rc_pace_dnd = 1;
23359 else
23360 rack->rc_pace_dnd = 0;
23361 break;
23362 case TCP_HDWR_RATE_CAP:
23363 RACK_OPTS_INC(tcp_hdwr_rate_cap);
23364 if (optval) {
23365 if (rack->r_rack_hw_rate_caps == 0)
23366 rack->r_rack_hw_rate_caps = 1;
23367 else
23368 error = EALREADY;
23369 } else {
23370 rack->r_rack_hw_rate_caps = 0;
23371 }
23372 break;
23373 case TCP_DGP_UPPER_BOUNDS:
23374 {
23375 uint8_t val;
23376 val = optval & 0x0000ff;
23377 rack->r_ctl.rack_per_upper_bound_ca = val;
23378 val = (optval >> 16) & 0x0000ff;
23379 rack->r_ctl.rack_per_upper_bound_ss = val;
23380 break;
23381 }
23382 case TCP_SS_EEXIT: /* URL:eexit */
23383 if (optval > 0) {
23384 rack->r_ctl.gp_rnd_thresh = optval & 0x0ff;
23385 if (optval & 0x10000) {
23386 rack->r_ctl.gate_to_fs = 1;
23387 } else {
23388 rack->r_ctl.gate_to_fs = 0;
23389 }
23390 if (optval & 0x20000) {
23391 rack->r_ctl.use_gp_not_last = 1;
23392 } else {
23393 rack->r_ctl.use_gp_not_last = 0;
23394 }
23395 if (optval & 0xfffc0000) {
23396 uint32_t v;
23397
23398 v = (optval >> 18) & 0x00003fff;
23399 if (v >= 1000)
23400 rack->r_ctl.gp_gain_req = v;
23401 }
23402 } else {
23403 /* We do not do ss early exit at all */
23404 rack->rc_initial_ss_comp = 1;
23405 rack->r_ctl.gp_rnd_thresh = 0;
23406 }
23407 break;
23408 case TCP_RACK_SPLIT_LIMIT:
23409 RACK_OPTS_INC(tcp_split_limit);
23410 rack->r_ctl.rc_split_limit = optval;
23411 break;
23412 case TCP_BBR_HDWR_PACE:
23413 RACK_OPTS_INC(tcp_hdwr_pacing);
23414 if (optval){
23415 if (rack->rack_hdrw_pacing == 0) {
23416 rack->rack_hdw_pace_ena = 1;
23417 rack->rack_attempt_hdwr_pace = 0;
23418 } else
23419 error = EALREADY;
23420 } else {
23421 rack->rack_hdw_pace_ena = 0;
23422 #ifdef RATELIMIT
23423 if (rack->r_ctl.crte != NULL) {
23424 rack->rack_hdrw_pacing = 0;
23425 rack->rack_attempt_hdwr_pace = 0;
23426 tcp_rel_pacing_rate(rack->r_ctl.crte, tp);
23427 rack->r_ctl.crte = NULL;
23428 }
23429 #endif
23430 }
23431 break;
23432 /* End Pacing related ones */
23433 case TCP_RACK_PRR_SENDALOT:
23434 /* Allow PRR to send more than one seg */
23435 RACK_OPTS_INC(tcp_rack_prr_sendalot);
23436 rack->r_ctl.rc_prr_sendalot = optval;
23437 break;
23438 case TCP_RACK_MIN_TO:
23439 /* Minimum time between rack t-o's in ms */
23440 RACK_OPTS_INC(tcp_rack_min_to);
23441 rack->r_ctl.rc_min_to = optval;
23442 break;
23443 case TCP_RACK_EARLY_SEG:
23444 /* If early recovery max segments */
23445 RACK_OPTS_INC(tcp_rack_early_seg);
23446 rack->r_ctl.rc_early_recovery_segs = optval;
23447 break;
23448 case TCP_RACK_ENABLE_HYSTART:
23449 {
23450 if (optval) {
23451 tp->t_ccv.flags |= CCF_HYSTART_ALLOWED;
23452 if (rack_do_hystart > RACK_HYSTART_ON)
23453 tp->t_ccv.flags |= CCF_HYSTART_CAN_SH_CWND;
23454 if (rack_do_hystart > RACK_HYSTART_ON_W_SC)
23455 tp->t_ccv.flags |= CCF_HYSTART_CONS_SSTH;
23456 } else {
23457 tp->t_ccv.flags &= ~(CCF_HYSTART_ALLOWED|CCF_HYSTART_CAN_SH_CWND|CCF_HYSTART_CONS_SSTH);
23458 }
23459 }
23460 break;
23461 case TCP_RACK_REORD_THRESH:
23462 /* RACK reorder threshold (shift amount) */
23463 RACK_OPTS_INC(tcp_rack_reord_thresh);
23464 if ((optval > 0) && (optval < 31))
23465 rack->r_ctl.rc_reorder_shift = optval;
23466 else
23467 error = EINVAL;
23468 break;
23469 case TCP_RACK_REORD_FADE:
23470 /* Does reordering fade after ms time */
23471 RACK_OPTS_INC(tcp_rack_reord_fade);
23472 rack->r_ctl.rc_reorder_fade = optval;
23473 break;
23474 case TCP_RACK_TLP_THRESH:
23475 /* RACK TLP theshold i.e. srtt+(srtt/N) */
23476 RACK_OPTS_INC(tcp_rack_tlp_thresh);
23477 if (optval)
23478 rack->r_ctl.rc_tlp_threshold = optval;
23479 else
23480 error = EINVAL;
23481 break;
23482 case TCP_BBR_USE_RACK_RR:
23483 RACK_OPTS_INC(tcp_rack_rr);
23484 if (optval)
23485 rack->use_rack_rr = 1;
23486 else
23487 rack->use_rack_rr = 0;
23488 break;
23489 case TCP_RACK_PKT_DELAY:
23490 /* RACK added ms i.e. rack-rtt + reord + N */
23491 RACK_OPTS_INC(tcp_rack_pkt_delay);
23492 rack->r_ctl.rc_pkt_delay = optval;
23493 break;
23494 case TCP_DELACK:
23495 RACK_OPTS_INC(tcp_rack_delayed_ack);
23496 if (optval == 0)
23497 tp->t_delayed_ack = 0;
23498 else
23499 tp->t_delayed_ack = 1;
23500 if (tp->t_flags & TF_DELACK) {
23501 tp->t_flags &= ~TF_DELACK;
23502 tp->t_flags |= TF_ACKNOW;
23503 NET_EPOCH_ENTER(et);
23504 rack_output(tp);
23505 NET_EPOCH_EXIT(et);
23506 }
23507 break;
23508
23509 case TCP_BBR_RACK_RTT_USE:
23510 RACK_OPTS_INC(tcp_rack_rtt_use);
23511 if ((optval != USE_RTT_HIGH) &&
23512 (optval != USE_RTT_LOW) &&
23513 (optval != USE_RTT_AVG))
23514 error = EINVAL;
23515 else
23516 rack->r_ctl.rc_rate_sample_method = optval;
23517 break;
23518 case TCP_HONOR_HPTS_MIN:
23519 RACK_OPTS_INC(tcp_honor_hpts);
23520 if (optval) {
23521 rack->r_use_hpts_min = 1;
23522 /*
23523 * Must be between 2 - 80% to be a reduction else
23524 * we keep the default (10%).
23525 */
23526 if ((optval > 1) && (optval <= 80)) {
23527 rack->r_ctl.max_reduction = optval;
23528 }
23529 } else
23530 rack->r_use_hpts_min = 0;
23531 break;
23532 case TCP_REC_IS_DYN: /* URL:dynrec */
23533 RACK_OPTS_INC(tcp_dyn_rec);
23534 if (optval)
23535 rack->rc_gp_no_rec_chg = 1;
23536 else
23537 rack->rc_gp_no_rec_chg = 0;
23538 break;
23539 case TCP_NO_TIMELY:
23540 RACK_OPTS_INC(tcp_notimely);
23541 if (optval) {
23542 rack->rc_skip_timely = 1;
23543 rack->r_ctl.rack_per_of_gp_rec = 90;
23544 rack->r_ctl.rack_per_of_gp_ca = 100;
23545 rack->r_ctl.rack_per_of_gp_ss = 250;
23546 } else {
23547 rack->rc_skip_timely = 0;
23548 }
23549 break;
23550 case TCP_GP_USE_LTBW:
23551 if (optval == 0) {
23552 rack->use_lesser_lt_bw = 0;
23553 rack->dis_lt_bw = 1;
23554 } else if (optval == 1) {
23555 rack->use_lesser_lt_bw = 1;
23556 rack->dis_lt_bw = 0;
23557 } else if (optval == 2) {
23558 rack->use_lesser_lt_bw = 0;
23559 rack->dis_lt_bw = 0;
23560 }
23561 break;
23562 case TCP_DATA_AFTER_CLOSE:
23563 RACK_OPTS_INC(tcp_data_after_close);
23564 if (optval)
23565 rack->rc_allow_data_af_clo = 1;
23566 else
23567 rack->rc_allow_data_af_clo = 0;
23568 break;
23569 default:
23570 break;
23571 }
23572 tcp_log_socket_option(tp, sopt_name, optval, error);
23573 return (error);
23574 }
23575
23576 static void
rack_inherit(struct tcpcb * tp,struct inpcb * parent)23577 rack_inherit(struct tcpcb *tp, struct inpcb *parent)
23578 {
23579 /*
23580 * A new connection has been created (tp) and
23581 * the parent is the inpcb given. We want to
23582 * apply a read-lock to the parent (we are already
23583 * holding a write lock on the tp) and copy anything
23584 * out of the rack specific data as long as its tfb is
23585 * the same as ours i.e. we are the same stack. Otherwise
23586 * we just return.
23587 */
23588 struct tcpcb *par;
23589 struct tcp_rack *dest, *src;
23590 int cnt = 0;
23591
23592 par = intotcpcb(parent);
23593 if (par->t_fb != tp->t_fb) {
23594 /* Not the same stack */
23595 tcp_log_socket_option(tp, 0, 0, 1);
23596 return;
23597 }
23598 /* Ok if we reach here lets setup the two rack pointers */
23599 dest = (struct tcp_rack *)tp->t_fb_ptr;
23600 src = (struct tcp_rack *)par->t_fb_ptr;
23601 if ((src == NULL) || (dest == NULL)) {
23602 /* Huh? */
23603 tcp_log_socket_option(tp, 0, 0, 2);
23604 return;
23605 }
23606 /* Now copy out anything we wish to inherit i.e. things in socket-options */
23607 /* TCP_RACK_PROFILE we can't know but we can set DGP if its on */
23608 if ((src->dgp_on) && (dest->dgp_on == 0)) {
23609 /* Profile 1 had to be set via sock opt */
23610 rack_set_dgp(dest);
23611 cnt++;
23612 }
23613 /* TCP_RACK_SET_RXT_OPTIONS */
23614 if (dest->full_size_rxt != src->full_size_rxt) {
23615 dest->full_size_rxt = src->full_size_rxt;
23616 cnt++;
23617 }
23618 if (dest->shape_rxt_to_pacing_min != src->shape_rxt_to_pacing_min) {
23619 dest->shape_rxt_to_pacing_min = src->shape_rxt_to_pacing_min;
23620 cnt++;
23621 }
23622 /* TCP_RACK_DSACK_OPT */
23623 if (dest->rc_rack_tmr_std_based != src->rc_rack_tmr_std_based) {
23624 dest->rc_rack_tmr_std_based = src->rc_rack_tmr_std_based;
23625 cnt++;
23626 }
23627 if (dest->rc_rack_use_dsack != src->rc_rack_use_dsack) {
23628 dest->rc_rack_use_dsack = src->rc_rack_use_dsack;
23629 cnt++;
23630 }
23631 /* TCP_RACK_PACING_DIVISOR */
23632 if (dest->r_ctl.pace_len_divisor != src->r_ctl.pace_len_divisor) {
23633 dest->r_ctl.pace_len_divisor = src->r_ctl.pace_len_divisor;
23634 cnt++;
23635 }
23636 /* TCP_RACK_HI_BETA */
23637 if (src->rack_hibeta != dest->rack_hibeta) {
23638 cnt++;
23639 if (src->rack_hibeta) {
23640 dest->r_ctl.rc_saved_beta = src->r_ctl.rc_saved_beta;
23641 dest->rack_hibeta = 1;
23642 } else {
23643 dest->rack_hibeta = 0;
23644 }
23645 }
23646 /* TCP_RACK_TIMER_SLOP */
23647 if (dest->r_ctl.timer_slop != src->r_ctl.timer_slop) {
23648 dest->r_ctl.timer_slop = src->r_ctl.timer_slop;
23649 cnt++;
23650 }
23651 /* TCP_RACK_PACING_BETA_ECN */
23652 if (dest->r_ctl.rc_saved_beta_ecn != src->r_ctl.rc_saved_beta_ecn) {
23653 dest->r_ctl.rc_saved_beta_ecn = src->r_ctl.rc_saved_beta_ecn;
23654 cnt++;
23655 }
23656 /* We do not do TCP_DEFER_OPTIONS */
23657 /* TCP_RACK_MEASURE_CNT */
23658 if (dest->r_ctl.req_measurements != src->r_ctl.req_measurements) {
23659 dest->r_ctl.req_measurements = src->r_ctl.req_measurements;
23660 cnt++;
23661 }
23662 /* TCP_HDWR_UP_ONLY */
23663 if (dest->r_up_only != src->r_up_only) {
23664 dest->r_up_only = src->r_up_only;
23665 cnt++;
23666 }
23667 /* TCP_FILLCW_RATE_CAP */
23668 if (dest->r_ctl.fillcw_cap != src->r_ctl.fillcw_cap) {
23669 dest->r_ctl.fillcw_cap = src->r_ctl.fillcw_cap;
23670 cnt++;
23671 }
23672 /* TCP_PACING_RATE_CAP */
23673 if (dest->r_ctl.bw_rate_cap != src->r_ctl.bw_rate_cap) {
23674 dest->r_ctl.bw_rate_cap = src->r_ctl.bw_rate_cap;
23675 cnt++;
23676 }
23677 /* A listener can't set TCP_HYBRID_PACING */
23678 /* TCP_SIDECHAN_DIS */
23679 if (dest->r_ctl.side_chan_dis_mask != src->r_ctl.side_chan_dis_mask) {
23680 dest->r_ctl.side_chan_dis_mask = src->r_ctl.side_chan_dis_mask;
23681 cnt++;
23682 }
23683 /* TCP_SHARED_CWND_TIME_LIMIT */
23684 if (dest->r_limit_scw != src->r_limit_scw) {
23685 dest->r_limit_scw = src->r_limit_scw;
23686 cnt++;
23687 }
23688 /* TCP_RACK_PACE_TO_FILL */
23689 if (dest->rc_pace_to_cwnd != src->rc_pace_to_cwnd) {
23690 dest->rc_pace_to_cwnd = src->rc_pace_to_cwnd;
23691 cnt++;
23692 }
23693 if (dest->rc_pace_fill_if_rttin_range != src->rc_pace_fill_if_rttin_range) {
23694 dest->rc_pace_fill_if_rttin_range = src->rc_pace_fill_if_rttin_range;
23695 cnt++;
23696 }
23697 if (dest->rtt_limit_mul != src->rtt_limit_mul) {
23698 dest->rtt_limit_mul = src->rtt_limit_mul;
23699 cnt++;
23700 }
23701 /* TCP_RACK_NO_PUSH_AT_MAX */
23702 if (dest->r_ctl.rc_no_push_at_mrtt != src->r_ctl.rc_no_push_at_mrtt) {
23703 dest->r_ctl.rc_no_push_at_mrtt = src->r_ctl.rc_no_push_at_mrtt;
23704 cnt++;
23705 }
23706 /* TCP_SHARED_CWND_ENABLE */
23707 if (dest->rack_enable_scwnd != src->rack_enable_scwnd) {
23708 dest->rack_enable_scwnd = src->rack_enable_scwnd;
23709 cnt++;
23710 }
23711 /* TCP_USE_CMP_ACKS */
23712 if (dest->r_use_cmp_ack != src->r_use_cmp_ack) {
23713 dest->r_use_cmp_ack = src->r_use_cmp_ack;
23714 cnt++;
23715 }
23716
23717 if (dest->r_mbuf_queue != src->r_mbuf_queue) {
23718 dest->r_mbuf_queue = src->r_mbuf_queue;
23719 cnt++;
23720 }
23721 /* TCP_RACK_MBUF_QUEUE */
23722 if (dest->r_mbuf_queue != src->r_mbuf_queue) {
23723 dest->r_mbuf_queue = src->r_mbuf_queue;
23724 cnt++;
23725 }
23726 if (dest->r_mbuf_queue || dest->rc_always_pace || dest->r_use_cmp_ack) {
23727 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ;
23728 } else {
23729 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ;
23730 }
23731 if (dest->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) {
23732 tp->t_flags2 |= TF2_MBUF_ACKCMP;
23733 }
23734 /* TCP_RACK_NONRXT_CFG_RATE */
23735 if (dest->rack_rec_nonrxt_use_cr != src->rack_rec_nonrxt_use_cr) {
23736 dest->rack_rec_nonrxt_use_cr = src->rack_rec_nonrxt_use_cr;
23737 cnt++;
23738 }
23739 /* TCP_NO_PRR */
23740 if (dest->rack_no_prr != src->rack_no_prr) {
23741 dest->rack_no_prr = src->rack_no_prr;
23742 cnt++;
23743 }
23744 if (dest->no_prr_addback != src->no_prr_addback) {
23745 dest->no_prr_addback = src->no_prr_addback;
23746 cnt++;
23747 }
23748 /* RACK_CSPR_IS_FCC */
23749 if (dest->cspr_is_fcc != src->cspr_is_fcc) {
23750 dest->cspr_is_fcc = src->cspr_is_fcc;
23751 cnt++;
23752 }
23753 /* TCP_TIMELY_DYN_ADJ */
23754 if (dest->rc_gp_dyn_mul != src->rc_gp_dyn_mul) {
23755 dest->rc_gp_dyn_mul = src->rc_gp_dyn_mul;
23756 cnt++;
23757 }
23758 if (dest->r_ctl.rack_per_of_gp_ca != src->r_ctl.rack_per_of_gp_ca) {
23759 dest->r_ctl.rack_per_of_gp_ca = src->r_ctl.rack_per_of_gp_ca;
23760 cnt++;
23761 }
23762 /* TCP_RACK_TLP_USE */
23763 if (dest->rack_tlp_threshold_use != src->rack_tlp_threshold_use) {
23764 dest->rack_tlp_threshold_use = src->rack_tlp_threshold_use;
23765 cnt++;
23766 }
23767 /* we don't allow inheritence of TCP_RACK_PACE_ALWAYS */
23768 /* TCP_BBR_RACK_INIT_RATE */
23769 if (dest->r_ctl.init_rate != src->r_ctl.init_rate) {
23770 dest->r_ctl.init_rate = src->r_ctl.init_rate;
23771 cnt++;
23772 }
23773 /* TCP_RACK_FORCE_MSEG */
23774 if (dest->rc_force_max_seg != src->rc_force_max_seg) {
23775 dest->rc_force_max_seg = src->rc_force_max_seg;
23776 cnt++;
23777 }
23778 /* TCP_RACK_PACE_MIN_SEG */
23779 if (dest->r_ctl.rc_user_set_min_segs != src->r_ctl.rc_user_set_min_segs) {
23780 dest->r_ctl.rc_user_set_min_segs = src->r_ctl.rc_user_set_min_segs;
23781 cnt++;
23782 }
23783 /* we don't allow TCP_RACK_PACE_MAX_SEG */
23784 /* TCP_RACK_PACE_RATE_REC, TCP_RACK_PACE_RATE_SS, TCP_RACK_PACE_RATE_CA */
23785 if (dest->r_ctl.rc_fixed_pacing_rate_ca != src->r_ctl.rc_fixed_pacing_rate_ca) {
23786 dest->r_ctl.rc_fixed_pacing_rate_ca = src->r_ctl.rc_fixed_pacing_rate_ca;
23787 cnt++;
23788 }
23789 if (dest->r_ctl.rc_fixed_pacing_rate_ss != src->r_ctl.rc_fixed_pacing_rate_ss) {
23790 dest->r_ctl.rc_fixed_pacing_rate_ss = src->r_ctl.rc_fixed_pacing_rate_ss;
23791 cnt++;
23792 }
23793 if (dest->r_ctl.rc_fixed_pacing_rate_rec != src->r_ctl.rc_fixed_pacing_rate_rec) {
23794 dest->r_ctl.rc_fixed_pacing_rate_rec = src->r_ctl.rc_fixed_pacing_rate_rec;
23795 cnt++;
23796 }
23797 /* TCP_RACK_GP_INCREASE_REC, TCP_RACK_GP_INCREASE_CA, TCP_RACK_GP_INCREASE_SS */
23798 if (dest->r_ctl.rack_per_of_gp_rec != src->r_ctl.rack_per_of_gp_rec) {
23799 dest->r_ctl.rack_per_of_gp_rec = src->r_ctl.rack_per_of_gp_rec;
23800 cnt++;
23801 }
23802 if (dest->r_ctl.rack_per_of_gp_ca != src->r_ctl.rack_per_of_gp_ca) {
23803 dest->r_ctl.rack_per_of_gp_ca = src->r_ctl.rack_per_of_gp_ca;
23804 cnt++;
23805 }
23806
23807 if (dest->r_ctl.rack_per_of_gp_ss != src->r_ctl.rack_per_of_gp_ss) {
23808 dest->r_ctl.rack_per_of_gp_ss = src->r_ctl.rack_per_of_gp_ss;
23809 cnt++;
23810 }
23811 /* TCP_RACK_RR_CONF */
23812 if (dest->r_rr_config != src->r_rr_config) {
23813 dest->r_rr_config = src->r_rr_config;
23814 cnt++;
23815 }
23816 /* TCP_PACING_DND */
23817 if (dest->rc_pace_dnd != src->rc_pace_dnd) {
23818 dest->rc_pace_dnd = src->rc_pace_dnd;
23819 cnt++;
23820 }
23821 /* TCP_HDWR_RATE_CAP */
23822 if (dest->r_rack_hw_rate_caps != src->r_rack_hw_rate_caps) {
23823 dest->r_rack_hw_rate_caps = src->r_rack_hw_rate_caps;
23824 cnt++;
23825 }
23826 /* TCP_DGP_UPPER_BOUNDS */
23827 if (dest->r_ctl.rack_per_upper_bound_ca != src->r_ctl.rack_per_upper_bound_ca) {
23828 dest->r_ctl.rack_per_upper_bound_ca = src->r_ctl.rack_per_upper_bound_ca;
23829 cnt++;
23830 }
23831 if (dest->r_ctl.rack_per_upper_bound_ss != src->r_ctl.rack_per_upper_bound_ss) {
23832 dest->r_ctl.rack_per_upper_bound_ss = src->r_ctl.rack_per_upper_bound_ss;
23833 cnt++;
23834 }
23835 /* TCP_SS_EEXIT */
23836 if (dest->r_ctl.gp_rnd_thresh != src->r_ctl.gp_rnd_thresh) {
23837 dest->r_ctl.gp_rnd_thresh = src->r_ctl.gp_rnd_thresh;
23838 cnt++;
23839 }
23840 if (dest->r_ctl.gate_to_fs != src->r_ctl.gate_to_fs) {
23841 dest->r_ctl.gate_to_fs = src->r_ctl.gate_to_fs;
23842 cnt++;
23843 }
23844 if (dest->r_ctl.use_gp_not_last != src->r_ctl.use_gp_not_last) {
23845 dest->r_ctl.use_gp_not_last = src->r_ctl.use_gp_not_last;
23846 cnt++;
23847 }
23848 if (dest->r_ctl.gp_gain_req != src->r_ctl.gp_gain_req) {
23849 dest->r_ctl.gp_gain_req = src->r_ctl.gp_gain_req;
23850 cnt++;
23851 }
23852 /* TCP_BBR_HDWR_PACE */
23853 if (dest->rack_hdw_pace_ena != src->rack_hdw_pace_ena) {
23854 dest->rack_hdw_pace_ena = src->rack_hdw_pace_ena;
23855 cnt++;
23856 }
23857 if (dest->rack_attempt_hdwr_pace != src->rack_attempt_hdwr_pace) {
23858 dest->rack_attempt_hdwr_pace = src->rack_attempt_hdwr_pace;
23859 cnt++;
23860 }
23861 /* TCP_RACK_PRR_SENDALOT */
23862 if (dest->r_ctl.rc_prr_sendalot != src->r_ctl.rc_prr_sendalot) {
23863 dest->r_ctl.rc_prr_sendalot = src->r_ctl.rc_prr_sendalot;
23864 cnt++;
23865 }
23866 /* TCP_RACK_MIN_TO */
23867 if (dest->r_ctl.rc_min_to != src->r_ctl.rc_min_to) {
23868 dest->r_ctl.rc_min_to = src->r_ctl.rc_min_to;
23869 cnt++;
23870 }
23871 /* TCP_RACK_EARLY_SEG */
23872 if (dest->r_ctl.rc_early_recovery_segs != src->r_ctl.rc_early_recovery_segs) {
23873 dest->r_ctl.rc_early_recovery_segs = src->r_ctl.rc_early_recovery_segs;
23874 cnt++;
23875 }
23876 /* TCP_RACK_ENABLE_HYSTART */
23877 if (par->t_ccv.flags != tp->t_ccv.flags) {
23878 cnt++;
23879 if (par->t_ccv.flags & CCF_HYSTART_ALLOWED) {
23880 tp->t_ccv.flags |= CCF_HYSTART_ALLOWED;
23881 if (rack_do_hystart > RACK_HYSTART_ON)
23882 tp->t_ccv.flags |= CCF_HYSTART_CAN_SH_CWND;
23883 if (rack_do_hystart > RACK_HYSTART_ON_W_SC)
23884 tp->t_ccv.flags |= CCF_HYSTART_CONS_SSTH;
23885 } else {
23886 tp->t_ccv.flags &= ~(CCF_HYSTART_ALLOWED|CCF_HYSTART_CAN_SH_CWND|CCF_HYSTART_CONS_SSTH);
23887 }
23888 }
23889 /* TCP_RACK_REORD_THRESH */
23890 if (dest->r_ctl.rc_reorder_shift != src->r_ctl.rc_reorder_shift) {
23891 dest->r_ctl.rc_reorder_shift = src->r_ctl.rc_reorder_shift;
23892 cnt++;
23893 }
23894 /* TCP_RACK_REORD_FADE */
23895 if (dest->r_ctl.rc_reorder_fade != src->r_ctl.rc_reorder_fade) {
23896 dest->r_ctl.rc_reorder_fade = src->r_ctl.rc_reorder_fade;
23897 cnt++;
23898 }
23899 /* TCP_RACK_TLP_THRESH */
23900 if (dest->r_ctl.rc_tlp_threshold != src->r_ctl.rc_tlp_threshold) {
23901 dest->r_ctl.rc_tlp_threshold = src->r_ctl.rc_tlp_threshold;
23902 cnt++;
23903 }
23904 /* TCP_BBR_USE_RACK_RR */
23905 if (dest->use_rack_rr != src->use_rack_rr) {
23906 dest->use_rack_rr = src->use_rack_rr;
23907 cnt++;
23908 }
23909 /* TCP_RACK_PKT_DELAY */
23910 if (dest->r_ctl.rc_pkt_delay != src->r_ctl.rc_pkt_delay) {
23911 dest->r_ctl.rc_pkt_delay = src->r_ctl.rc_pkt_delay;
23912 cnt++;
23913 }
23914 /* TCP_DELACK will get copied via the main code if applicable */
23915 /* TCP_BBR_RACK_RTT_USE */
23916 if (dest->r_ctl.rc_rate_sample_method != src->r_ctl.rc_rate_sample_method) {
23917 dest->r_ctl.rc_rate_sample_method = src->r_ctl.rc_rate_sample_method;
23918 cnt++;
23919 }
23920 /* TCP_HONOR_HPTS_MIN */
23921 if (dest->r_use_hpts_min != src->r_use_hpts_min) {
23922 dest->r_use_hpts_min = src->r_use_hpts_min;
23923 cnt++;
23924 }
23925 if (dest->r_ctl.max_reduction != src->r_ctl.max_reduction) {
23926 dest->r_ctl.max_reduction = src->r_ctl.max_reduction;
23927 cnt++;
23928 }
23929 /* TCP_REC_IS_DYN */
23930 if (dest->rc_gp_no_rec_chg != src->rc_gp_no_rec_chg) {
23931 dest->rc_gp_no_rec_chg = src->rc_gp_no_rec_chg;
23932 cnt++;
23933 }
23934 if (dest->rc_skip_timely != src->rc_skip_timely) {
23935 dest->rc_skip_timely = src->rc_skip_timely;
23936 cnt++;
23937 }
23938 /* TCP_DATA_AFTER_CLOSE */
23939 if (dest->rc_allow_data_af_clo != src->rc_allow_data_af_clo) {
23940 dest->rc_allow_data_af_clo = src->rc_allow_data_af_clo;
23941 cnt++;
23942 }
23943 /* TCP_GP_USE_LTBW */
23944 if (src->use_lesser_lt_bw != dest->use_lesser_lt_bw) {
23945 dest->use_lesser_lt_bw = src->use_lesser_lt_bw;
23946 cnt++;
23947 }
23948 if (dest->dis_lt_bw != src->dis_lt_bw) {
23949 dest->dis_lt_bw = src->dis_lt_bw;
23950 cnt++;
23951 }
23952 tcp_log_socket_option(tp, 0, cnt, 0);
23953 }
23954
23955
23956 static void
rack_apply_deferred_options(struct tcp_rack * rack)23957 rack_apply_deferred_options(struct tcp_rack *rack)
23958 {
23959 struct deferred_opt_list *dol, *sdol;
23960 uint32_t s_optval;
23961
23962 TAILQ_FOREACH_SAFE(dol, &rack->r_ctl.opt_list, next, sdol) {
23963 TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next);
23964 /* Disadvantage of deferal is you loose the error return */
23965 s_optval = (uint32_t)dol->optval;
23966 (void)rack_process_option(rack->rc_tp, rack, dol->optname, s_optval, dol->optval, NULL);
23967 free(dol, M_TCPDO);
23968 }
23969 }
23970
23971 static void
rack_hw_tls_change(struct tcpcb * tp,int chg)23972 rack_hw_tls_change(struct tcpcb *tp, int chg)
23973 {
23974 /* Update HW tls state */
23975 struct tcp_rack *rack;
23976
23977 rack = (struct tcp_rack *)tp->t_fb_ptr;
23978 if (chg)
23979 rack->r_ctl.fsb.hw_tls = 1;
23980 else
23981 rack->r_ctl.fsb.hw_tls = 0;
23982 }
23983
23984 static int
rack_pru_options(struct tcpcb * tp,int flags)23985 rack_pru_options(struct tcpcb *tp, int flags)
23986 {
23987 if (flags & PRUS_OOB)
23988 return (EOPNOTSUPP);
23989 return (0);
23990 }
23991
23992 static bool
rack_wake_check(struct tcpcb * tp)23993 rack_wake_check(struct tcpcb *tp)
23994 {
23995 struct tcp_rack *rack;
23996 struct timeval tv;
23997 uint32_t cts;
23998
23999 rack = (struct tcp_rack *)tp->t_fb_ptr;
24000 if (rack->r_ctl.rc_hpts_flags) {
24001 cts = tcp_get_usecs(&tv);
24002 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == PACE_PKT_OUTPUT){
24003 /*
24004 * Pacing timer is up, check if we are ready.
24005 */
24006 if (TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to))
24007 return (true);
24008 } else if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) != 0) {
24009 /*
24010 * A timer is up, check if we are ready.
24011 */
24012 if (TSTMP_GEQ(cts, rack->r_ctl.rc_timer_exp))
24013 return (true);
24014 }
24015 }
24016 return (false);
24017 }
24018
24019 static struct tcp_function_block __tcp_rack = {
24020 .tfb_tcp_block_name = __XSTRING(STACKNAME),
24021 .tfb_tcp_output = rack_output,
24022 .tfb_do_queued_segments = ctf_do_queued_segments,
24023 .tfb_do_segment_nounlock = rack_do_segment_nounlock,
24024 .tfb_tcp_do_segment = rack_do_segment,
24025 .tfb_tcp_ctloutput = rack_ctloutput,
24026 .tfb_tcp_fb_init = rack_init,
24027 .tfb_tcp_fb_fini = rack_fini,
24028 .tfb_tcp_timer_stop_all = rack_stopall,
24029 .tfb_tcp_rexmit_tmr = rack_remxt_tmr,
24030 .tfb_tcp_handoff_ok = rack_handoff_ok,
24031 .tfb_tcp_mtu_chg = rack_mtu_change,
24032 .tfb_pru_options = rack_pru_options,
24033 .tfb_hwtls_change = rack_hw_tls_change,
24034 .tfb_chg_query = rack_chg_query,
24035 .tfb_switch_failed = rack_switch_failed,
24036 .tfb_early_wake_check = rack_wake_check,
24037 .tfb_compute_pipe = rack_compute_pipe,
24038 .tfb_stack_info = rack_stack_information,
24039 .tfb_inherit = rack_inherit,
24040 .tfb_flags = TCP_FUNC_OUTPUT_CANDROP | TCP_FUNC_DEFAULT_OK,
24041
24042 };
24043
24044 /*
24045 * rack_ctloutput() must drop the inpcb lock before performing copyin on
24046 * socket option arguments. When it re-acquires the lock after the copy, it
24047 * has to revalidate that the connection is still valid for the socket
24048 * option.
24049 */
24050 static int
rack_set_sockopt(struct tcpcb * tp,struct sockopt * sopt)24051 rack_set_sockopt(struct tcpcb *tp, struct sockopt *sopt)
24052 {
24053 struct inpcb *inp = tptoinpcb(tp);
24054 #ifdef INET
24055 struct ip *ip;
24056 #endif
24057 struct tcp_rack *rack;
24058 struct tcp_hybrid_req hybrid;
24059 uint64_t loptval;
24060 int32_t error = 0, optval;
24061
24062 rack = (struct tcp_rack *)tp->t_fb_ptr;
24063 if (rack == NULL) {
24064 INP_WUNLOCK(inp);
24065 return (EINVAL);
24066 }
24067 #ifdef INET
24068 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
24069 #endif
24070
24071 switch (sopt->sopt_level) {
24072 #ifdef INET6
24073 case IPPROTO_IPV6:
24074 MPASS(inp->inp_vflag & INP_IPV6PROTO);
24075 switch (sopt->sopt_name) {
24076 case IPV6_USE_MIN_MTU:
24077 tcp6_use_min_mtu(tp);
24078 break;
24079 }
24080 INP_WUNLOCK(inp);
24081 return (0);
24082 #endif
24083 #ifdef INET
24084 case IPPROTO_IP:
24085 switch (sopt->sopt_name) {
24086 case IP_TOS:
24087 /*
24088 * The DSCP codepoint has changed, update the fsb.
24089 */
24090 ip->ip_tos = rack->rc_inp->inp_ip_tos;
24091 break;
24092 case IP_TTL:
24093 /*
24094 * The TTL has changed, update the fsb.
24095 */
24096 ip->ip_ttl = rack->rc_inp->inp_ip_ttl;
24097 break;
24098 }
24099 INP_WUNLOCK(inp);
24100 return (0);
24101 #endif
24102 #ifdef SO_PEERPRIO
24103 case SOL_SOCKET:
24104 switch (sopt->sopt_name) {
24105 case SO_PEERPRIO: /* SC-URL:bs */
24106 /* Already read in and sanity checked in sosetopt(). */
24107 if (inp->inp_socket) {
24108 rack->client_bufferlvl = inp->inp_socket->so_peerprio;
24109 }
24110 break;
24111 }
24112 INP_WUNLOCK(inp);
24113 return (0);
24114 #endif
24115 case IPPROTO_TCP:
24116 switch (sopt->sopt_name) {
24117 case TCP_RACK_TLP_REDUCE: /* URL:tlp_reduce */
24118 /* Pacing related ones */
24119 case TCP_RACK_PACE_ALWAYS: /* URL:pace_always */
24120 case TCP_BBR_RACK_INIT_RATE: /* URL:irate */
24121 case TCP_RACK_PACE_MIN_SEG: /* URL:pace_min_seg */
24122 case TCP_RACK_PACE_MAX_SEG: /* URL:pace_max_seg */
24123 case TCP_RACK_FORCE_MSEG: /* URL:force_max_seg */
24124 case TCP_RACK_PACE_RATE_CA: /* URL:pr_ca */
24125 case TCP_RACK_PACE_RATE_SS: /* URL:pr_ss*/
24126 case TCP_RACK_PACE_RATE_REC: /* URL:pr_rec */
24127 case TCP_RACK_GP_INCREASE_CA: /* URL:gp_inc_ca */
24128 case TCP_RACK_GP_INCREASE_SS: /* URL:gp_inc_ss */
24129 case TCP_RACK_GP_INCREASE_REC: /* URL:gp_inc_rec */
24130 case TCP_RACK_RR_CONF: /* URL:rrr_conf */
24131 case TCP_BBR_HDWR_PACE: /* URL:hdwrpace */
24132 case TCP_HDWR_RATE_CAP: /* URL:hdwrcap boolean */
24133 case TCP_PACING_RATE_CAP: /* URL:cap -- used by side-channel */
24134 case TCP_HDWR_UP_ONLY: /* URL:uponly -- hardware pacing boolean */
24135 case TCP_FILLCW_RATE_CAP: /* URL:fillcw_cap */
24136 case TCP_RACK_PACING_BETA_ECN: /* URL:pacing_beta_ecn */
24137 case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */
24138 /* End pacing related */
24139 case TCP_DELACK: /* URL:delack (in base TCP i.e. tcp_hints along with cc etc ) */
24140 case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */
24141 case TCP_RACK_MIN_TO: /* URL:min_to */
24142 case TCP_RACK_EARLY_SEG: /* URL:early_seg */
24143 case TCP_RACK_REORD_THRESH: /* URL:reord_thresh */
24144 case TCP_RACK_REORD_FADE: /* URL:reord_fade */
24145 case TCP_RACK_TLP_THRESH: /* URL:tlp_thresh */
24146 case TCP_RACK_PKT_DELAY: /* URL:pkt_delay */
24147 case TCP_RACK_TLP_USE: /* URL:tlp_use */
24148 case TCP_BBR_RACK_RTT_USE: /* URL:rttuse */
24149 case TCP_BBR_USE_RACK_RR: /* URL:rackrr */
24150 case TCP_NO_PRR: /* URL:noprr */
24151 case TCP_TIMELY_DYN_ADJ: /* URL:dynamic */
24152 case TCP_DATA_AFTER_CLOSE: /* no URL */
24153 case TCP_RACK_NONRXT_CFG_RATE: /* URL:nonrxtcr */
24154 case TCP_SHARED_CWND_ENABLE: /* URL:scwnd */
24155 case TCP_RACK_MBUF_QUEUE: /* URL:mqueue */
24156 case TCP_RACK_NO_PUSH_AT_MAX: /* URL:npush */
24157 case TCP_SHARED_CWND_TIME_LIMIT: /* URL:lscwnd */
24158 case TCP_RACK_PROFILE: /* URL:profile */
24159 case TCP_SIDECHAN_DIS: /* URL:scodm */
24160 case TCP_HYBRID_PACING: /* URL:pacing=hybrid */
24161 case TCP_USE_CMP_ACKS: /* URL:cmpack */
24162 case TCP_RACK_ABC_VAL: /* URL:labc */
24163 case TCP_REC_ABC_VAL: /* URL:reclabc */
24164 case TCP_RACK_MEASURE_CNT: /* URL:measurecnt */
24165 case TCP_DEFER_OPTIONS: /* URL:defer */
24166 case TCP_RACK_DSACK_OPT: /* URL:dsack */
24167 case TCP_RACK_TIMER_SLOP: /* URL:timer_slop */
24168 case TCP_RACK_ENABLE_HYSTART: /* URL:hystart */
24169 case TCP_RACK_SET_RXT_OPTIONS: /* URL:rxtsz */
24170 case TCP_RACK_HI_BETA: /* URL:hibeta */
24171 case TCP_RACK_SPLIT_LIMIT: /* URL:split */
24172 case TCP_SS_EEXIT: /* URL:eexit */
24173 case TCP_DGP_UPPER_BOUNDS: /* URL:upper */
24174 case TCP_RACK_PACING_DIVISOR: /* URL:divisor */
24175 case TCP_PACING_DND: /* URL:dnd */
24176 case TCP_NO_TIMELY: /* URL:notimely */
24177 case RACK_CSPR_IS_FCC: /* URL:csprisfcc */
24178 case TCP_HONOR_HPTS_MIN: /* URL:hptsmin */
24179 case TCP_REC_IS_DYN: /* URL:dynrec */
24180 case TCP_GP_USE_LTBW: /* URL:useltbw */
24181 goto process_opt;
24182 break;
24183 default:
24184 /* Filter off all unknown options to the base stack */
24185 return (tcp_default_ctloutput(tp, sopt));
24186 break;
24187 }
24188 default:
24189 INP_WUNLOCK(inp);
24190 return (0);
24191 }
24192 process_opt:
24193 INP_WUNLOCK(inp);
24194 if ((sopt->sopt_name == TCP_PACING_RATE_CAP) ||
24195 (sopt->sopt_name == TCP_FILLCW_RATE_CAP)) {
24196 error = sooptcopyin(sopt, &loptval, sizeof(loptval), sizeof(loptval));
24197 /*
24198 * We truncate it down to 32 bits for the socket-option trace this
24199 * means rates > 34Gbps won't show right, but thats probably ok.
24200 */
24201 optval = (uint32_t)loptval;
24202 } else if (sopt->sopt_name == TCP_HYBRID_PACING) {
24203 error = sooptcopyin(sopt, &hybrid, sizeof(hybrid), sizeof(hybrid));
24204 } else {
24205 error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
24206 /* Save it in 64 bit form too */
24207 loptval = optval;
24208 }
24209 if (error)
24210 return (error);
24211 INP_WLOCK(inp);
24212 if (tp->t_fb != &__tcp_rack) {
24213 INP_WUNLOCK(inp);
24214 return (ENOPROTOOPT);
24215 }
24216 if (rack->defer_options && (rack->gp_ready == 0) &&
24217 (sopt->sopt_name != TCP_DEFER_OPTIONS) &&
24218 (sopt->sopt_name != TCP_HYBRID_PACING) &&
24219 (sopt->sopt_name != TCP_RACK_SET_RXT_OPTIONS) &&
24220 (sopt->sopt_name != TCP_RACK_PACING_BETA_ECN) &&
24221 (sopt->sopt_name != TCP_RACK_MEASURE_CNT)) {
24222 /* Options are being deferred */
24223 if (rack_add_deferred_option(rack, sopt->sopt_name, loptval)) {
24224 INP_WUNLOCK(inp);
24225 return (0);
24226 } else {
24227 /* No memory to defer, fail */
24228 INP_WUNLOCK(inp);
24229 return (ENOMEM);
24230 }
24231 }
24232 error = rack_process_option(tp, rack, sopt->sopt_name, optval, loptval, &hybrid);
24233 INP_WUNLOCK(inp);
24234 return (error);
24235 }
24236
24237 static void
rack_fill_info(struct tcpcb * tp,struct tcp_info * ti)24238 rack_fill_info(struct tcpcb *tp, struct tcp_info *ti)
24239 {
24240
24241 INP_WLOCK_ASSERT(tptoinpcb(tp));
24242 bzero(ti, sizeof(*ti));
24243
24244 ti->tcpi_state = tp->t_state;
24245 if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
24246 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
24247 if (tp->t_flags & TF_SACK_PERMIT)
24248 ti->tcpi_options |= TCPI_OPT_SACK;
24249 if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
24250 ti->tcpi_options |= TCPI_OPT_WSCALE;
24251 ti->tcpi_snd_wscale = tp->snd_scale;
24252 ti->tcpi_rcv_wscale = tp->rcv_scale;
24253 }
24254 if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))
24255 ti->tcpi_options |= TCPI_OPT_ECN;
24256 if (tp->t_flags & TF_FASTOPEN)
24257 ti->tcpi_options |= TCPI_OPT_TFO;
24258 /* still kept in ticks is t_rcvtime */
24259 ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick;
24260 /* Since we hold everything in precise useconds this is easy */
24261 ti->tcpi_rtt = tp->t_srtt;
24262 ti->tcpi_rttvar = tp->t_rttvar;
24263 ti->tcpi_rto = tp->t_rxtcur;
24264 ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
24265 ti->tcpi_snd_cwnd = tp->snd_cwnd;
24266 /*
24267 * FreeBSD-specific extension fields for tcp_info.
24268 */
24269 ti->tcpi_rcv_space = tp->rcv_wnd;
24270 ti->tcpi_rcv_nxt = tp->rcv_nxt;
24271 ti->tcpi_snd_wnd = tp->snd_wnd;
24272 ti->tcpi_snd_bwnd = 0; /* Unused, kept for compat. */
24273 ti->tcpi_snd_nxt = tp->snd_nxt;
24274 ti->tcpi_snd_mss = tp->t_maxseg;
24275 ti->tcpi_rcv_mss = tp->t_maxseg;
24276 ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack;
24277 ti->tcpi_rcv_ooopack = tp->t_rcvoopack;
24278 ti->tcpi_snd_zerowin = tp->t_sndzerowin;
24279 ti->tcpi_total_tlp = tp->t_sndtlppack;
24280 ti->tcpi_total_tlp_bytes = tp->t_sndtlpbyte;
24281 ti->tcpi_rttmin = tp->t_rttlow;
24282 #ifdef NETFLIX_STATS
24283 memcpy(&ti->tcpi_rxsyninfo, &tp->t_rxsyninfo, sizeof(struct tcpsyninfo));
24284 #endif
24285 #ifdef TCP_OFFLOAD
24286 if (tp->t_flags & TF_TOE) {
24287 ti->tcpi_options |= TCPI_OPT_TOE;
24288 tcp_offload_tcp_info(tp, ti);
24289 }
24290 #endif
24291 }
24292
24293 static int
rack_get_sockopt(struct tcpcb * tp,struct sockopt * sopt)24294 rack_get_sockopt(struct tcpcb *tp, struct sockopt *sopt)
24295 {
24296 struct inpcb *inp = tptoinpcb(tp);
24297 struct tcp_rack *rack;
24298 int32_t error, optval;
24299 uint64_t val, loptval;
24300 struct tcp_info ti;
24301 /*
24302 * Because all our options are either boolean or an int, we can just
24303 * pull everything into optval and then unlock and copy. If we ever
24304 * add a option that is not a int, then this will have quite an
24305 * impact to this routine.
24306 */
24307 error = 0;
24308 rack = (struct tcp_rack *)tp->t_fb_ptr;
24309 if (rack == NULL) {
24310 INP_WUNLOCK(inp);
24311 return (EINVAL);
24312 }
24313 switch (sopt->sopt_name) {
24314 case TCP_INFO:
24315 /* First get the info filled */
24316 rack_fill_info(tp, &ti);
24317 /* Fix up the rtt related fields if needed */
24318 INP_WUNLOCK(inp);
24319 error = sooptcopyout(sopt, &ti, sizeof ti);
24320 return (error);
24321 /*
24322 * Beta is the congestion control value for NewReno that influences how
24323 * much of a backoff happens when loss is detected. It is normally set
24324 * to 50 for 50% i.e. the cwnd is reduced to 50% of its previous value
24325 * when you exit recovery.
24326 */
24327 case TCP_RACK_PACING_BETA:
24328 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0)
24329 error = EINVAL;
24330 else if (rack->rc_pacing_cc_set == 0)
24331 optval = rack->r_ctl.rc_saved_beta;
24332 else {
24333 /*
24334 * Reach out into the CC data and report back what
24335 * I have previously set. Yeah it looks hackish but
24336 * we don't want to report the saved values.
24337 */
24338 if (tp->t_ccv.cc_data)
24339 optval = ((struct newreno *)tp->t_ccv.cc_data)->beta;
24340 else
24341 error = EINVAL;
24342 }
24343 break;
24344 /*
24345 * Beta_ecn is the congestion control value for NewReno that influences how
24346 * much of a backoff happens when a ECN mark is detected. It is normally set
24347 * to 80 for 80% i.e. the cwnd is reduced by 20% of its previous value when
24348 * you exit recovery. Note that classic ECN has a beta of 50, it is only
24349 * ABE Ecn that uses this "less" value, but we do too with pacing :)
24350 */
24351 case TCP_RACK_PACING_BETA_ECN:
24352 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0)
24353 error = EINVAL;
24354 else if (rack->rc_pacing_cc_set == 0)
24355 optval = rack->r_ctl.rc_saved_beta_ecn;
24356 else {
24357 /*
24358 * Reach out into the CC data and report back what
24359 * I have previously set. Yeah it looks hackish but
24360 * we don't want to report the saved values.
24361 */
24362 if (tp->t_ccv.cc_data)
24363 optval = ((struct newreno *)tp->t_ccv.cc_data)->beta_ecn;
24364 else
24365 error = EINVAL;
24366 }
24367 break;
24368 case TCP_RACK_DSACK_OPT:
24369 optval = 0;
24370 if (rack->rc_rack_tmr_std_based) {
24371 optval |= 1;
24372 }
24373 if (rack->rc_rack_use_dsack) {
24374 optval |= 2;
24375 }
24376 break;
24377 case TCP_RACK_ENABLE_HYSTART:
24378 {
24379 if (tp->t_ccv.flags & CCF_HYSTART_ALLOWED) {
24380 optval = RACK_HYSTART_ON;
24381 if (tp->t_ccv.flags & CCF_HYSTART_CAN_SH_CWND)
24382 optval = RACK_HYSTART_ON_W_SC;
24383 if (tp->t_ccv.flags & CCF_HYSTART_CONS_SSTH)
24384 optval = RACK_HYSTART_ON_W_SC_C;
24385 } else {
24386 optval = RACK_HYSTART_OFF;
24387 }
24388 }
24389 break;
24390 case TCP_RACK_DGP_IN_REC:
24391 error = EINVAL;
24392 break;
24393 case TCP_RACK_HI_BETA:
24394 optval = rack->rack_hibeta;
24395 break;
24396 case TCP_DEFER_OPTIONS:
24397 optval = rack->defer_options;
24398 break;
24399 case TCP_RACK_MEASURE_CNT:
24400 optval = rack->r_ctl.req_measurements;
24401 break;
24402 case TCP_REC_ABC_VAL:
24403 optval = rack->r_use_labc_for_rec;
24404 break;
24405 case TCP_RACK_ABC_VAL:
24406 optval = rack->rc_labc;
24407 break;
24408 case TCP_HDWR_UP_ONLY:
24409 optval= rack->r_up_only;
24410 break;
24411 case TCP_FILLCW_RATE_CAP:
24412 loptval = rack->r_ctl.fillcw_cap;
24413 break;
24414 case TCP_PACING_RATE_CAP:
24415 loptval = rack->r_ctl.bw_rate_cap;
24416 break;
24417 case TCP_RACK_PROFILE:
24418 /* You cannot retrieve a profile, its write only */
24419 error = EINVAL;
24420 break;
24421 case TCP_SIDECHAN_DIS:
24422 optval = rack->r_ctl.side_chan_dis_mask;
24423 break;
24424 case TCP_HYBRID_PACING:
24425 /* You cannot retrieve hybrid pacing information, its write only */
24426 error = EINVAL;
24427 break;
24428 case TCP_USE_CMP_ACKS:
24429 optval = rack->r_use_cmp_ack;
24430 break;
24431 case TCP_RACK_PACE_TO_FILL:
24432 optval = rack->rc_pace_to_cwnd;
24433 break;
24434 case TCP_RACK_NO_PUSH_AT_MAX:
24435 optval = rack->r_ctl.rc_no_push_at_mrtt;
24436 break;
24437 case TCP_SHARED_CWND_ENABLE:
24438 optval = rack->rack_enable_scwnd;
24439 break;
24440 case TCP_RACK_NONRXT_CFG_RATE:
24441 optval = rack->rack_rec_nonrxt_use_cr;
24442 break;
24443 case TCP_NO_PRR:
24444 if (rack->rack_no_prr == 1)
24445 optval = 1;
24446 else if (rack->no_prr_addback == 1)
24447 optval = 2;
24448 else
24449 optval = 0;
24450 break;
24451 case TCP_GP_USE_LTBW:
24452 if (rack->dis_lt_bw) {
24453 /* It is not used */
24454 optval = 0;
24455 } else if (rack->use_lesser_lt_bw) {
24456 /* we use min() */
24457 optval = 1;
24458 } else {
24459 /* we use max() */
24460 optval = 2;
24461 }
24462 break;
24463 case TCP_RACK_DO_DETECTION:
24464 error = EINVAL;
24465 break;
24466 case TCP_RACK_MBUF_QUEUE:
24467 /* Now do we use the LRO mbuf-queue feature */
24468 optval = rack->r_mbuf_queue;
24469 break;
24470 case RACK_CSPR_IS_FCC:
24471 optval = rack->cspr_is_fcc;
24472 break;
24473 case TCP_TIMELY_DYN_ADJ:
24474 optval = rack->rc_gp_dyn_mul;
24475 break;
24476 case TCP_BBR_IWINTSO:
24477 error = EINVAL;
24478 break;
24479 case TCP_RACK_TLP_REDUCE:
24480 /* RACK TLP cwnd reduction (bool) */
24481 optval = rack->r_ctl.rc_tlp_cwnd_reduce;
24482 break;
24483 case TCP_BBR_RACK_INIT_RATE:
24484 val = rack->r_ctl.init_rate;
24485 /* convert to kbits per sec */
24486 val *= 8;
24487 val /= 1000;
24488 optval = (uint32_t)val;
24489 break;
24490 case TCP_RACK_FORCE_MSEG:
24491 optval = rack->rc_force_max_seg;
24492 break;
24493 case TCP_RACK_PACE_MIN_SEG:
24494 optval = rack->r_ctl.rc_user_set_min_segs;
24495 break;
24496 case TCP_RACK_PACE_MAX_SEG:
24497 /* Max segments in a pace */
24498 optval = rack->rc_user_set_max_segs;
24499 break;
24500 case TCP_RACK_PACE_ALWAYS:
24501 /* Use the always pace method */
24502 optval = rack->rc_always_pace;
24503 break;
24504 case TCP_RACK_PRR_SENDALOT:
24505 /* Allow PRR to send more than one seg */
24506 optval = rack->r_ctl.rc_prr_sendalot;
24507 break;
24508 case TCP_RACK_MIN_TO:
24509 /* Minimum time between rack t-o's in ms */
24510 optval = rack->r_ctl.rc_min_to;
24511 break;
24512 case TCP_RACK_SPLIT_LIMIT:
24513 optval = rack->r_ctl.rc_split_limit;
24514 break;
24515 case TCP_RACK_EARLY_SEG:
24516 /* If early recovery max segments */
24517 optval = rack->r_ctl.rc_early_recovery_segs;
24518 break;
24519 case TCP_RACK_REORD_THRESH:
24520 /* RACK reorder threshold (shift amount) */
24521 optval = rack->r_ctl.rc_reorder_shift;
24522 break;
24523 case TCP_SS_EEXIT:
24524 if (rack->r_ctl.gp_rnd_thresh) {
24525 uint32_t v;
24526
24527 v = rack->r_ctl.gp_gain_req;
24528 v <<= 17;
24529 optval = v | (rack->r_ctl.gp_rnd_thresh & 0xff);
24530 if (rack->r_ctl.gate_to_fs == 1)
24531 optval |= 0x10000;
24532 } else
24533 optval = 0;
24534 break;
24535 case TCP_RACK_REORD_FADE:
24536 /* Does reordering fade after ms time */
24537 optval = rack->r_ctl.rc_reorder_fade;
24538 break;
24539 case TCP_BBR_USE_RACK_RR:
24540 /* Do we use the rack cheat for rxt */
24541 optval = rack->use_rack_rr;
24542 break;
24543 case TCP_RACK_RR_CONF:
24544 optval = rack->r_rr_config;
24545 break;
24546 case TCP_HDWR_RATE_CAP:
24547 optval = rack->r_rack_hw_rate_caps;
24548 break;
24549 case TCP_BBR_HDWR_PACE:
24550 optval = rack->rack_hdw_pace_ena;
24551 break;
24552 case TCP_RACK_TLP_THRESH:
24553 /* RACK TLP theshold i.e. srtt+(srtt/N) */
24554 optval = rack->r_ctl.rc_tlp_threshold;
24555 break;
24556 case TCP_RACK_PKT_DELAY:
24557 /* RACK added ms i.e. rack-rtt + reord + N */
24558 optval = rack->r_ctl.rc_pkt_delay;
24559 break;
24560 case TCP_RACK_TLP_USE:
24561 optval = rack->rack_tlp_threshold_use;
24562 break;
24563 case TCP_PACING_DND:
24564 optval = rack->rc_pace_dnd;
24565 break;
24566 case TCP_RACK_PACE_RATE_CA:
24567 optval = rack->r_ctl.rc_fixed_pacing_rate_ca;
24568 break;
24569 case TCP_RACK_PACE_RATE_SS:
24570 optval = rack->r_ctl.rc_fixed_pacing_rate_ss;
24571 break;
24572 case TCP_RACK_PACE_RATE_REC:
24573 optval = rack->r_ctl.rc_fixed_pacing_rate_rec;
24574 break;
24575 case TCP_DGP_UPPER_BOUNDS:
24576 optval = rack->r_ctl.rack_per_upper_bound_ss;
24577 optval <<= 16;
24578 optval |= rack->r_ctl.rack_per_upper_bound_ca;
24579 break;
24580 case TCP_RACK_GP_INCREASE_SS:
24581 optval = rack->r_ctl.rack_per_of_gp_ca;
24582 break;
24583 case TCP_RACK_GP_INCREASE_CA:
24584 optval = rack->r_ctl.rack_per_of_gp_ss;
24585 break;
24586 case TCP_RACK_PACING_DIVISOR:
24587 optval = rack->r_ctl.pace_len_divisor;
24588 break;
24589 case TCP_BBR_RACK_RTT_USE:
24590 optval = rack->r_ctl.rc_rate_sample_method;
24591 break;
24592 case TCP_DELACK:
24593 optval = tp->t_delayed_ack;
24594 break;
24595 case TCP_DATA_AFTER_CLOSE:
24596 optval = rack->rc_allow_data_af_clo;
24597 break;
24598 case TCP_SHARED_CWND_TIME_LIMIT:
24599 optval = rack->r_limit_scw;
24600 break;
24601 case TCP_HONOR_HPTS_MIN:
24602 if (rack->r_use_hpts_min)
24603 optval = rack->r_ctl.max_reduction;
24604 else
24605 optval = 0;
24606 break;
24607 case TCP_REC_IS_DYN:
24608 optval = rack->rc_gp_no_rec_chg;
24609 break;
24610 case TCP_NO_TIMELY:
24611 optval = rack->rc_skip_timely;
24612 break;
24613 case TCP_RACK_TIMER_SLOP:
24614 optval = rack->r_ctl.timer_slop;
24615 break;
24616 default:
24617 return (tcp_default_ctloutput(tp, sopt));
24618 break;
24619 }
24620 INP_WUNLOCK(inp);
24621 if (error == 0) {
24622 if ((sopt->sopt_name == TCP_PACING_RATE_CAP) ||
24623 (sopt->sopt_name == TCP_FILLCW_RATE_CAP))
24624 error = sooptcopyout(sopt, &loptval, sizeof loptval);
24625 else
24626 error = sooptcopyout(sopt, &optval, sizeof optval);
24627 }
24628 return (error);
24629 }
24630
24631 static int
rack_ctloutput(struct tcpcb * tp,struct sockopt * sopt)24632 rack_ctloutput(struct tcpcb *tp, struct sockopt *sopt)
24633 {
24634 if (sopt->sopt_dir == SOPT_SET) {
24635 return (rack_set_sockopt(tp, sopt));
24636 } else if (sopt->sopt_dir == SOPT_GET) {
24637 return (rack_get_sockopt(tp, sopt));
24638 } else {
24639 panic("%s: sopt_dir $%d", __func__, sopt->sopt_dir);
24640 }
24641 }
24642
24643 static const char *rack_stack_names[] = {
24644 __XSTRING(STACKNAME),
24645 #ifdef STACKALIAS
24646 __XSTRING(STACKALIAS),
24647 #endif
24648 };
24649
24650 static int
rack_ctor(void * mem,int32_t size,void * arg,int32_t how)24651 rack_ctor(void *mem, int32_t size, void *arg, int32_t how)
24652 {
24653 memset(mem, 0, size);
24654 return (0);
24655 }
24656
24657 static void
rack_dtor(void * mem,int32_t size,void * arg)24658 rack_dtor(void *mem, int32_t size, void *arg)
24659 {
24660
24661 }
24662
24663 static bool rack_mod_inited = false;
24664
24665 static int
tcp_addrack(module_t mod,int32_t type,void * data)24666 tcp_addrack(module_t mod, int32_t type, void *data)
24667 {
24668 int32_t err = 0;
24669 int num_stacks;
24670
24671 switch (type) {
24672 case MOD_LOAD:
24673 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map",
24674 sizeof(struct rack_sendmap),
24675 rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0);
24676
24677 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb",
24678 sizeof(struct tcp_rack),
24679 rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
24680
24681 sysctl_ctx_init(&rack_sysctl_ctx);
24682 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
24683 SYSCTL_STATIC_CHILDREN(_net_inet_tcp),
24684 OID_AUTO,
24685 #ifdef STACKALIAS
24686 __XSTRING(STACKALIAS),
24687 #else
24688 __XSTRING(STACKNAME),
24689 #endif
24690 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
24691 "");
24692 if (rack_sysctl_root == NULL) {
24693 printf("Failed to add sysctl node\n");
24694 err = EFAULT;
24695 goto free_uma;
24696 }
24697 rack_init_sysctls();
24698 num_stacks = nitems(rack_stack_names);
24699 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK,
24700 rack_stack_names, &num_stacks);
24701 if (err) {
24702 printf("Failed to register %s stack name for "
24703 "%s module\n", rack_stack_names[num_stacks],
24704 __XSTRING(MODNAME));
24705 sysctl_ctx_free(&rack_sysctl_ctx);
24706 free_uma:
24707 uma_zdestroy(rack_zone);
24708 uma_zdestroy(rack_pcb_zone);
24709 rack_counter_destroy();
24710 printf("Failed to register rack module -- err:%d\n", err);
24711 return (err);
24712 }
24713 tcp_lro_reg_mbufq();
24714 rack_mod_inited = true;
24715 break;
24716 case MOD_QUIESCE:
24717 err = deregister_tcp_functions(&__tcp_rack, true, false);
24718 break;
24719 case MOD_UNLOAD:
24720 err = deregister_tcp_functions(&__tcp_rack, false, true);
24721 if (err == EBUSY)
24722 break;
24723 if (rack_mod_inited) {
24724 uma_zdestroy(rack_zone);
24725 uma_zdestroy(rack_pcb_zone);
24726 sysctl_ctx_free(&rack_sysctl_ctx);
24727 rack_counter_destroy();
24728 rack_mod_inited = false;
24729 }
24730 tcp_lro_dereg_mbufq();
24731 err = 0;
24732 break;
24733 default:
24734 return (EOPNOTSUPP);
24735 }
24736 return (err);
24737 }
24738
24739 static moduledata_t tcp_rack = {
24740 .name = __XSTRING(MODNAME),
24741 .evhand = tcp_addrack,
24742 .priv = 0
24743 };
24744
24745 MODULE_VERSION(MODNAME, 1);
24746 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
24747 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1);
24748
24749 #endif /* #if !defined(INET) && !defined(INET6) */
24750