1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 #ifndef _TCP_ECN_H
3 #define _TCP_ECN_H
4
5 #include <linux/tcp.h>
6 #include <linux/skbuff.h>
7 #include <linux/bitfield.h>
8
9 #include <net/inet_connection_sock.h>
10 #include <net/sock.h>
11 #include <net/tcp.h>
12 #include <net/inet_ecn.h>
13
14 /* The highest ECN variant (Accurate ECN, ECN, or no ECN) that is
15 * attemped to be negotiated and requested for incoming connection
16 * and outgoing connection, respectively.
17 */
18 enum tcp_ecn_mode {
19 TCP_ECN_IN_NOECN_OUT_NOECN = 0,
20 TCP_ECN_IN_ECN_OUT_ECN = 1,
21 TCP_ECN_IN_ECN_OUT_NOECN = 2,
22 TCP_ECN_IN_ACCECN_OUT_ACCECN = 3,
23 TCP_ECN_IN_ACCECN_OUT_ECN = 4,
24 TCP_ECN_IN_ACCECN_OUT_NOECN = 5,
25 };
26
27 /* AccECN option sending when AccECN has been successfully negotiated */
28 enum tcp_accecn_option {
29 TCP_ACCECN_OPTION_DISABLED = 0,
30 TCP_ACCECN_OPTION_MINIMUM = 1,
31 TCP_ACCECN_OPTION_FULL = 2,
32 TCP_ACCECN_OPTION_PERSIST = 3,
33 };
34
35 /* Apply either ECT(0) or ECT(1) based on TCP_CONG_ECT_1_NEGOTIATION flag */
INET_ECN_xmit_ect_1_negotiation(struct sock * sk)36 static inline void INET_ECN_xmit_ect_1_negotiation(struct sock *sk)
37 {
38 __INET_ECN_xmit(sk, tcp_ca_ect_1_negotiation(sk));
39 }
40
tcp_ecn_queue_cwr(struct tcp_sock * tp)41 static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp)
42 {
43 /* Do not set CWR if in AccECN mode! */
44 if (tcp_ecn_mode_rfc3168(tp))
45 tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
46 }
47
tcp_ecn_accept_cwr(struct sock * sk,const struct sk_buff * skb)48 static inline void tcp_ecn_accept_cwr(struct sock *sk,
49 const struct sk_buff *skb)
50 {
51 struct tcp_sock *tp = tcp_sk(sk);
52
53 if (tcp_ecn_mode_rfc3168(tp) && tcp_hdr(skb)->cwr) {
54 tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
55
56 /* If the sender is telling us it has entered CWR, then its
57 * cwnd may be very low (even just 1 packet), so we should ACK
58 * immediately.
59 */
60 if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq)
61 inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
62 }
63 }
64
tcp_ecn_withdraw_cwr(struct tcp_sock * tp)65 static inline void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
66 {
67 tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
68 }
69
tcp_accecn_ace_fail_send(const struct tcp_sock * tp)70 static inline bool tcp_accecn_ace_fail_send(const struct tcp_sock *tp)
71 {
72 return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_SEND;
73 }
74
tcp_accecn_ace_fail_recv(const struct tcp_sock * tp)75 static inline bool tcp_accecn_ace_fail_recv(const struct tcp_sock *tp)
76 {
77 return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_RECV;
78 }
79
tcp_accecn_opt_fail_send(const struct tcp_sock * tp)80 static inline bool tcp_accecn_opt_fail_send(const struct tcp_sock *tp)
81 {
82 return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_SEND;
83 }
84
tcp_accecn_opt_fail_recv(const struct tcp_sock * tp)85 static inline bool tcp_accecn_opt_fail_recv(const struct tcp_sock *tp)
86 {
87 return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_RECV;
88 }
89
tcp_accecn_fail_mode_set(struct tcp_sock * tp,u8 mode)90 static inline void tcp_accecn_fail_mode_set(struct tcp_sock *tp, u8 mode)
91 {
92 tp->accecn_fail_mode |= mode;
93 }
94
tcp_accecn_ace(const struct tcphdr * th)95 static inline u8 tcp_accecn_ace(const struct tcphdr *th)
96 {
97 return (th->ae << 2) | (th->cwr << 1) | th->ece;
98 }
99
100 /* Infer the ECT value our SYN arrived with from the echoed ACE field */
tcp_accecn_extract_syn_ect(u8 ace)101 static inline int tcp_accecn_extract_syn_ect(u8 ace)
102 {
103 /* Below is an excerpt from the 1st block of Table 2 of AccECN spec */
104 static const int ace_to_ecn[8] = {
105 INET_ECN_ECT_0, /* 0b000 (Undefined) */
106 INET_ECN_ECT_1, /* 0b001 (Undefined) */
107 INET_ECN_NOT_ECT, /* 0b010 (Not-ECT is received) */
108 INET_ECN_ECT_1, /* 0b011 (ECT-1 is received) */
109 INET_ECN_ECT_0, /* 0b100 (ECT-0 is received) */
110 INET_ECN_ECT_1, /* 0b101 (Reserved) */
111 INET_ECN_CE, /* 0b110 (CE is received) */
112 INET_ECN_ECT_1 /* 0b111 (Undefined) */
113 };
114
115 return ace_to_ecn[ace & 0x7];
116 }
117
118 /* Check ECN field transition to detect invalid transitions */
tcp_ect_transition_valid(u8 snt,u8 rcv)119 static inline bool tcp_ect_transition_valid(u8 snt, u8 rcv)
120 {
121 if (rcv == snt)
122 return true;
123
124 /* Non-ECT altered to something or something became non-ECT */
125 if (snt == INET_ECN_NOT_ECT || rcv == INET_ECN_NOT_ECT)
126 return false;
127 /* CE -> ECT(0/1)? */
128 if (snt == INET_ECN_CE)
129 return false;
130 return true;
131 }
132
tcp_accecn_validate_syn_feedback(struct sock * sk,u8 ace,u8 sent_ect)133 static inline bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace,
134 u8 sent_ect)
135 {
136 u8 ect = tcp_accecn_extract_syn_ect(ace);
137 struct tcp_sock *tp = tcp_sk(sk);
138
139 if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback))
140 return true;
141
142 if (!tcp_ect_transition_valid(sent_ect, ect)) {
143 tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV);
144 return false;
145 }
146
147 return true;
148 }
149
tcp_accecn_saw_opt_fail_recv(struct tcp_sock * tp,u8 saw_opt)150 static inline void tcp_accecn_saw_opt_fail_recv(struct tcp_sock *tp,
151 u8 saw_opt)
152 {
153 tp->saw_accecn_opt = saw_opt;
154 if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN)
155 tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV);
156 }
157
158 /* Validate the 3rd ACK based on the ACE field, see Table 4 of AccECN spec */
tcp_accecn_third_ack(struct sock * sk,const struct sk_buff * skb,u8 sent_ect)159 static inline void tcp_accecn_third_ack(struct sock *sk,
160 const struct sk_buff *skb, u8 sent_ect)
161 {
162 u8 ace = tcp_accecn_ace(tcp_hdr(skb));
163 struct tcp_sock *tp = tcp_sk(sk);
164
165 switch (ace) {
166 case 0x0:
167 /* Invalid value */
168 if (!TCP_SKB_CB(skb)->sacked)
169 tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV |
170 TCP_ACCECN_OPT_FAIL_RECV);
171 break;
172 case 0x7:
173 case 0x5:
174 case 0x1:
175 /* Unused but legal values */
176 break;
177 default:
178 /* Validation only applies to first non-data packet */
179 if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq &&
180 !TCP_SKB_CB(skb)->sacked &&
181 tcp_accecn_validate_syn_feedback(sk, ace, sent_ect)) {
182 if ((tcp_accecn_extract_syn_ect(ace) == INET_ECN_CE) &&
183 !tp->delivered_ce)
184 tp->delivered_ce++;
185 }
186 break;
187 }
188 }
189
190 /* Demand the minimum # to send AccECN optnio */
tcp_accecn_opt_demand_min(struct sock * sk,u8 opt_demand_min)191 static inline void tcp_accecn_opt_demand_min(struct sock *sk,
192 u8 opt_demand_min)
193 {
194 struct tcp_sock *tp = tcp_sk(sk);
195 u8 opt_demand;
196
197 opt_demand = max_t(u8, opt_demand_min, tp->accecn_opt_demand);
198 tp->accecn_opt_demand = opt_demand;
199 }
200
201 /* Maps IP ECN field ECT/CE code point to AccECN option field number, given
202 * we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0).
203 */
tcp_ecnfield_to_accecn_optfield(u8 ecnfield)204 static inline u8 tcp_ecnfield_to_accecn_optfield(u8 ecnfield)
205 {
206 switch (ecnfield & INET_ECN_MASK) {
207 case INET_ECN_NOT_ECT:
208 return 0; /* AccECN does not send counts of NOT_ECT */
209 case INET_ECN_ECT_1:
210 return 1;
211 case INET_ECN_CE:
212 return 2;
213 case INET_ECN_ECT_0:
214 return 3;
215 }
216 return 0;
217 }
218
219 /* Maps IP ECN field ECT/CE code point to AccECN option field value offset.
220 * Some fields do not start from zero, to detect zeroing by middleboxes.
221 */
tcp_accecn_field_init_offset(u8 ecnfield)222 static inline u32 tcp_accecn_field_init_offset(u8 ecnfield)
223 {
224 switch (ecnfield & INET_ECN_MASK) {
225 case INET_ECN_NOT_ECT:
226 return 0; /* AccECN does not send counts of NOT_ECT */
227 case INET_ECN_ECT_1:
228 return TCP_ACCECN_E1B_INIT_OFFSET;
229 case INET_ECN_CE:
230 return TCP_ACCECN_CEB_INIT_OFFSET;
231 case INET_ECN_ECT_0:
232 return TCP_ACCECN_E0B_INIT_OFFSET;
233 }
234 return 0;
235 }
236
237 /* Maps AccECN option field #nr to IP ECN field ECT/CE bits */
tcp_accecn_optfield_to_ecnfield(unsigned int option,bool order)238 static inline unsigned int tcp_accecn_optfield_to_ecnfield(unsigned int option,
239 bool order)
240 {
241 /* Based on Table 5 of the AccECN spec to map (option, order) to
242 * the corresponding ECN conuters (ECT-1, ECT-0, or CE).
243 */
244 static const u8 optfield_lookup[2][3] = {
245 /* order = 0: 1st field ECT-0, 2nd field CE, 3rd field ECT-1 */
246 { INET_ECN_ECT_0, INET_ECN_CE, INET_ECN_ECT_1 },
247 /* order = 1: 1st field ECT-1, 2nd field CE, 3rd field ECT-0 */
248 { INET_ECN_ECT_1, INET_ECN_CE, INET_ECN_ECT_0 }
249 };
250
251 return optfield_lookup[order][option % 3];
252 }
253
254 /* Handles AccECN option ECT and CE 24-bit byte counters update into
255 * the u32 value in tcp_sock. As we're processing TCP options, it is
256 * safe to access from - 1.
257 */
tcp_update_ecn_bytes(u32 * cnt,const char * from,u32 init_offset)258 static inline s32 tcp_update_ecn_bytes(u32 *cnt, const char *from,
259 u32 init_offset)
260 {
261 u32 truncated = (get_unaligned_be32(from - 1) - init_offset) &
262 0xFFFFFFU;
263 u32 delta = (truncated - *cnt) & 0xFFFFFFU;
264
265 /* If delta has the highest bit set (24th bit) indicating
266 * negative, sign extend to correct an estimation using
267 * sign_extend32(delta, 24 - 1)
268 */
269 delta = sign_extend32(delta, 23);
270 *cnt += delta;
271 return (s32)delta;
272 }
273
274 /* Updates Accurate ECN received counters from the received IP ECN field */
tcp_ecn_received_counters(struct sock * sk,const struct sk_buff * skb,u32 len)275 static inline void tcp_ecn_received_counters(struct sock *sk,
276 const struct sk_buff *skb, u32 len)
277 {
278 u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK;
279 u8 is_ce = INET_ECN_is_ce(ecnfield);
280 struct tcp_sock *tp = tcp_sk(sk);
281 bool ecn_edge;
282
283 if (!INET_ECN_is_not_ect(ecnfield)) {
284 u32 pcount = is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs);
285
286 /* As for accurate ECN, the TCP_ECN_SEEN flag is set by
287 * tcp_ecn_received_counters() when the ECN codepoint of
288 * received TCP data or ACK contains ECT(0), ECT(1), or CE.
289 */
290 if (!tcp_ecn_mode_rfc3168(tp))
291 tp->ecn_flags |= TCP_ECN_SEEN;
292
293 /* ACE counter tracks *all* segments including pure ACKs */
294 tp->received_ce += pcount;
295 tp->received_ce_pending = min(tp->received_ce_pending + pcount,
296 0xfU);
297
298 if (len > 0) {
299 u8 minlen = tcp_ecnfield_to_accecn_optfield(ecnfield);
300 u32 oldbytes = tp->received_ecn_bytes[ecnfield - 1];
301 u32 bytes_mask = GENMASK_U32(31, 22);
302
303 tp->received_ecn_bytes[ecnfield - 1] += len;
304 tp->accecn_minlen = max_t(u8, tp->accecn_minlen,
305 minlen);
306
307 /* Send AccECN option at least once per 2^22-byte
308 * increase in any ECN byte counter.
309 */
310 if ((tp->received_ecn_bytes[ecnfield - 1] ^ oldbytes) &
311 bytes_mask) {
312 tcp_accecn_opt_demand_min(sk, 1);
313 }
314 }
315 }
316
317 ecn_edge = tp->prev_ecnfield != ecnfield;
318 if (ecn_edge || is_ce) {
319 tp->prev_ecnfield = ecnfield;
320 /* Demand Accurate ECN change-triggered ACKs. Two ACK are
321 * demanded to indicate unambiguously the ecnfield value
322 * in the latter ACK.
323 */
324 if (tcp_ecn_mode_accecn(tp)) {
325 if (ecn_edge)
326 inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
327 tp->accecn_opt_demand = 2;
328 }
329 }
330 }
331
332 /* AccECN specification, 2.2: [...] A Data Receiver maintains four counters
333 * initialized at the start of the half-connection. [...] These byte counters
334 * reflect only the TCP payload length, excluding TCP header and TCP options.
335 */
tcp_ecn_received_counters_payload(struct sock * sk,const struct sk_buff * skb)336 static inline void tcp_ecn_received_counters_payload(struct sock *sk,
337 const struct sk_buff *skb)
338 {
339 const struct tcphdr *th = (const struct tcphdr *)skb->data;
340
341 tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4);
342 }
343
344 /* AccECN specification, 5.1: [...] a server can determine that it
345 * negotiated AccECN as [...] if the ACK contains an ACE field with
346 * the value 0b010 to 0b111 (decimal 2 to 7).
347 */
cookie_accecn_ok(const struct tcphdr * th)348 static inline bool cookie_accecn_ok(const struct tcphdr *th)
349 {
350 return tcp_accecn_ace(th) > 0x1;
351 }
352
353 /* Used to form the ACE flags for SYN/ACK */
tcp_accecn_reflector_flags(u8 ect)354 static inline u16 tcp_accecn_reflector_flags(u8 ect)
355 {
356 /* TCP ACE flags of SYN/ACK are set based on IP-ECN received from SYN.
357 * Below is an excerpt from the 1st block of Table 2 of AccECN spec,
358 * in which TCP ACE flags are encoded as: (AE << 2) | (CWR << 1) | ECE
359 */
360 static const u8 ecn_to_ace_flags[4] = {
361 0b010, /* Not-ECT is received */
362 0b011, /* ECT(1) is received */
363 0b100, /* ECT(0) is received */
364 0b110 /* CE is received */
365 };
366
367 return FIELD_PREP(TCPHDR_ACE, ecn_to_ace_flags[ect & 0x3]);
368 }
369
370 /* AccECN specification, 3.1.2: If a TCP server that implements AccECN
371 * receives a SYN with the three TCP header flags (AE, CWR and ECE) set
372 * to any combination other than 000, 011 or 111, it MUST negotiate the
373 * use of AccECN as if they had been set to 111.
374 */
tcp_accecn_syn_requested(const struct tcphdr * th)375 static inline bool tcp_accecn_syn_requested(const struct tcphdr *th)
376 {
377 u8 ace = tcp_accecn_ace(th);
378
379 return ace && ace != 0x3;
380 }
381
__tcp_accecn_init_bytes_counters(int * counter_array)382 static inline void __tcp_accecn_init_bytes_counters(int *counter_array)
383 {
384 BUILD_BUG_ON(INET_ECN_ECT_1 != 0x1);
385 BUILD_BUG_ON(INET_ECN_ECT_0 != 0x2);
386 BUILD_BUG_ON(INET_ECN_CE != 0x3);
387
388 counter_array[INET_ECN_ECT_1 - 1] = 0;
389 counter_array[INET_ECN_ECT_0 - 1] = 0;
390 counter_array[INET_ECN_CE - 1] = 0;
391 }
392
tcp_accecn_init_counters(struct tcp_sock * tp)393 static inline void tcp_accecn_init_counters(struct tcp_sock *tp)
394 {
395 tp->received_ce = 0;
396 tp->received_ce_pending = 0;
397 __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes);
398 __tcp_accecn_init_bytes_counters(tp->delivered_ecn_bytes);
399 tp->accecn_opt_sent_w_dsack = 0;
400 tp->accecn_minlen = 0;
401 tp->accecn_opt_demand = 0;
402 tp->est_ecnfield = 0;
403 }
404
405 /* Used for make_synack to form the ACE flags */
tcp_accecn_echo_syn_ect(struct tcphdr * th,u8 ect)406 static inline void tcp_accecn_echo_syn_ect(struct tcphdr *th, u8 ect)
407 {
408 /* TCP ACE flags of SYN/ACK are set based on IP-ECN codepoint received
409 * from SYN. Below is an excerpt from Table 2 of the AccECN spec:
410 * +====================+====================================+
411 * | IP-ECN codepoint | Respective ACE falgs on SYN/ACK |
412 * | received on SYN | AE CWR ECE |
413 * +====================+====================================+
414 * | Not-ECT | 0 1 0 |
415 * | ECT(1) | 0 1 1 |
416 * | ECT(0) | 1 0 0 |
417 * | CE | 1 1 0 |
418 * +====================+====================================+
419 */
420 th->ae = !!(ect & INET_ECN_ECT_0);
421 th->cwr = ect != INET_ECN_ECT_0;
422 th->ece = ect == INET_ECN_ECT_1;
423 }
424
tcp_accecn_set_ace(struct tcp_sock * tp,struct sk_buff * skb,struct tcphdr * th)425 static inline void tcp_accecn_set_ace(struct tcp_sock *tp, struct sk_buff *skb,
426 struct tcphdr *th)
427 {
428 u32 wire_ace;
429
430 /* The final packet of the 3WHS or anything like it must reflect
431 * the SYN/ACK ECT instead of putting CEP into ACE field, such
432 * case show up in tcp_flags.
433 */
434 if (likely(!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACE))) {
435 wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET;
436 th->ece = !!(wire_ace & 0x1);
437 th->cwr = !!(wire_ace & 0x2);
438 th->ae = !!(wire_ace & 0x4);
439 tp->received_ce_pending = 0;
440 }
441 }
442
tcp_accecn_option_init(const struct sk_buff * skb,u8 opt_offset)443 static inline u8 tcp_accecn_option_init(const struct sk_buff *skb,
444 u8 opt_offset)
445 {
446 u8 *ptr = skb_transport_header(skb) + opt_offset;
447 unsigned int optlen = ptr[1] - 2;
448
449 if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1))
450 return TCP_ACCECN_OPT_FAIL_SEEN;
451 ptr += 2;
452
453 /* Detect option zeroing: an AccECN connection "MAY check that the
454 * initial value of the EE0B field or the EE1B field is non-zero"
455 */
456 if (optlen < TCPOLEN_ACCECN_PERFIELD)
457 return TCP_ACCECN_OPT_EMPTY_SEEN;
458 if (get_unaligned_be24(ptr) == 0)
459 return TCP_ACCECN_OPT_FAIL_SEEN;
460 if (optlen < TCPOLEN_ACCECN_PERFIELD * 3)
461 return TCP_ACCECN_OPT_COUNTER_SEEN;
462 ptr += TCPOLEN_ACCECN_PERFIELD * 2;
463 if (get_unaligned_be24(ptr) == 0)
464 return TCP_ACCECN_OPT_FAIL_SEEN;
465
466 return TCP_ACCECN_OPT_COUNTER_SEEN;
467 }
468
tcp_ecn_rcv_synack_accecn(struct sock * sk,const struct sk_buff * skb,u8 dsf)469 static inline void tcp_ecn_rcv_synack_accecn(struct sock *sk,
470 const struct sk_buff *skb, u8 dsf)
471 {
472 struct tcp_sock *tp = tcp_sk(sk);
473
474 tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
475 tp->syn_ect_rcv = dsf & INET_ECN_MASK;
476 /* Demand Accurate ECN option in response to the SYN on the SYN/ACK
477 * and the TCP server will try to send one more packet with an AccECN
478 * Option at a later point during the connection.
479 */
480 if (tp->rx_opt.accecn &&
481 tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
482 u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn);
483
484 tcp_accecn_saw_opt_fail_recv(tp, saw_opt);
485 tp->accecn_opt_demand = 2;
486 }
487 }
488
489 /* See Table 2 of the AccECN draft */
tcp_ecn_rcv_synack(struct sock * sk,const struct sk_buff * skb,const struct tcphdr * th,u8 ip_dsfield)490 static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct sk_buff *skb,
491 const struct tcphdr *th, u8 ip_dsfield)
492 {
493 struct tcp_sock *tp = tcp_sk(sk);
494 u8 ace = tcp_accecn_ace(th);
495
496 switch (ace) {
497 case 0x0:
498 case 0x7:
499 /* +========+========+============+=============+
500 * | A | B | SYN/ACK | Feedback |
501 * | | | B->A | Mode of A |
502 * | | | AE CWR ECE | |
503 * +========+========+============+=============+
504 * | AccECN | No ECN | 0 0 0 | Not ECN |
505 * | AccECN | Broken | 1 1 1 | Not ECN |
506 * +========+========+============+=============+
507 */
508 tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
509 break;
510 case 0x1:
511 /* +========+========+============+=============+
512 * | A | B | SYN/ACK | Feedback |
513 * | | | B->A | Mode of A |
514 * | | | AE CWR ECE | |
515 * +========+========+============+=============+
516 * | AccECN | ECN | 0 0 1 | Classic ECN |
517 * | Nonce | AccECN | 0 0 1 | Classic ECN |
518 * | ECN | AccECN | 0 0 1 | Classic ECN |
519 * +========+========+============+=============+
520 */
521 if (tcp_ca_no_fallback_rfc3168(sk))
522 tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
523 else
524 tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
525 break;
526 case 0x5:
527 if (tcp_ecn_mode_pending(tp)) {
528 tcp_ecn_rcv_synack_accecn(sk, skb, ip_dsfield);
529 if (INET_ECN_is_ce(ip_dsfield)) {
530 tp->received_ce++;
531 tp->received_ce_pending++;
532 }
533 }
534 break;
535 default:
536 tcp_ecn_rcv_synack_accecn(sk, skb, ip_dsfield);
537 if (INET_ECN_is_ce(ip_dsfield) &&
538 tcp_accecn_validate_syn_feedback(sk, ace,
539 tp->syn_ect_snt)) {
540 tp->received_ce++;
541 tp->received_ce_pending++;
542 }
543 break;
544 }
545 }
546
tcp_ecn_rcv_syn(struct sock * sk,const struct tcphdr * th,const struct sk_buff * skb)547 static inline void tcp_ecn_rcv_syn(struct sock *sk, const struct tcphdr *th,
548 const struct sk_buff *skb)
549 {
550 struct tcp_sock *tp = tcp_sk(sk);
551
552 if (tcp_ecn_mode_pending(tp)) {
553 if (!tcp_accecn_syn_requested(th)) {
554 /* Downgrade to classic ECN feedback */
555 tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
556 } else {
557 tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield &
558 INET_ECN_MASK;
559 tp->prev_ecnfield = tp->syn_ect_rcv;
560 tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
561 }
562 }
563 if (tcp_ecn_mode_rfc3168(tp) &&
564 (!th->ece || !th->cwr || tcp_ca_no_fallback_rfc3168(sk)))
565 tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
566 }
567
tcp_ecn_rcv_ecn_echo(const struct tcp_sock * tp,const struct tcphdr * th)568 static inline bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp,
569 const struct tcphdr *th)
570 {
571 if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp))
572 return true;
573 return false;
574 }
575
576 /* Packet ECN state for a SYN-ACK */
tcp_ecn_send_synack(struct sock * sk,struct sk_buff * skb)577 static inline void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
578 {
579 struct tcp_sock *tp = tcp_sk(sk);
580
581 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
582 if (tcp_ecn_disabled(tp))
583 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
584 else if (tcp_ca_needs_ecn(sk) ||
585 tcp_bpf_ca_needs_ecn(sk))
586 INET_ECN_xmit_ect_1_negotiation(sk);
587
588 if (tp->ecn_flags & TCP_ECN_MODE_ACCECN) {
589 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE;
590 TCP_SKB_CB(skb)->tcp_flags |=
591 tcp_accecn_reflector_flags(tp->syn_ect_rcv);
592 tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
593 }
594 }
595
596 /* Packet ECN state for a SYN. */
tcp_ecn_send_syn(struct sock * sk,struct sk_buff * skb)597 static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
598 {
599 struct tcp_sock *tp = tcp_sk(sk);
600 bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
601 bool use_ecn, use_accecn;
602 u8 tcp_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn);
603
604 use_accecn = tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ACCECN ||
605 tcp_ca_needs_accecn(sk);
606 use_ecn = tcp_ecn == TCP_ECN_IN_ECN_OUT_ECN ||
607 tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ECN ||
608 tcp_ca_needs_ecn(sk) || bpf_needs_ecn || use_accecn;
609
610 if (!use_ecn) {
611 const struct dst_entry *dst = __sk_dst_get(sk);
612
613 if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
614 use_ecn = true;
615 }
616
617 tp->ecn_flags = 0;
618
619 if (use_ecn) {
620 if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
621 INET_ECN_xmit_ect_1_negotiation(sk);
622
623 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
624 if (use_accecn) {
625 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_AE;
626 tcp_ecn_mode_set(tp, TCP_ECN_MODE_PENDING);
627 tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
628 } else {
629 tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
630 }
631 }
632 }
633
tcp_ecn_clear_syn(struct sock * sk,struct sk_buff * skb)634 static inline void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
635 {
636 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) {
637 /* tp->ecn_flags are cleared at a later point in time when
638 * SYN ACK is ultimatively being received.
639 */
640 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE;
641 }
642 }
643
644 static inline void
tcp_ecn_make_synack(const struct request_sock * req,struct tcphdr * th,enum tcp_synack_type synack_type)645 tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th,
646 enum tcp_synack_type synack_type)
647 {
648 /* Accurate ECN shall retransmit SYN/ACK with ACE=0 if the
649 * previously retransmitted SYN/ACK also times out.
650 */
651 if (!req->num_timeout || synack_type != TCP_SYNACK_RETRANS) {
652 if (tcp_rsk(req)->accecn_ok)
653 tcp_accecn_echo_syn_ect(th, tcp_rsk(req)->syn_ect_rcv);
654 else if (inet_rsk(req)->ecn_ok)
655 th->ece = 1;
656 } else if (tcp_rsk(req)->accecn_ok) {
657 th->ae = 0;
658 th->cwr = 0;
659 th->ece = 0;
660 }
661 }
662
tcp_accecn_option_beacon_check(const struct sock * sk)663 static inline bool tcp_accecn_option_beacon_check(const struct sock *sk)
664 {
665 u32 ecn_beacon = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option_beacon);
666 const struct tcp_sock *tp = tcp_sk(sk);
667
668 if (!ecn_beacon)
669 return false;
670
671 return tcp_stamp_us_delta(tp->tcp_mstamp, tp->accecn_opt_tstamp) * ecn_beacon >=
672 (tp->srtt_us >> 3);
673 }
674
675 #endif /* _LINUX_TCP_ECN_H */
676