xref: /linux/tools/testing/selftests/bpf/progs/test_cls_redirect_dynptr.c (revision 1260ed77798502de9c98020040d2995008de10cc)
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 // Copyright (c) 2019, 2020 Cloudflare
3 
4 #include <stdbool.h>
5 #include <stddef.h>
6 #include <stdint.h>
7 #include <string.h>
8 
9 #include <linux/bpf.h>
10 #include <linux/icmp.h>
11 #include <linux/icmpv6.h>
12 #include <linux/if_ether.h>
13 #include <linux/in.h>
14 #include <linux/ip.h>
15 #include <linux/ipv6.h>
16 #include <linux/pkt_cls.h>
17 #include <linux/tcp.h>
18 #include <netinet/udp.h>
19 
20 #include <bpf/bpf_helpers.h>
21 #include <bpf/bpf_endian.h>
22 
23 #include "test_cls_redirect.h"
24 #include "bpf_kfuncs.h"
25 
26 #pragma GCC diagnostic ignored "-Waddress-of-packed-member"
27 
28 #define offsetofend(TYPE, MEMBER) \
29 	(offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER)))
30 
31 #define IP_OFFSET_MASK (0x1FFF)
32 #define IP_MF (0x2000)
33 
34 char _license[] SEC("license") = "Dual BSD/GPL";
35 
36 /**
37  * Destination port and IP used for UDP encapsulation.
38  */
39 volatile const __be16 ENCAPSULATION_PORT;
40 volatile const __be32 ENCAPSULATION_IP;
41 
42 typedef struct {
43 	uint64_t processed_packets_total;
44 	uint64_t l3_protocol_packets_total_ipv4;
45 	uint64_t l3_protocol_packets_total_ipv6;
46 	uint64_t l4_protocol_packets_total_tcp;
47 	uint64_t l4_protocol_packets_total_udp;
48 	uint64_t accepted_packets_total_syn;
49 	uint64_t accepted_packets_total_syn_cookies;
50 	uint64_t accepted_packets_total_last_hop;
51 	uint64_t accepted_packets_total_icmp_echo_request;
52 	uint64_t accepted_packets_total_established;
53 	uint64_t forwarded_packets_total_gue;
54 	uint64_t forwarded_packets_total_gre;
55 
56 	uint64_t errors_total_unknown_l3_proto;
57 	uint64_t errors_total_unknown_l4_proto;
58 	uint64_t errors_total_malformed_ip;
59 	uint64_t errors_total_fragmented_ip;
60 	uint64_t errors_total_malformed_icmp;
61 	uint64_t errors_total_unwanted_icmp;
62 	uint64_t errors_total_malformed_icmp_pkt_too_big;
63 	uint64_t errors_total_malformed_tcp;
64 	uint64_t errors_total_malformed_udp;
65 	uint64_t errors_total_icmp_echo_replies;
66 	uint64_t errors_total_malformed_encapsulation;
67 	uint64_t errors_total_encap_adjust_failed;
68 	uint64_t errors_total_encap_buffer_too_small;
69 	uint64_t errors_total_redirect_loop;
70 	uint64_t errors_total_encap_mtu_violate;
71 } metrics_t;
72 
73 typedef enum {
74 	INVALID = 0,
75 	UNKNOWN,
76 	ECHO_REQUEST,
77 	SYN,
78 	SYN_COOKIE,
79 	ESTABLISHED,
80 } verdict_t;
81 
82 typedef struct {
83 	uint16_t src, dst;
84 } flow_ports_t;
85 
86 _Static_assert(
87 	sizeof(flow_ports_t) !=
88 		offsetofend(struct bpf_sock_tuple, ipv4.dport) -
89 			offsetof(struct bpf_sock_tuple, ipv4.sport) - 1,
90 	"flow_ports_t must match sport and dport in struct bpf_sock_tuple");
91 _Static_assert(
92 	sizeof(flow_ports_t) !=
93 		offsetofend(struct bpf_sock_tuple, ipv6.dport) -
94 			offsetof(struct bpf_sock_tuple, ipv6.sport) - 1,
95 	"flow_ports_t must match sport and dport in struct bpf_sock_tuple");
96 
97 struct iphdr_info {
98 	void *hdr;
99 	__u64 len;
100 };
101 
102 typedef int ret_t;
103 
104 /* This is a bit of a hack. We need a return value which allows us to
105  * indicate that the regular flow of the program should continue,
106  * while allowing functions to use XDP_PASS and XDP_DROP, etc.
107  */
108 static const ret_t CONTINUE_PROCESSING = -1;
109 
110 /* Convenience macro to call functions which return ret_t.
111  */
112 #define MAYBE_RETURN(x)                           \
113 	do {                                      \
114 		ret_t __ret = x;                  \
115 		if (__ret != CONTINUE_PROCESSING) \
116 			return __ret;             \
117 	} while (0)
118 
ipv4_is_fragment(const struct iphdr * ip)119 static bool ipv4_is_fragment(const struct iphdr *ip)
120 {
121 	uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK);
122 	return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0;
123 }
124 
pkt_parse_ipv4(struct bpf_dynptr * dynptr,__u64 * offset,struct iphdr * iphdr)125 static int pkt_parse_ipv4(struct bpf_dynptr *dynptr, __u64 *offset, struct iphdr *iphdr)
126 {
127 	if (bpf_dynptr_read(iphdr, sizeof(*iphdr), dynptr, *offset, 0))
128 		return -1;
129 
130 	*offset += sizeof(*iphdr);
131 
132 	if (iphdr->ihl < 5)
133 		return -1;
134 
135 	/* skip ipv4 options */
136 	*offset += (iphdr->ihl - 5) * 4;
137 
138 	return 0;
139 }
140 
141 /* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */
pkt_parse_icmp_l4_ports(struct bpf_dynptr * dynptr,__u64 * offset,flow_ports_t * ports)142 static bool pkt_parse_icmp_l4_ports(struct bpf_dynptr *dynptr, __u64 *offset, flow_ports_t *ports)
143 {
144 	if (bpf_dynptr_read(ports, sizeof(*ports), dynptr, *offset, 0))
145 		return false;
146 
147 	*offset += sizeof(*ports);
148 
149 	/* Ports in the L4 headers are reversed, since we are parsing an ICMP
150 	 * payload which is going towards the eyeball.
151 	 */
152 	uint16_t dst = ports->src;
153 	ports->src = ports->dst;
154 	ports->dst = dst;
155 	return true;
156 }
157 
pkt_checksum_fold(uint32_t csum)158 static uint16_t pkt_checksum_fold(uint32_t csum)
159 {
160 	/* The highest reasonable value for an IPv4 header
161 	 * checksum requires two folds, so we just do that always.
162 	 */
163 	csum = (csum & 0xffff) + (csum >> 16);
164 	csum = (csum & 0xffff) + (csum >> 16);
165 	return (uint16_t)~csum;
166 }
167 
pkt_ipv4_checksum(struct iphdr * iph)168 static void pkt_ipv4_checksum(struct iphdr *iph)
169 {
170 	iph->check = 0;
171 
172 	/* An IP header without options is 20 bytes. Two of those
173 	 * are the checksum, which we always set to zero. Hence,
174 	 * the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7,
175 	 * which fits in 32 bit.
176 	 */
177 	_Static_assert(sizeof(struct iphdr) == 20, "iphdr must be 20 bytes");
178 	uint32_t acc = 0;
179 	uint16_t *ipw = (uint16_t *)iph;
180 
181 	for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++)
182 		acc += ipw[i];
183 
184 	iph->check = pkt_checksum_fold(acc);
185 }
186 
pkt_skip_ipv6_extension_headers(struct bpf_dynptr * dynptr,__u64 * offset,const struct ipv6hdr * ipv6,uint8_t * upper_proto,bool * is_fragment)187 static bool pkt_skip_ipv6_extension_headers(struct bpf_dynptr *dynptr, __u64 *offset,
188 					    const struct ipv6hdr *ipv6, uint8_t *upper_proto,
189 					    bool *is_fragment)
190 {
191 	/* We understand five extension headers.
192 	 * https://tools.ietf.org/html/rfc8200#section-4.1 states that all
193 	 * headers should occur once, except Destination Options, which may
194 	 * occur twice. Hence we give up after 6 headers.
195 	 */
196 	struct {
197 		uint8_t next;
198 		uint8_t len;
199 	} exthdr = {
200 		.next = ipv6->nexthdr,
201 	};
202 	*is_fragment = false;
203 
204 	for (int i = 0; i < 6; i++) {
205 		switch (exthdr.next) {
206 		case IPPROTO_FRAGMENT:
207 			*is_fragment = true;
208 			/* NB: We don't check that hdrlen == 0 as per spec. */
209 			/* fallthrough; */
210 
211 		case IPPROTO_HOPOPTS:
212 		case IPPROTO_ROUTING:
213 		case IPPROTO_DSTOPTS:
214 		case IPPROTO_MH:
215 			if (bpf_dynptr_read(&exthdr, sizeof(exthdr), dynptr, *offset, 0))
216 				return false;
217 
218 			/* hdrlen is in 8-octet units, and excludes the first 8 octets. */
219 			*offset += (exthdr.len + 1) * 8;
220 
221 			/* Decode next header */
222 			break;
223 
224 		default:
225 			/* The next header is not one of the known extension
226 			 * headers, treat it as the upper layer header.
227 			 *
228 			 * This handles IPPROTO_NONE.
229 			 *
230 			 * Encapsulating Security Payload (50) and Authentication
231 			 * Header (51) also end up here (and will trigger an
232 			 * unknown proto error later). They have a custom header
233 			 * format and seem too esoteric to care about.
234 			 */
235 			*upper_proto = exthdr.next;
236 			return true;
237 		}
238 	}
239 
240 	/* We never found an upper layer header. */
241 	return false;
242 }
243 
pkt_parse_ipv6(struct bpf_dynptr * dynptr,__u64 * offset,struct ipv6hdr * ipv6,uint8_t * proto,bool * is_fragment)244 static int pkt_parse_ipv6(struct bpf_dynptr *dynptr, __u64 *offset, struct ipv6hdr *ipv6,
245 			  uint8_t *proto, bool *is_fragment)
246 {
247 	if (bpf_dynptr_read(ipv6, sizeof(*ipv6), dynptr, *offset, 0))
248 		return -1;
249 
250 	*offset += sizeof(*ipv6);
251 
252 	if (!pkt_skip_ipv6_extension_headers(dynptr, offset, ipv6, proto, is_fragment))
253 		return -1;
254 
255 	return 0;
256 }
257 
258 /* Global metrics, per CPU
259  */
260 struct {
261 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
262 	__uint(max_entries, 1);
263 	__type(key, unsigned int);
264 	__type(value, metrics_t);
265 } metrics_map SEC(".maps");
266 
get_global_metrics(void)267 static metrics_t *get_global_metrics(void)
268 {
269 	uint64_t key = 0;
270 	return bpf_map_lookup_elem(&metrics_map, &key);
271 }
272 
accept_locally(struct __sk_buff * skb,encap_headers_t * encap)273 static ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap)
274 {
275 	const int payload_off =
276 		sizeof(*encap) +
277 		sizeof(struct in_addr) * encap->unigue.hop_count;
278 	int32_t encap_overhead = payload_off - sizeof(struct ethhdr);
279 
280 	/* Changing the ethertype if the encapsulated packet is ipv6 */
281 	if (encap->gue.proto_ctype == IPPROTO_IPV6)
282 		encap->eth.h_proto = bpf_htons(ETH_P_IPV6);
283 
284 	if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC,
285 				BPF_F_ADJ_ROOM_FIXED_GSO |
286 				BPF_F_ADJ_ROOM_NO_CSUM_RESET) ||
287 	    bpf_csum_level(skb, BPF_CSUM_LEVEL_DEC))
288 		return TC_ACT_SHOT;
289 
290 	return bpf_redirect(skb->ifindex, BPF_F_INGRESS);
291 }
292 
forward_with_gre(struct __sk_buff * skb,struct bpf_dynptr * dynptr,encap_headers_t * encap,struct in_addr * next_hop,metrics_t * metrics)293 static ret_t forward_with_gre(struct __sk_buff *skb, struct bpf_dynptr *dynptr,
294 			      encap_headers_t *encap, struct in_addr *next_hop,
295 			      metrics_t *metrics)
296 {
297 	const int payload_off =
298 		sizeof(*encap) +
299 		sizeof(struct in_addr) * encap->unigue.hop_count;
300 	int32_t encap_overhead =
301 		payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr);
302 	int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead;
303 	__u8 encap_buffer[sizeof(encap_gre_t)] = {};
304 	uint16_t proto = ETH_P_IP;
305 	uint32_t mtu_len = 0;
306 	encap_gre_t *encap_gre;
307 
308 	metrics->forwarded_packets_total_gre++;
309 
310 	/* Loop protection: the inner packet's TTL is decremented as a safeguard
311 	 * against any forwarding loop. As the only interesting field is the TTL
312 	 * hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes
313 	 * as they handle the split packets if needed (no need for the data to be
314 	 * in the linear section).
315 	 */
316 	if (encap->gue.proto_ctype == IPPROTO_IPV6) {
317 		proto = ETH_P_IPV6;
318 		uint8_t ttl;
319 		int rc;
320 
321 		rc = bpf_skb_load_bytes(
322 			skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
323 			&ttl, 1);
324 		if (rc != 0) {
325 			metrics->errors_total_malformed_encapsulation++;
326 			return TC_ACT_SHOT;
327 		}
328 
329 		if (ttl == 0) {
330 			metrics->errors_total_redirect_loop++;
331 			return TC_ACT_SHOT;
332 		}
333 
334 		ttl--;
335 		rc = bpf_skb_store_bytes(
336 			skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
337 			&ttl, 1, 0);
338 		if (rc != 0) {
339 			metrics->errors_total_malformed_encapsulation++;
340 			return TC_ACT_SHOT;
341 		}
342 	} else {
343 		uint8_t ttl;
344 		int rc;
345 
346 		rc = bpf_skb_load_bytes(
347 			skb, payload_off + offsetof(struct iphdr, ttl), &ttl,
348 			1);
349 		if (rc != 0) {
350 			metrics->errors_total_malformed_encapsulation++;
351 			return TC_ACT_SHOT;
352 		}
353 
354 		if (ttl == 0) {
355 			metrics->errors_total_redirect_loop++;
356 			return TC_ACT_SHOT;
357 		}
358 
359 		/* IPv4 also has a checksum to patch. While the TTL is only one byte,
360 		 * this function only works for 2 and 4 bytes arguments (the result is
361 		 * the same).
362 		 */
363 		rc = bpf_l3_csum_replace(
364 			skb, payload_off + offsetof(struct iphdr, check), ttl,
365 			ttl - 1, 2);
366 		if (rc != 0) {
367 			metrics->errors_total_malformed_encapsulation++;
368 			return TC_ACT_SHOT;
369 		}
370 
371 		ttl--;
372 		rc = bpf_skb_store_bytes(
373 			skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1,
374 			0);
375 		if (rc != 0) {
376 			metrics->errors_total_malformed_encapsulation++;
377 			return TC_ACT_SHOT;
378 		}
379 	}
380 
381 	if (bpf_check_mtu(skb, skb->ifindex, &mtu_len, delta, 0)) {
382 		metrics->errors_total_encap_mtu_violate++;
383 		return TC_ACT_SHOT;
384 	}
385 
386 	if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET,
387 				BPF_F_ADJ_ROOM_FIXED_GSO |
388 				BPF_F_ADJ_ROOM_NO_CSUM_RESET) ||
389 	    bpf_csum_level(skb, BPF_CSUM_LEVEL_INC)) {
390 		metrics->errors_total_encap_adjust_failed++;
391 		return TC_ACT_SHOT;
392 	}
393 
394 	if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) {
395 		metrics->errors_total_encap_buffer_too_small++;
396 		return TC_ACT_SHOT;
397 	}
398 
399 	encap_gre = bpf_dynptr_slice_rdwr(dynptr, 0, encap_buffer, sizeof(encap_buffer));
400 	if (!encap_gre) {
401 		metrics->errors_total_encap_buffer_too_small++;
402 		return TC_ACT_SHOT;
403 	}
404 
405 	encap_gre->ip.protocol = IPPROTO_GRE;
406 	encap_gre->ip.daddr = next_hop->s_addr;
407 	encap_gre->ip.saddr = ENCAPSULATION_IP;
408 	encap_gre->ip.tot_len =
409 		bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta);
410 	encap_gre->gre.flags = 0;
411 	encap_gre->gre.protocol = bpf_htons(proto);
412 	pkt_ipv4_checksum((void *)&encap_gre->ip);
413 
414 	if (encap_gre == encap_buffer)
415 		bpf_dynptr_write(dynptr, 0, encap_buffer, sizeof(encap_buffer), 0);
416 
417 	return bpf_redirect(skb->ifindex, 0);
418 }
419 
forward_to_next_hop(struct __sk_buff * skb,struct bpf_dynptr * dynptr,encap_headers_t * encap,struct in_addr * next_hop,metrics_t * metrics)420 static ret_t forward_to_next_hop(struct __sk_buff *skb, struct bpf_dynptr *dynptr,
421 				 encap_headers_t *encap, struct in_addr *next_hop,
422 				 metrics_t *metrics)
423 {
424 	/* swap L2 addresses */
425 	/* This assumes that packets are received from a router.
426 	 * So just swapping the MAC addresses here will make the packet go back to
427 	 * the router, which will send it to the appropriate machine.
428 	 */
429 	unsigned char temp[ETH_ALEN];
430 	memcpy(temp, encap->eth.h_dest, sizeof(temp));
431 	memcpy(encap->eth.h_dest, encap->eth.h_source,
432 	       sizeof(encap->eth.h_dest));
433 	memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source));
434 
435 	if (encap->unigue.next_hop == encap->unigue.hop_count - 1 &&
436 	    encap->unigue.last_hop_gre) {
437 		return forward_with_gre(skb, dynptr, encap, next_hop, metrics);
438 	}
439 
440 	metrics->forwarded_packets_total_gue++;
441 	uint32_t old_saddr = encap->ip.saddr;
442 	encap->ip.saddr = encap->ip.daddr;
443 	encap->ip.daddr = next_hop->s_addr;
444 	if (encap->unigue.next_hop < encap->unigue.hop_count) {
445 		encap->unigue.next_hop++;
446 	}
447 
448 	/* Remove ip->saddr, add next_hop->s_addr */
449 	const uint64_t off = offsetof(typeof(*encap), ip.check);
450 	int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, 4);
451 	if (ret < 0) {
452 		return TC_ACT_SHOT;
453 	}
454 
455 	return bpf_redirect(skb->ifindex, 0);
456 }
457 
skip_next_hops(__u64 * offset,int n)458 static ret_t skip_next_hops(__u64 *offset, int n)
459 {
460 	switch (n) {
461 	case 1:
462 		*offset += sizeof(struct in_addr);
463 	case 0:
464 		return CONTINUE_PROCESSING;
465 
466 	default:
467 		return TC_ACT_SHOT;
468 	}
469 }
470 
471 /* Get the next hop from the GLB header.
472  *
473  * Sets next_hop->s_addr to 0 if there are no more hops left.
474  * pkt is positioned just after the variable length GLB header
475  * iff the call is successful.
476  */
get_next_hop(struct bpf_dynptr * dynptr,__u64 * offset,encap_headers_t * encap,struct in_addr * next_hop)477 static ret_t get_next_hop(struct bpf_dynptr *dynptr, __u64 *offset, encap_headers_t *encap,
478 			  struct in_addr *next_hop)
479 {
480 	if (encap->unigue.next_hop > encap->unigue.hop_count)
481 		return TC_ACT_SHOT;
482 
483 	/* Skip "used" next hops. */
484 	MAYBE_RETURN(skip_next_hops(offset, encap->unigue.next_hop));
485 
486 	if (encap->unigue.next_hop == encap->unigue.hop_count) {
487 		/* No more next hops, we are at the end of the GLB header. */
488 		next_hop->s_addr = 0;
489 		return CONTINUE_PROCESSING;
490 	}
491 
492 	if (bpf_dynptr_read(next_hop, sizeof(*next_hop), dynptr, *offset, 0))
493 		return TC_ACT_SHOT;
494 
495 	*offset += sizeof(*next_hop);
496 
497 	/* Skip the remainig next hops (may be zero). */
498 	return skip_next_hops(offset, encap->unigue.hop_count - encap->unigue.next_hop - 1);
499 }
500 
501 /* Fill a bpf_sock_tuple to be used with the socket lookup functions.
502  * This is a kludge that let's us work around verifier limitations:
503  *
504  *    fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321)
505  *
506  * clang will substitute a constant for sizeof, which allows the verifier
507  * to track it's value. Based on this, it can figure out the constant
508  * return value, and calling code works while still being "generic" to
509  * IPv4 and IPv6.
510  */
fill_tuple(struct bpf_sock_tuple * tuple,void * iph,uint64_t iphlen,uint16_t sport,uint16_t dport)511 static uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph,
512 				    uint64_t iphlen, uint16_t sport, uint16_t dport)
513 {
514 	switch (iphlen) {
515 	case sizeof(struct iphdr): {
516 		struct iphdr *ipv4 = (struct iphdr *)iph;
517 		tuple->ipv4.daddr = ipv4->daddr;
518 		tuple->ipv4.saddr = ipv4->saddr;
519 		tuple->ipv4.sport = sport;
520 		tuple->ipv4.dport = dport;
521 		return sizeof(tuple->ipv4);
522 	}
523 
524 	case sizeof(struct ipv6hdr): {
525 		struct ipv6hdr *ipv6 = (struct ipv6hdr *)iph;
526 		memcpy(&tuple->ipv6.daddr, &ipv6->daddr,
527 		       sizeof(tuple->ipv6.daddr));
528 		memcpy(&tuple->ipv6.saddr, &ipv6->saddr,
529 		       sizeof(tuple->ipv6.saddr));
530 		tuple->ipv6.sport = sport;
531 		tuple->ipv6.dport = dport;
532 		return sizeof(tuple->ipv6);
533 	}
534 
535 	default:
536 		return 0;
537 	}
538 }
539 
classify_tcp(struct __sk_buff * skb,struct bpf_sock_tuple * tuple,uint64_t tuplen,void * iph,struct tcphdr * tcp)540 static verdict_t classify_tcp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple,
541 			      uint64_t tuplen, void *iph, struct tcphdr *tcp)
542 {
543 	struct bpf_sock *sk =
544 		bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
545 
546 	if (sk == NULL)
547 		return UNKNOWN;
548 
549 	if (sk->state != BPF_TCP_LISTEN) {
550 		bpf_sk_release(sk);
551 		return ESTABLISHED;
552 	}
553 
554 	if (iph != NULL && tcp != NULL) {
555 		/* Kludge: we've run out of arguments, but need the length of the ip header. */
556 		uint64_t iphlen = sizeof(struct iphdr);
557 
558 		if (tuplen == sizeof(tuple->ipv6))
559 			iphlen = sizeof(struct ipv6hdr);
560 
561 		if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp,
562 					    sizeof(*tcp)) == 0) {
563 			bpf_sk_release(sk);
564 			return SYN_COOKIE;
565 		}
566 	}
567 
568 	bpf_sk_release(sk);
569 	return UNKNOWN;
570 }
571 
classify_udp(struct __sk_buff * skb,struct bpf_sock_tuple * tuple,uint64_t tuplen)572 static verdict_t classify_udp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, uint64_t tuplen)
573 {
574 	struct bpf_sock *sk =
575 		bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
576 
577 	if (sk == NULL)
578 		return UNKNOWN;
579 
580 	if (sk->state == BPF_TCP_ESTABLISHED) {
581 		bpf_sk_release(sk);
582 		return ESTABLISHED;
583 	}
584 
585 	bpf_sk_release(sk);
586 	return UNKNOWN;
587 }
588 
classify_icmp(struct __sk_buff * skb,uint8_t proto,struct bpf_sock_tuple * tuple,uint64_t tuplen,metrics_t * metrics)589 static verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto, struct bpf_sock_tuple *tuple,
590 			       uint64_t tuplen, metrics_t *metrics)
591 {
592 	switch (proto) {
593 	case IPPROTO_TCP:
594 		return classify_tcp(skb, tuple, tuplen, NULL, NULL);
595 
596 	case IPPROTO_UDP:
597 		return classify_udp(skb, tuple, tuplen);
598 
599 	default:
600 		metrics->errors_total_malformed_icmp++;
601 		return INVALID;
602 	}
603 }
604 
process_icmpv4(struct __sk_buff * skb,struct bpf_dynptr * dynptr,__u64 * offset,metrics_t * metrics)605 static verdict_t process_icmpv4(struct __sk_buff *skb, struct bpf_dynptr *dynptr, __u64 *offset,
606 				metrics_t *metrics)
607 {
608 	struct icmphdr icmp;
609 	struct iphdr ipv4;
610 
611 	if (bpf_dynptr_read(&icmp, sizeof(icmp), dynptr, *offset, 0)) {
612 		metrics->errors_total_malformed_icmp++;
613 		return INVALID;
614 	}
615 
616 	*offset += sizeof(icmp);
617 
618 	/* We should never receive encapsulated echo replies. */
619 	if (icmp.type == ICMP_ECHOREPLY) {
620 		metrics->errors_total_icmp_echo_replies++;
621 		return INVALID;
622 	}
623 
624 	if (icmp.type == ICMP_ECHO)
625 		return ECHO_REQUEST;
626 
627 	if (icmp.type != ICMP_DEST_UNREACH || icmp.code != ICMP_FRAG_NEEDED) {
628 		metrics->errors_total_unwanted_icmp++;
629 		return INVALID;
630 	}
631 
632 	if (pkt_parse_ipv4(dynptr, offset, &ipv4)) {
633 		metrics->errors_total_malformed_icmp_pkt_too_big++;
634 		return INVALID;
635 	}
636 
637 	/* The source address in the outer IP header is from the entity that
638 	 * originated the ICMP message. Use the original IP header to restore
639 	 * the correct flow tuple.
640 	 */
641 	struct bpf_sock_tuple tuple;
642 	tuple.ipv4.saddr = ipv4.daddr;
643 	tuple.ipv4.daddr = ipv4.saddr;
644 
645 	if (!pkt_parse_icmp_l4_ports(dynptr, offset, (flow_ports_t *)&tuple.ipv4.sport)) {
646 		metrics->errors_total_malformed_icmp_pkt_too_big++;
647 		return INVALID;
648 	}
649 
650 	return classify_icmp(skb, ipv4.protocol, &tuple,
651 			     sizeof(tuple.ipv4), metrics);
652 }
653 
process_icmpv6(struct bpf_dynptr * dynptr,__u64 * offset,struct __sk_buff * skb,metrics_t * metrics)654 static verdict_t process_icmpv6(struct bpf_dynptr *dynptr, __u64 *offset, struct __sk_buff *skb,
655 				metrics_t *metrics)
656 {
657 	struct bpf_sock_tuple tuple;
658 	struct ipv6hdr ipv6;
659 	struct icmp6hdr icmp6;
660 	bool is_fragment;
661 	uint8_t l4_proto;
662 
663 	if (bpf_dynptr_read(&icmp6, sizeof(icmp6), dynptr, *offset, 0)) {
664 		metrics->errors_total_malformed_icmp++;
665 		return INVALID;
666 	}
667 
668 	/* We should never receive encapsulated echo replies. */
669 	if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) {
670 		metrics->errors_total_icmp_echo_replies++;
671 		return INVALID;
672 	}
673 
674 	if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) {
675 		return ECHO_REQUEST;
676 	}
677 
678 	if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) {
679 		metrics->errors_total_unwanted_icmp++;
680 		return INVALID;
681 	}
682 
683 	if (pkt_parse_ipv6(dynptr, offset, &ipv6, &l4_proto, &is_fragment)) {
684 		metrics->errors_total_malformed_icmp_pkt_too_big++;
685 		return INVALID;
686 	}
687 
688 	if (is_fragment) {
689 		metrics->errors_total_fragmented_ip++;
690 		return INVALID;
691 	}
692 
693 	/* Swap source and dest addresses. */
694 	memcpy(&tuple.ipv6.saddr, &ipv6.daddr, sizeof(tuple.ipv6.saddr));
695 	memcpy(&tuple.ipv6.daddr, &ipv6.saddr, sizeof(tuple.ipv6.daddr));
696 
697 	if (!pkt_parse_icmp_l4_ports(dynptr, offset, (flow_ports_t *)&tuple.ipv6.sport)) {
698 		metrics->errors_total_malformed_icmp_pkt_too_big++;
699 		return INVALID;
700 	}
701 
702 	return classify_icmp(skb, l4_proto, &tuple, sizeof(tuple.ipv6),
703 			     metrics);
704 }
705 
process_tcp(struct bpf_dynptr * dynptr,__u64 * offset,struct __sk_buff * skb,struct iphdr_info * info,metrics_t * metrics)706 static verdict_t process_tcp(struct bpf_dynptr *dynptr, __u64 *offset, struct __sk_buff *skb,
707 			     struct iphdr_info *info, metrics_t *metrics)
708 {
709 	struct bpf_sock_tuple tuple;
710 	struct tcphdr tcp;
711 	uint64_t tuplen;
712 
713 	metrics->l4_protocol_packets_total_tcp++;
714 
715 	if (bpf_dynptr_read(&tcp, sizeof(tcp), dynptr, *offset, 0)) {
716 		metrics->errors_total_malformed_tcp++;
717 		return INVALID;
718 	}
719 
720 	*offset += sizeof(tcp);
721 
722 	if (tcp.syn)
723 		return SYN;
724 
725 	tuplen = fill_tuple(&tuple, info->hdr, info->len, tcp.source, tcp.dest);
726 	return classify_tcp(skb, &tuple, tuplen, info->hdr, &tcp);
727 }
728 
process_udp(struct bpf_dynptr * dynptr,__u64 * offset,struct __sk_buff * skb,struct iphdr_info * info,metrics_t * metrics)729 static verdict_t process_udp(struct bpf_dynptr *dynptr, __u64 *offset, struct __sk_buff *skb,
730 			     struct iphdr_info *info, metrics_t *metrics)
731 {
732 	struct bpf_sock_tuple tuple;
733 	struct udphdr udph;
734 	uint64_t tuplen;
735 
736 	metrics->l4_protocol_packets_total_udp++;
737 
738 	if (bpf_dynptr_read(&udph, sizeof(udph), dynptr, *offset, 0)) {
739 		metrics->errors_total_malformed_udp++;
740 		return INVALID;
741 	}
742 	*offset += sizeof(udph);
743 
744 	tuplen = fill_tuple(&tuple, info->hdr, info->len, udph.source, udph.dest);
745 	return classify_udp(skb, &tuple, tuplen);
746 }
747 
process_ipv4(struct __sk_buff * skb,struct bpf_dynptr * dynptr,__u64 * offset,metrics_t * metrics)748 static verdict_t process_ipv4(struct __sk_buff *skb, struct bpf_dynptr *dynptr,
749 			      __u64 *offset, metrics_t *metrics)
750 {
751 	struct iphdr ipv4;
752 	struct iphdr_info info = {
753 		.hdr = &ipv4,
754 		.len = sizeof(ipv4),
755 	};
756 
757 	metrics->l3_protocol_packets_total_ipv4++;
758 
759 	if (pkt_parse_ipv4(dynptr, offset, &ipv4)) {
760 		metrics->errors_total_malformed_ip++;
761 		return INVALID;
762 	}
763 
764 	if (ipv4.version != 4) {
765 		metrics->errors_total_malformed_ip++;
766 		return INVALID;
767 	}
768 
769 	if (ipv4_is_fragment(&ipv4)) {
770 		metrics->errors_total_fragmented_ip++;
771 		return INVALID;
772 	}
773 
774 	switch (ipv4.protocol) {
775 	case IPPROTO_ICMP:
776 		return process_icmpv4(skb, dynptr, offset, metrics);
777 
778 	case IPPROTO_TCP:
779 		return process_tcp(dynptr, offset, skb, &info, metrics);
780 
781 	case IPPROTO_UDP:
782 		return process_udp(dynptr, offset, skb, &info, metrics);
783 
784 	default:
785 		metrics->errors_total_unknown_l4_proto++;
786 		return INVALID;
787 	}
788 }
789 
process_ipv6(struct __sk_buff * skb,struct bpf_dynptr * dynptr,__u64 * offset,metrics_t * metrics)790 static verdict_t process_ipv6(struct __sk_buff *skb, struct bpf_dynptr *dynptr,
791 			      __u64 *offset, metrics_t *metrics)
792 {
793 	struct ipv6hdr ipv6;
794 	struct iphdr_info info = {
795 		.hdr = &ipv6,
796 		.len = sizeof(ipv6),
797 	};
798 	uint8_t l4_proto;
799 	bool is_fragment;
800 
801 	metrics->l3_protocol_packets_total_ipv6++;
802 
803 	if (pkt_parse_ipv6(dynptr, offset, &ipv6, &l4_proto, &is_fragment)) {
804 		metrics->errors_total_malformed_ip++;
805 		return INVALID;
806 	}
807 
808 	if (ipv6.version != 6) {
809 		metrics->errors_total_malformed_ip++;
810 		return INVALID;
811 	}
812 
813 	if (is_fragment) {
814 		metrics->errors_total_fragmented_ip++;
815 		return INVALID;
816 	}
817 
818 	switch (l4_proto) {
819 	case IPPROTO_ICMPV6:
820 		return process_icmpv6(dynptr, offset, skb, metrics);
821 
822 	case IPPROTO_TCP:
823 		return process_tcp(dynptr, offset, skb, &info, metrics);
824 
825 	case IPPROTO_UDP:
826 		return process_udp(dynptr, offset, skb, &info, metrics);
827 
828 	default:
829 		metrics->errors_total_unknown_l4_proto++;
830 		return INVALID;
831 	}
832 }
833 
834 SEC("tc")
cls_redirect(struct __sk_buff * skb)835 int cls_redirect(struct __sk_buff *skb)
836 {
837 	__u8 encap_buffer[sizeof(encap_headers_t)] = {};
838 	struct bpf_dynptr dynptr;
839 	struct in_addr next_hop;
840 	/* Tracks offset of the dynptr. This will be unnecessary once
841 	 * bpf_dynptr_advance() is available.
842 	 */
843 	__u64 off = 0;
844 	ret_t ret;
845 
846 	bpf_dynptr_from_skb(skb, 0, &dynptr);
847 
848 	metrics_t *metrics = get_global_metrics();
849 	if (metrics == NULL)
850 		return TC_ACT_SHOT;
851 
852 	metrics->processed_packets_total++;
853 
854 	/* Pass bogus packets as long as we're not sure they're
855 	 * destined for us.
856 	 */
857 	if (skb->protocol != bpf_htons(ETH_P_IP))
858 		return TC_ACT_OK;
859 
860 	encap_headers_t *encap;
861 
862 	/* Make sure that all encapsulation headers are available in
863 	 * the linear portion of the skb. This makes it easy to manipulate them.
864 	 */
865 	if (bpf_skb_pull_data(skb, sizeof(*encap)))
866 		return TC_ACT_OK;
867 
868 	encap = bpf_dynptr_slice_rdwr(&dynptr, 0, encap_buffer, sizeof(encap_buffer));
869 	if (!encap)
870 		return TC_ACT_OK;
871 
872 	off += sizeof(*encap);
873 
874 	if (encap->ip.ihl != 5)
875 		/* We never have any options. */
876 		return TC_ACT_OK;
877 
878 	if (encap->ip.daddr != ENCAPSULATION_IP ||
879 	    encap->ip.protocol != IPPROTO_UDP)
880 		return TC_ACT_OK;
881 
882 	/* TODO Check UDP length? */
883 	if (encap->udp.dest != ENCAPSULATION_PORT)
884 		return TC_ACT_OK;
885 
886 	/* We now know that the packet is destined to us, we can
887 	 * drop bogus ones.
888 	 */
889 	if (ipv4_is_fragment((void *)&encap->ip)) {
890 		metrics->errors_total_fragmented_ip++;
891 		return TC_ACT_SHOT;
892 	}
893 
894 	if (encap->gue.variant != 0) {
895 		metrics->errors_total_malformed_encapsulation++;
896 		return TC_ACT_SHOT;
897 	}
898 
899 	if (encap->gue.control != 0) {
900 		metrics->errors_total_malformed_encapsulation++;
901 		return TC_ACT_SHOT;
902 	}
903 
904 	if (encap->gue.flags != 0) {
905 		metrics->errors_total_malformed_encapsulation++;
906 		return TC_ACT_SHOT;
907 	}
908 
909 	if (encap->gue.hlen !=
910 	    sizeof(encap->unigue) / 4 + encap->unigue.hop_count) {
911 		metrics->errors_total_malformed_encapsulation++;
912 		return TC_ACT_SHOT;
913 	}
914 
915 	if (encap->unigue.version != 0) {
916 		metrics->errors_total_malformed_encapsulation++;
917 		return TC_ACT_SHOT;
918 	}
919 
920 	if (encap->unigue.reserved != 0)
921 		return TC_ACT_SHOT;
922 
923 	MAYBE_RETURN(get_next_hop(&dynptr, &off, encap, &next_hop));
924 
925 	if (next_hop.s_addr == 0) {
926 		metrics->accepted_packets_total_last_hop++;
927 		return accept_locally(skb, encap);
928 	}
929 
930 	verdict_t verdict;
931 	switch (encap->gue.proto_ctype) {
932 	case IPPROTO_IPIP:
933 		verdict = process_ipv4(skb, &dynptr, &off, metrics);
934 		break;
935 
936 	case IPPROTO_IPV6:
937 		verdict = process_ipv6(skb, &dynptr, &off, metrics);
938 		break;
939 
940 	default:
941 		metrics->errors_total_unknown_l3_proto++;
942 		return TC_ACT_SHOT;
943 	}
944 
945 	switch (verdict) {
946 	case INVALID:
947 		/* metrics have already been bumped */
948 		return TC_ACT_SHOT;
949 
950 	case UNKNOWN:
951 		return forward_to_next_hop(skb, &dynptr, encap, &next_hop, metrics);
952 
953 	case ECHO_REQUEST:
954 		metrics->accepted_packets_total_icmp_echo_request++;
955 		break;
956 
957 	case SYN:
958 		if (encap->unigue.forward_syn) {
959 			return forward_to_next_hop(skb, &dynptr, encap, &next_hop,
960 						   metrics);
961 		}
962 
963 		metrics->accepted_packets_total_syn++;
964 		break;
965 
966 	case SYN_COOKIE:
967 		metrics->accepted_packets_total_syn_cookies++;
968 		break;
969 
970 	case ESTABLISHED:
971 		metrics->accepted_packets_total_established++;
972 		break;
973 	}
974 
975 	ret = accept_locally(skb, encap);
976 
977 	if (encap == encap_buffer)
978 		bpf_dynptr_write(&dynptr, 0, encap_buffer, sizeof(encap_buffer), 0);
979 
980 	return ret;
981 }
982