1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 // Copyright (c) 2019, 2020 Cloudflare
3 
4 #include <stdbool.h>
5 #include <stddef.h>
6 #include <stdint.h>
7 #include <string.h>
8 
9 #include <linux/bpf.h>
10 #include <linux/icmp.h>
11 #include <linux/icmpv6.h>
12 #include <linux/if_ether.h>
13 #include <linux/in.h>
14 #include <linux/ip.h>
15 #include <linux/ipv6.h>
16 #include <linux/pkt_cls.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 
20 #include <bpf/bpf_helpers.h>
21 #include <bpf/bpf_endian.h>
22 
23 #include "test_cls_redirect.h"
24 
25 #ifdef SUBPROGS
26 #define INLINING __noinline
27 #else
28 #define INLINING __always_inline
29 #endif
30 
31 #define offsetofend(TYPE, MEMBER) \
32 	(offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER)))
33 
34 #define IP_OFFSET_MASK (0x1FFF)
35 #define IP_MF (0x2000)
36 
37 char _license[] SEC("license") = "Dual BSD/GPL";
38 
39 /**
40  * Destination port and IP used for UDP encapsulation.
41  */
42 static volatile const __be16 ENCAPSULATION_PORT;
43 static volatile const __be32 ENCAPSULATION_IP;
44 
45 typedef struct {
46 	uint64_t processed_packets_total;
47 	uint64_t l3_protocol_packets_total_ipv4;
48 	uint64_t l3_protocol_packets_total_ipv6;
49 	uint64_t l4_protocol_packets_total_tcp;
50 	uint64_t l4_protocol_packets_total_udp;
51 	uint64_t accepted_packets_total_syn;
52 	uint64_t accepted_packets_total_syn_cookies;
53 	uint64_t accepted_packets_total_last_hop;
54 	uint64_t accepted_packets_total_icmp_echo_request;
55 	uint64_t accepted_packets_total_established;
56 	uint64_t forwarded_packets_total_gue;
57 	uint64_t forwarded_packets_total_gre;
58 
59 	uint64_t errors_total_unknown_l3_proto;
60 	uint64_t errors_total_unknown_l4_proto;
61 	uint64_t errors_total_malformed_ip;
62 	uint64_t errors_total_fragmented_ip;
63 	uint64_t errors_total_malformed_icmp;
64 	uint64_t errors_total_unwanted_icmp;
65 	uint64_t errors_total_malformed_icmp_pkt_too_big;
66 	uint64_t errors_total_malformed_tcp;
67 	uint64_t errors_total_malformed_udp;
68 	uint64_t errors_total_icmp_echo_replies;
69 	uint64_t errors_total_malformed_encapsulation;
70 	uint64_t errors_total_encap_adjust_failed;
71 	uint64_t errors_total_encap_buffer_too_small;
72 	uint64_t errors_total_redirect_loop;
73 } metrics_t;
74 
75 typedef enum {
76 	INVALID = 0,
77 	UNKNOWN,
78 	ECHO_REQUEST,
79 	SYN,
80 	SYN_COOKIE,
81 	ESTABLISHED,
82 } verdict_t;
83 
84 typedef struct {
85 	uint16_t src, dst;
86 } flow_ports_t;
87 
88 _Static_assert(
89 	sizeof(flow_ports_t) !=
90 		offsetofend(struct bpf_sock_tuple, ipv4.dport) -
91 			offsetof(struct bpf_sock_tuple, ipv4.sport) - 1,
92 	"flow_ports_t must match sport and dport in struct bpf_sock_tuple");
93 _Static_assert(
94 	sizeof(flow_ports_t) !=
95 		offsetofend(struct bpf_sock_tuple, ipv6.dport) -
96 			offsetof(struct bpf_sock_tuple, ipv6.sport) - 1,
97 	"flow_ports_t must match sport and dport in struct bpf_sock_tuple");
98 
99 typedef int ret_t;
100 
101 /* This is a bit of a hack. We need a return value which allows us to
102  * indicate that the regular flow of the program should continue,
103  * while allowing functions to use XDP_PASS and XDP_DROP, etc.
104  */
105 static const ret_t CONTINUE_PROCESSING = -1;
106 
107 /* Convenience macro to call functions which return ret_t.
108  */
109 #define MAYBE_RETURN(x)                           \
110 	do {                                      \
111 		ret_t __ret = x;                  \
112 		if (__ret != CONTINUE_PROCESSING) \
113 			return __ret;             \
114 	} while (0)
115 
116 /* Linux packet pointers are either aligned to NET_IP_ALIGN (aka 2 bytes),
117  * or not aligned if the arch supports efficient unaligned access.
118  *
119  * Since the verifier ensures that eBPF packet accesses follow these rules,
120  * we can tell LLVM to emit code as if we always had a larger alignment.
121  * It will yell at us if we end up on a platform where this is not valid.
122  */
123 typedef uint8_t *net_ptr __attribute__((align_value(8)));
124 
125 typedef struct buf {
126 	struct __sk_buff *skb;
127 	net_ptr head;
128 	/* NB: tail musn't have alignment other than 1, otherwise
129 	* LLVM will go and eliminate code, e.g. when checking packet lengths.
130 	*/
131 	uint8_t *const tail;
132 } buf_t;
133 
buf_off(const buf_t * buf)134 static __always_inline size_t buf_off(const buf_t *buf)
135 {
136 	/* Clang seems to optimize constructs like
137 	 *    a - b + c
138 	 * if c is known:
139 	 *    r? = c
140 	 *    r? -= b
141 	 *    r? += a
142 	 *
143 	 * This is a problem if a and b are packet pointers,
144 	 * since the verifier allows subtracting two pointers to
145 	 * get a scalar, but not a scalar and a pointer.
146 	 *
147 	 * Use inline asm to break this optimization.
148 	 */
149 	size_t off = (size_t)buf->head;
150 	asm("%0 -= %1" : "+r"(off) : "r"(buf->skb->data));
151 	return off;
152 }
153 
buf_copy(buf_t * buf,void * dst,size_t len)154 static __always_inline bool buf_copy(buf_t *buf, void *dst, size_t len)
155 {
156 	if (bpf_skb_load_bytes(buf->skb, buf_off(buf), dst, len)) {
157 		return false;
158 	}
159 
160 	buf->head += len;
161 	return true;
162 }
163 
buf_skip(buf_t * buf,const size_t len)164 static __always_inline bool buf_skip(buf_t *buf, const size_t len)
165 {
166 	/* Check whether off + len is valid in the non-linear part. */
167 	if (buf_off(buf) + len > buf->skb->len) {
168 		return false;
169 	}
170 
171 	buf->head += len;
172 	return true;
173 }
174 
175 /* Returns a pointer to the start of buf, or NULL if len is
176  * larger than the remaining data. Consumes len bytes on a successful
177  * call.
178  *
179  * If scratch is not NULL, the function will attempt to load non-linear
180  * data via bpf_skb_load_bytes. On success, scratch is returned.
181  */
buf_assign(buf_t * buf,const size_t len,void * scratch)182 static __always_inline void *buf_assign(buf_t *buf, const size_t len, void *scratch)
183 {
184 	if (buf->head + len > buf->tail) {
185 		if (scratch == NULL) {
186 			return NULL;
187 		}
188 
189 		return buf_copy(buf, scratch, len) ? scratch : NULL;
190 	}
191 
192 	void *ptr = buf->head;
193 	buf->head += len;
194 	return ptr;
195 }
196 
pkt_skip_ipv4_options(buf_t * buf,const struct iphdr * ipv4)197 static INLINING bool pkt_skip_ipv4_options(buf_t *buf, const struct iphdr *ipv4)
198 {
199 	if (ipv4->ihl <= 5) {
200 		return true;
201 	}
202 
203 	return buf_skip(buf, (ipv4->ihl - 5) * 4);
204 }
205 
ipv4_is_fragment(const struct iphdr * ip)206 static INLINING bool ipv4_is_fragment(const struct iphdr *ip)
207 {
208 	uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK);
209 	return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0;
210 }
211 
pkt_parse_ipv4(buf_t * pkt,struct iphdr * scratch)212 static __always_inline struct iphdr *pkt_parse_ipv4(buf_t *pkt, struct iphdr *scratch)
213 {
214 	struct iphdr *ipv4 = buf_assign(pkt, sizeof(*ipv4), scratch);
215 	if (ipv4 == NULL) {
216 		return NULL;
217 	}
218 
219 	if (ipv4->ihl < 5) {
220 		return NULL;
221 	}
222 
223 	if (!pkt_skip_ipv4_options(pkt, ipv4)) {
224 		return NULL;
225 	}
226 
227 	return ipv4;
228 }
229 
230 /* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */
pkt_parse_icmp_l4_ports(buf_t * pkt,flow_ports_t * ports)231 static INLINING bool pkt_parse_icmp_l4_ports(buf_t *pkt, flow_ports_t *ports)
232 {
233 	if (!buf_copy(pkt, ports, sizeof(*ports))) {
234 		return false;
235 	}
236 
237 	/* Ports in the L4 headers are reversed, since we are parsing an ICMP
238 	 * payload which is going towards the eyeball.
239 	 */
240 	uint16_t dst = ports->src;
241 	ports->src = ports->dst;
242 	ports->dst = dst;
243 	return true;
244 }
245 
pkt_checksum_fold(uint32_t csum)246 static INLINING uint16_t pkt_checksum_fold(uint32_t csum)
247 {
248 	/* The highest reasonable value for an IPv4 header
249 	 * checksum requires two folds, so we just do that always.
250 	 */
251 	csum = (csum & 0xffff) + (csum >> 16);
252 	csum = (csum & 0xffff) + (csum >> 16);
253 	return (uint16_t)~csum;
254 }
255 
pkt_ipv4_checksum(struct iphdr * iph)256 static INLINING void pkt_ipv4_checksum(struct iphdr *iph)
257 {
258 	iph->check = 0;
259 
260 	/* An IP header without options is 20 bytes. Two of those
261 	 * are the checksum, which we always set to zero. Hence,
262 	 * the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7,
263 	 * which fits in 32 bit.
264 	 */
265 	_Static_assert(sizeof(struct iphdr) == 20, "iphdr must be 20 bytes");
266 	uint32_t acc = 0;
267 	uint16_t *ipw = (uint16_t *)iph;
268 
269 #pragma clang loop unroll(full)
270 	for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++) {
271 		acc += ipw[i];
272 	}
273 
274 	iph->check = pkt_checksum_fold(acc);
275 }
276 
277 static INLINING
pkt_skip_ipv6_extension_headers(buf_t * pkt,const struct ipv6hdr * ipv6,uint8_t * upper_proto,bool * is_fragment)278 bool pkt_skip_ipv6_extension_headers(buf_t *pkt,
279 				     const struct ipv6hdr *ipv6,
280 				     uint8_t *upper_proto,
281 				     bool *is_fragment)
282 {
283 	/* We understand five extension headers.
284 	 * https://tools.ietf.org/html/rfc8200#section-4.1 states that all
285 	 * headers should occur once, except Destination Options, which may
286 	 * occur twice. Hence we give up after 6 headers.
287 	 */
288 	struct {
289 		uint8_t next;
290 		uint8_t len;
291 	} exthdr = {
292 		.next = ipv6->nexthdr,
293 	};
294 	*is_fragment = false;
295 
296 #pragma clang loop unroll(full)
297 	for (int i = 0; i < 6; i++) {
298 		switch (exthdr.next) {
299 		case IPPROTO_FRAGMENT:
300 			*is_fragment = true;
301 			/* NB: We don't check that hdrlen == 0 as per spec. */
302 			/* fallthrough; */
303 
304 		case IPPROTO_HOPOPTS:
305 		case IPPROTO_ROUTING:
306 		case IPPROTO_DSTOPTS:
307 		case IPPROTO_MH:
308 			if (!buf_copy(pkt, &exthdr, sizeof(exthdr))) {
309 				return false;
310 			}
311 
312 			/* hdrlen is in 8-octet units, and excludes the first 8 octets. */
313 			if (!buf_skip(pkt,
314 				      (exthdr.len + 1) * 8 - sizeof(exthdr))) {
315 				return false;
316 			}
317 
318 			/* Decode next header */
319 			break;
320 
321 		default:
322 			/* The next header is not one of the known extension
323 			 * headers, treat it as the upper layer header.
324 			 *
325 			 * This handles IPPROTO_NONE.
326 			 *
327 			 * Encapsulating Security Payload (50) and Authentication
328 			 * Header (51) also end up here (and will trigger an
329 			 * unknown proto error later). They have a custom header
330 			 * format and seem too esoteric to care about.
331 			 */
332 			*upper_proto = exthdr.next;
333 			return true;
334 		}
335 	}
336 
337 	/* We never found an upper layer header. */
338 	return false;
339 }
340 
341 /* This function has to be inlined, because the verifier otherwise rejects it
342  * due to returning a pointer to the stack. This is technically correct, since
343  * scratch is allocated on the stack. However, this usage should be safe since
344  * it's the callers stack after all.
345  */
346 static __always_inline struct ipv6hdr *
pkt_parse_ipv6(buf_t * pkt,struct ipv6hdr * scratch,uint8_t * proto,bool * is_fragment)347 pkt_parse_ipv6(buf_t *pkt, struct ipv6hdr *scratch, uint8_t *proto,
348 	       bool *is_fragment)
349 {
350 	struct ipv6hdr *ipv6 = buf_assign(pkt, sizeof(*ipv6), scratch);
351 	if (ipv6 == NULL) {
352 		return NULL;
353 	}
354 
355 	if (!pkt_skip_ipv6_extension_headers(pkt, ipv6, proto, is_fragment)) {
356 		return NULL;
357 	}
358 
359 	return ipv6;
360 }
361 
362 /* Global metrics, per CPU
363  */
364 struct {
365 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
366 	__uint(max_entries, 1);
367 	__type(key, unsigned int);
368 	__type(value, metrics_t);
369 } metrics_map SEC(".maps");
370 
get_global_metrics(void)371 static INLINING metrics_t *get_global_metrics(void)
372 {
373 	uint64_t key = 0;
374 	return bpf_map_lookup_elem(&metrics_map, &key);
375 }
376 
accept_locally(struct __sk_buff * skb,encap_headers_t * encap)377 static INLINING ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap)
378 {
379 	const int payload_off =
380 		sizeof(*encap) +
381 		sizeof(struct in_addr) * encap->unigue.hop_count;
382 	int32_t encap_overhead = payload_off - sizeof(struct ethhdr);
383 
384 	// Changing the ethertype if the encapsulated packet is ipv6
385 	if (encap->gue.proto_ctype == IPPROTO_IPV6) {
386 		encap->eth.h_proto = bpf_htons(ETH_P_IPV6);
387 	}
388 
389 	if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC,
390 				BPF_F_ADJ_ROOM_FIXED_GSO |
391 				BPF_F_ADJ_ROOM_NO_CSUM_RESET) ||
392 	    bpf_csum_level(skb, BPF_CSUM_LEVEL_DEC))
393 		return TC_ACT_SHOT;
394 
395 	return bpf_redirect(skb->ifindex, BPF_F_INGRESS);
396 }
397 
forward_with_gre(struct __sk_buff * skb,encap_headers_t * encap,struct in_addr * next_hop,metrics_t * metrics)398 static INLINING ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap,
399 				       struct in_addr *next_hop, metrics_t *metrics)
400 {
401 	metrics->forwarded_packets_total_gre++;
402 
403 	const int payload_off =
404 		sizeof(*encap) +
405 		sizeof(struct in_addr) * encap->unigue.hop_count;
406 	int32_t encap_overhead =
407 		payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr);
408 	int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead;
409 	uint16_t proto = ETH_P_IP;
410 
411 	/* Loop protection: the inner packet's TTL is decremented as a safeguard
412 	 * against any forwarding loop. As the only interesting field is the TTL
413 	 * hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes
414 	 * as they handle the split packets if needed (no need for the data to be
415 	 * in the linear section).
416 	 */
417 	if (encap->gue.proto_ctype == IPPROTO_IPV6) {
418 		proto = ETH_P_IPV6;
419 		uint8_t ttl;
420 		int rc;
421 
422 		rc = bpf_skb_load_bytes(
423 			skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
424 			&ttl, 1);
425 		if (rc != 0) {
426 			metrics->errors_total_malformed_encapsulation++;
427 			return TC_ACT_SHOT;
428 		}
429 
430 		if (ttl == 0) {
431 			metrics->errors_total_redirect_loop++;
432 			return TC_ACT_SHOT;
433 		}
434 
435 		ttl--;
436 		rc = bpf_skb_store_bytes(
437 			skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
438 			&ttl, 1, 0);
439 		if (rc != 0) {
440 			metrics->errors_total_malformed_encapsulation++;
441 			return TC_ACT_SHOT;
442 		}
443 	} else {
444 		uint8_t ttl;
445 		int rc;
446 
447 		rc = bpf_skb_load_bytes(
448 			skb, payload_off + offsetof(struct iphdr, ttl), &ttl,
449 			1);
450 		if (rc != 0) {
451 			metrics->errors_total_malformed_encapsulation++;
452 			return TC_ACT_SHOT;
453 		}
454 
455 		if (ttl == 0) {
456 			metrics->errors_total_redirect_loop++;
457 			return TC_ACT_SHOT;
458 		}
459 
460 		/* IPv4 also has a checksum to patch. While the TTL is only one byte,
461 		 * this function only works for 2 and 4 bytes arguments (the result is
462 		 * the same).
463 		 */
464 		rc = bpf_l3_csum_replace(
465 			skb, payload_off + offsetof(struct iphdr, check), ttl,
466 			ttl - 1, 2);
467 		if (rc != 0) {
468 			metrics->errors_total_malformed_encapsulation++;
469 			return TC_ACT_SHOT;
470 		}
471 
472 		ttl--;
473 		rc = bpf_skb_store_bytes(
474 			skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1,
475 			0);
476 		if (rc != 0) {
477 			metrics->errors_total_malformed_encapsulation++;
478 			return TC_ACT_SHOT;
479 		}
480 	}
481 
482 	if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET,
483 				BPF_F_ADJ_ROOM_FIXED_GSO |
484 				BPF_F_ADJ_ROOM_NO_CSUM_RESET) ||
485 	    bpf_csum_level(skb, BPF_CSUM_LEVEL_INC)) {
486 		metrics->errors_total_encap_adjust_failed++;
487 		return TC_ACT_SHOT;
488 	}
489 
490 	if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) {
491 		metrics->errors_total_encap_buffer_too_small++;
492 		return TC_ACT_SHOT;
493 	}
494 
495 	buf_t pkt = {
496 		.skb = skb,
497 		.head = (uint8_t *)(long)skb->data,
498 		.tail = (uint8_t *)(long)skb->data_end,
499 	};
500 
501 	encap_gre_t *encap_gre = buf_assign(&pkt, sizeof(encap_gre_t), NULL);
502 	if (encap_gre == NULL) {
503 		metrics->errors_total_encap_buffer_too_small++;
504 		return TC_ACT_SHOT;
505 	}
506 
507 	encap_gre->ip.protocol = IPPROTO_GRE;
508 	encap_gre->ip.daddr = next_hop->s_addr;
509 	encap_gre->ip.saddr = ENCAPSULATION_IP;
510 	encap_gre->ip.tot_len =
511 		bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta);
512 	encap_gre->gre.flags = 0;
513 	encap_gre->gre.protocol = bpf_htons(proto);
514 	pkt_ipv4_checksum((void *)&encap_gre->ip);
515 
516 	return bpf_redirect(skb->ifindex, 0);
517 }
518 
forward_to_next_hop(struct __sk_buff * skb,encap_headers_t * encap,struct in_addr * next_hop,metrics_t * metrics)519 static INLINING ret_t forward_to_next_hop(struct __sk_buff *skb, encap_headers_t *encap,
520 					  struct in_addr *next_hop, metrics_t *metrics)
521 {
522 	/* swap L2 addresses */
523 	/* This assumes that packets are received from a router.
524 	 * So just swapping the MAC addresses here will make the packet go back to
525 	 * the router, which will send it to the appropriate machine.
526 	 */
527 	unsigned char temp[ETH_ALEN];
528 	memcpy(temp, encap->eth.h_dest, sizeof(temp));
529 	memcpy(encap->eth.h_dest, encap->eth.h_source,
530 	       sizeof(encap->eth.h_dest));
531 	memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source));
532 
533 	if (encap->unigue.next_hop == encap->unigue.hop_count - 1 &&
534 	    encap->unigue.last_hop_gre) {
535 		return forward_with_gre(skb, encap, next_hop, metrics);
536 	}
537 
538 	metrics->forwarded_packets_total_gue++;
539 	uint32_t old_saddr = encap->ip.saddr;
540 	encap->ip.saddr = encap->ip.daddr;
541 	encap->ip.daddr = next_hop->s_addr;
542 	if (encap->unigue.next_hop < encap->unigue.hop_count) {
543 		encap->unigue.next_hop++;
544 	}
545 
546 	/* Remove ip->saddr, add next_hop->s_addr */
547 	const uint64_t off = offsetof(typeof(*encap), ip.check);
548 	int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, 4);
549 	if (ret < 0) {
550 		return TC_ACT_SHOT;
551 	}
552 
553 	return bpf_redirect(skb->ifindex, 0);
554 }
555 
skip_next_hops(buf_t * pkt,int n)556 static INLINING ret_t skip_next_hops(buf_t *pkt, int n)
557 {
558 	switch (n) {
559 	case 1:
560 		if (!buf_skip(pkt, sizeof(struct in_addr)))
561 			return TC_ACT_SHOT;
562 	case 0:
563 		return CONTINUE_PROCESSING;
564 
565 	default:
566 		return TC_ACT_SHOT;
567 	}
568 }
569 
570 /* Get the next hop from the GLB header.
571  *
572  * Sets next_hop->s_addr to 0 if there are no more hops left.
573  * pkt is positioned just after the variable length GLB header
574  * iff the call is successful.
575  */
get_next_hop(buf_t * pkt,encap_headers_t * encap,struct in_addr * next_hop)576 static INLINING ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap,
577 				   struct in_addr *next_hop)
578 {
579 	if (encap->unigue.next_hop > encap->unigue.hop_count) {
580 		return TC_ACT_SHOT;
581 	}
582 
583 	/* Skip "used" next hops. */
584 	MAYBE_RETURN(skip_next_hops(pkt, encap->unigue.next_hop));
585 
586 	if (encap->unigue.next_hop == encap->unigue.hop_count) {
587 		/* No more next hops, we are at the end of the GLB header. */
588 		next_hop->s_addr = 0;
589 		return CONTINUE_PROCESSING;
590 	}
591 
592 	if (!buf_copy(pkt, next_hop, sizeof(*next_hop))) {
593 		return TC_ACT_SHOT;
594 	}
595 
596 	/* Skip the remainig next hops (may be zero). */
597 	return skip_next_hops(pkt, encap->unigue.hop_count -
598 					   encap->unigue.next_hop - 1);
599 }
600 
601 /* Fill a bpf_sock_tuple to be used with the socket lookup functions.
602  * This is a kludge that let's us work around verifier limitations:
603  *
604  *    fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321)
605  *
606  * clang will substitue a costant for sizeof, which allows the verifier
607  * to track it's value. Based on this, it can figure out the constant
608  * return value, and calling code works while still being "generic" to
609  * IPv4 and IPv6.
610  */
fill_tuple(struct bpf_sock_tuple * tuple,void * iph,uint64_t iphlen,uint16_t sport,uint16_t dport)611 static INLINING uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph,
612 				    uint64_t iphlen, uint16_t sport, uint16_t dport)
613 {
614 	switch (iphlen) {
615 	case sizeof(struct iphdr): {
616 		struct iphdr *ipv4 = (struct iphdr *)iph;
617 		tuple->ipv4.daddr = ipv4->daddr;
618 		tuple->ipv4.saddr = ipv4->saddr;
619 		tuple->ipv4.sport = sport;
620 		tuple->ipv4.dport = dport;
621 		return sizeof(tuple->ipv4);
622 	}
623 
624 	case sizeof(struct ipv6hdr): {
625 		struct ipv6hdr *ipv6 = (struct ipv6hdr *)iph;
626 		memcpy(&tuple->ipv6.daddr, &ipv6->daddr,
627 		       sizeof(tuple->ipv6.daddr));
628 		memcpy(&tuple->ipv6.saddr, &ipv6->saddr,
629 		       sizeof(tuple->ipv6.saddr));
630 		tuple->ipv6.sport = sport;
631 		tuple->ipv6.dport = dport;
632 		return sizeof(tuple->ipv6);
633 	}
634 
635 	default:
636 		return 0;
637 	}
638 }
639 
classify_tcp(struct __sk_buff * skb,struct bpf_sock_tuple * tuple,uint64_t tuplen,void * iph,struct tcphdr * tcp)640 static INLINING verdict_t classify_tcp(struct __sk_buff *skb,
641 				       struct bpf_sock_tuple *tuple, uint64_t tuplen,
642 				       void *iph, struct tcphdr *tcp)
643 {
644 	struct bpf_sock *sk =
645 		bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
646 	if (sk == NULL) {
647 		return UNKNOWN;
648 	}
649 
650 	if (sk->state != BPF_TCP_LISTEN) {
651 		bpf_sk_release(sk);
652 		return ESTABLISHED;
653 	}
654 
655 	if (iph != NULL && tcp != NULL) {
656 		/* Kludge: we've run out of arguments, but need the length of the ip header. */
657 		uint64_t iphlen = sizeof(struct iphdr);
658 		if (tuplen == sizeof(tuple->ipv6)) {
659 			iphlen = sizeof(struct ipv6hdr);
660 		}
661 
662 		if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp,
663 					    sizeof(*tcp)) == 0) {
664 			bpf_sk_release(sk);
665 			return SYN_COOKIE;
666 		}
667 	}
668 
669 	bpf_sk_release(sk);
670 	return UNKNOWN;
671 }
672 
classify_udp(struct __sk_buff * skb,struct bpf_sock_tuple * tuple,uint64_t tuplen)673 static INLINING verdict_t classify_udp(struct __sk_buff *skb,
674 				       struct bpf_sock_tuple *tuple, uint64_t tuplen)
675 {
676 	struct bpf_sock *sk =
677 		bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
678 	if (sk == NULL) {
679 		return UNKNOWN;
680 	}
681 
682 	if (sk->state == BPF_TCP_ESTABLISHED) {
683 		bpf_sk_release(sk);
684 		return ESTABLISHED;
685 	}
686 
687 	bpf_sk_release(sk);
688 	return UNKNOWN;
689 }
690 
classify_icmp(struct __sk_buff * skb,uint8_t proto,struct bpf_sock_tuple * tuple,uint64_t tuplen,metrics_t * metrics)691 static INLINING verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto,
692 					struct bpf_sock_tuple *tuple, uint64_t tuplen,
693 					metrics_t *metrics)
694 {
695 	switch (proto) {
696 	case IPPROTO_TCP:
697 		return classify_tcp(skb, tuple, tuplen, NULL, NULL);
698 
699 	case IPPROTO_UDP:
700 		return classify_udp(skb, tuple, tuplen);
701 
702 	default:
703 		metrics->errors_total_malformed_icmp++;
704 		return INVALID;
705 	}
706 }
707 
process_icmpv4(buf_t * pkt,metrics_t * metrics)708 static INLINING verdict_t process_icmpv4(buf_t *pkt, metrics_t *metrics)
709 {
710 	struct icmphdr icmp;
711 	if (!buf_copy(pkt, &icmp, sizeof(icmp))) {
712 		metrics->errors_total_malformed_icmp++;
713 		return INVALID;
714 	}
715 
716 	/* We should never receive encapsulated echo replies. */
717 	if (icmp.type == ICMP_ECHOREPLY) {
718 		metrics->errors_total_icmp_echo_replies++;
719 		return INVALID;
720 	}
721 
722 	if (icmp.type == ICMP_ECHO) {
723 		return ECHO_REQUEST;
724 	}
725 
726 	if (icmp.type != ICMP_DEST_UNREACH || icmp.code != ICMP_FRAG_NEEDED) {
727 		metrics->errors_total_unwanted_icmp++;
728 		return INVALID;
729 	}
730 
731 	struct iphdr _ip4;
732 	const struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4);
733 	if (ipv4 == NULL) {
734 		metrics->errors_total_malformed_icmp_pkt_too_big++;
735 		return INVALID;
736 	}
737 
738 	/* The source address in the outer IP header is from the entity that
739 	 * originated the ICMP message. Use the original IP header to restore
740 	 * the correct flow tuple.
741 	 */
742 	struct bpf_sock_tuple tuple;
743 	tuple.ipv4.saddr = ipv4->daddr;
744 	tuple.ipv4.daddr = ipv4->saddr;
745 
746 	if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv4.sport)) {
747 		metrics->errors_total_malformed_icmp_pkt_too_big++;
748 		return INVALID;
749 	}
750 
751 	return classify_icmp(pkt->skb, ipv4->protocol, &tuple,
752 			     sizeof(tuple.ipv4), metrics);
753 }
754 
process_icmpv6(buf_t * pkt,metrics_t * metrics)755 static INLINING verdict_t process_icmpv6(buf_t *pkt, metrics_t *metrics)
756 {
757 	struct icmp6hdr icmp6;
758 	if (!buf_copy(pkt, &icmp6, sizeof(icmp6))) {
759 		metrics->errors_total_malformed_icmp++;
760 		return INVALID;
761 	}
762 
763 	/* We should never receive encapsulated echo replies. */
764 	if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) {
765 		metrics->errors_total_icmp_echo_replies++;
766 		return INVALID;
767 	}
768 
769 	if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) {
770 		return ECHO_REQUEST;
771 	}
772 
773 	if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) {
774 		metrics->errors_total_unwanted_icmp++;
775 		return INVALID;
776 	}
777 
778 	bool is_fragment;
779 	uint8_t l4_proto;
780 	struct ipv6hdr _ipv6;
781 	const struct ipv6hdr *ipv6 =
782 		pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment);
783 	if (ipv6 == NULL) {
784 		metrics->errors_total_malformed_icmp_pkt_too_big++;
785 		return INVALID;
786 	}
787 
788 	if (is_fragment) {
789 		metrics->errors_total_fragmented_ip++;
790 		return INVALID;
791 	}
792 
793 	/* Swap source and dest addresses. */
794 	struct bpf_sock_tuple tuple;
795 	memcpy(&tuple.ipv6.saddr, &ipv6->daddr, sizeof(tuple.ipv6.saddr));
796 	memcpy(&tuple.ipv6.daddr, &ipv6->saddr, sizeof(tuple.ipv6.daddr));
797 
798 	if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv6.sport)) {
799 		metrics->errors_total_malformed_icmp_pkt_too_big++;
800 		return INVALID;
801 	}
802 
803 	return classify_icmp(pkt->skb, l4_proto, &tuple, sizeof(tuple.ipv6),
804 			     metrics);
805 }
806 
process_tcp(buf_t * pkt,void * iph,uint64_t iphlen,metrics_t * metrics)807 static INLINING verdict_t process_tcp(buf_t *pkt, void *iph, uint64_t iphlen,
808 				      metrics_t *metrics)
809 {
810 	metrics->l4_protocol_packets_total_tcp++;
811 
812 	struct tcphdr _tcp;
813 	struct tcphdr *tcp = buf_assign(pkt, sizeof(_tcp), &_tcp);
814 	if (tcp == NULL) {
815 		metrics->errors_total_malformed_tcp++;
816 		return INVALID;
817 	}
818 
819 	if (tcp->syn) {
820 		return SYN;
821 	}
822 
823 	struct bpf_sock_tuple tuple;
824 	uint64_t tuplen =
825 		fill_tuple(&tuple, iph, iphlen, tcp->source, tcp->dest);
826 	return classify_tcp(pkt->skb, &tuple, tuplen, iph, tcp);
827 }
828 
process_udp(buf_t * pkt,void * iph,uint64_t iphlen,metrics_t * metrics)829 static INLINING verdict_t process_udp(buf_t *pkt, void *iph, uint64_t iphlen,
830 				      metrics_t *metrics)
831 {
832 	metrics->l4_protocol_packets_total_udp++;
833 
834 	struct udphdr _udp;
835 	struct udphdr *udph = buf_assign(pkt, sizeof(_udp), &_udp);
836 	if (udph == NULL) {
837 		metrics->errors_total_malformed_udp++;
838 		return INVALID;
839 	}
840 
841 	struct bpf_sock_tuple tuple;
842 	uint64_t tuplen =
843 		fill_tuple(&tuple, iph, iphlen, udph->source, udph->dest);
844 	return classify_udp(pkt->skb, &tuple, tuplen);
845 }
846 
process_ipv4(buf_t * pkt,metrics_t * metrics)847 static INLINING verdict_t process_ipv4(buf_t *pkt, metrics_t *metrics)
848 {
849 	metrics->l3_protocol_packets_total_ipv4++;
850 
851 	struct iphdr _ip4;
852 	struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4);
853 	if (ipv4 == NULL) {
854 		metrics->errors_total_malformed_ip++;
855 		return INVALID;
856 	}
857 
858 	if (ipv4->version != 4) {
859 		metrics->errors_total_malformed_ip++;
860 		return INVALID;
861 	}
862 
863 	if (ipv4_is_fragment(ipv4)) {
864 		metrics->errors_total_fragmented_ip++;
865 		return INVALID;
866 	}
867 
868 	switch (ipv4->protocol) {
869 	case IPPROTO_ICMP:
870 		return process_icmpv4(pkt, metrics);
871 
872 	case IPPROTO_TCP:
873 		return process_tcp(pkt, ipv4, sizeof(*ipv4), metrics);
874 
875 	case IPPROTO_UDP:
876 		return process_udp(pkt, ipv4, sizeof(*ipv4), metrics);
877 
878 	default:
879 		metrics->errors_total_unknown_l4_proto++;
880 		return INVALID;
881 	}
882 }
883 
process_ipv6(buf_t * pkt,metrics_t * metrics)884 static INLINING verdict_t process_ipv6(buf_t *pkt, metrics_t *metrics)
885 {
886 	metrics->l3_protocol_packets_total_ipv6++;
887 
888 	uint8_t l4_proto;
889 	bool is_fragment;
890 	struct ipv6hdr _ipv6;
891 	struct ipv6hdr *ipv6 =
892 		pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment);
893 	if (ipv6 == NULL) {
894 		metrics->errors_total_malformed_ip++;
895 		return INVALID;
896 	}
897 
898 	if (ipv6->version != 6) {
899 		metrics->errors_total_malformed_ip++;
900 		return INVALID;
901 	}
902 
903 	if (is_fragment) {
904 		metrics->errors_total_fragmented_ip++;
905 		return INVALID;
906 	}
907 
908 	switch (l4_proto) {
909 	case IPPROTO_ICMPV6:
910 		return process_icmpv6(pkt, metrics);
911 
912 	case IPPROTO_TCP:
913 		return process_tcp(pkt, ipv6, sizeof(*ipv6), metrics);
914 
915 	case IPPROTO_UDP:
916 		return process_udp(pkt, ipv6, sizeof(*ipv6), metrics);
917 
918 	default:
919 		metrics->errors_total_unknown_l4_proto++;
920 		return INVALID;
921 	}
922 }
923 
924 SEC("classifier/cls_redirect")
cls_redirect(struct __sk_buff * skb)925 int cls_redirect(struct __sk_buff *skb)
926 {
927 	metrics_t *metrics = get_global_metrics();
928 	if (metrics == NULL) {
929 		return TC_ACT_SHOT;
930 	}
931 
932 	metrics->processed_packets_total++;
933 
934 	/* Pass bogus packets as long as we're not sure they're
935 	 * destined for us.
936 	 */
937 	if (skb->protocol != bpf_htons(ETH_P_IP)) {
938 		return TC_ACT_OK;
939 	}
940 
941 	encap_headers_t *encap;
942 
943 	/* Make sure that all encapsulation headers are available in
944 	 * the linear portion of the skb. This makes it easy to manipulate them.
945 	 */
946 	if (bpf_skb_pull_data(skb, sizeof(*encap))) {
947 		return TC_ACT_OK;
948 	}
949 
950 	buf_t pkt = {
951 		.skb = skb,
952 		.head = (uint8_t *)(long)skb->data,
953 		.tail = (uint8_t *)(long)skb->data_end,
954 	};
955 
956 	encap = buf_assign(&pkt, sizeof(*encap), NULL);
957 	if (encap == NULL) {
958 		return TC_ACT_OK;
959 	}
960 
961 	if (encap->ip.ihl != 5) {
962 		/* We never have any options. */
963 		return TC_ACT_OK;
964 	}
965 
966 	if (encap->ip.daddr != ENCAPSULATION_IP ||
967 	    encap->ip.protocol != IPPROTO_UDP) {
968 		return TC_ACT_OK;
969 	}
970 
971 	/* TODO Check UDP length? */
972 	if (encap->udp.dest != ENCAPSULATION_PORT) {
973 		return TC_ACT_OK;
974 	}
975 
976 	/* We now know that the packet is destined to us, we can
977 	 * drop bogus ones.
978 	 */
979 	if (ipv4_is_fragment((void *)&encap->ip)) {
980 		metrics->errors_total_fragmented_ip++;
981 		return TC_ACT_SHOT;
982 	}
983 
984 	if (encap->gue.variant != 0) {
985 		metrics->errors_total_malformed_encapsulation++;
986 		return TC_ACT_SHOT;
987 	}
988 
989 	if (encap->gue.control != 0) {
990 		metrics->errors_total_malformed_encapsulation++;
991 		return TC_ACT_SHOT;
992 	}
993 
994 	if (encap->gue.flags != 0) {
995 		metrics->errors_total_malformed_encapsulation++;
996 		return TC_ACT_SHOT;
997 	}
998 
999 	if (encap->gue.hlen !=
1000 	    sizeof(encap->unigue) / 4 + encap->unigue.hop_count) {
1001 		metrics->errors_total_malformed_encapsulation++;
1002 		return TC_ACT_SHOT;
1003 	}
1004 
1005 	if (encap->unigue.version != 0) {
1006 		metrics->errors_total_malformed_encapsulation++;
1007 		return TC_ACT_SHOT;
1008 	}
1009 
1010 	if (encap->unigue.reserved != 0) {
1011 		return TC_ACT_SHOT;
1012 	}
1013 
1014 	struct in_addr next_hop;
1015 	MAYBE_RETURN(get_next_hop(&pkt, encap, &next_hop));
1016 
1017 	if (next_hop.s_addr == 0) {
1018 		metrics->accepted_packets_total_last_hop++;
1019 		return accept_locally(skb, encap);
1020 	}
1021 
1022 	verdict_t verdict;
1023 	switch (encap->gue.proto_ctype) {
1024 	case IPPROTO_IPIP:
1025 		verdict = process_ipv4(&pkt, metrics);
1026 		break;
1027 
1028 	case IPPROTO_IPV6:
1029 		verdict = process_ipv6(&pkt, metrics);
1030 		break;
1031 
1032 	default:
1033 		metrics->errors_total_unknown_l3_proto++;
1034 		return TC_ACT_SHOT;
1035 	}
1036 
1037 	switch (verdict) {
1038 	case INVALID:
1039 		/* metrics have already been bumped */
1040 		return TC_ACT_SHOT;
1041 
1042 	case UNKNOWN:
1043 		return forward_to_next_hop(skb, encap, &next_hop, metrics);
1044 
1045 	case ECHO_REQUEST:
1046 		metrics->accepted_packets_total_icmp_echo_request++;
1047 		break;
1048 
1049 	case SYN:
1050 		if (encap->unigue.forward_syn) {
1051 			return forward_to_next_hop(skb, encap, &next_hop,
1052 						   metrics);
1053 		}
1054 
1055 		metrics->accepted_packets_total_syn++;
1056 		break;
1057 
1058 	case SYN_COOKIE:
1059 		metrics->accepted_packets_total_syn_cookies++;
1060 		break;
1061 
1062 	case ESTABLISHED:
1063 		metrics->accepted_packets_total_established++;
1064 		break;
1065 	}
1066 
1067 	return accept_locally(skb, encap);
1068 }
1069