1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 // Copyright (c) 2019, 2020 Cloudflare
3
4 #include <stdbool.h>
5 #include <stddef.h>
6 #include <stdint.h>
7 #include <string.h>
8
9 #include <linux/bpf.h>
10 #include <linux/icmp.h>
11 #include <linux/icmpv6.h>
12 #include <linux/if_ether.h>
13 #include <linux/in.h>
14 #include <linux/ip.h>
15 #include <linux/ipv6.h>
16 #include <linux/pkt_cls.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19
20 #include <bpf/bpf_helpers.h>
21 #include <bpf/bpf_endian.h>
22
23 #include "test_cls_redirect.h"
24
25 #ifdef SUBPROGS
26 #define INLINING __noinline
27 #else
28 #define INLINING __always_inline
29 #endif
30
31 #define offsetofend(TYPE, MEMBER) \
32 (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER)))
33
34 #define IP_OFFSET_MASK (0x1FFF)
35 #define IP_MF (0x2000)
36
37 char _license[] SEC("license") = "Dual BSD/GPL";
38
39 /**
40 * Destination port and IP used for UDP encapsulation.
41 */
42 static volatile const __be16 ENCAPSULATION_PORT;
43 static volatile const __be32 ENCAPSULATION_IP;
44
45 typedef struct {
46 uint64_t processed_packets_total;
47 uint64_t l3_protocol_packets_total_ipv4;
48 uint64_t l3_protocol_packets_total_ipv6;
49 uint64_t l4_protocol_packets_total_tcp;
50 uint64_t l4_protocol_packets_total_udp;
51 uint64_t accepted_packets_total_syn;
52 uint64_t accepted_packets_total_syn_cookies;
53 uint64_t accepted_packets_total_last_hop;
54 uint64_t accepted_packets_total_icmp_echo_request;
55 uint64_t accepted_packets_total_established;
56 uint64_t forwarded_packets_total_gue;
57 uint64_t forwarded_packets_total_gre;
58
59 uint64_t errors_total_unknown_l3_proto;
60 uint64_t errors_total_unknown_l4_proto;
61 uint64_t errors_total_malformed_ip;
62 uint64_t errors_total_fragmented_ip;
63 uint64_t errors_total_malformed_icmp;
64 uint64_t errors_total_unwanted_icmp;
65 uint64_t errors_total_malformed_icmp_pkt_too_big;
66 uint64_t errors_total_malformed_tcp;
67 uint64_t errors_total_malformed_udp;
68 uint64_t errors_total_icmp_echo_replies;
69 uint64_t errors_total_malformed_encapsulation;
70 uint64_t errors_total_encap_adjust_failed;
71 uint64_t errors_total_encap_buffer_too_small;
72 uint64_t errors_total_redirect_loop;
73 } metrics_t;
74
75 typedef enum {
76 INVALID = 0,
77 UNKNOWN,
78 ECHO_REQUEST,
79 SYN,
80 SYN_COOKIE,
81 ESTABLISHED,
82 } verdict_t;
83
84 typedef struct {
85 uint16_t src, dst;
86 } flow_ports_t;
87
88 _Static_assert(
89 sizeof(flow_ports_t) !=
90 offsetofend(struct bpf_sock_tuple, ipv4.dport) -
91 offsetof(struct bpf_sock_tuple, ipv4.sport) - 1,
92 "flow_ports_t must match sport and dport in struct bpf_sock_tuple");
93 _Static_assert(
94 sizeof(flow_ports_t) !=
95 offsetofend(struct bpf_sock_tuple, ipv6.dport) -
96 offsetof(struct bpf_sock_tuple, ipv6.sport) - 1,
97 "flow_ports_t must match sport and dport in struct bpf_sock_tuple");
98
99 typedef int ret_t;
100
101 /* This is a bit of a hack. We need a return value which allows us to
102 * indicate that the regular flow of the program should continue,
103 * while allowing functions to use XDP_PASS and XDP_DROP, etc.
104 */
105 static const ret_t CONTINUE_PROCESSING = -1;
106
107 /* Convenience macro to call functions which return ret_t.
108 */
109 #define MAYBE_RETURN(x) \
110 do { \
111 ret_t __ret = x; \
112 if (__ret != CONTINUE_PROCESSING) \
113 return __ret; \
114 } while (0)
115
116 /* Linux packet pointers are either aligned to NET_IP_ALIGN (aka 2 bytes),
117 * or not aligned if the arch supports efficient unaligned access.
118 *
119 * Since the verifier ensures that eBPF packet accesses follow these rules,
120 * we can tell LLVM to emit code as if we always had a larger alignment.
121 * It will yell at us if we end up on a platform where this is not valid.
122 */
123 typedef uint8_t *net_ptr __attribute__((align_value(8)));
124
125 typedef struct buf {
126 struct __sk_buff *skb;
127 net_ptr head;
128 /* NB: tail musn't have alignment other than 1, otherwise
129 * LLVM will go and eliminate code, e.g. when checking packet lengths.
130 */
131 uint8_t *const tail;
132 } buf_t;
133
buf_off(const buf_t * buf)134 static __always_inline size_t buf_off(const buf_t *buf)
135 {
136 /* Clang seems to optimize constructs like
137 * a - b + c
138 * if c is known:
139 * r? = c
140 * r? -= b
141 * r? += a
142 *
143 * This is a problem if a and b are packet pointers,
144 * since the verifier allows subtracting two pointers to
145 * get a scalar, but not a scalar and a pointer.
146 *
147 * Use inline asm to break this optimization.
148 */
149 size_t off = (size_t)buf->head;
150 asm("%0 -= %1" : "+r"(off) : "r"(buf->skb->data));
151 return off;
152 }
153
buf_copy(buf_t * buf,void * dst,size_t len)154 static __always_inline bool buf_copy(buf_t *buf, void *dst, size_t len)
155 {
156 if (bpf_skb_load_bytes(buf->skb, buf_off(buf), dst, len)) {
157 return false;
158 }
159
160 buf->head += len;
161 return true;
162 }
163
buf_skip(buf_t * buf,const size_t len)164 static __always_inline bool buf_skip(buf_t *buf, const size_t len)
165 {
166 /* Check whether off + len is valid in the non-linear part. */
167 if (buf_off(buf) + len > buf->skb->len) {
168 return false;
169 }
170
171 buf->head += len;
172 return true;
173 }
174
175 /* Returns a pointer to the start of buf, or NULL if len is
176 * larger than the remaining data. Consumes len bytes on a successful
177 * call.
178 *
179 * If scratch is not NULL, the function will attempt to load non-linear
180 * data via bpf_skb_load_bytes. On success, scratch is returned.
181 */
buf_assign(buf_t * buf,const size_t len,void * scratch)182 static __always_inline void *buf_assign(buf_t *buf, const size_t len, void *scratch)
183 {
184 if (buf->head + len > buf->tail) {
185 if (scratch == NULL) {
186 return NULL;
187 }
188
189 return buf_copy(buf, scratch, len) ? scratch : NULL;
190 }
191
192 void *ptr = buf->head;
193 buf->head += len;
194 return ptr;
195 }
196
pkt_skip_ipv4_options(buf_t * buf,const struct iphdr * ipv4)197 static INLINING bool pkt_skip_ipv4_options(buf_t *buf, const struct iphdr *ipv4)
198 {
199 if (ipv4->ihl <= 5) {
200 return true;
201 }
202
203 return buf_skip(buf, (ipv4->ihl - 5) * 4);
204 }
205
ipv4_is_fragment(const struct iphdr * ip)206 static INLINING bool ipv4_is_fragment(const struct iphdr *ip)
207 {
208 uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK);
209 return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0;
210 }
211
pkt_parse_ipv4(buf_t * pkt,struct iphdr * scratch)212 static __always_inline struct iphdr *pkt_parse_ipv4(buf_t *pkt, struct iphdr *scratch)
213 {
214 struct iphdr *ipv4 = buf_assign(pkt, sizeof(*ipv4), scratch);
215 if (ipv4 == NULL) {
216 return NULL;
217 }
218
219 if (ipv4->ihl < 5) {
220 return NULL;
221 }
222
223 if (!pkt_skip_ipv4_options(pkt, ipv4)) {
224 return NULL;
225 }
226
227 return ipv4;
228 }
229
230 /* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */
pkt_parse_icmp_l4_ports(buf_t * pkt,flow_ports_t * ports)231 static INLINING bool pkt_parse_icmp_l4_ports(buf_t *pkt, flow_ports_t *ports)
232 {
233 if (!buf_copy(pkt, ports, sizeof(*ports))) {
234 return false;
235 }
236
237 /* Ports in the L4 headers are reversed, since we are parsing an ICMP
238 * payload which is going towards the eyeball.
239 */
240 uint16_t dst = ports->src;
241 ports->src = ports->dst;
242 ports->dst = dst;
243 return true;
244 }
245
pkt_checksum_fold(uint32_t csum)246 static INLINING uint16_t pkt_checksum_fold(uint32_t csum)
247 {
248 /* The highest reasonable value for an IPv4 header
249 * checksum requires two folds, so we just do that always.
250 */
251 csum = (csum & 0xffff) + (csum >> 16);
252 csum = (csum & 0xffff) + (csum >> 16);
253 return (uint16_t)~csum;
254 }
255
pkt_ipv4_checksum(struct iphdr * iph)256 static INLINING void pkt_ipv4_checksum(struct iphdr *iph)
257 {
258 iph->check = 0;
259
260 /* An IP header without options is 20 bytes. Two of those
261 * are the checksum, which we always set to zero. Hence,
262 * the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7,
263 * which fits in 32 bit.
264 */
265 _Static_assert(sizeof(struct iphdr) == 20, "iphdr must be 20 bytes");
266 uint32_t acc = 0;
267 uint16_t *ipw = (uint16_t *)iph;
268
269 #pragma clang loop unroll(full)
270 for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++) {
271 acc += ipw[i];
272 }
273
274 iph->check = pkt_checksum_fold(acc);
275 }
276
277 static INLINING
pkt_skip_ipv6_extension_headers(buf_t * pkt,const struct ipv6hdr * ipv6,uint8_t * upper_proto,bool * is_fragment)278 bool pkt_skip_ipv6_extension_headers(buf_t *pkt,
279 const struct ipv6hdr *ipv6,
280 uint8_t *upper_proto,
281 bool *is_fragment)
282 {
283 /* We understand five extension headers.
284 * https://tools.ietf.org/html/rfc8200#section-4.1 states that all
285 * headers should occur once, except Destination Options, which may
286 * occur twice. Hence we give up after 6 headers.
287 */
288 struct {
289 uint8_t next;
290 uint8_t len;
291 } exthdr = {
292 .next = ipv6->nexthdr,
293 };
294 *is_fragment = false;
295
296 #pragma clang loop unroll(full)
297 for (int i = 0; i < 6; i++) {
298 switch (exthdr.next) {
299 case IPPROTO_FRAGMENT:
300 *is_fragment = true;
301 /* NB: We don't check that hdrlen == 0 as per spec. */
302 /* fallthrough; */
303
304 case IPPROTO_HOPOPTS:
305 case IPPROTO_ROUTING:
306 case IPPROTO_DSTOPTS:
307 case IPPROTO_MH:
308 if (!buf_copy(pkt, &exthdr, sizeof(exthdr))) {
309 return false;
310 }
311
312 /* hdrlen is in 8-octet units, and excludes the first 8 octets. */
313 if (!buf_skip(pkt,
314 (exthdr.len + 1) * 8 - sizeof(exthdr))) {
315 return false;
316 }
317
318 /* Decode next header */
319 break;
320
321 default:
322 /* The next header is not one of the known extension
323 * headers, treat it as the upper layer header.
324 *
325 * This handles IPPROTO_NONE.
326 *
327 * Encapsulating Security Payload (50) and Authentication
328 * Header (51) also end up here (and will trigger an
329 * unknown proto error later). They have a custom header
330 * format and seem too esoteric to care about.
331 */
332 *upper_proto = exthdr.next;
333 return true;
334 }
335 }
336
337 /* We never found an upper layer header. */
338 return false;
339 }
340
341 /* This function has to be inlined, because the verifier otherwise rejects it
342 * due to returning a pointer to the stack. This is technically correct, since
343 * scratch is allocated on the stack. However, this usage should be safe since
344 * it's the callers stack after all.
345 */
346 static __always_inline struct ipv6hdr *
pkt_parse_ipv6(buf_t * pkt,struct ipv6hdr * scratch,uint8_t * proto,bool * is_fragment)347 pkt_parse_ipv6(buf_t *pkt, struct ipv6hdr *scratch, uint8_t *proto,
348 bool *is_fragment)
349 {
350 struct ipv6hdr *ipv6 = buf_assign(pkt, sizeof(*ipv6), scratch);
351 if (ipv6 == NULL) {
352 return NULL;
353 }
354
355 if (!pkt_skip_ipv6_extension_headers(pkt, ipv6, proto, is_fragment)) {
356 return NULL;
357 }
358
359 return ipv6;
360 }
361
362 /* Global metrics, per CPU
363 */
364 struct {
365 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
366 __uint(max_entries, 1);
367 __type(key, unsigned int);
368 __type(value, metrics_t);
369 } metrics_map SEC(".maps");
370
get_global_metrics(void)371 static INLINING metrics_t *get_global_metrics(void)
372 {
373 uint64_t key = 0;
374 return bpf_map_lookup_elem(&metrics_map, &key);
375 }
376
accept_locally(struct __sk_buff * skb,encap_headers_t * encap)377 static INLINING ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap)
378 {
379 const int payload_off =
380 sizeof(*encap) +
381 sizeof(struct in_addr) * encap->unigue.hop_count;
382 int32_t encap_overhead = payload_off - sizeof(struct ethhdr);
383
384 // Changing the ethertype if the encapsulated packet is ipv6
385 if (encap->gue.proto_ctype == IPPROTO_IPV6) {
386 encap->eth.h_proto = bpf_htons(ETH_P_IPV6);
387 }
388
389 if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC,
390 BPF_F_ADJ_ROOM_FIXED_GSO |
391 BPF_F_ADJ_ROOM_NO_CSUM_RESET) ||
392 bpf_csum_level(skb, BPF_CSUM_LEVEL_DEC))
393 return TC_ACT_SHOT;
394
395 return bpf_redirect(skb->ifindex, BPF_F_INGRESS);
396 }
397
forward_with_gre(struct __sk_buff * skb,encap_headers_t * encap,struct in_addr * next_hop,metrics_t * metrics)398 static INLINING ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap,
399 struct in_addr *next_hop, metrics_t *metrics)
400 {
401 metrics->forwarded_packets_total_gre++;
402
403 const int payload_off =
404 sizeof(*encap) +
405 sizeof(struct in_addr) * encap->unigue.hop_count;
406 int32_t encap_overhead =
407 payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr);
408 int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead;
409 uint16_t proto = ETH_P_IP;
410
411 /* Loop protection: the inner packet's TTL is decremented as a safeguard
412 * against any forwarding loop. As the only interesting field is the TTL
413 * hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes
414 * as they handle the split packets if needed (no need for the data to be
415 * in the linear section).
416 */
417 if (encap->gue.proto_ctype == IPPROTO_IPV6) {
418 proto = ETH_P_IPV6;
419 uint8_t ttl;
420 int rc;
421
422 rc = bpf_skb_load_bytes(
423 skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
424 &ttl, 1);
425 if (rc != 0) {
426 metrics->errors_total_malformed_encapsulation++;
427 return TC_ACT_SHOT;
428 }
429
430 if (ttl == 0) {
431 metrics->errors_total_redirect_loop++;
432 return TC_ACT_SHOT;
433 }
434
435 ttl--;
436 rc = bpf_skb_store_bytes(
437 skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
438 &ttl, 1, 0);
439 if (rc != 0) {
440 metrics->errors_total_malformed_encapsulation++;
441 return TC_ACT_SHOT;
442 }
443 } else {
444 uint8_t ttl;
445 int rc;
446
447 rc = bpf_skb_load_bytes(
448 skb, payload_off + offsetof(struct iphdr, ttl), &ttl,
449 1);
450 if (rc != 0) {
451 metrics->errors_total_malformed_encapsulation++;
452 return TC_ACT_SHOT;
453 }
454
455 if (ttl == 0) {
456 metrics->errors_total_redirect_loop++;
457 return TC_ACT_SHOT;
458 }
459
460 /* IPv4 also has a checksum to patch. While the TTL is only one byte,
461 * this function only works for 2 and 4 bytes arguments (the result is
462 * the same).
463 */
464 rc = bpf_l3_csum_replace(
465 skb, payload_off + offsetof(struct iphdr, check), ttl,
466 ttl - 1, 2);
467 if (rc != 0) {
468 metrics->errors_total_malformed_encapsulation++;
469 return TC_ACT_SHOT;
470 }
471
472 ttl--;
473 rc = bpf_skb_store_bytes(
474 skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1,
475 0);
476 if (rc != 0) {
477 metrics->errors_total_malformed_encapsulation++;
478 return TC_ACT_SHOT;
479 }
480 }
481
482 if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET,
483 BPF_F_ADJ_ROOM_FIXED_GSO |
484 BPF_F_ADJ_ROOM_NO_CSUM_RESET) ||
485 bpf_csum_level(skb, BPF_CSUM_LEVEL_INC)) {
486 metrics->errors_total_encap_adjust_failed++;
487 return TC_ACT_SHOT;
488 }
489
490 if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) {
491 metrics->errors_total_encap_buffer_too_small++;
492 return TC_ACT_SHOT;
493 }
494
495 buf_t pkt = {
496 .skb = skb,
497 .head = (uint8_t *)(long)skb->data,
498 .tail = (uint8_t *)(long)skb->data_end,
499 };
500
501 encap_gre_t *encap_gre = buf_assign(&pkt, sizeof(encap_gre_t), NULL);
502 if (encap_gre == NULL) {
503 metrics->errors_total_encap_buffer_too_small++;
504 return TC_ACT_SHOT;
505 }
506
507 encap_gre->ip.protocol = IPPROTO_GRE;
508 encap_gre->ip.daddr = next_hop->s_addr;
509 encap_gre->ip.saddr = ENCAPSULATION_IP;
510 encap_gre->ip.tot_len =
511 bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta);
512 encap_gre->gre.flags = 0;
513 encap_gre->gre.protocol = bpf_htons(proto);
514 pkt_ipv4_checksum((void *)&encap_gre->ip);
515
516 return bpf_redirect(skb->ifindex, 0);
517 }
518
forward_to_next_hop(struct __sk_buff * skb,encap_headers_t * encap,struct in_addr * next_hop,metrics_t * metrics)519 static INLINING ret_t forward_to_next_hop(struct __sk_buff *skb, encap_headers_t *encap,
520 struct in_addr *next_hop, metrics_t *metrics)
521 {
522 /* swap L2 addresses */
523 /* This assumes that packets are received from a router.
524 * So just swapping the MAC addresses here will make the packet go back to
525 * the router, which will send it to the appropriate machine.
526 */
527 unsigned char temp[ETH_ALEN];
528 memcpy(temp, encap->eth.h_dest, sizeof(temp));
529 memcpy(encap->eth.h_dest, encap->eth.h_source,
530 sizeof(encap->eth.h_dest));
531 memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source));
532
533 if (encap->unigue.next_hop == encap->unigue.hop_count - 1 &&
534 encap->unigue.last_hop_gre) {
535 return forward_with_gre(skb, encap, next_hop, metrics);
536 }
537
538 metrics->forwarded_packets_total_gue++;
539 uint32_t old_saddr = encap->ip.saddr;
540 encap->ip.saddr = encap->ip.daddr;
541 encap->ip.daddr = next_hop->s_addr;
542 if (encap->unigue.next_hop < encap->unigue.hop_count) {
543 encap->unigue.next_hop++;
544 }
545
546 /* Remove ip->saddr, add next_hop->s_addr */
547 const uint64_t off = offsetof(typeof(*encap), ip.check);
548 int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, 4);
549 if (ret < 0) {
550 return TC_ACT_SHOT;
551 }
552
553 return bpf_redirect(skb->ifindex, 0);
554 }
555
skip_next_hops(buf_t * pkt,int n)556 static INLINING ret_t skip_next_hops(buf_t *pkt, int n)
557 {
558 switch (n) {
559 case 1:
560 if (!buf_skip(pkt, sizeof(struct in_addr)))
561 return TC_ACT_SHOT;
562 case 0:
563 return CONTINUE_PROCESSING;
564
565 default:
566 return TC_ACT_SHOT;
567 }
568 }
569
570 /* Get the next hop from the GLB header.
571 *
572 * Sets next_hop->s_addr to 0 if there are no more hops left.
573 * pkt is positioned just after the variable length GLB header
574 * iff the call is successful.
575 */
get_next_hop(buf_t * pkt,encap_headers_t * encap,struct in_addr * next_hop)576 static INLINING ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap,
577 struct in_addr *next_hop)
578 {
579 if (encap->unigue.next_hop > encap->unigue.hop_count) {
580 return TC_ACT_SHOT;
581 }
582
583 /* Skip "used" next hops. */
584 MAYBE_RETURN(skip_next_hops(pkt, encap->unigue.next_hop));
585
586 if (encap->unigue.next_hop == encap->unigue.hop_count) {
587 /* No more next hops, we are at the end of the GLB header. */
588 next_hop->s_addr = 0;
589 return CONTINUE_PROCESSING;
590 }
591
592 if (!buf_copy(pkt, next_hop, sizeof(*next_hop))) {
593 return TC_ACT_SHOT;
594 }
595
596 /* Skip the remainig next hops (may be zero). */
597 return skip_next_hops(pkt, encap->unigue.hop_count -
598 encap->unigue.next_hop - 1);
599 }
600
601 /* Fill a bpf_sock_tuple to be used with the socket lookup functions.
602 * This is a kludge that let's us work around verifier limitations:
603 *
604 * fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321)
605 *
606 * clang will substitue a costant for sizeof, which allows the verifier
607 * to track it's value. Based on this, it can figure out the constant
608 * return value, and calling code works while still being "generic" to
609 * IPv4 and IPv6.
610 */
fill_tuple(struct bpf_sock_tuple * tuple,void * iph,uint64_t iphlen,uint16_t sport,uint16_t dport)611 static INLINING uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph,
612 uint64_t iphlen, uint16_t sport, uint16_t dport)
613 {
614 switch (iphlen) {
615 case sizeof(struct iphdr): {
616 struct iphdr *ipv4 = (struct iphdr *)iph;
617 tuple->ipv4.daddr = ipv4->daddr;
618 tuple->ipv4.saddr = ipv4->saddr;
619 tuple->ipv4.sport = sport;
620 tuple->ipv4.dport = dport;
621 return sizeof(tuple->ipv4);
622 }
623
624 case sizeof(struct ipv6hdr): {
625 struct ipv6hdr *ipv6 = (struct ipv6hdr *)iph;
626 memcpy(&tuple->ipv6.daddr, &ipv6->daddr,
627 sizeof(tuple->ipv6.daddr));
628 memcpy(&tuple->ipv6.saddr, &ipv6->saddr,
629 sizeof(tuple->ipv6.saddr));
630 tuple->ipv6.sport = sport;
631 tuple->ipv6.dport = dport;
632 return sizeof(tuple->ipv6);
633 }
634
635 default:
636 return 0;
637 }
638 }
639
classify_tcp(struct __sk_buff * skb,struct bpf_sock_tuple * tuple,uint64_t tuplen,void * iph,struct tcphdr * tcp)640 static INLINING verdict_t classify_tcp(struct __sk_buff *skb,
641 struct bpf_sock_tuple *tuple, uint64_t tuplen,
642 void *iph, struct tcphdr *tcp)
643 {
644 struct bpf_sock *sk =
645 bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
646 if (sk == NULL) {
647 return UNKNOWN;
648 }
649
650 if (sk->state != BPF_TCP_LISTEN) {
651 bpf_sk_release(sk);
652 return ESTABLISHED;
653 }
654
655 if (iph != NULL && tcp != NULL) {
656 /* Kludge: we've run out of arguments, but need the length of the ip header. */
657 uint64_t iphlen = sizeof(struct iphdr);
658 if (tuplen == sizeof(tuple->ipv6)) {
659 iphlen = sizeof(struct ipv6hdr);
660 }
661
662 if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp,
663 sizeof(*tcp)) == 0) {
664 bpf_sk_release(sk);
665 return SYN_COOKIE;
666 }
667 }
668
669 bpf_sk_release(sk);
670 return UNKNOWN;
671 }
672
classify_udp(struct __sk_buff * skb,struct bpf_sock_tuple * tuple,uint64_t tuplen)673 static INLINING verdict_t classify_udp(struct __sk_buff *skb,
674 struct bpf_sock_tuple *tuple, uint64_t tuplen)
675 {
676 struct bpf_sock *sk =
677 bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
678 if (sk == NULL) {
679 return UNKNOWN;
680 }
681
682 if (sk->state == BPF_TCP_ESTABLISHED) {
683 bpf_sk_release(sk);
684 return ESTABLISHED;
685 }
686
687 bpf_sk_release(sk);
688 return UNKNOWN;
689 }
690
classify_icmp(struct __sk_buff * skb,uint8_t proto,struct bpf_sock_tuple * tuple,uint64_t tuplen,metrics_t * metrics)691 static INLINING verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto,
692 struct bpf_sock_tuple *tuple, uint64_t tuplen,
693 metrics_t *metrics)
694 {
695 switch (proto) {
696 case IPPROTO_TCP:
697 return classify_tcp(skb, tuple, tuplen, NULL, NULL);
698
699 case IPPROTO_UDP:
700 return classify_udp(skb, tuple, tuplen);
701
702 default:
703 metrics->errors_total_malformed_icmp++;
704 return INVALID;
705 }
706 }
707
process_icmpv4(buf_t * pkt,metrics_t * metrics)708 static INLINING verdict_t process_icmpv4(buf_t *pkt, metrics_t *metrics)
709 {
710 struct icmphdr icmp;
711 if (!buf_copy(pkt, &icmp, sizeof(icmp))) {
712 metrics->errors_total_malformed_icmp++;
713 return INVALID;
714 }
715
716 /* We should never receive encapsulated echo replies. */
717 if (icmp.type == ICMP_ECHOREPLY) {
718 metrics->errors_total_icmp_echo_replies++;
719 return INVALID;
720 }
721
722 if (icmp.type == ICMP_ECHO) {
723 return ECHO_REQUEST;
724 }
725
726 if (icmp.type != ICMP_DEST_UNREACH || icmp.code != ICMP_FRAG_NEEDED) {
727 metrics->errors_total_unwanted_icmp++;
728 return INVALID;
729 }
730
731 struct iphdr _ip4;
732 const struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4);
733 if (ipv4 == NULL) {
734 metrics->errors_total_malformed_icmp_pkt_too_big++;
735 return INVALID;
736 }
737
738 /* The source address in the outer IP header is from the entity that
739 * originated the ICMP message. Use the original IP header to restore
740 * the correct flow tuple.
741 */
742 struct bpf_sock_tuple tuple;
743 tuple.ipv4.saddr = ipv4->daddr;
744 tuple.ipv4.daddr = ipv4->saddr;
745
746 if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv4.sport)) {
747 metrics->errors_total_malformed_icmp_pkt_too_big++;
748 return INVALID;
749 }
750
751 return classify_icmp(pkt->skb, ipv4->protocol, &tuple,
752 sizeof(tuple.ipv4), metrics);
753 }
754
process_icmpv6(buf_t * pkt,metrics_t * metrics)755 static INLINING verdict_t process_icmpv6(buf_t *pkt, metrics_t *metrics)
756 {
757 struct icmp6hdr icmp6;
758 if (!buf_copy(pkt, &icmp6, sizeof(icmp6))) {
759 metrics->errors_total_malformed_icmp++;
760 return INVALID;
761 }
762
763 /* We should never receive encapsulated echo replies. */
764 if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) {
765 metrics->errors_total_icmp_echo_replies++;
766 return INVALID;
767 }
768
769 if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) {
770 return ECHO_REQUEST;
771 }
772
773 if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) {
774 metrics->errors_total_unwanted_icmp++;
775 return INVALID;
776 }
777
778 bool is_fragment;
779 uint8_t l4_proto;
780 struct ipv6hdr _ipv6;
781 const struct ipv6hdr *ipv6 =
782 pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment);
783 if (ipv6 == NULL) {
784 metrics->errors_total_malformed_icmp_pkt_too_big++;
785 return INVALID;
786 }
787
788 if (is_fragment) {
789 metrics->errors_total_fragmented_ip++;
790 return INVALID;
791 }
792
793 /* Swap source and dest addresses. */
794 struct bpf_sock_tuple tuple;
795 memcpy(&tuple.ipv6.saddr, &ipv6->daddr, sizeof(tuple.ipv6.saddr));
796 memcpy(&tuple.ipv6.daddr, &ipv6->saddr, sizeof(tuple.ipv6.daddr));
797
798 if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv6.sport)) {
799 metrics->errors_total_malformed_icmp_pkt_too_big++;
800 return INVALID;
801 }
802
803 return classify_icmp(pkt->skb, l4_proto, &tuple, sizeof(tuple.ipv6),
804 metrics);
805 }
806
process_tcp(buf_t * pkt,void * iph,uint64_t iphlen,metrics_t * metrics)807 static INLINING verdict_t process_tcp(buf_t *pkt, void *iph, uint64_t iphlen,
808 metrics_t *metrics)
809 {
810 metrics->l4_protocol_packets_total_tcp++;
811
812 struct tcphdr _tcp;
813 struct tcphdr *tcp = buf_assign(pkt, sizeof(_tcp), &_tcp);
814 if (tcp == NULL) {
815 metrics->errors_total_malformed_tcp++;
816 return INVALID;
817 }
818
819 if (tcp->syn) {
820 return SYN;
821 }
822
823 struct bpf_sock_tuple tuple;
824 uint64_t tuplen =
825 fill_tuple(&tuple, iph, iphlen, tcp->source, tcp->dest);
826 return classify_tcp(pkt->skb, &tuple, tuplen, iph, tcp);
827 }
828
process_udp(buf_t * pkt,void * iph,uint64_t iphlen,metrics_t * metrics)829 static INLINING verdict_t process_udp(buf_t *pkt, void *iph, uint64_t iphlen,
830 metrics_t *metrics)
831 {
832 metrics->l4_protocol_packets_total_udp++;
833
834 struct udphdr _udp;
835 struct udphdr *udph = buf_assign(pkt, sizeof(_udp), &_udp);
836 if (udph == NULL) {
837 metrics->errors_total_malformed_udp++;
838 return INVALID;
839 }
840
841 struct bpf_sock_tuple tuple;
842 uint64_t tuplen =
843 fill_tuple(&tuple, iph, iphlen, udph->source, udph->dest);
844 return classify_udp(pkt->skb, &tuple, tuplen);
845 }
846
process_ipv4(buf_t * pkt,metrics_t * metrics)847 static INLINING verdict_t process_ipv4(buf_t *pkt, metrics_t *metrics)
848 {
849 metrics->l3_protocol_packets_total_ipv4++;
850
851 struct iphdr _ip4;
852 struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4);
853 if (ipv4 == NULL) {
854 metrics->errors_total_malformed_ip++;
855 return INVALID;
856 }
857
858 if (ipv4->version != 4) {
859 metrics->errors_total_malformed_ip++;
860 return INVALID;
861 }
862
863 if (ipv4_is_fragment(ipv4)) {
864 metrics->errors_total_fragmented_ip++;
865 return INVALID;
866 }
867
868 switch (ipv4->protocol) {
869 case IPPROTO_ICMP:
870 return process_icmpv4(pkt, metrics);
871
872 case IPPROTO_TCP:
873 return process_tcp(pkt, ipv4, sizeof(*ipv4), metrics);
874
875 case IPPROTO_UDP:
876 return process_udp(pkt, ipv4, sizeof(*ipv4), metrics);
877
878 default:
879 metrics->errors_total_unknown_l4_proto++;
880 return INVALID;
881 }
882 }
883
process_ipv6(buf_t * pkt,metrics_t * metrics)884 static INLINING verdict_t process_ipv6(buf_t *pkt, metrics_t *metrics)
885 {
886 metrics->l3_protocol_packets_total_ipv6++;
887
888 uint8_t l4_proto;
889 bool is_fragment;
890 struct ipv6hdr _ipv6;
891 struct ipv6hdr *ipv6 =
892 pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment);
893 if (ipv6 == NULL) {
894 metrics->errors_total_malformed_ip++;
895 return INVALID;
896 }
897
898 if (ipv6->version != 6) {
899 metrics->errors_total_malformed_ip++;
900 return INVALID;
901 }
902
903 if (is_fragment) {
904 metrics->errors_total_fragmented_ip++;
905 return INVALID;
906 }
907
908 switch (l4_proto) {
909 case IPPROTO_ICMPV6:
910 return process_icmpv6(pkt, metrics);
911
912 case IPPROTO_TCP:
913 return process_tcp(pkt, ipv6, sizeof(*ipv6), metrics);
914
915 case IPPROTO_UDP:
916 return process_udp(pkt, ipv6, sizeof(*ipv6), metrics);
917
918 default:
919 metrics->errors_total_unknown_l4_proto++;
920 return INVALID;
921 }
922 }
923
924 SEC("classifier/cls_redirect")
cls_redirect(struct __sk_buff * skb)925 int cls_redirect(struct __sk_buff *skb)
926 {
927 metrics_t *metrics = get_global_metrics();
928 if (metrics == NULL) {
929 return TC_ACT_SHOT;
930 }
931
932 metrics->processed_packets_total++;
933
934 /* Pass bogus packets as long as we're not sure they're
935 * destined for us.
936 */
937 if (skb->protocol != bpf_htons(ETH_P_IP)) {
938 return TC_ACT_OK;
939 }
940
941 encap_headers_t *encap;
942
943 /* Make sure that all encapsulation headers are available in
944 * the linear portion of the skb. This makes it easy to manipulate them.
945 */
946 if (bpf_skb_pull_data(skb, sizeof(*encap))) {
947 return TC_ACT_OK;
948 }
949
950 buf_t pkt = {
951 .skb = skb,
952 .head = (uint8_t *)(long)skb->data,
953 .tail = (uint8_t *)(long)skb->data_end,
954 };
955
956 encap = buf_assign(&pkt, sizeof(*encap), NULL);
957 if (encap == NULL) {
958 return TC_ACT_OK;
959 }
960
961 if (encap->ip.ihl != 5) {
962 /* We never have any options. */
963 return TC_ACT_OK;
964 }
965
966 if (encap->ip.daddr != ENCAPSULATION_IP ||
967 encap->ip.protocol != IPPROTO_UDP) {
968 return TC_ACT_OK;
969 }
970
971 /* TODO Check UDP length? */
972 if (encap->udp.dest != ENCAPSULATION_PORT) {
973 return TC_ACT_OK;
974 }
975
976 /* We now know that the packet is destined to us, we can
977 * drop bogus ones.
978 */
979 if (ipv4_is_fragment((void *)&encap->ip)) {
980 metrics->errors_total_fragmented_ip++;
981 return TC_ACT_SHOT;
982 }
983
984 if (encap->gue.variant != 0) {
985 metrics->errors_total_malformed_encapsulation++;
986 return TC_ACT_SHOT;
987 }
988
989 if (encap->gue.control != 0) {
990 metrics->errors_total_malformed_encapsulation++;
991 return TC_ACT_SHOT;
992 }
993
994 if (encap->gue.flags != 0) {
995 metrics->errors_total_malformed_encapsulation++;
996 return TC_ACT_SHOT;
997 }
998
999 if (encap->gue.hlen !=
1000 sizeof(encap->unigue) / 4 + encap->unigue.hop_count) {
1001 metrics->errors_total_malformed_encapsulation++;
1002 return TC_ACT_SHOT;
1003 }
1004
1005 if (encap->unigue.version != 0) {
1006 metrics->errors_total_malformed_encapsulation++;
1007 return TC_ACT_SHOT;
1008 }
1009
1010 if (encap->unigue.reserved != 0) {
1011 return TC_ACT_SHOT;
1012 }
1013
1014 struct in_addr next_hop;
1015 MAYBE_RETURN(get_next_hop(&pkt, encap, &next_hop));
1016
1017 if (next_hop.s_addr == 0) {
1018 metrics->accepted_packets_total_last_hop++;
1019 return accept_locally(skb, encap);
1020 }
1021
1022 verdict_t verdict;
1023 switch (encap->gue.proto_ctype) {
1024 case IPPROTO_IPIP:
1025 verdict = process_ipv4(&pkt, metrics);
1026 break;
1027
1028 case IPPROTO_IPV6:
1029 verdict = process_ipv6(&pkt, metrics);
1030 break;
1031
1032 default:
1033 metrics->errors_total_unknown_l3_proto++;
1034 return TC_ACT_SHOT;
1035 }
1036
1037 switch (verdict) {
1038 case INVALID:
1039 /* metrics have already been bumped */
1040 return TC_ACT_SHOT;
1041
1042 case UNKNOWN:
1043 return forward_to_next_hop(skb, encap, &next_hop, metrics);
1044
1045 case ECHO_REQUEST:
1046 metrics->accepted_packets_total_icmp_echo_request++;
1047 break;
1048
1049 case SYN:
1050 if (encap->unigue.forward_syn) {
1051 return forward_to_next_hop(skb, encap, &next_hop,
1052 metrics);
1053 }
1054
1055 metrics->accepted_packets_total_syn++;
1056 break;
1057
1058 case SYN_COOKIE:
1059 metrics->accepted_packets_total_syn_cookies++;
1060 break;
1061
1062 case ESTABLISHED:
1063 metrics->accepted_packets_total_established++;
1064 break;
1065 }
1066
1067 return accept_locally(skb, encap);
1068 }
1069