1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Unstable Conntrack Helpers for XDP and TC-BPF hook 3 * 4 * These are called from the XDP and SCHED_CLS BPF programs. Note that it is 5 * allowed to break compatibility for these functions since the interface they 6 * are exposed through to BPF programs is explicitly unstable. 7 */ 8 9 #include <linux/bpf_verifier.h> 10 #include <linux/bpf.h> 11 #include <linux/btf.h> 12 #include <linux/filter.h> 13 #include <linux/mutex.h> 14 #include <linux/types.h> 15 #include <linux/btf_ids.h> 16 #include <linux/net_namespace.h> 17 #include <net/xdp.h> 18 #include <net/netfilter/nf_conntrack_bpf.h> 19 #include <net/netfilter/nf_conntrack_core.h> 20 21 /* bpf_ct_opts - Options for CT lookup helpers 22 * 23 * Members: 24 * @netns_id - Specify the network namespace for lookup 25 * Values: 26 * BPF_F_CURRENT_NETNS (-1) 27 * Use namespace associated with ctx (xdp_md, __sk_buff) 28 * [0, S32_MAX] 29 * Network Namespace ID 30 * @error - Out parameter, set for any errors encountered 31 * Values: 32 * -EINVAL - Passed NULL for bpf_tuple pointer 33 * -EINVAL - opts->reserved is not 0 34 * -EINVAL - netns_id is less than -1 35 * -EINVAL - opts__sz isn't NF_BPF_CT_OPTS_SZ (16) or 12 36 * -EINVAL - opts->ct_zone_id set when 37 opts__sz isn't NF_BPF_CT_OPTS_SZ (16) 38 * -EPROTO - l4proto isn't one of IPPROTO_TCP or IPPROTO_UDP 39 * -ENONET - No network namespace found for netns_id 40 * -ENOENT - Conntrack lookup could not find entry for tuple 41 * -EAFNOSUPPORT - tuple__sz isn't one of sizeof(tuple->ipv4) 42 * or sizeof(tuple->ipv6) 43 * @l4proto - Layer 4 protocol 44 * Values: 45 * IPPROTO_TCP, IPPROTO_UDP 46 * @dir: - connection tracking tuple direction. 47 * @ct_zone_id - connection tracking zone id. 48 * @ct_zone_dir - connection tracking zone direction. 49 * @reserved - Reserved member, will be reused for more options in future 50 * Values: 51 * 0 52 */ 53 struct bpf_ct_opts { 54 s32 netns_id; 55 s32 error; 56 u8 l4proto; 57 u8 dir; 58 u16 ct_zone_id; 59 u8 ct_zone_dir; 60 u8 reserved[3]; 61 }; 62 63 enum { 64 NF_BPF_CT_OPTS_SZ = 16, 65 }; 66 67 static int bpf_nf_ct_tuple_parse(struct bpf_sock_tuple *bpf_tuple, 68 u32 tuple_len, u8 protonum, u8 dir, 69 struct nf_conntrack_tuple *tuple) 70 { 71 union nf_inet_addr *src = dir ? &tuple->dst.u3 : &tuple->src.u3; 72 union nf_inet_addr *dst = dir ? &tuple->src.u3 : &tuple->dst.u3; 73 union nf_conntrack_man_proto *sport = dir ? (void *)&tuple->dst.u 74 : &tuple->src.u; 75 union nf_conntrack_man_proto *dport = dir ? &tuple->src.u 76 : (void *)&tuple->dst.u; 77 78 if (unlikely(protonum != IPPROTO_TCP && protonum != IPPROTO_UDP)) 79 return -EPROTO; 80 81 memset(tuple, 0, sizeof(*tuple)); 82 83 switch (tuple_len) { 84 case sizeof(bpf_tuple->ipv4): 85 tuple->src.l3num = AF_INET; 86 src->ip = bpf_tuple->ipv4.saddr; 87 sport->tcp.port = bpf_tuple->ipv4.sport; 88 dst->ip = bpf_tuple->ipv4.daddr; 89 dport->tcp.port = bpf_tuple->ipv4.dport; 90 break; 91 case sizeof(bpf_tuple->ipv6): 92 tuple->src.l3num = AF_INET6; 93 memcpy(src->ip6, bpf_tuple->ipv6.saddr, sizeof(bpf_tuple->ipv6.saddr)); 94 sport->tcp.port = bpf_tuple->ipv6.sport; 95 memcpy(dst->ip6, bpf_tuple->ipv6.daddr, sizeof(bpf_tuple->ipv6.daddr)); 96 dport->tcp.port = bpf_tuple->ipv6.dport; 97 break; 98 default: 99 return -EAFNOSUPPORT; 100 } 101 tuple->dst.protonum = protonum; 102 tuple->dst.dir = dir; 103 104 return 0; 105 } 106 107 static struct nf_conn * 108 __bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple, 109 u32 tuple_len, struct bpf_ct_opts *opts, u32 opts_len, 110 u32 timeout) 111 { 112 struct nf_conntrack_tuple otuple, rtuple; 113 struct nf_conntrack_zone ct_zone; 114 struct nf_conn *ct; 115 int err; 116 117 if (!opts || !bpf_tuple) 118 return ERR_PTR(-EINVAL); 119 if (!(opts_len == NF_BPF_CT_OPTS_SZ || opts_len == 12)) 120 return ERR_PTR(-EINVAL); 121 if (opts_len == NF_BPF_CT_OPTS_SZ) { 122 if (opts->reserved[0] || opts->reserved[1] || opts->reserved[2]) 123 return ERR_PTR(-EINVAL); 124 } else { 125 if (opts->ct_zone_id) 126 return ERR_PTR(-EINVAL); 127 } 128 129 if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS)) 130 return ERR_PTR(-EINVAL); 131 132 err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto, 133 IP_CT_DIR_ORIGINAL, &otuple); 134 if (err < 0) 135 return ERR_PTR(err); 136 137 err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto, 138 IP_CT_DIR_REPLY, &rtuple); 139 if (err < 0) 140 return ERR_PTR(err); 141 142 if (opts->netns_id >= 0) { 143 net = get_net_ns_by_id(net, opts->netns_id); 144 if (unlikely(!net)) 145 return ERR_PTR(-ENONET); 146 } 147 148 if (opts_len == NF_BPF_CT_OPTS_SZ) { 149 if (opts->ct_zone_dir == 0) 150 opts->ct_zone_dir = NF_CT_DEFAULT_ZONE_DIR; 151 nf_ct_zone_init(&ct_zone, 152 opts->ct_zone_id, opts->ct_zone_dir, 0); 153 } else { 154 ct_zone = nf_ct_zone_dflt; 155 } 156 157 ct = nf_conntrack_alloc(net, &ct_zone, &otuple, &rtuple, 158 GFP_ATOMIC); 159 if (IS_ERR(ct)) 160 goto out; 161 162 memset(&ct->proto, 0, sizeof(ct->proto)); 163 __nf_ct_set_timeout(ct, timeout * HZ); 164 165 out: 166 if (opts->netns_id >= 0) 167 put_net(net); 168 169 return ct; 170 } 171 172 static struct nf_conn *__bpf_nf_ct_lookup(struct net *net, 173 struct bpf_sock_tuple *bpf_tuple, 174 u32 tuple_len, struct bpf_ct_opts *opts, 175 u32 opts_len) 176 { 177 struct nf_conntrack_tuple_hash *hash; 178 struct nf_conntrack_tuple tuple; 179 struct nf_conntrack_zone ct_zone; 180 struct nf_conn *ct; 181 int err; 182 183 if (!opts || !bpf_tuple) 184 return ERR_PTR(-EINVAL); 185 if (!(opts_len == NF_BPF_CT_OPTS_SZ || opts_len == 12)) 186 return ERR_PTR(-EINVAL); 187 if (opts_len == NF_BPF_CT_OPTS_SZ) { 188 if (opts->reserved[0] || opts->reserved[1] || opts->reserved[2]) 189 return ERR_PTR(-EINVAL); 190 } else { 191 if (opts->ct_zone_id) 192 return ERR_PTR(-EINVAL); 193 } 194 if (unlikely(opts->l4proto != IPPROTO_TCP && opts->l4proto != IPPROTO_UDP)) 195 return ERR_PTR(-EPROTO); 196 if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS)) 197 return ERR_PTR(-EINVAL); 198 199 err = bpf_nf_ct_tuple_parse(bpf_tuple, tuple_len, opts->l4proto, 200 IP_CT_DIR_ORIGINAL, &tuple); 201 if (err < 0) 202 return ERR_PTR(err); 203 204 if (opts->netns_id >= 0) { 205 net = get_net_ns_by_id(net, opts->netns_id); 206 if (unlikely(!net)) 207 return ERR_PTR(-ENONET); 208 } 209 210 if (opts_len == NF_BPF_CT_OPTS_SZ) { 211 if (opts->ct_zone_dir == 0) 212 opts->ct_zone_dir = NF_CT_DEFAULT_ZONE_DIR; 213 nf_ct_zone_init(&ct_zone, 214 opts->ct_zone_id, opts->ct_zone_dir, 0); 215 } else { 216 ct_zone = nf_ct_zone_dflt; 217 } 218 219 hash = nf_conntrack_find_get(net, &ct_zone, &tuple); 220 if (opts->netns_id >= 0) 221 put_net(net); 222 if (!hash) 223 return ERR_PTR(-ENOENT); 224 225 ct = nf_ct_tuplehash_to_ctrack(hash); 226 opts->dir = NF_CT_DIRECTION(hash); 227 228 return ct; 229 } 230 231 BTF_ID_LIST(btf_nf_conn_ids) 232 BTF_ID(struct, nf_conn) 233 BTF_ID(struct, nf_conn___init) 234 235 /* Check writes into `struct nf_conn` */ 236 static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log, 237 const struct bpf_reg_state *reg, 238 int off, int size) 239 { 240 const struct btf_type *ncit, *nct, *t; 241 size_t end; 242 243 ncit = btf_type_by_id(reg->btf, btf_nf_conn_ids[1]); 244 nct = btf_type_by_id(reg->btf, btf_nf_conn_ids[0]); 245 t = btf_type_by_id(reg->btf, reg->btf_id); 246 if (t != nct && t != ncit) { 247 bpf_log(log, "only read is supported\n"); 248 return -EACCES; 249 } 250 251 /* `struct nf_conn` and `struct nf_conn___init` have the same layout 252 * so we are safe to simply merge offset checks here 253 */ 254 switch (off) { 255 #if defined(CONFIG_NF_CONNTRACK_MARK) 256 case offsetof(struct nf_conn, mark): 257 end = offsetofend(struct nf_conn, mark); 258 break; 259 #endif 260 default: 261 bpf_log(log, "no write support to nf_conn at off %d\n", off); 262 return -EACCES; 263 } 264 265 if (off + size > end) { 266 bpf_log(log, 267 "write access at off %d with size %d beyond the member of nf_conn ended at %zu\n", 268 off, size, end); 269 return -EACCES; 270 } 271 272 return 0; 273 } 274 275 __bpf_kfunc_start_defs(); 276 277 /* bpf_xdp_ct_alloc - Allocate a new CT entry 278 * 279 * Parameters: 280 * @xdp_ctx - Pointer to ctx (xdp_md) in XDP program 281 * Cannot be NULL 282 * @bpf_tuple - Pointer to memory representing the tuple to look up 283 * Cannot be NULL 284 * @tuple__sz - Length of the tuple structure 285 * Must be one of sizeof(bpf_tuple->ipv4) or 286 * sizeof(bpf_tuple->ipv6) 287 * @opts - Additional options for allocation (documented above) 288 * Cannot be NULL 289 * @opts__sz - Length of the bpf_ct_opts structure 290 * Must be NF_BPF_CT_OPTS_SZ (16) or 12 291 */ 292 __bpf_kfunc struct nf_conn___init * 293 bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, 294 u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz) 295 { 296 struct xdp_buff *ctx = (struct xdp_buff *)xdp_ctx; 297 struct nf_conn *nfct; 298 299 nfct = __bpf_nf_ct_alloc_entry(dev_net(ctx->rxq->dev), bpf_tuple, tuple__sz, 300 opts, opts__sz, 10); 301 if (IS_ERR(nfct)) { 302 if (opts) 303 opts->error = PTR_ERR(nfct); 304 return NULL; 305 } 306 307 return (struct nf_conn___init *)nfct; 308 } 309 310 /* bpf_xdp_ct_lookup - Lookup CT entry for the given tuple, and acquire a 311 * reference to it 312 * 313 * Parameters: 314 * @xdp_ctx - Pointer to ctx (xdp_md) in XDP program 315 * Cannot be NULL 316 * @bpf_tuple - Pointer to memory representing the tuple to look up 317 * Cannot be NULL 318 * @tuple__sz - Length of the tuple structure 319 * Must be one of sizeof(bpf_tuple->ipv4) or 320 * sizeof(bpf_tuple->ipv6) 321 * @opts - Additional options for lookup (documented above) 322 * Cannot be NULL 323 * @opts__sz - Length of the bpf_ct_opts structure 324 * Must be NF_BPF_CT_OPTS_SZ (16) or 12 325 */ 326 __bpf_kfunc struct nf_conn * 327 bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, 328 u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz) 329 { 330 struct xdp_buff *ctx = (struct xdp_buff *)xdp_ctx; 331 struct net *caller_net; 332 struct nf_conn *nfct; 333 334 caller_net = dev_net(ctx->rxq->dev); 335 nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts, opts__sz); 336 if (IS_ERR(nfct)) { 337 if (opts) 338 opts->error = PTR_ERR(nfct); 339 return NULL; 340 } 341 return nfct; 342 } 343 344 /* bpf_skb_ct_alloc - Allocate a new CT entry 345 * 346 * Parameters: 347 * @skb_ctx - Pointer to ctx (__sk_buff) in TC program 348 * Cannot be NULL 349 * @bpf_tuple - Pointer to memory representing the tuple to look up 350 * Cannot be NULL 351 * @tuple__sz - Length of the tuple structure 352 * Must be one of sizeof(bpf_tuple->ipv4) or 353 * sizeof(bpf_tuple->ipv6) 354 * @opts - Additional options for allocation (documented above) 355 * Cannot be NULL 356 * @opts__sz - Length of the bpf_ct_opts structure 357 * Must be NF_BPF_CT_OPTS_SZ (16) or 12 358 */ 359 __bpf_kfunc struct nf_conn___init * 360 bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, 361 u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz) 362 { 363 struct sk_buff *skb = (struct sk_buff *)skb_ctx; 364 struct nf_conn *nfct; 365 struct net *net; 366 367 net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk); 368 nfct = __bpf_nf_ct_alloc_entry(net, bpf_tuple, tuple__sz, opts, opts__sz, 10); 369 if (IS_ERR(nfct)) { 370 if (opts) 371 opts->error = PTR_ERR(nfct); 372 return NULL; 373 } 374 375 return (struct nf_conn___init *)nfct; 376 } 377 378 /* bpf_skb_ct_lookup - Lookup CT entry for the given tuple, and acquire a 379 * reference to it 380 * 381 * Parameters: 382 * @skb_ctx - Pointer to ctx (__sk_buff) in TC program 383 * Cannot be NULL 384 * @bpf_tuple - Pointer to memory representing the tuple to look up 385 * Cannot be NULL 386 * @tuple__sz - Length of the tuple structure 387 * Must be one of sizeof(bpf_tuple->ipv4) or 388 * sizeof(bpf_tuple->ipv6) 389 * @opts - Additional options for lookup (documented above) 390 * Cannot be NULL 391 * @opts__sz - Length of the bpf_ct_opts structure 392 * Must be NF_BPF_CT_OPTS_SZ (16) or 12 393 */ 394 __bpf_kfunc struct nf_conn * 395 bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, 396 u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz) 397 { 398 struct sk_buff *skb = (struct sk_buff *)skb_ctx; 399 struct net *caller_net; 400 struct nf_conn *nfct; 401 402 caller_net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk); 403 nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts, opts__sz); 404 if (IS_ERR(nfct)) { 405 if (opts) 406 opts->error = PTR_ERR(nfct); 407 return NULL; 408 } 409 return nfct; 410 } 411 412 /* bpf_ct_insert_entry - Add the provided entry into a CT map 413 * 414 * This must be invoked for referenced PTR_TO_BTF_ID. 415 * 416 * @nfct - Pointer to referenced nf_conn___init object, obtained 417 * using bpf_xdp_ct_alloc or bpf_skb_ct_alloc. 418 */ 419 __bpf_kfunc struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i) 420 { 421 struct nf_conn *nfct = (struct nf_conn *)nfct_i; 422 int err; 423 424 if (!nf_ct_is_confirmed(nfct)) 425 nfct->timeout += nfct_time_stamp; 426 nfct->status |= IPS_CONFIRMED; 427 err = nf_conntrack_hash_check_insert(nfct); 428 if (err < 0) { 429 nf_conntrack_free(nfct); 430 return NULL; 431 } 432 return nfct; 433 } 434 435 /* bpf_ct_release - Release acquired nf_conn object 436 * 437 * This must be invoked for referenced PTR_TO_BTF_ID, and the verifier rejects 438 * the program if any references remain in the program in all of the explored 439 * states. 440 * 441 * Parameters: 442 * @nf_conn - Pointer to referenced nf_conn object, obtained using 443 * bpf_xdp_ct_lookup or bpf_skb_ct_lookup. 444 */ 445 __bpf_kfunc void bpf_ct_release(struct nf_conn *nfct) 446 { 447 nf_ct_put(nfct); 448 } 449 450 /* bpf_ct_set_timeout - Set timeout of allocated nf_conn 451 * 452 * Sets the default timeout of newly allocated nf_conn before insertion. 453 * This helper must be invoked for refcounted pointer to nf_conn___init. 454 * 455 * Parameters: 456 * @nfct - Pointer to referenced nf_conn object, obtained using 457 * bpf_xdp_ct_alloc or bpf_skb_ct_alloc. 458 * @timeout - Timeout in msecs. 459 */ 460 __bpf_kfunc void bpf_ct_set_timeout(struct nf_conn___init *nfct, u32 timeout) 461 { 462 __nf_ct_set_timeout((struct nf_conn *)nfct, msecs_to_jiffies(timeout)); 463 } 464 465 /* bpf_ct_change_timeout - Change timeout of inserted nf_conn 466 * 467 * Change timeout associated of the inserted or looked up nf_conn. 468 * This helper must be invoked for refcounted pointer to nf_conn. 469 * 470 * Parameters: 471 * @nfct - Pointer to referenced nf_conn object, obtained using 472 * bpf_ct_insert_entry, bpf_xdp_ct_lookup, or bpf_skb_ct_lookup. 473 * @timeout - New timeout in msecs. 474 */ 475 __bpf_kfunc int bpf_ct_change_timeout(struct nf_conn *nfct, u32 timeout) 476 { 477 return __nf_ct_change_timeout(nfct, msecs_to_jiffies(timeout)); 478 } 479 480 /* bpf_ct_set_status - Set status field of allocated nf_conn 481 * 482 * Set the status field of the newly allocated nf_conn before insertion. 483 * This must be invoked for referenced PTR_TO_BTF_ID to nf_conn___init. 484 * 485 * Parameters: 486 * @nfct - Pointer to referenced nf_conn object, obtained using 487 * bpf_xdp_ct_alloc or bpf_skb_ct_alloc. 488 * @status - New status value. 489 */ 490 __bpf_kfunc int bpf_ct_set_status(const struct nf_conn___init *nfct, u32 status) 491 { 492 return nf_ct_change_status_common((struct nf_conn *)nfct, status); 493 } 494 495 /* bpf_ct_change_status - Change status of inserted nf_conn 496 * 497 * Change the status field of the provided connection tracking entry. 498 * This must be invoked for referenced PTR_TO_BTF_ID to nf_conn. 499 * 500 * Parameters: 501 * @nfct - Pointer to referenced nf_conn object, obtained using 502 * bpf_ct_insert_entry, bpf_xdp_ct_lookup or bpf_skb_ct_lookup. 503 * @status - New status value. 504 */ 505 __bpf_kfunc int bpf_ct_change_status(struct nf_conn *nfct, u32 status) 506 { 507 return nf_ct_change_status_common(nfct, status); 508 } 509 510 __bpf_kfunc_end_defs(); 511 512 BTF_KFUNCS_START(nf_ct_kfunc_set) 513 BTF_ID_FLAGS(func, bpf_xdp_ct_alloc, KF_ACQUIRE | KF_RET_NULL) 514 BTF_ID_FLAGS(func, bpf_xdp_ct_lookup, KF_ACQUIRE | KF_RET_NULL) 515 BTF_ID_FLAGS(func, bpf_skb_ct_alloc, KF_ACQUIRE | KF_RET_NULL) 516 BTF_ID_FLAGS(func, bpf_skb_ct_lookup, KF_ACQUIRE | KF_RET_NULL) 517 BTF_ID_FLAGS(func, bpf_ct_insert_entry, KF_ACQUIRE | KF_RET_NULL | KF_RELEASE) 518 BTF_ID_FLAGS(func, bpf_ct_release, KF_RELEASE) 519 BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS) 520 BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS) 521 BTF_ID_FLAGS(func, bpf_ct_set_status, KF_TRUSTED_ARGS) 522 BTF_ID_FLAGS(func, bpf_ct_change_status, KF_TRUSTED_ARGS) 523 BTF_KFUNCS_END(nf_ct_kfunc_set) 524 525 static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = { 526 .owner = THIS_MODULE, 527 .set = &nf_ct_kfunc_set, 528 }; 529 530 int register_nf_conntrack_bpf(void) 531 { 532 int ret; 533 534 ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &nf_conntrack_kfunc_set); 535 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &nf_conntrack_kfunc_set); 536 if (!ret) { 537 mutex_lock(&nf_conn_btf_access_lock); 538 nfct_btf_struct_access = _nf_conntrack_btf_struct_access; 539 mutex_unlock(&nf_conn_btf_access_lock); 540 } 541 542 return ret; 543 } 544 545 void cleanup_nf_conntrack_bpf(void) 546 { 547 mutex_lock(&nf_conn_btf_access_lock); 548 nfct_btf_struct_access = NULL; 549 mutex_unlock(&nf_conn_btf_access_lock); 550 } 551