1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Linux Socket Filter - Kernel level socket filtering 4 * 5 * Based on the design of the Berkeley Packet Filter. The new 6 * internal format has been designed by PLUMgrid: 7 * 8 * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com 9 * 10 * Authors: 11 * 12 * Jay Schulist <jschlst@samba.org> 13 * Alexei Starovoitov <ast@plumgrid.com> 14 * Daniel Borkmann <dborkman@redhat.com> 15 * 16 * Andi Kleen - Fix a few bad bugs and races. 17 * Kris Katterjohn - Added many additional checks in bpf_check_classic() 18 */ 19 20 #include <linux/atomic.h> 21 #include <linux/bpf_verifier.h> 22 #include <linux/module.h> 23 #include <linux/types.h> 24 #include <linux/mm.h> 25 #include <linux/fcntl.h> 26 #include <linux/socket.h> 27 #include <linux/sock_diag.h> 28 #include <linux/in.h> 29 #include <linux/inet.h> 30 #include <linux/netdevice.h> 31 #include <linux/if_packet.h> 32 #include <linux/if_arp.h> 33 #include <linux/gfp.h> 34 #include <net/inet_common.h> 35 #include <net/ip.h> 36 #include <net/protocol.h> 37 #include <net/netlink.h> 38 #include <linux/skbuff.h> 39 #include <linux/skmsg.h> 40 #include <net/sock.h> 41 #include <net/flow_dissector.h> 42 #include <linux/errno.h> 43 #include <linux/timer.h> 44 #include <linux/uaccess.h> 45 #include <linux/unaligned.h> 46 #include <linux/filter.h> 47 #include <linux/ratelimit.h> 48 #include <linux/seccomp.h> 49 #include <linux/if_vlan.h> 50 #include <linux/bpf.h> 51 #include <linux/btf.h> 52 #include <net/sch_generic.h> 53 #include <net/cls_cgroup.h> 54 #include <net/dst_metadata.h> 55 #include <net/dst.h> 56 #include <net/sock_reuseport.h> 57 #include <net/busy_poll.h> 58 #include <net/tcp.h> 59 #include <net/xfrm.h> 60 #include <net/udp.h> 61 #include <linux/bpf_trace.h> 62 #include <net/xdp_sock.h> 63 #include <linux/inetdevice.h> 64 #include <net/inet_hashtables.h> 65 #include <net/inet6_hashtables.h> 66 #include <net/ip_fib.h> 67 #include <net/nexthop.h> 68 #include <net/flow.h> 69 #include <net/arp.h> 70 #include <net/ipv6.h> 71 #include <net/net_namespace.h> 72 #include <linux/seg6_local.h> 73 #include <net/seg6.h> 74 #include <net/seg6_local.h> 75 #include <net/lwtunnel.h> 76 #include <net/ipv6_stubs.h> 77 #include <net/bpf_sk_storage.h> 78 #include <net/transp_v6.h> 79 #include <linux/btf_ids.h> 80 #include <net/tls.h> 81 #include <net/xdp.h> 82 #include <net/mptcp.h> 83 #include <net/netfilter/nf_conntrack_bpf.h> 84 #include <net/netkit.h> 85 #include <linux/un.h> 86 #include <net/xdp_sock_drv.h> 87 #include <net/inet_dscp.h> 88 89 #include "dev.h" 90 91 /* Keep the struct bpf_fib_lookup small so that it fits into a cacheline */ 92 static_assert(sizeof(struct bpf_fib_lookup) == 64, "struct bpf_fib_lookup size check"); 93 94 static const struct bpf_func_proto * 95 bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); 96 97 int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len) 98 { 99 if (in_compat_syscall()) { 100 struct compat_sock_fprog f32; 101 102 if (len != sizeof(f32)) 103 return -EINVAL; 104 if (copy_from_sockptr(&f32, src, sizeof(f32))) 105 return -EFAULT; 106 memset(dst, 0, sizeof(*dst)); 107 dst->len = f32.len; 108 dst->filter = compat_ptr(f32.filter); 109 } else { 110 if (len != sizeof(*dst)) 111 return -EINVAL; 112 if (copy_from_sockptr(dst, src, sizeof(*dst))) 113 return -EFAULT; 114 } 115 116 return 0; 117 } 118 EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user); 119 120 /** 121 * sk_filter_trim_cap - run a packet through a socket filter 122 * @sk: sock associated with &sk_buff 123 * @skb: buffer to filter 124 * @cap: limit on how short the eBPF program may trim the packet 125 * @reason: record drop reason on errors (negative return value) 126 * 127 * Run the eBPF program and then cut skb->data to correct size returned by 128 * the program. If pkt_len is 0 we toss packet. If skb->len is smaller 129 * than pkt_len we keep whole skb->data. This is the socket level 130 * wrapper to bpf_prog_run. It returns 0 if the packet should 131 * be accepted or -EPERM if the packet should be tossed. 132 * 133 */ 134 int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, 135 unsigned int cap, enum skb_drop_reason *reason) 136 { 137 int err; 138 struct sk_filter *filter; 139 140 /* 141 * If the skb was allocated from pfmemalloc reserves, only 142 * allow SOCK_MEMALLOC sockets to use it as this socket is 143 * helping free memory 144 */ 145 if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) { 146 NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); 147 *reason = SKB_DROP_REASON_PFMEMALLOC; 148 return -ENOMEM; 149 } 150 err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb); 151 if (err) { 152 *reason = SKB_DROP_REASON_SOCKET_FILTER; 153 return err; 154 } 155 156 err = security_sock_rcv_skb(sk, skb); 157 if (err) { 158 *reason = SKB_DROP_REASON_SECURITY_HOOK; 159 return err; 160 } 161 162 rcu_read_lock(); 163 filter = rcu_dereference(sk->sk_filter); 164 if (filter) { 165 struct sock *save_sk = skb->sk; 166 unsigned int pkt_len; 167 168 skb->sk = sk; 169 pkt_len = bpf_prog_run_save_cb(filter->prog, skb); 170 skb->sk = save_sk; 171 err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM; 172 if (err) 173 *reason = SKB_DROP_REASON_SOCKET_FILTER; 174 } 175 rcu_read_unlock(); 176 177 return err; 178 } 179 EXPORT_SYMBOL(sk_filter_trim_cap); 180 181 BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb) 182 { 183 return skb_get_poff(skb); 184 } 185 186 BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) 187 { 188 struct nlattr *nla; 189 190 if (skb_is_nonlinear(skb)) 191 return 0; 192 193 if (skb->len < sizeof(struct nlattr)) 194 return 0; 195 196 if (a > skb->len - sizeof(struct nlattr)) 197 return 0; 198 199 nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x); 200 if (nla) 201 return (void *) nla - (void *) skb->data; 202 203 return 0; 204 } 205 206 BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) 207 { 208 struct nlattr *nla; 209 210 if (skb_is_nonlinear(skb)) 211 return 0; 212 213 if (skb->len < sizeof(struct nlattr)) 214 return 0; 215 216 if (a > skb->len - sizeof(struct nlattr)) 217 return 0; 218 219 nla = (struct nlattr *) &skb->data[a]; 220 if (!nla_ok(nla, skb->len - a)) 221 return 0; 222 223 nla = nla_find_nested(nla, x); 224 if (nla) 225 return (void *) nla - (void *) skb->data; 226 227 return 0; 228 } 229 230 static int bpf_skb_load_helper_convert_offset(const struct sk_buff *skb, int offset) 231 { 232 if (likely(offset >= 0)) 233 return offset; 234 235 if (offset >= SKF_NET_OFF) 236 return offset - SKF_NET_OFF + skb_network_offset(skb); 237 238 if (offset >= SKF_LL_OFF && skb_mac_header_was_set(skb)) 239 return offset - SKF_LL_OFF + skb_mac_offset(skb); 240 241 return INT_MIN; 242 } 243 244 BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *, 245 data, int, headlen, int, offset) 246 { 247 u8 tmp; 248 const int len = sizeof(tmp); 249 250 offset = bpf_skb_load_helper_convert_offset(skb, offset); 251 if (offset == INT_MIN) 252 return -EFAULT; 253 254 if (headlen - offset >= len) 255 return *(u8 *)(data + offset); 256 if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) 257 return tmp; 258 else 259 return -EFAULT; 260 } 261 262 BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, 263 int, offset) 264 { 265 return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len, 266 offset); 267 } 268 269 BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *, 270 data, int, headlen, int, offset) 271 { 272 __be16 tmp; 273 const int len = sizeof(tmp); 274 275 offset = bpf_skb_load_helper_convert_offset(skb, offset); 276 if (offset == INT_MIN) 277 return -EFAULT; 278 279 if (headlen - offset >= len) 280 return get_unaligned_be16(data + offset); 281 if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) 282 return be16_to_cpu(tmp); 283 else 284 return -EFAULT; 285 } 286 287 BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, 288 int, offset) 289 { 290 return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len, 291 offset); 292 } 293 294 BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *, 295 data, int, headlen, int, offset) 296 { 297 __be32 tmp; 298 const int len = sizeof(tmp); 299 300 offset = bpf_skb_load_helper_convert_offset(skb, offset); 301 if (offset == INT_MIN) 302 return -EFAULT; 303 304 if (headlen - offset >= len) 305 return get_unaligned_be32(data + offset); 306 if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) 307 return be32_to_cpu(tmp); 308 else 309 return -EFAULT; 310 } 311 312 BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb, 313 int, offset) 314 { 315 return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len, 316 offset); 317 } 318 319 static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, 320 struct bpf_insn *insn_buf) 321 { 322 struct bpf_insn *insn = insn_buf; 323 324 switch (skb_field) { 325 case SKF_AD_MARK: 326 BUILD_BUG_ON(sizeof_field(struct sk_buff, mark) != 4); 327 328 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 329 offsetof(struct sk_buff, mark)); 330 break; 331 332 case SKF_AD_PKTTYPE: 333 *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET); 334 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX); 335 #ifdef __BIG_ENDIAN_BITFIELD 336 *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5); 337 #endif 338 break; 339 340 case SKF_AD_QUEUE: 341 BUILD_BUG_ON(sizeof_field(struct sk_buff, queue_mapping) != 2); 342 343 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 344 offsetof(struct sk_buff, queue_mapping)); 345 break; 346 347 case SKF_AD_VLAN_TAG: 348 BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_tci) != 2); 349 350 /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ 351 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 352 offsetof(struct sk_buff, vlan_tci)); 353 break; 354 case SKF_AD_VLAN_TAG_PRESENT: 355 BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_all) != 4); 356 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 357 offsetof(struct sk_buff, vlan_all)); 358 *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1); 359 *insn++ = BPF_ALU32_IMM(BPF_MOV, dst_reg, 1); 360 break; 361 } 362 363 return insn - insn_buf; 364 } 365 366 static bool convert_bpf_extensions(struct sock_filter *fp, 367 struct bpf_insn **insnp) 368 { 369 struct bpf_insn *insn = *insnp; 370 u32 cnt; 371 372 switch (fp->k) { 373 case SKF_AD_OFF + SKF_AD_PROTOCOL: 374 BUILD_BUG_ON(sizeof_field(struct sk_buff, protocol) != 2); 375 376 /* A = *(u16 *) (CTX + offsetof(protocol)) */ 377 *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, 378 offsetof(struct sk_buff, protocol)); 379 /* A = ntohs(A) [emitting a nop or swap16] */ 380 *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); 381 break; 382 383 case SKF_AD_OFF + SKF_AD_PKTTYPE: 384 cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn); 385 insn += cnt - 1; 386 break; 387 388 case SKF_AD_OFF + SKF_AD_IFINDEX: 389 case SKF_AD_OFF + SKF_AD_HATYPE: 390 BUILD_BUG_ON(sizeof_field(struct net_device, ifindex) != 4); 391 BUILD_BUG_ON(sizeof_field(struct net_device, type) != 2); 392 393 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), 394 BPF_REG_TMP, BPF_REG_CTX, 395 offsetof(struct sk_buff, dev)); 396 /* if (tmp != 0) goto pc + 1 */ 397 *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1); 398 *insn++ = BPF_EXIT_INSN(); 399 if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX) 400 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP, 401 offsetof(struct net_device, ifindex)); 402 else 403 *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP, 404 offsetof(struct net_device, type)); 405 break; 406 407 case SKF_AD_OFF + SKF_AD_MARK: 408 cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn); 409 insn += cnt - 1; 410 break; 411 412 case SKF_AD_OFF + SKF_AD_RXHASH: 413 BUILD_BUG_ON(sizeof_field(struct sk_buff, hash) != 4); 414 415 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, 416 offsetof(struct sk_buff, hash)); 417 break; 418 419 case SKF_AD_OFF + SKF_AD_QUEUE: 420 cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn); 421 insn += cnt - 1; 422 break; 423 424 case SKF_AD_OFF + SKF_AD_VLAN_TAG: 425 cnt = convert_skb_access(SKF_AD_VLAN_TAG, 426 BPF_REG_A, BPF_REG_CTX, insn); 427 insn += cnt - 1; 428 break; 429 430 case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT: 431 cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, 432 BPF_REG_A, BPF_REG_CTX, insn); 433 insn += cnt - 1; 434 break; 435 436 case SKF_AD_OFF + SKF_AD_VLAN_TPID: 437 BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_proto) != 2); 438 439 /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */ 440 *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, 441 offsetof(struct sk_buff, vlan_proto)); 442 /* A = ntohs(A) [emitting a nop or swap16] */ 443 *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); 444 break; 445 446 case SKF_AD_OFF + SKF_AD_PAY_OFFSET: 447 case SKF_AD_OFF + SKF_AD_NLATTR: 448 case SKF_AD_OFF + SKF_AD_NLATTR_NEST: 449 case SKF_AD_OFF + SKF_AD_CPU: 450 case SKF_AD_OFF + SKF_AD_RANDOM: 451 /* arg1 = CTX */ 452 *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); 453 /* arg2 = A */ 454 *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A); 455 /* arg3 = X */ 456 *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X); 457 /* Emit call(arg1=CTX, arg2=A, arg3=X) */ 458 switch (fp->k) { 459 case SKF_AD_OFF + SKF_AD_PAY_OFFSET: 460 *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset); 461 break; 462 case SKF_AD_OFF + SKF_AD_NLATTR: 463 *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr); 464 break; 465 case SKF_AD_OFF + SKF_AD_NLATTR_NEST: 466 *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest); 467 break; 468 case SKF_AD_OFF + SKF_AD_CPU: 469 *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id); 470 break; 471 case SKF_AD_OFF + SKF_AD_RANDOM: 472 *insn = BPF_EMIT_CALL(bpf_user_rnd_u32); 473 bpf_user_rnd_init_once(); 474 break; 475 } 476 break; 477 478 case SKF_AD_OFF + SKF_AD_ALU_XOR_X: 479 /* A ^= X */ 480 *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X); 481 break; 482 483 default: 484 /* This is just a dummy call to avoid letting the compiler 485 * evict __bpf_call_base() as an optimization. Placed here 486 * where no-one bothers. 487 */ 488 BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0); 489 return false; 490 } 491 492 *insnp = insn; 493 return true; 494 } 495 496 static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp) 497 { 498 const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS); 499 int size = bpf_size_to_bytes(BPF_SIZE(fp->code)); 500 bool endian = BPF_SIZE(fp->code) == BPF_H || 501 BPF_SIZE(fp->code) == BPF_W; 502 bool indirect = BPF_MODE(fp->code) == BPF_IND; 503 const int ip_align = NET_IP_ALIGN; 504 struct bpf_insn *insn = *insnp; 505 int offset = fp->k; 506 507 if (!indirect && 508 ((unaligned_ok && offset >= 0) || 509 (!unaligned_ok && offset >= 0 && 510 offset + ip_align >= 0 && 511 offset + ip_align % size == 0))) { 512 bool ldx_off_ok = offset <= S16_MAX; 513 514 *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); 515 if (offset) 516 *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); 517 *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, 518 size, 2 + endian + (!ldx_off_ok * 2)); 519 if (ldx_off_ok) { 520 *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, 521 BPF_REG_D, offset); 522 } else { 523 *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_D); 524 *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_TMP, offset); 525 *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, 526 BPF_REG_TMP, 0); 527 } 528 if (endian) 529 *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8); 530 *insn++ = BPF_JMP_A(8); 531 } 532 533 *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); 534 *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D); 535 *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H); 536 if (!indirect) { 537 *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset); 538 } else { 539 *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X); 540 if (fp->k) 541 *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset); 542 } 543 544 switch (BPF_SIZE(fp->code)) { 545 case BPF_B: 546 *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8); 547 break; 548 case BPF_H: 549 *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16); 550 break; 551 case BPF_W: 552 *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32); 553 break; 554 default: 555 return false; 556 } 557 558 *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2); 559 *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); 560 *insn = BPF_EXIT_INSN(); 561 562 *insnp = insn; 563 return true; 564 } 565 566 /** 567 * bpf_convert_filter - convert filter program 568 * @prog: the user passed filter program 569 * @len: the length of the user passed filter program 570 * @new_prog: allocated 'struct bpf_prog' or NULL 571 * @new_len: pointer to store length of converted program 572 * @seen_ld_abs: bool whether we've seen ld_abs/ind 573 * 574 * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn' 575 * style extended BPF (eBPF). 576 * Conversion workflow: 577 * 578 * 1) First pass for calculating the new program length: 579 * bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs) 580 * 581 * 2) 2nd pass to remap in two passes: 1st pass finds new 582 * jump offsets, 2nd pass remapping: 583 * bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs) 584 */ 585 static int bpf_convert_filter(struct sock_filter *prog, int len, 586 struct bpf_prog *new_prog, int *new_len, 587 bool *seen_ld_abs) 588 { 589 int new_flen = 0, pass = 0, target, i, stack_off; 590 struct bpf_insn *new_insn, *first_insn = NULL; 591 struct sock_filter *fp; 592 int *addrs = NULL; 593 u8 bpf_src; 594 595 BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK); 596 BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); 597 598 if (len <= 0 || len > BPF_MAXINSNS) 599 return -EINVAL; 600 601 if (new_prog) { 602 first_insn = new_prog->insnsi; 603 addrs = kzalloc_objs(*addrs, len, GFP_KERNEL | __GFP_NOWARN); 604 if (!addrs) 605 return -ENOMEM; 606 } 607 608 do_pass: 609 new_insn = first_insn; 610 fp = prog; 611 612 /* Classic BPF related prologue emission. */ 613 if (new_prog) { 614 /* Classic BPF expects A and X to be reset first. These need 615 * to be guaranteed to be the first two instructions. 616 */ 617 *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); 618 *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); 619 620 /* All programs must keep CTX in callee saved BPF_REG_CTX. 621 * In eBPF case it's done by the compiler, here we need to 622 * do this ourself. Initial CTX is present in BPF_REG_ARG1. 623 */ 624 *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); 625 if (*seen_ld_abs) { 626 /* For packet access in classic BPF, cache skb->data 627 * in callee-saved BPF R8 and skb->len - skb->data_len 628 * (headlen) in BPF R9. Since classic BPF is read-only 629 * on CTX, we only need to cache it once. 630 */ 631 *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), 632 BPF_REG_D, BPF_REG_CTX, 633 offsetof(struct sk_buff, data)); 634 *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX, 635 offsetof(struct sk_buff, len)); 636 *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX, 637 offsetof(struct sk_buff, data_len)); 638 *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP); 639 } 640 } else { 641 new_insn += 3; 642 } 643 644 for (i = 0; i < len; fp++, i++) { 645 struct bpf_insn tmp_insns[32] = { }; 646 struct bpf_insn *insn = tmp_insns; 647 648 if (addrs) 649 addrs[i] = new_insn - first_insn; 650 651 switch (fp->code) { 652 /* All arithmetic insns and skb loads map as-is. */ 653 case BPF_ALU | BPF_ADD | BPF_X: 654 case BPF_ALU | BPF_ADD | BPF_K: 655 case BPF_ALU | BPF_SUB | BPF_X: 656 case BPF_ALU | BPF_SUB | BPF_K: 657 case BPF_ALU | BPF_AND | BPF_X: 658 case BPF_ALU | BPF_AND | BPF_K: 659 case BPF_ALU | BPF_OR | BPF_X: 660 case BPF_ALU | BPF_OR | BPF_K: 661 case BPF_ALU | BPF_LSH | BPF_X: 662 case BPF_ALU | BPF_LSH | BPF_K: 663 case BPF_ALU | BPF_RSH | BPF_X: 664 case BPF_ALU | BPF_RSH | BPF_K: 665 case BPF_ALU | BPF_XOR | BPF_X: 666 case BPF_ALU | BPF_XOR | BPF_K: 667 case BPF_ALU | BPF_MUL | BPF_X: 668 case BPF_ALU | BPF_MUL | BPF_K: 669 case BPF_ALU | BPF_DIV | BPF_X: 670 case BPF_ALU | BPF_DIV | BPF_K: 671 case BPF_ALU | BPF_MOD | BPF_X: 672 case BPF_ALU | BPF_MOD | BPF_K: 673 case BPF_ALU | BPF_NEG: 674 case BPF_LD | BPF_ABS | BPF_W: 675 case BPF_LD | BPF_ABS | BPF_H: 676 case BPF_LD | BPF_ABS | BPF_B: 677 case BPF_LD | BPF_IND | BPF_W: 678 case BPF_LD | BPF_IND | BPF_H: 679 case BPF_LD | BPF_IND | BPF_B: 680 /* Check for overloaded BPF extension and 681 * directly convert it if found, otherwise 682 * just move on with mapping. 683 */ 684 if (BPF_CLASS(fp->code) == BPF_LD && 685 BPF_MODE(fp->code) == BPF_ABS && 686 convert_bpf_extensions(fp, &insn)) 687 break; 688 if (BPF_CLASS(fp->code) == BPF_LD && 689 convert_bpf_ld_abs(fp, &insn)) { 690 *seen_ld_abs = true; 691 break; 692 } 693 694 if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || 695 fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { 696 *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X); 697 /* Error with exception code on div/mod by 0. 698 * For cBPF programs, this was always return 0. 699 */ 700 *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2); 701 *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); 702 *insn++ = BPF_EXIT_INSN(); 703 } 704 705 *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); 706 break; 707 708 /* Jump transformation cannot use BPF block macros 709 * everywhere as offset calculation and target updates 710 * require a bit more work than the rest, i.e. jump 711 * opcodes map as-is, but offsets need adjustment. 712 */ 713 714 #define BPF_EMIT_JMP \ 715 do { \ 716 const s32 off_min = S16_MIN, off_max = S16_MAX; \ 717 s32 off; \ 718 \ 719 if (target >= len || target < 0) \ 720 goto err; \ 721 off = addrs ? addrs[target] - addrs[i] - 1 : 0; \ 722 /* Adjust pc relative offset for 2nd or 3rd insn. */ \ 723 off -= insn - tmp_insns; \ 724 /* Reject anything not fitting into insn->off. */ \ 725 if (off < off_min || off > off_max) \ 726 goto err; \ 727 insn->off = off; \ 728 } while (0) 729 730 case BPF_JMP | BPF_JA: 731 target = i + fp->k + 1; 732 insn->code = fp->code; 733 BPF_EMIT_JMP; 734 break; 735 736 case BPF_JMP | BPF_JEQ | BPF_K: 737 case BPF_JMP | BPF_JEQ | BPF_X: 738 case BPF_JMP | BPF_JSET | BPF_K: 739 case BPF_JMP | BPF_JSET | BPF_X: 740 case BPF_JMP | BPF_JGT | BPF_K: 741 case BPF_JMP | BPF_JGT | BPF_X: 742 case BPF_JMP | BPF_JGE | BPF_K: 743 case BPF_JMP | BPF_JGE | BPF_X: 744 if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) { 745 /* BPF immediates are signed, zero extend 746 * immediate into tmp register and use it 747 * in compare insn. 748 */ 749 *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k); 750 751 insn->dst_reg = BPF_REG_A; 752 insn->src_reg = BPF_REG_TMP; 753 bpf_src = BPF_X; 754 } else { 755 insn->dst_reg = BPF_REG_A; 756 insn->imm = fp->k; 757 bpf_src = BPF_SRC(fp->code); 758 insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0; 759 } 760 761 /* Common case where 'jump_false' is next insn. */ 762 if (fp->jf == 0) { 763 insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; 764 target = i + fp->jt + 1; 765 BPF_EMIT_JMP; 766 break; 767 } 768 769 /* Convert some jumps when 'jump_true' is next insn. */ 770 if (fp->jt == 0) { 771 switch (BPF_OP(fp->code)) { 772 case BPF_JEQ: 773 insn->code = BPF_JMP | BPF_JNE | bpf_src; 774 break; 775 case BPF_JGT: 776 insn->code = BPF_JMP | BPF_JLE | bpf_src; 777 break; 778 case BPF_JGE: 779 insn->code = BPF_JMP | BPF_JLT | bpf_src; 780 break; 781 default: 782 goto jmp_rest; 783 } 784 785 target = i + fp->jf + 1; 786 BPF_EMIT_JMP; 787 break; 788 } 789 jmp_rest: 790 /* Other jumps are mapped into two insns: Jxx and JA. */ 791 target = i + fp->jt + 1; 792 insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; 793 BPF_EMIT_JMP; 794 insn++; 795 796 insn->code = BPF_JMP | BPF_JA; 797 target = i + fp->jf + 1; 798 BPF_EMIT_JMP; 799 break; 800 801 /* ldxb 4 * ([14] & 0xf) is remapped into 6 insns. */ 802 case BPF_LDX | BPF_MSH | BPF_B: { 803 struct sock_filter tmp = { 804 .code = BPF_LD | BPF_ABS | BPF_B, 805 .k = fp->k, 806 }; 807 808 *seen_ld_abs = true; 809 810 /* X = A */ 811 *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); 812 /* A = BPF_R0 = *(u8 *) (skb->data + K) */ 813 convert_bpf_ld_abs(&tmp, &insn); 814 insn++; 815 /* A &= 0xf */ 816 *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); 817 /* A <<= 2 */ 818 *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); 819 /* tmp = X */ 820 *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X); 821 /* X = A */ 822 *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); 823 /* A = tmp */ 824 *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); 825 break; 826 } 827 /* RET_K is remapped into 2 insns. RET_A case doesn't need an 828 * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. 829 */ 830 case BPF_RET | BPF_A: 831 case BPF_RET | BPF_K: 832 if (BPF_RVAL(fp->code) == BPF_K) 833 *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0, 834 0, fp->k); 835 *insn = BPF_EXIT_INSN(); 836 break; 837 838 /* Store to stack. */ 839 case BPF_ST: 840 case BPF_STX: 841 stack_off = fp->k * 4 + 4; 842 *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) == 843 BPF_ST ? BPF_REG_A : BPF_REG_X, 844 -stack_off); 845 /* check_load_and_stores() verifies that classic BPF can 846 * load from stack only after write, so tracking 847 * stack_depth for ST|STX insns is enough 848 */ 849 if (new_prog && new_prog->aux->stack_depth < stack_off) 850 new_prog->aux->stack_depth = stack_off; 851 break; 852 853 /* Load from stack. */ 854 case BPF_LD | BPF_MEM: 855 case BPF_LDX | BPF_MEM: 856 stack_off = fp->k * 4 + 4; 857 *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? 858 BPF_REG_A : BPF_REG_X, BPF_REG_FP, 859 -stack_off); 860 break; 861 862 /* A = K or X = K */ 863 case BPF_LD | BPF_IMM: 864 case BPF_LDX | BPF_IMM: 865 *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ? 866 BPF_REG_A : BPF_REG_X, fp->k); 867 break; 868 869 /* X = A */ 870 case BPF_MISC | BPF_TAX: 871 *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); 872 break; 873 874 /* A = X */ 875 case BPF_MISC | BPF_TXA: 876 *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X); 877 break; 878 879 /* A = skb->len or X = skb->len */ 880 case BPF_LD | BPF_W | BPF_LEN: 881 case BPF_LDX | BPF_W | BPF_LEN: 882 *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? 883 BPF_REG_A : BPF_REG_X, BPF_REG_CTX, 884 offsetof(struct sk_buff, len)); 885 break; 886 887 /* Access seccomp_data fields. */ 888 case BPF_LDX | BPF_ABS | BPF_W: 889 /* A = *(u32 *) (ctx + K) */ 890 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k); 891 break; 892 893 /* Unknown instruction. */ 894 default: 895 goto err; 896 } 897 898 insn++; 899 if (new_prog) 900 memcpy(new_insn, tmp_insns, 901 sizeof(*insn) * (insn - tmp_insns)); 902 new_insn += insn - tmp_insns; 903 } 904 905 if (!new_prog) { 906 /* Only calculating new length. */ 907 *new_len = new_insn - first_insn; 908 if (*seen_ld_abs) 909 *new_len += 4; /* Prologue bits. */ 910 return 0; 911 } 912 913 pass++; 914 if (new_flen != new_insn - first_insn) { 915 new_flen = new_insn - first_insn; 916 if (pass > 2) 917 goto err; 918 goto do_pass; 919 } 920 921 kfree(addrs); 922 BUG_ON(*new_len != new_flen); 923 return 0; 924 err: 925 kfree(addrs); 926 return -EINVAL; 927 } 928 929 /* Security: 930 * 931 * As we dont want to clear mem[] array for each packet going through 932 * __bpf_prog_run(), we check that filter loaded by user never try to read 933 * a cell if not previously written, and we check all branches to be sure 934 * a malicious user doesn't try to abuse us. 935 */ 936 static int check_load_and_stores(const struct sock_filter *filter, int flen) 937 { 938 u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */ 939 int pc, ret = 0; 940 941 BUILD_BUG_ON(BPF_MEMWORDS > 16); 942 943 masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL); 944 if (!masks) 945 return -ENOMEM; 946 947 memset(masks, 0xff, flen * sizeof(*masks)); 948 949 for (pc = 0; pc < flen; pc++) { 950 memvalid &= masks[pc]; 951 952 switch (filter[pc].code) { 953 case BPF_ST: 954 case BPF_STX: 955 memvalid |= (1 << filter[pc].k); 956 break; 957 case BPF_LD | BPF_MEM: 958 case BPF_LDX | BPF_MEM: 959 if (!(memvalid & (1 << filter[pc].k))) { 960 ret = -EINVAL; 961 goto error; 962 } 963 break; 964 case BPF_JMP | BPF_JA: 965 /* A jump must set masks on target */ 966 masks[pc + 1 + filter[pc].k] &= memvalid; 967 memvalid = ~0; 968 break; 969 case BPF_JMP | BPF_JEQ | BPF_K: 970 case BPF_JMP | BPF_JEQ | BPF_X: 971 case BPF_JMP | BPF_JGE | BPF_K: 972 case BPF_JMP | BPF_JGE | BPF_X: 973 case BPF_JMP | BPF_JGT | BPF_K: 974 case BPF_JMP | BPF_JGT | BPF_X: 975 case BPF_JMP | BPF_JSET | BPF_K: 976 case BPF_JMP | BPF_JSET | BPF_X: 977 /* A jump must set masks on targets */ 978 masks[pc + 1 + filter[pc].jt] &= memvalid; 979 masks[pc + 1 + filter[pc].jf] &= memvalid; 980 memvalid = ~0; 981 break; 982 } 983 } 984 error: 985 kfree(masks); 986 return ret; 987 } 988 989 static bool chk_code_allowed(u16 code_to_probe) 990 { 991 static const bool codes[] = { 992 /* 32 bit ALU operations */ 993 [BPF_ALU | BPF_ADD | BPF_K] = true, 994 [BPF_ALU | BPF_ADD | BPF_X] = true, 995 [BPF_ALU | BPF_SUB | BPF_K] = true, 996 [BPF_ALU | BPF_SUB | BPF_X] = true, 997 [BPF_ALU | BPF_MUL | BPF_K] = true, 998 [BPF_ALU | BPF_MUL | BPF_X] = true, 999 [BPF_ALU | BPF_DIV | BPF_K] = true, 1000 [BPF_ALU | BPF_DIV | BPF_X] = true, 1001 [BPF_ALU | BPF_MOD | BPF_K] = true, 1002 [BPF_ALU | BPF_MOD | BPF_X] = true, 1003 [BPF_ALU | BPF_AND | BPF_K] = true, 1004 [BPF_ALU | BPF_AND | BPF_X] = true, 1005 [BPF_ALU | BPF_OR | BPF_K] = true, 1006 [BPF_ALU | BPF_OR | BPF_X] = true, 1007 [BPF_ALU | BPF_XOR | BPF_K] = true, 1008 [BPF_ALU | BPF_XOR | BPF_X] = true, 1009 [BPF_ALU | BPF_LSH | BPF_K] = true, 1010 [BPF_ALU | BPF_LSH | BPF_X] = true, 1011 [BPF_ALU | BPF_RSH | BPF_K] = true, 1012 [BPF_ALU | BPF_RSH | BPF_X] = true, 1013 [BPF_ALU | BPF_NEG] = true, 1014 /* Load instructions */ 1015 [BPF_LD | BPF_W | BPF_ABS] = true, 1016 [BPF_LD | BPF_H | BPF_ABS] = true, 1017 [BPF_LD | BPF_B | BPF_ABS] = true, 1018 [BPF_LD | BPF_W | BPF_LEN] = true, 1019 [BPF_LD | BPF_W | BPF_IND] = true, 1020 [BPF_LD | BPF_H | BPF_IND] = true, 1021 [BPF_LD | BPF_B | BPF_IND] = true, 1022 [BPF_LD | BPF_IMM] = true, 1023 [BPF_LD | BPF_MEM] = true, 1024 [BPF_LDX | BPF_W | BPF_LEN] = true, 1025 [BPF_LDX | BPF_B | BPF_MSH] = true, 1026 [BPF_LDX | BPF_IMM] = true, 1027 [BPF_LDX | BPF_MEM] = true, 1028 /* Store instructions */ 1029 [BPF_ST] = true, 1030 [BPF_STX] = true, 1031 /* Misc instructions */ 1032 [BPF_MISC | BPF_TAX] = true, 1033 [BPF_MISC | BPF_TXA] = true, 1034 /* Return instructions */ 1035 [BPF_RET | BPF_K] = true, 1036 [BPF_RET | BPF_A] = true, 1037 /* Jump instructions */ 1038 [BPF_JMP | BPF_JA] = true, 1039 [BPF_JMP | BPF_JEQ | BPF_K] = true, 1040 [BPF_JMP | BPF_JEQ | BPF_X] = true, 1041 [BPF_JMP | BPF_JGE | BPF_K] = true, 1042 [BPF_JMP | BPF_JGE | BPF_X] = true, 1043 [BPF_JMP | BPF_JGT | BPF_K] = true, 1044 [BPF_JMP | BPF_JGT | BPF_X] = true, 1045 [BPF_JMP | BPF_JSET | BPF_K] = true, 1046 [BPF_JMP | BPF_JSET | BPF_X] = true, 1047 }; 1048 1049 if (code_to_probe >= ARRAY_SIZE(codes)) 1050 return false; 1051 1052 return codes[code_to_probe]; 1053 } 1054 1055 static bool bpf_check_basics_ok(const struct sock_filter *filter, 1056 unsigned int flen) 1057 { 1058 if (filter == NULL) 1059 return false; 1060 if (flen == 0 || flen > BPF_MAXINSNS) 1061 return false; 1062 1063 return true; 1064 } 1065 1066 /** 1067 * bpf_check_classic - verify socket filter code 1068 * @filter: filter to verify 1069 * @flen: length of filter 1070 * 1071 * Check the user's filter code. If we let some ugly 1072 * filter code slip through kaboom! The filter must contain 1073 * no references or jumps that are out of range, no illegal 1074 * instructions, and must end with a RET instruction. 1075 * 1076 * All jumps are forward as they are not signed. 1077 * 1078 * Returns 0 if the rule set is legal or -EINVAL if not. 1079 */ 1080 static int bpf_check_classic(const struct sock_filter *filter, 1081 unsigned int flen) 1082 { 1083 bool anc_found; 1084 int pc; 1085 1086 /* Check the filter code now */ 1087 for (pc = 0; pc < flen; pc++) { 1088 const struct sock_filter *ftest = &filter[pc]; 1089 1090 /* May we actually operate on this code? */ 1091 if (!chk_code_allowed(ftest->code)) 1092 return -EINVAL; 1093 1094 /* Some instructions need special checks */ 1095 switch (ftest->code) { 1096 case BPF_ALU | BPF_DIV | BPF_K: 1097 case BPF_ALU | BPF_MOD | BPF_K: 1098 /* Check for division by zero */ 1099 if (ftest->k == 0) 1100 return -EINVAL; 1101 break; 1102 case BPF_ALU | BPF_LSH | BPF_K: 1103 case BPF_ALU | BPF_RSH | BPF_K: 1104 if (ftest->k >= 32) 1105 return -EINVAL; 1106 break; 1107 case BPF_LD | BPF_MEM: 1108 case BPF_LDX | BPF_MEM: 1109 case BPF_ST: 1110 case BPF_STX: 1111 /* Check for invalid memory addresses */ 1112 if (ftest->k >= BPF_MEMWORDS) 1113 return -EINVAL; 1114 break; 1115 case BPF_JMP | BPF_JA: 1116 /* Note, the large ftest->k might cause loops. 1117 * Compare this with conditional jumps below, 1118 * where offsets are limited. --ANK (981016) 1119 */ 1120 if (ftest->k >= (unsigned int)(flen - pc - 1)) 1121 return -EINVAL; 1122 break; 1123 case BPF_JMP | BPF_JEQ | BPF_K: 1124 case BPF_JMP | BPF_JEQ | BPF_X: 1125 case BPF_JMP | BPF_JGE | BPF_K: 1126 case BPF_JMP | BPF_JGE | BPF_X: 1127 case BPF_JMP | BPF_JGT | BPF_K: 1128 case BPF_JMP | BPF_JGT | BPF_X: 1129 case BPF_JMP | BPF_JSET | BPF_K: 1130 case BPF_JMP | BPF_JSET | BPF_X: 1131 /* Both conditionals must be safe */ 1132 if (pc + ftest->jt + 1 >= flen || 1133 pc + ftest->jf + 1 >= flen) 1134 return -EINVAL; 1135 break; 1136 case BPF_LD | BPF_W | BPF_ABS: 1137 case BPF_LD | BPF_H | BPF_ABS: 1138 case BPF_LD | BPF_B | BPF_ABS: 1139 anc_found = false; 1140 if (bpf_anc_helper(ftest) & BPF_ANC) 1141 anc_found = true; 1142 /* Ancillary operation unknown or unsupported */ 1143 if (anc_found == false && ftest->k >= SKF_AD_OFF) 1144 return -EINVAL; 1145 } 1146 } 1147 1148 /* Last instruction must be a RET code */ 1149 switch (filter[flen - 1].code) { 1150 case BPF_RET | BPF_K: 1151 case BPF_RET | BPF_A: 1152 return check_load_and_stores(filter, flen); 1153 } 1154 1155 return -EINVAL; 1156 } 1157 1158 static int bpf_prog_store_orig_filter(struct bpf_prog *fp, 1159 const struct sock_fprog *fprog) 1160 { 1161 unsigned int fsize = bpf_classic_proglen(fprog); 1162 struct sock_fprog_kern *fkprog; 1163 1164 fp->orig_prog = kmalloc_obj(*fkprog); 1165 if (!fp->orig_prog) 1166 return -ENOMEM; 1167 1168 fkprog = fp->orig_prog; 1169 fkprog->len = fprog->len; 1170 1171 fkprog->filter = kmemdup(fp->insns, fsize, 1172 GFP_KERNEL | __GFP_NOWARN); 1173 if (!fkprog->filter) { 1174 kfree(fp->orig_prog); 1175 return -ENOMEM; 1176 } 1177 1178 return 0; 1179 } 1180 1181 static void bpf_release_orig_filter(struct bpf_prog *fp) 1182 { 1183 struct sock_fprog_kern *fprog = fp->orig_prog; 1184 1185 if (fprog) { 1186 kfree(fprog->filter); 1187 kfree(fprog); 1188 } 1189 } 1190 1191 static void __bpf_prog_release(struct bpf_prog *prog) 1192 { 1193 if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) { 1194 bpf_prog_put(prog); 1195 } else { 1196 bpf_release_orig_filter(prog); 1197 bpf_prog_free(prog); 1198 } 1199 } 1200 1201 static void __sk_filter_release(struct sk_filter *fp) 1202 { 1203 __bpf_prog_release(fp->prog); 1204 kfree(fp); 1205 } 1206 1207 /** 1208 * sk_filter_release_rcu - Release a socket filter by rcu_head 1209 * @rcu: rcu_head that contains the sk_filter to free 1210 */ 1211 static void sk_filter_release_rcu(struct rcu_head *rcu) 1212 { 1213 struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); 1214 1215 __sk_filter_release(fp); 1216 } 1217 1218 /** 1219 * sk_filter_release - release a socket filter 1220 * @fp: filter to remove 1221 * 1222 * Remove a filter from a socket and release its resources. 1223 */ 1224 static void sk_filter_release(struct sk_filter *fp) 1225 { 1226 if (refcount_dec_and_test(&fp->refcnt)) 1227 call_rcu(&fp->rcu, sk_filter_release_rcu); 1228 } 1229 1230 void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) 1231 { 1232 u32 filter_size = bpf_prog_size(fp->prog->len); 1233 1234 atomic_sub(filter_size, &sk->sk_omem_alloc); 1235 sk_filter_release(fp); 1236 } 1237 1238 /* try to charge the socket memory if there is space available 1239 * return true on success 1240 */ 1241 static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) 1242 { 1243 int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); 1244 u32 filter_size = bpf_prog_size(fp->prog->len); 1245 1246 /* same check as in sock_kmalloc() */ 1247 if (filter_size <= optmem_max && 1248 atomic_read(&sk->sk_omem_alloc) + filter_size < optmem_max) { 1249 atomic_add(filter_size, &sk->sk_omem_alloc); 1250 return true; 1251 } 1252 return false; 1253 } 1254 1255 bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) 1256 { 1257 if (!refcount_inc_not_zero(&fp->refcnt)) 1258 return false; 1259 1260 if (!__sk_filter_charge(sk, fp)) { 1261 sk_filter_release(fp); 1262 return false; 1263 } 1264 return true; 1265 } 1266 1267 static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) 1268 { 1269 struct sock_filter *old_prog; 1270 struct bpf_prog *old_fp; 1271 int err, new_len, old_len = fp->len; 1272 bool seen_ld_abs = false; 1273 1274 /* We are free to overwrite insns et al right here as it won't be used at 1275 * this point in time anymore internally after the migration to the eBPF 1276 * instruction representation. 1277 */ 1278 BUILD_BUG_ON(sizeof(struct sock_filter) != 1279 sizeof(struct bpf_insn)); 1280 1281 /* Conversion cannot happen on overlapping memory areas, 1282 * so we need to keep the user BPF around until the 2nd 1283 * pass. At this time, the user BPF is stored in fp->insns. 1284 */ 1285 old_prog = kmemdup_array(fp->insns, old_len, sizeof(struct sock_filter), 1286 GFP_KERNEL | __GFP_NOWARN); 1287 if (!old_prog) { 1288 err = -ENOMEM; 1289 goto out_err; 1290 } 1291 1292 /* 1st pass: calculate the new program length. */ 1293 err = bpf_convert_filter(old_prog, old_len, NULL, &new_len, 1294 &seen_ld_abs); 1295 if (err) 1296 goto out_err_free; 1297 1298 /* Expand fp for appending the new filter representation. */ 1299 old_fp = fp; 1300 fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0); 1301 if (!fp) { 1302 /* The old_fp is still around in case we couldn't 1303 * allocate new memory, so uncharge on that one. 1304 */ 1305 fp = old_fp; 1306 err = -ENOMEM; 1307 goto out_err_free; 1308 } 1309 1310 fp->len = new_len; 1311 1312 /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ 1313 err = bpf_convert_filter(old_prog, old_len, fp, &new_len, 1314 &seen_ld_abs); 1315 if (err) 1316 /* 2nd bpf_convert_filter() can fail only if it fails 1317 * to allocate memory, remapping must succeed. Note, 1318 * that at this time old_fp has already been released 1319 * by krealloc(). 1320 */ 1321 goto out_err_free; 1322 1323 fp = bpf_prog_select_runtime(fp, &err); 1324 if (err) 1325 goto out_err_free; 1326 1327 kfree(old_prog); 1328 return fp; 1329 1330 out_err_free: 1331 kfree(old_prog); 1332 out_err: 1333 __bpf_prog_release(fp); 1334 return ERR_PTR(err); 1335 } 1336 1337 static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, 1338 bpf_aux_classic_check_t trans) 1339 { 1340 int err; 1341 1342 fp->bpf_func = NULL; 1343 fp->jited = 0; 1344 1345 err = bpf_check_classic(fp->insns, fp->len); 1346 if (err) { 1347 __bpf_prog_release(fp); 1348 return ERR_PTR(err); 1349 } 1350 1351 /* There might be additional checks and transformations 1352 * needed on classic filters, f.e. in case of seccomp. 1353 */ 1354 if (trans) { 1355 err = trans(fp->insns, fp->len); 1356 if (err) { 1357 __bpf_prog_release(fp); 1358 return ERR_PTR(err); 1359 } 1360 } 1361 1362 /* Probe if we can JIT compile the filter and if so, do 1363 * the compilation of the filter. 1364 */ 1365 bpf_jit_compile(fp); 1366 1367 /* JIT compiler couldn't process this filter, so do the eBPF translation 1368 * for the optimized interpreter. 1369 */ 1370 if (!fp->jited) 1371 fp = bpf_migrate_filter(fp); 1372 1373 return fp; 1374 } 1375 1376 /** 1377 * bpf_prog_create - create an unattached filter 1378 * @pfp: the unattached filter that is created 1379 * @fprog: the filter program 1380 * 1381 * Create a filter independent of any socket. We first run some 1382 * sanity checks on it to make sure it does not explode on us later. 1383 * If an error occurs or there is insufficient memory for the filter 1384 * a negative errno code is returned. On success the return is zero. 1385 */ 1386 int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) 1387 { 1388 unsigned int fsize = bpf_classic_proglen(fprog); 1389 struct bpf_prog *fp; 1390 1391 /* Make sure new filter is there and in the right amounts. */ 1392 if (!bpf_check_basics_ok(fprog->filter, fprog->len)) 1393 return -EINVAL; 1394 1395 fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); 1396 if (!fp) 1397 return -ENOMEM; 1398 1399 memcpy(fp->insns, fprog->filter, fsize); 1400 1401 fp->len = fprog->len; 1402 /* Since unattached filters are not copied back to user 1403 * space through sk_get_filter(), we do not need to hold 1404 * a copy here, and can spare us the work. 1405 */ 1406 fp->orig_prog = NULL; 1407 1408 /* bpf_prepare_filter() already takes care of freeing 1409 * memory in case something goes wrong. 1410 */ 1411 fp = bpf_prepare_filter(fp, NULL); 1412 if (IS_ERR(fp)) 1413 return PTR_ERR(fp); 1414 1415 *pfp = fp; 1416 return 0; 1417 } 1418 EXPORT_SYMBOL_GPL(bpf_prog_create); 1419 1420 /** 1421 * bpf_prog_create_from_user - create an unattached filter from user buffer 1422 * @pfp: the unattached filter that is created 1423 * @fprog: the filter program 1424 * @trans: post-classic verifier transformation handler 1425 * @save_orig: save classic BPF program 1426 * 1427 * This function effectively does the same as bpf_prog_create(), only 1428 * that it builds up its insns buffer from user space provided buffer. 1429 * It also allows for passing a bpf_aux_classic_check_t handler. 1430 */ 1431 int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, 1432 bpf_aux_classic_check_t trans, bool save_orig) 1433 { 1434 unsigned int fsize = bpf_classic_proglen(fprog); 1435 struct bpf_prog *fp; 1436 int err; 1437 1438 /* Make sure new filter is there and in the right amounts. */ 1439 if (!bpf_check_basics_ok(fprog->filter, fprog->len)) 1440 return -EINVAL; 1441 1442 fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); 1443 if (!fp) 1444 return -ENOMEM; 1445 1446 if (copy_from_user(fp->insns, fprog->filter, fsize)) { 1447 __bpf_prog_free(fp); 1448 return -EFAULT; 1449 } 1450 1451 fp->len = fprog->len; 1452 fp->orig_prog = NULL; 1453 1454 if (save_orig) { 1455 err = bpf_prog_store_orig_filter(fp, fprog); 1456 if (err) { 1457 __bpf_prog_free(fp); 1458 return -ENOMEM; 1459 } 1460 } 1461 1462 /* bpf_prepare_filter() already takes care of freeing 1463 * memory in case something goes wrong. 1464 */ 1465 fp = bpf_prepare_filter(fp, trans); 1466 if (IS_ERR(fp)) 1467 return PTR_ERR(fp); 1468 1469 *pfp = fp; 1470 return 0; 1471 } 1472 EXPORT_SYMBOL_GPL(bpf_prog_create_from_user); 1473 1474 void bpf_prog_destroy(struct bpf_prog *fp) 1475 { 1476 __bpf_prog_release(fp); 1477 } 1478 EXPORT_SYMBOL_GPL(bpf_prog_destroy); 1479 1480 static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) 1481 { 1482 struct sk_filter *fp, *old_fp; 1483 1484 fp = kmalloc_obj(*fp); 1485 if (!fp) 1486 return -ENOMEM; 1487 1488 fp->prog = prog; 1489 1490 if (!__sk_filter_charge(sk, fp)) { 1491 kfree(fp); 1492 return -ENOMEM; 1493 } 1494 refcount_set(&fp->refcnt, 1); 1495 1496 old_fp = rcu_dereference_protected(sk->sk_filter, 1497 lockdep_sock_is_held(sk)); 1498 rcu_assign_pointer(sk->sk_filter, fp); 1499 1500 if (old_fp) 1501 sk_filter_uncharge(sk, old_fp); 1502 1503 return 0; 1504 } 1505 1506 static 1507 struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) 1508 { 1509 unsigned int fsize = bpf_classic_proglen(fprog); 1510 struct bpf_prog *prog; 1511 int err; 1512 1513 if (sock_flag(sk, SOCK_FILTER_LOCKED)) 1514 return ERR_PTR(-EPERM); 1515 1516 /* Make sure new filter is there and in the right amounts. */ 1517 if (!bpf_check_basics_ok(fprog->filter, fprog->len)) 1518 return ERR_PTR(-EINVAL); 1519 1520 prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); 1521 if (!prog) 1522 return ERR_PTR(-ENOMEM); 1523 1524 if (copy_from_user(prog->insns, fprog->filter, fsize)) { 1525 __bpf_prog_free(prog); 1526 return ERR_PTR(-EFAULT); 1527 } 1528 1529 prog->len = fprog->len; 1530 1531 err = bpf_prog_store_orig_filter(prog, fprog); 1532 if (err) { 1533 __bpf_prog_free(prog); 1534 return ERR_PTR(-ENOMEM); 1535 } 1536 1537 /* bpf_prepare_filter() already takes care of freeing 1538 * memory in case something goes wrong. 1539 */ 1540 return bpf_prepare_filter(prog, NULL); 1541 } 1542 1543 /** 1544 * sk_attach_filter - attach a socket filter 1545 * @fprog: the filter program 1546 * @sk: the socket to use 1547 * 1548 * Attach the user's filter code. We first run some sanity checks on 1549 * it to make sure it does not explode on us later. If an error 1550 * occurs or there is insufficient memory for the filter a negative 1551 * errno code is returned. On success the return is zero. 1552 */ 1553 int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) 1554 { 1555 struct bpf_prog *prog = __get_filter(fprog, sk); 1556 int err; 1557 1558 if (IS_ERR(prog)) 1559 return PTR_ERR(prog); 1560 1561 err = __sk_attach_prog(prog, sk); 1562 if (err < 0) { 1563 __bpf_prog_release(prog); 1564 return err; 1565 } 1566 1567 return 0; 1568 } 1569 EXPORT_SYMBOL_GPL(sk_attach_filter); 1570 1571 int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) 1572 { 1573 struct bpf_prog *prog = __get_filter(fprog, sk); 1574 int err, optmem_max; 1575 1576 if (IS_ERR(prog)) 1577 return PTR_ERR(prog); 1578 1579 optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); 1580 if (bpf_prog_size(prog->len) > optmem_max) 1581 err = -ENOMEM; 1582 else 1583 err = reuseport_attach_prog(sk, prog); 1584 1585 if (err) 1586 __bpf_prog_release(prog); 1587 1588 return err; 1589 } 1590 1591 static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) 1592 { 1593 if (sock_flag(sk, SOCK_FILTER_LOCKED)) 1594 return ERR_PTR(-EPERM); 1595 1596 return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); 1597 } 1598 1599 int sk_attach_bpf(u32 ufd, struct sock *sk) 1600 { 1601 struct bpf_prog *prog = __get_bpf(ufd, sk); 1602 int err; 1603 1604 if (IS_ERR(prog)) 1605 return PTR_ERR(prog); 1606 1607 err = __sk_attach_prog(prog, sk); 1608 if (err < 0) { 1609 bpf_prog_put(prog); 1610 return err; 1611 } 1612 1613 return 0; 1614 } 1615 1616 int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) 1617 { 1618 struct bpf_prog *prog; 1619 int err, optmem_max; 1620 1621 if (sock_flag(sk, SOCK_FILTER_LOCKED)) 1622 return -EPERM; 1623 1624 prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); 1625 if (PTR_ERR(prog) == -EINVAL) 1626 prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT); 1627 if (IS_ERR(prog)) 1628 return PTR_ERR(prog); 1629 1630 if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) { 1631 /* Like other non BPF_PROG_TYPE_SOCKET_FILTER 1632 * bpf prog (e.g. sockmap). It depends on the 1633 * limitation imposed by bpf_prog_load(). 1634 * Hence, sysctl_optmem_max is not checked. 1635 */ 1636 if ((sk->sk_type != SOCK_STREAM && 1637 sk->sk_type != SOCK_DGRAM) || 1638 (sk->sk_protocol != IPPROTO_UDP && 1639 sk->sk_protocol != IPPROTO_TCP) || 1640 (sk->sk_family != AF_INET && 1641 sk->sk_family != AF_INET6)) { 1642 err = -ENOTSUPP; 1643 goto err_prog_put; 1644 } 1645 } else { 1646 /* BPF_PROG_TYPE_SOCKET_FILTER */ 1647 optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); 1648 if (bpf_prog_size(prog->len) > optmem_max) { 1649 err = -ENOMEM; 1650 goto err_prog_put; 1651 } 1652 } 1653 1654 err = reuseport_attach_prog(sk, prog); 1655 err_prog_put: 1656 if (err) 1657 bpf_prog_put(prog); 1658 1659 return err; 1660 } 1661 1662 void sk_reuseport_prog_free(struct bpf_prog *prog) 1663 { 1664 if (!prog) 1665 return; 1666 1667 if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) 1668 bpf_prog_put(prog); 1669 else 1670 bpf_prog_destroy(prog); 1671 } 1672 1673 static inline int __bpf_try_make_writable(struct sk_buff *skb, 1674 unsigned int write_len) 1675 { 1676 #ifdef CONFIG_DEBUG_NET 1677 /* Avoid a splat in pskb_may_pull_reason() */ 1678 if (write_len > INT_MAX) 1679 return -EINVAL; 1680 #endif 1681 return skb_ensure_writable(skb, write_len); 1682 } 1683 1684 static inline int bpf_try_make_writable(struct sk_buff *skb, 1685 unsigned int write_len) 1686 { 1687 int err = __bpf_try_make_writable(skb, write_len); 1688 1689 bpf_compute_data_pointers(skb); 1690 return err; 1691 } 1692 1693 static int bpf_try_make_head_writable(struct sk_buff *skb) 1694 { 1695 return bpf_try_make_writable(skb, skb_headlen(skb)); 1696 } 1697 1698 static inline void bpf_push_mac_rcsum(struct sk_buff *skb) 1699 { 1700 if (skb_at_tc_ingress(skb)) 1701 skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len); 1702 } 1703 1704 static inline void bpf_pull_mac_rcsum(struct sk_buff *skb) 1705 { 1706 if (skb_at_tc_ingress(skb)) 1707 skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len); 1708 } 1709 1710 BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset, 1711 const void *, from, u32, len, u64, flags) 1712 { 1713 void *ptr; 1714 1715 if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) 1716 return -EINVAL; 1717 if (unlikely(offset > INT_MAX)) 1718 return -EFAULT; 1719 if (unlikely(bpf_try_make_writable(skb, offset + len))) 1720 return -EFAULT; 1721 1722 ptr = skb->data + offset; 1723 if (flags & BPF_F_RECOMPUTE_CSUM) 1724 __skb_postpull_rcsum(skb, ptr, len, offset); 1725 1726 memcpy(ptr, from, len); 1727 1728 if (flags & BPF_F_RECOMPUTE_CSUM) 1729 __skb_postpush_rcsum(skb, ptr, len, offset); 1730 if (flags & BPF_F_INVALIDATE_HASH) 1731 skb_clear_hash(skb); 1732 1733 return 0; 1734 } 1735 1736 static const struct bpf_func_proto bpf_skb_store_bytes_proto = { 1737 .func = bpf_skb_store_bytes, 1738 .gpl_only = false, 1739 .ret_type = RET_INTEGER, 1740 .arg1_type = ARG_PTR_TO_CTX, 1741 .arg2_type = ARG_ANYTHING, 1742 .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, 1743 .arg4_type = ARG_CONST_SIZE, 1744 .arg5_type = ARG_ANYTHING, 1745 }; 1746 1747 int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, 1748 u32 len, u64 flags) 1749 { 1750 return ____bpf_skb_store_bytes(skb, offset, from, len, flags); 1751 } 1752 1753 BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset, 1754 void *, to, u32, len) 1755 { 1756 void *ptr; 1757 1758 if (unlikely(offset > INT_MAX)) 1759 goto err_clear; 1760 1761 ptr = skb_header_pointer(skb, offset, len, to); 1762 if (unlikely(!ptr)) 1763 goto err_clear; 1764 if (ptr != to) 1765 memcpy(to, ptr, len); 1766 1767 return 0; 1768 err_clear: 1769 memset(to, 0, len); 1770 return -EFAULT; 1771 } 1772 1773 static const struct bpf_func_proto bpf_skb_load_bytes_proto = { 1774 .func = bpf_skb_load_bytes, 1775 .gpl_only = false, 1776 .ret_type = RET_INTEGER, 1777 .arg1_type = ARG_PTR_TO_CTX, 1778 .arg2_type = ARG_ANYTHING, 1779 .arg3_type = ARG_PTR_TO_UNINIT_MEM, 1780 .arg4_type = ARG_CONST_SIZE, 1781 }; 1782 1783 int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) 1784 { 1785 return ____bpf_skb_load_bytes(skb, offset, to, len); 1786 } 1787 1788 BPF_CALL_4(bpf_flow_dissector_load_bytes, 1789 const struct bpf_flow_dissector *, ctx, u32, offset, 1790 void *, to, u32, len) 1791 { 1792 void *ptr; 1793 1794 if (unlikely(offset > 0xffff)) 1795 goto err_clear; 1796 1797 if (unlikely(!ctx->skb)) 1798 goto err_clear; 1799 1800 ptr = skb_header_pointer(ctx->skb, offset, len, to); 1801 if (unlikely(!ptr)) 1802 goto err_clear; 1803 if (ptr != to) 1804 memcpy(to, ptr, len); 1805 1806 return 0; 1807 err_clear: 1808 memset(to, 0, len); 1809 return -EFAULT; 1810 } 1811 1812 static const struct bpf_func_proto bpf_flow_dissector_load_bytes_proto = { 1813 .func = bpf_flow_dissector_load_bytes, 1814 .gpl_only = false, 1815 .ret_type = RET_INTEGER, 1816 .arg1_type = ARG_PTR_TO_CTX, 1817 .arg2_type = ARG_ANYTHING, 1818 .arg3_type = ARG_PTR_TO_UNINIT_MEM, 1819 .arg4_type = ARG_CONST_SIZE, 1820 }; 1821 1822 BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb, 1823 u32, offset, void *, to, u32, len, u32, start_header) 1824 { 1825 u8 *end = skb_tail_pointer(skb); 1826 u8 *start, *ptr; 1827 1828 if (unlikely(offset > 0xffff)) 1829 goto err_clear; 1830 1831 switch (start_header) { 1832 case BPF_HDR_START_MAC: 1833 if (unlikely(!skb_mac_header_was_set(skb))) 1834 goto err_clear; 1835 start = skb_mac_header(skb); 1836 break; 1837 case BPF_HDR_START_NET: 1838 start = skb_network_header(skb); 1839 break; 1840 default: 1841 goto err_clear; 1842 } 1843 1844 ptr = start + offset; 1845 1846 if (likely(ptr + len <= end)) { 1847 memcpy(to, ptr, len); 1848 return 0; 1849 } 1850 1851 err_clear: 1852 memset(to, 0, len); 1853 return -EFAULT; 1854 } 1855 1856 static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = { 1857 .func = bpf_skb_load_bytes_relative, 1858 .gpl_only = false, 1859 .ret_type = RET_INTEGER, 1860 .arg1_type = ARG_PTR_TO_CTX, 1861 .arg2_type = ARG_ANYTHING, 1862 .arg3_type = ARG_PTR_TO_UNINIT_MEM, 1863 .arg4_type = ARG_CONST_SIZE, 1864 .arg5_type = ARG_ANYTHING, 1865 }; 1866 1867 BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) 1868 { 1869 /* Idea is the following: should the needed direct read/write 1870 * test fail during runtime, we can pull in more data and redo 1871 * again, since implicitly, we invalidate previous checks here. 1872 * 1873 * Or, since we know how much we need to make read/writeable, 1874 * this can be done once at the program beginning for direct 1875 * access case. By this we overcome limitations of only current 1876 * headroom being accessible. 1877 */ 1878 return bpf_try_make_writable(skb, len ? : skb_headlen(skb)); 1879 } 1880 1881 static const struct bpf_func_proto bpf_skb_pull_data_proto = { 1882 .func = bpf_skb_pull_data, 1883 .gpl_only = false, 1884 .ret_type = RET_INTEGER, 1885 .arg1_type = ARG_PTR_TO_CTX, 1886 .arg2_type = ARG_ANYTHING, 1887 }; 1888 1889 BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk) 1890 { 1891 return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL; 1892 } 1893 1894 static const struct bpf_func_proto bpf_sk_fullsock_proto = { 1895 .func = bpf_sk_fullsock, 1896 .gpl_only = false, 1897 .ret_type = RET_PTR_TO_SOCKET_OR_NULL, 1898 .arg1_type = ARG_PTR_TO_SOCK_COMMON, 1899 }; 1900 1901 static inline int sk_skb_try_make_writable(struct sk_buff *skb, 1902 unsigned int write_len) 1903 { 1904 return __bpf_try_make_writable(skb, write_len); 1905 } 1906 1907 BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len) 1908 { 1909 /* Idea is the following: should the needed direct read/write 1910 * test fail during runtime, we can pull in more data and redo 1911 * again, since implicitly, we invalidate previous checks here. 1912 * 1913 * Or, since we know how much we need to make read/writeable, 1914 * this can be done once at the program beginning for direct 1915 * access case. By this we overcome limitations of only current 1916 * headroom being accessible. 1917 */ 1918 return sk_skb_try_make_writable(skb, len ? : skb_headlen(skb)); 1919 } 1920 1921 static const struct bpf_func_proto sk_skb_pull_data_proto = { 1922 .func = sk_skb_pull_data, 1923 .gpl_only = false, 1924 .ret_type = RET_INTEGER, 1925 .arg1_type = ARG_PTR_TO_CTX, 1926 .arg2_type = ARG_ANYTHING, 1927 }; 1928 1929 BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset, 1930 u64, from, u64, to, u64, flags) 1931 { 1932 __sum16 *ptr; 1933 1934 if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK))) 1935 return -EINVAL; 1936 if (unlikely(offset > 0xffff || offset & 1)) 1937 return -EFAULT; 1938 if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) 1939 return -EFAULT; 1940 1941 ptr = (__sum16 *)(skb->data + offset); 1942 switch (flags & BPF_F_HDR_FIELD_MASK) { 1943 case 0: 1944 if (unlikely(from != 0)) 1945 return -EINVAL; 1946 1947 csum_replace_by_diff(ptr, to); 1948 break; 1949 case 2: 1950 csum_replace2(ptr, from, to); 1951 break; 1952 case 4: 1953 csum_replace4(ptr, from, to); 1954 break; 1955 default: 1956 return -EINVAL; 1957 } 1958 1959 return 0; 1960 } 1961 1962 static const struct bpf_func_proto bpf_l3_csum_replace_proto = { 1963 .func = bpf_l3_csum_replace, 1964 .gpl_only = false, 1965 .ret_type = RET_INTEGER, 1966 .arg1_type = ARG_PTR_TO_CTX, 1967 .arg2_type = ARG_ANYTHING, 1968 .arg3_type = ARG_ANYTHING, 1969 .arg4_type = ARG_ANYTHING, 1970 .arg5_type = ARG_ANYTHING, 1971 }; 1972 1973 BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, 1974 u64, from, u64, to, u64, flags) 1975 { 1976 bool is_pseudo = flags & BPF_F_PSEUDO_HDR; 1977 bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; 1978 bool do_mforce = flags & BPF_F_MARK_ENFORCE; 1979 bool is_ipv6 = flags & BPF_F_IPV6; 1980 __sum16 *ptr; 1981 1982 if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE | 1983 BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK | BPF_F_IPV6))) 1984 return -EINVAL; 1985 if (unlikely(offset > 0xffff || offset & 1)) 1986 return -EFAULT; 1987 if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) 1988 return -EFAULT; 1989 1990 ptr = (__sum16 *)(skb->data + offset); 1991 if (is_mmzero && !do_mforce && !*ptr) 1992 return 0; 1993 1994 switch (flags & BPF_F_HDR_FIELD_MASK) { 1995 case 0: 1996 if (unlikely(from != 0)) 1997 return -EINVAL; 1998 1999 inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo, is_ipv6); 2000 break; 2001 case 2: 2002 inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); 2003 break; 2004 case 4: 2005 inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo); 2006 break; 2007 default: 2008 return -EINVAL; 2009 } 2010 2011 if (is_mmzero && !*ptr) 2012 *ptr = CSUM_MANGLED_0; 2013 return 0; 2014 } 2015 2016 static const struct bpf_func_proto bpf_l4_csum_replace_proto = { 2017 .func = bpf_l4_csum_replace, 2018 .gpl_only = false, 2019 .ret_type = RET_INTEGER, 2020 .arg1_type = ARG_PTR_TO_CTX, 2021 .arg2_type = ARG_ANYTHING, 2022 .arg3_type = ARG_ANYTHING, 2023 .arg4_type = ARG_ANYTHING, 2024 .arg5_type = ARG_ANYTHING, 2025 }; 2026 2027 BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, 2028 __be32 *, to, u32, to_size, __wsum, seed) 2029 { 2030 /* This is quite flexible, some examples: 2031 * 2032 * from_size == 0, to_size > 0, seed := csum --> pushing data 2033 * from_size > 0, to_size == 0, seed := csum --> pulling data 2034 * from_size > 0, to_size > 0, seed := 0 --> diffing data 2035 * 2036 * Even for diffing, from_size and to_size don't need to be equal. 2037 */ 2038 2039 __wsum ret = seed; 2040 2041 if (from_size && to_size) 2042 ret = csum_sub(csum_partial(to, to_size, ret), 2043 csum_partial(from, from_size, 0)); 2044 else if (to_size) 2045 ret = csum_partial(to, to_size, ret); 2046 2047 else if (from_size) 2048 ret = ~csum_partial(from, from_size, ~ret); 2049 2050 return csum_from32to16((__force unsigned int)ret); 2051 } 2052 2053 static const struct bpf_func_proto bpf_csum_diff_proto = { 2054 .func = bpf_csum_diff, 2055 .gpl_only = false, 2056 .pkt_access = true, 2057 .ret_type = RET_INTEGER, 2058 .arg1_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, 2059 .arg2_type = ARG_CONST_SIZE_OR_ZERO, 2060 .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, 2061 .arg4_type = ARG_CONST_SIZE_OR_ZERO, 2062 .arg5_type = ARG_ANYTHING, 2063 }; 2064 2065 BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum) 2066 { 2067 /* The interface is to be used in combination with bpf_csum_diff() 2068 * for direct packet writes. csum rotation for alignment as well 2069 * as emulating csum_sub() can be done from the eBPF program. 2070 */ 2071 if (skb->ip_summed == CHECKSUM_COMPLETE) 2072 return (skb->csum = csum_add(skb->csum, csum)); 2073 2074 return -ENOTSUPP; 2075 } 2076 2077 static const struct bpf_func_proto bpf_csum_update_proto = { 2078 .func = bpf_csum_update, 2079 .gpl_only = false, 2080 .ret_type = RET_INTEGER, 2081 .arg1_type = ARG_PTR_TO_CTX, 2082 .arg2_type = ARG_ANYTHING, 2083 }; 2084 2085 BPF_CALL_2(bpf_csum_level, struct sk_buff *, skb, u64, level) 2086 { 2087 /* The interface is to be used in combination with bpf_skb_adjust_room() 2088 * for encap/decap of packet headers when BPF_F_ADJ_ROOM_NO_CSUM_RESET 2089 * is passed as flags, for example. 2090 */ 2091 switch (level) { 2092 case BPF_CSUM_LEVEL_INC: 2093 __skb_incr_checksum_unnecessary(skb); 2094 break; 2095 case BPF_CSUM_LEVEL_DEC: 2096 __skb_decr_checksum_unnecessary(skb); 2097 break; 2098 case BPF_CSUM_LEVEL_RESET: 2099 __skb_reset_checksum_unnecessary(skb); 2100 break; 2101 case BPF_CSUM_LEVEL_QUERY: 2102 return skb->ip_summed == CHECKSUM_UNNECESSARY ? 2103 skb->csum_level : -EACCES; 2104 default: 2105 return -EINVAL; 2106 } 2107 2108 return 0; 2109 } 2110 2111 static const struct bpf_func_proto bpf_csum_level_proto = { 2112 .func = bpf_csum_level, 2113 .gpl_only = false, 2114 .ret_type = RET_INTEGER, 2115 .arg1_type = ARG_PTR_TO_CTX, 2116 .arg2_type = ARG_ANYTHING, 2117 }; 2118 2119 static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) 2120 { 2121 return dev_forward_skb_nomtu(dev, skb); 2122 } 2123 2124 static inline int __bpf_rx_skb_no_mac(struct net_device *dev, 2125 struct sk_buff *skb) 2126 { 2127 int ret = ____dev_forward_skb(dev, skb, false); 2128 2129 if (likely(!ret)) { 2130 skb->dev = dev; 2131 ret = netif_rx(skb); 2132 } 2133 2134 return ret; 2135 } 2136 2137 static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) 2138 { 2139 int ret; 2140 2141 if (dev_xmit_recursion()) { 2142 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); 2143 kfree_skb(skb); 2144 return -ENETDOWN; 2145 } 2146 2147 skb->dev = dev; 2148 skb_set_redirected_noclear(skb, skb_at_tc_ingress(skb)); 2149 skb_clear_tstamp(skb); 2150 2151 dev_xmit_recursion_inc(); 2152 ret = dev_queue_xmit(skb); 2153 dev_xmit_recursion_dec(); 2154 2155 return ret; 2156 } 2157 2158 static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev, 2159 u32 flags) 2160 { 2161 unsigned int mlen = skb_network_offset(skb); 2162 2163 if (unlikely(skb->len <= mlen)) { 2164 kfree_skb(skb); 2165 return -ERANGE; 2166 } 2167 2168 if (mlen) { 2169 __skb_pull(skb, mlen); 2170 2171 /* At ingress, the mac header has already been pulled once. 2172 * At egress, skb_pospull_rcsum has to be done in case that 2173 * the skb is originated from ingress (i.e. a forwarded skb) 2174 * to ensure that rcsum starts at net header. 2175 */ 2176 if (!skb_at_tc_ingress(skb)) 2177 skb_postpull_rcsum(skb, skb_mac_header(skb), mlen); 2178 } 2179 skb_pop_mac_header(skb); 2180 skb_reset_mac_len(skb); 2181 return flags & BPF_F_INGRESS ? 2182 __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb); 2183 } 2184 2185 static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, 2186 u32 flags) 2187 { 2188 /* Verify that a link layer header is carried */ 2189 if (unlikely(skb->mac_header >= skb->network_header || skb->len == 0)) { 2190 kfree_skb(skb); 2191 return -ERANGE; 2192 } 2193 2194 bpf_push_mac_rcsum(skb); 2195 return flags & BPF_F_INGRESS ? 2196 __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); 2197 } 2198 2199 static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, 2200 u32 flags) 2201 { 2202 if (dev_is_mac_header_xmit(dev)) 2203 return __bpf_redirect_common(skb, dev, flags); 2204 else 2205 return __bpf_redirect_no_mac(skb, dev, flags); 2206 } 2207 2208 #if IS_ENABLED(CONFIG_IPV6) 2209 static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, 2210 struct net_device *dev, struct bpf_nh_params *nh) 2211 { 2212 u32 hh_len = LL_RESERVED_SPACE(dev); 2213 const struct in6_addr *nexthop; 2214 struct dst_entry *dst = NULL; 2215 struct neighbour *neigh; 2216 2217 if (dev_xmit_recursion()) { 2218 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); 2219 goto out_drop; 2220 } 2221 2222 skb->dev = dev; 2223 skb_clear_tstamp(skb); 2224 2225 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { 2226 skb = skb_expand_head(skb, hh_len); 2227 if (!skb) 2228 return -ENOMEM; 2229 } 2230 2231 if (unlikely(!ipv6_mod_enabled())) 2232 goto out_drop; 2233 2234 rcu_read_lock(); 2235 if (!nh) { 2236 dst = skb_dst(skb); 2237 nexthop = rt6_nexthop(dst_rt6_info(dst), 2238 &ipv6_hdr(skb)->daddr); 2239 } else { 2240 nexthop = &nh->ipv6_nh; 2241 } 2242 neigh = ip_neigh_gw6(dev, nexthop); 2243 if (likely(!IS_ERR(neigh))) { 2244 int ret; 2245 2246 sock_confirm_neigh(skb, neigh); 2247 local_bh_disable(); 2248 dev_xmit_recursion_inc(); 2249 ret = neigh_output(neigh, skb, false); 2250 dev_xmit_recursion_dec(); 2251 local_bh_enable(); 2252 rcu_read_unlock(); 2253 return ret; 2254 } 2255 rcu_read_unlock(); 2256 if (dst) 2257 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); 2258 out_drop: 2259 kfree_skb(skb); 2260 return -ENETDOWN; 2261 } 2262 2263 static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, 2264 struct bpf_nh_params *nh) 2265 { 2266 const struct ipv6hdr *ip6h = ipv6_hdr(skb); 2267 struct net *net = dev_net(dev); 2268 int err, ret = NET_XMIT_DROP; 2269 2270 if (!nh) { 2271 struct dst_entry *dst; 2272 struct flowi6 fl6 = { 2273 .flowi6_flags = FLOWI_FLAG_ANYSRC, 2274 .flowi6_mark = skb->mark, 2275 .flowlabel = ip6_flowinfo(ip6h), 2276 .flowi6_oif = dev->ifindex, 2277 .flowi6_proto = ip6h->nexthdr, 2278 .daddr = ip6h->daddr, 2279 .saddr = ip6h->saddr, 2280 }; 2281 2282 dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); 2283 if (IS_ERR(dst)) 2284 goto out_drop; 2285 2286 skb_dst_drop(skb); 2287 skb_dst_set(skb, dst); 2288 } else if (nh->nh_family != AF_INET6) { 2289 goto out_drop; 2290 } 2291 2292 err = bpf_out_neigh_v6(net, skb, dev, nh); 2293 if (unlikely(net_xmit_eval(err))) 2294 dev_core_stats_tx_dropped_inc(dev); 2295 else 2296 ret = NET_XMIT_SUCCESS; 2297 goto out_xmit; 2298 out_drop: 2299 dev_core_stats_tx_dropped_inc(dev); 2300 kfree_skb(skb); 2301 out_xmit: 2302 return ret; 2303 } 2304 #else 2305 static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, 2306 struct bpf_nh_params *nh) 2307 { 2308 kfree_skb(skb); 2309 return NET_XMIT_DROP; 2310 } 2311 #endif /* CONFIG_IPV6 */ 2312 2313 #if IS_ENABLED(CONFIG_INET) 2314 static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, 2315 struct net_device *dev, struct bpf_nh_params *nh) 2316 { 2317 u32 hh_len = LL_RESERVED_SPACE(dev); 2318 struct neighbour *neigh; 2319 bool is_v6gw = false; 2320 2321 if (dev_xmit_recursion()) { 2322 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); 2323 goto out_drop; 2324 } 2325 2326 skb->dev = dev; 2327 skb_clear_tstamp(skb); 2328 2329 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { 2330 skb = skb_expand_head(skb, hh_len); 2331 if (!skb) 2332 return -ENOMEM; 2333 } 2334 2335 rcu_read_lock(); 2336 if (!nh) { 2337 struct rtable *rt = skb_rtable(skb); 2338 2339 neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); 2340 } else if (nh->nh_family == AF_INET6) { 2341 if (unlikely(!ipv6_mod_enabled())) { 2342 rcu_read_unlock(); 2343 goto out_drop; 2344 } 2345 neigh = ip_neigh_gw6(dev, &nh->ipv6_nh); 2346 is_v6gw = true; 2347 } else if (nh->nh_family == AF_INET) { 2348 neigh = ip_neigh_gw4(dev, nh->ipv4_nh); 2349 } else { 2350 rcu_read_unlock(); 2351 goto out_drop; 2352 } 2353 2354 if (likely(!IS_ERR(neigh))) { 2355 int ret; 2356 2357 sock_confirm_neigh(skb, neigh); 2358 local_bh_disable(); 2359 dev_xmit_recursion_inc(); 2360 ret = neigh_output(neigh, skb, is_v6gw); 2361 dev_xmit_recursion_dec(); 2362 local_bh_enable(); 2363 rcu_read_unlock(); 2364 return ret; 2365 } 2366 rcu_read_unlock(); 2367 out_drop: 2368 kfree_skb(skb); 2369 return -ENETDOWN; 2370 } 2371 2372 static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, 2373 struct bpf_nh_params *nh) 2374 { 2375 const struct iphdr *ip4h = ip_hdr(skb); 2376 struct net *net = dev_net(dev); 2377 int err, ret = NET_XMIT_DROP; 2378 2379 if (!nh) { 2380 struct flowi4 fl4 = { 2381 .flowi4_flags = FLOWI_FLAG_ANYSRC, 2382 .flowi4_mark = skb->mark, 2383 .flowi4_dscp = ip4h_dscp(ip4h), 2384 .flowi4_oif = dev->ifindex, 2385 .flowi4_proto = ip4h->protocol, 2386 .daddr = ip4h->daddr, 2387 .saddr = ip4h->saddr, 2388 }; 2389 struct rtable *rt; 2390 2391 rt = ip_route_output_flow(net, &fl4, NULL); 2392 if (IS_ERR(rt)) 2393 goto out_drop; 2394 if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { 2395 ip_rt_put(rt); 2396 goto out_drop; 2397 } 2398 2399 skb_dst_drop(skb); 2400 skb_dst_set(skb, &rt->dst); 2401 } 2402 2403 err = bpf_out_neigh_v4(net, skb, dev, nh); 2404 if (unlikely(net_xmit_eval(err))) 2405 dev_core_stats_tx_dropped_inc(dev); 2406 else 2407 ret = NET_XMIT_SUCCESS; 2408 goto out_xmit; 2409 out_drop: 2410 dev_core_stats_tx_dropped_inc(dev); 2411 kfree_skb(skb); 2412 out_xmit: 2413 return ret; 2414 } 2415 #else 2416 static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, 2417 struct bpf_nh_params *nh) 2418 { 2419 kfree_skb(skb); 2420 return NET_XMIT_DROP; 2421 } 2422 #endif /* CONFIG_INET */ 2423 2424 static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev, 2425 struct bpf_nh_params *nh) 2426 { 2427 struct ethhdr *ethh = eth_hdr(skb); 2428 2429 if (unlikely(skb->mac_header >= skb->network_header)) 2430 goto out; 2431 bpf_push_mac_rcsum(skb); 2432 if (is_multicast_ether_addr(ethh->h_dest)) 2433 goto out; 2434 2435 skb_pull(skb, sizeof(*ethh)); 2436 skb_unset_mac_header(skb); 2437 skb_reset_network_header(skb); 2438 2439 if (skb->protocol == htons(ETH_P_IP)) 2440 return __bpf_redirect_neigh_v4(skb, dev, nh); 2441 else if (skb->protocol == htons(ETH_P_IPV6)) 2442 return __bpf_redirect_neigh_v6(skb, dev, nh); 2443 out: 2444 kfree_skb(skb); 2445 return -ENOTSUPP; 2446 } 2447 2448 /* Internal, non-exposed redirect flags. */ 2449 enum { 2450 BPF_F_NEIGH = (1ULL << 16), 2451 BPF_F_PEER = (1ULL << 17), 2452 BPF_F_NEXTHOP = (1ULL << 18), 2453 #define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP) 2454 }; 2455 2456 BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) 2457 { 2458 struct net_device *dev; 2459 struct sk_buff *clone; 2460 int ret; 2461 2462 BUILD_BUG_ON(BPF_F_REDIRECT_INTERNAL & BPF_F_REDIRECT_FLAGS); 2463 2464 if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) 2465 return -EINVAL; 2466 2467 /* BPF test infra's convert___skb_to_skb() can create type-less 2468 * GSO packets. gso_features_check() will detect this as a bad 2469 * offload. However, lets not leak them out in the first place. 2470 */ 2471 if (unlikely(skb_is_gso(skb) && !skb_shinfo(skb)->gso_type)) 2472 return -EBADMSG; 2473 2474 dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); 2475 if (unlikely(!dev)) 2476 return -EINVAL; 2477 2478 clone = skb_clone(skb, GFP_ATOMIC); 2479 if (unlikely(!clone)) 2480 return -ENOMEM; 2481 2482 /* For direct write, we need to keep the invariant that the skbs 2483 * we're dealing with need to be uncloned. Should uncloning fail 2484 * here, we need to free the just generated clone to unclone once 2485 * again. 2486 */ 2487 ret = bpf_try_make_head_writable(skb); 2488 if (unlikely(ret)) { 2489 kfree_skb(clone); 2490 return -ENOMEM; 2491 } 2492 2493 return __bpf_redirect(clone, dev, flags); 2494 } 2495 2496 static const struct bpf_func_proto bpf_clone_redirect_proto = { 2497 .func = bpf_clone_redirect, 2498 .gpl_only = false, 2499 .ret_type = RET_INTEGER, 2500 .arg1_type = ARG_PTR_TO_CTX, 2501 .arg2_type = ARG_ANYTHING, 2502 .arg3_type = ARG_ANYTHING, 2503 }; 2504 2505 static struct net_device *skb_get_peer_dev(struct net_device *dev) 2506 { 2507 const struct net_device_ops *ops = dev->netdev_ops; 2508 2509 if (likely(ops->ndo_get_peer_dev)) 2510 return INDIRECT_CALL_1(ops->ndo_get_peer_dev, 2511 netkit_peer_dev, dev); 2512 return NULL; 2513 } 2514 2515 int skb_do_redirect(struct sk_buff *skb) 2516 { 2517 struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); 2518 struct net *net = dev_net(skb->dev); 2519 struct net_device *dev; 2520 u32 flags = ri->flags; 2521 2522 dev = dev_get_by_index_rcu(net, ri->tgt_index); 2523 ri->tgt_index = 0; 2524 ri->flags = 0; 2525 if (unlikely(!dev)) 2526 goto out_drop; 2527 if (flags & BPF_F_PEER) { 2528 if (unlikely(!skb_at_tc_ingress(skb))) 2529 goto out_drop; 2530 dev = skb_get_peer_dev(dev); 2531 if (unlikely(!dev || 2532 !(dev->flags & IFF_UP) || 2533 net_eq(net, dev_net(dev)))) 2534 goto out_drop; 2535 skb->dev = dev; 2536 dev_sw_netstats_rx_add(dev, skb->len); 2537 skb_scrub_packet(skb, false); 2538 return -EAGAIN; 2539 } 2540 return flags & BPF_F_NEIGH ? 2541 __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ? 2542 &ri->nh : NULL) : 2543 __bpf_redirect(skb, dev, flags); 2544 out_drop: 2545 kfree_skb(skb); 2546 return -EINVAL; 2547 } 2548 2549 BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) 2550 { 2551 struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); 2552 2553 if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) 2554 return TC_ACT_SHOT; 2555 2556 ri->flags = flags; 2557 ri->tgt_index = ifindex; 2558 2559 return TC_ACT_REDIRECT; 2560 } 2561 2562 static const struct bpf_func_proto bpf_redirect_proto = { 2563 .func = bpf_redirect, 2564 .gpl_only = false, 2565 .ret_type = RET_INTEGER, 2566 .arg1_type = ARG_ANYTHING, 2567 .arg2_type = ARG_ANYTHING, 2568 }; 2569 2570 BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags) 2571 { 2572 struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); 2573 2574 if (unlikely(flags)) 2575 return TC_ACT_SHOT; 2576 2577 ri->flags = BPF_F_PEER; 2578 ri->tgt_index = ifindex; 2579 2580 return TC_ACT_REDIRECT; 2581 } 2582 2583 static const struct bpf_func_proto bpf_redirect_peer_proto = { 2584 .func = bpf_redirect_peer, 2585 .gpl_only = false, 2586 .ret_type = RET_INTEGER, 2587 .arg1_type = ARG_ANYTHING, 2588 .arg2_type = ARG_ANYTHING, 2589 }; 2590 2591 BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params, 2592 int, plen, u64, flags) 2593 { 2594 struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); 2595 2596 if (unlikely((plen && plen < sizeof(*params)) || flags)) 2597 return TC_ACT_SHOT; 2598 2599 ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0); 2600 ri->tgt_index = ifindex; 2601 2602 BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params)); 2603 if (plen) 2604 memcpy(&ri->nh, params, sizeof(ri->nh)); 2605 2606 return TC_ACT_REDIRECT; 2607 } 2608 2609 static const struct bpf_func_proto bpf_redirect_neigh_proto = { 2610 .func = bpf_redirect_neigh, 2611 .gpl_only = false, 2612 .ret_type = RET_INTEGER, 2613 .arg1_type = ARG_ANYTHING, 2614 .arg2_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, 2615 .arg3_type = ARG_CONST_SIZE_OR_ZERO, 2616 .arg4_type = ARG_ANYTHING, 2617 }; 2618 2619 BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes) 2620 { 2621 msg->apply_bytes = bytes; 2622 return 0; 2623 } 2624 2625 static const struct bpf_func_proto bpf_msg_apply_bytes_proto = { 2626 .func = bpf_msg_apply_bytes, 2627 .gpl_only = false, 2628 .ret_type = RET_INTEGER, 2629 .arg1_type = ARG_PTR_TO_CTX, 2630 .arg2_type = ARG_ANYTHING, 2631 }; 2632 2633 BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes) 2634 { 2635 msg->cork_bytes = bytes; 2636 return 0; 2637 } 2638 2639 static void sk_msg_reset_curr(struct sk_msg *msg) 2640 { 2641 if (!msg->sg.size) { 2642 msg->sg.curr = msg->sg.start; 2643 msg->sg.copybreak = 0; 2644 } else { 2645 u32 i = msg->sg.end; 2646 2647 sk_msg_iter_var_prev(i); 2648 msg->sg.curr = i; 2649 msg->sg.copybreak = msg->sg.data[i].length; 2650 } 2651 } 2652 2653 static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { 2654 .func = bpf_msg_cork_bytes, 2655 .gpl_only = false, 2656 .ret_type = RET_INTEGER, 2657 .arg1_type = ARG_PTR_TO_CTX, 2658 .arg2_type = ARG_ANYTHING, 2659 }; 2660 2661 BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, 2662 u32, end, u64, flags) 2663 { 2664 u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start; 2665 u32 first_sge, last_sge, i, shift, bytes_sg_total; 2666 struct scatterlist *sge; 2667 u8 *raw, *to, *from; 2668 struct page *page; 2669 2670 if (unlikely(flags || end <= start)) 2671 return -EINVAL; 2672 2673 /* First find the starting scatterlist element */ 2674 i = msg->sg.start; 2675 do { 2676 offset += len; 2677 len = sk_msg_elem(msg, i)->length; 2678 if (start < offset + len) 2679 break; 2680 sk_msg_iter_var_next(i); 2681 } while (i != msg->sg.end); 2682 2683 if (unlikely(start >= offset + len)) 2684 return -EINVAL; 2685 2686 first_sge = i; 2687 /* The start may point into the sg element so we need to also 2688 * account for the headroom. 2689 */ 2690 bytes_sg_total = start - offset + bytes; 2691 if (!test_bit(i, msg->sg.copy) && bytes_sg_total <= len) 2692 goto out; 2693 2694 /* At this point we need to linearize multiple scatterlist 2695 * elements or a single shared page. Either way we need to 2696 * copy into a linear buffer exclusively owned by BPF. Then 2697 * place the buffer in the scatterlist and fixup the original 2698 * entries by removing the entries now in the linear buffer 2699 * and shifting the remaining entries. For now we do not try 2700 * to copy partial entries to avoid complexity of running out 2701 * of sg_entry slots. The downside is reading a single byte 2702 * will copy the entire sg entry. 2703 */ 2704 do { 2705 copy += sk_msg_elem(msg, i)->length; 2706 sk_msg_iter_var_next(i); 2707 if (bytes_sg_total <= copy) 2708 break; 2709 } while (i != msg->sg.end); 2710 last_sge = i; 2711 2712 if (unlikely(bytes_sg_total > copy)) 2713 return -EINVAL; 2714 2715 page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP, 2716 get_order(copy)); 2717 if (unlikely(!page)) 2718 return -ENOMEM; 2719 2720 raw = page_address(page); 2721 i = first_sge; 2722 do { 2723 sge = sk_msg_elem(msg, i); 2724 from = sg_virt(sge); 2725 len = sge->length; 2726 to = raw + poffset; 2727 2728 memcpy(to, from, len); 2729 poffset += len; 2730 sge->length = 0; 2731 put_page(sg_page(sge)); 2732 2733 sk_msg_iter_var_next(i); 2734 } while (i != last_sge); 2735 2736 sg_set_page(&msg->sg.data[first_sge], page, copy, 0); 2737 2738 /* To repair sg ring we need to shift entries. If we only 2739 * had a single entry though we can just replace it and 2740 * be done. Otherwise walk the ring and shift the entries. 2741 */ 2742 WARN_ON_ONCE(last_sge == first_sge); 2743 shift = last_sge > first_sge ? 2744 last_sge - first_sge - 1 : 2745 NR_MSG_FRAG_IDS - first_sge + last_sge - 1; 2746 if (!shift) 2747 goto out; 2748 2749 i = first_sge; 2750 sk_msg_iter_var_next(i); 2751 do { 2752 u32 move_from; 2753 2754 if (i + shift >= NR_MSG_FRAG_IDS) 2755 move_from = i + shift - NR_MSG_FRAG_IDS; 2756 else 2757 move_from = i + shift; 2758 if (move_from == msg->sg.end) 2759 break; 2760 2761 msg->sg.data[i] = msg->sg.data[move_from]; 2762 msg->sg.data[move_from].length = 0; 2763 msg->sg.data[move_from].page_link = 0; 2764 msg->sg.data[move_from].offset = 0; 2765 sk_msg_iter_var_next(i); 2766 } while (1); 2767 2768 msg->sg.end = msg->sg.end - shift > msg->sg.end ? 2769 msg->sg.end - shift + NR_MSG_FRAG_IDS : 2770 msg->sg.end - shift; 2771 out: 2772 sk_msg_reset_curr(msg); 2773 msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset; 2774 msg->data_end = msg->data + bytes; 2775 return 0; 2776 } 2777 2778 static const struct bpf_func_proto bpf_msg_pull_data_proto = { 2779 .func = bpf_msg_pull_data, 2780 .gpl_only = false, 2781 .ret_type = RET_INTEGER, 2782 .arg1_type = ARG_PTR_TO_CTX, 2783 .arg2_type = ARG_ANYTHING, 2784 .arg3_type = ARG_ANYTHING, 2785 .arg4_type = ARG_ANYTHING, 2786 }; 2787 2788 BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, 2789 u32, len, u64, flags) 2790 { 2791 struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge; 2792 u32 new, i = 0, l = 0, space, copy = 0, offset = 0; 2793 u8 *raw, *to, *from; 2794 struct page *page; 2795 2796 if (unlikely(flags)) 2797 return -EINVAL; 2798 2799 if (unlikely(len == 0)) 2800 return 0; 2801 2802 /* First find the starting scatterlist element */ 2803 i = msg->sg.start; 2804 do { 2805 offset += l; 2806 l = sk_msg_elem(msg, i)->length; 2807 2808 if (start < offset + l) 2809 break; 2810 sk_msg_iter_var_next(i); 2811 } while (i != msg->sg.end); 2812 2813 if (start > offset + l) 2814 return -EINVAL; 2815 2816 space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); 2817 2818 /* If no space available will fallback to copy, we need at 2819 * least one scatterlist elem available to push data into 2820 * when start aligns to the beginning of an element or two 2821 * when it falls inside an element. We handle the start equals 2822 * offset case because its the common case for inserting a 2823 * header. 2824 */ 2825 if (!space || (space == 1 && start != offset)) 2826 copy = msg->sg.data[i].length; 2827 2828 page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP, 2829 get_order(copy + len)); 2830 if (unlikely(!page)) 2831 return -ENOMEM; 2832 2833 if (copy) { 2834 int front, back; 2835 2836 raw = page_address(page); 2837 2838 if (i == msg->sg.end) 2839 sk_msg_iter_var_prev(i); 2840 psge = sk_msg_elem(msg, i); 2841 front = start - offset; 2842 back = psge->length - front; 2843 from = sg_virt(psge); 2844 2845 if (front) 2846 memcpy(raw, from, front); 2847 2848 if (back) { 2849 from += front; 2850 to = raw + front + len; 2851 2852 memcpy(to, from, back); 2853 } 2854 2855 put_page(sg_page(psge)); 2856 new = i; 2857 goto place_new; 2858 } 2859 2860 if (start - offset) { 2861 if (i == msg->sg.end) 2862 sk_msg_iter_var_prev(i); 2863 psge = sk_msg_elem(msg, i); 2864 rsge = sk_msg_elem_cpy(msg, i); 2865 2866 psge->length = start - offset; 2867 rsge.length -= psge->length; 2868 rsge.offset += start; 2869 2870 sk_msg_iter_var_next(i); 2871 sg_unmark_end(psge); 2872 sg_unmark_end(&rsge); 2873 } 2874 2875 /* Slot(s) to place newly allocated data */ 2876 sk_msg_iter_next(msg, end); 2877 new = i; 2878 sk_msg_iter_var_next(i); 2879 2880 if (i == msg->sg.end) { 2881 if (!rsge.length) 2882 goto place_new; 2883 sk_msg_iter_next(msg, end); 2884 goto place_new; 2885 } 2886 2887 /* Shift one or two slots as needed */ 2888 sge = sk_msg_elem_cpy(msg, new); 2889 sg_unmark_end(&sge); 2890 2891 nsge = sk_msg_elem_cpy(msg, i); 2892 if (rsge.length) { 2893 sk_msg_iter_var_next(i); 2894 nnsge = sk_msg_elem_cpy(msg, i); 2895 sk_msg_iter_next(msg, end); 2896 } 2897 2898 while (i != msg->sg.end) { 2899 msg->sg.data[i] = sge; 2900 sge = nsge; 2901 sk_msg_iter_var_next(i); 2902 if (rsge.length) { 2903 nsge = nnsge; 2904 nnsge = sk_msg_elem_cpy(msg, i); 2905 } else { 2906 nsge = sk_msg_elem_cpy(msg, i); 2907 } 2908 } 2909 2910 place_new: 2911 /* Place newly allocated data buffer */ 2912 sk_mem_charge(msg->sk, len); 2913 msg->sg.size += len; 2914 __clear_bit(new, msg->sg.copy); 2915 sg_set_page(&msg->sg.data[new], page, len + copy, 0); 2916 if (rsge.length) { 2917 get_page(sg_page(&rsge)); 2918 sk_msg_iter_var_next(new); 2919 msg->sg.data[new] = rsge; 2920 } 2921 2922 sk_msg_reset_curr(msg); 2923 sk_msg_compute_data_pointers(msg); 2924 return 0; 2925 } 2926 2927 static const struct bpf_func_proto bpf_msg_push_data_proto = { 2928 .func = bpf_msg_push_data, 2929 .gpl_only = false, 2930 .ret_type = RET_INTEGER, 2931 .arg1_type = ARG_PTR_TO_CTX, 2932 .arg2_type = ARG_ANYTHING, 2933 .arg3_type = ARG_ANYTHING, 2934 .arg4_type = ARG_ANYTHING, 2935 }; 2936 2937 static void sk_msg_shift_left(struct sk_msg *msg, int i) 2938 { 2939 struct scatterlist *sge = sk_msg_elem(msg, i); 2940 int prev; 2941 2942 put_page(sg_page(sge)); 2943 do { 2944 prev = i; 2945 sk_msg_iter_var_next(i); 2946 msg->sg.data[prev] = msg->sg.data[i]; 2947 } while (i != msg->sg.end); 2948 2949 sk_msg_iter_prev(msg, end); 2950 } 2951 2952 static void sk_msg_shift_right(struct sk_msg *msg, int i) 2953 { 2954 struct scatterlist tmp, sge; 2955 2956 sk_msg_iter_next(msg, end); 2957 sge = sk_msg_elem_cpy(msg, i); 2958 sk_msg_iter_var_next(i); 2959 tmp = sk_msg_elem_cpy(msg, i); 2960 2961 while (i != msg->sg.end) { 2962 msg->sg.data[i] = sge; 2963 sk_msg_iter_var_next(i); 2964 sge = tmp; 2965 tmp = sk_msg_elem_cpy(msg, i); 2966 } 2967 } 2968 2969 BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, 2970 u32, len, u64, flags) 2971 { 2972 u32 i = 0, l = 0, space, offset = 0; 2973 u64 last = start + len; 2974 int pop; 2975 2976 if (unlikely(flags)) 2977 return -EINVAL; 2978 2979 if (unlikely(len == 0)) 2980 return 0; 2981 2982 /* First find the starting scatterlist element */ 2983 i = msg->sg.start; 2984 do { 2985 offset += l; 2986 l = sk_msg_elem(msg, i)->length; 2987 2988 if (start < offset + l) 2989 break; 2990 sk_msg_iter_var_next(i); 2991 } while (i != msg->sg.end); 2992 2993 /* Bounds checks: start and pop must be inside message */ 2994 if (start >= offset + l || last > msg->sg.size) 2995 return -EINVAL; 2996 2997 space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); 2998 2999 pop = len; 3000 /* --------------| offset 3001 * -| start |-------- len -------| 3002 * 3003 * |----- a ----|-------- pop -------|----- b ----| 3004 * |______________________________________________| length 3005 * 3006 * 3007 * a: region at front of scatter element to save 3008 * b: region at back of scatter element to save when length > A + pop 3009 * pop: region to pop from element, same as input 'pop' here will be 3010 * decremented below per iteration. 3011 * 3012 * Two top-level cases to handle when start != offset, first B is non 3013 * zero and second B is zero corresponding to when a pop includes more 3014 * than one element. 3015 * 3016 * Then if B is non-zero AND there is no space allocate space and 3017 * compact A, B regions into page. If there is space shift ring to 3018 * the right free'ing the next element in ring to place B, leaving 3019 * A untouched except to reduce length. 3020 */ 3021 if (start != offset) { 3022 struct scatterlist *nsge, *sge = sk_msg_elem(msg, i); 3023 int a = start - offset; 3024 int b = sge->length - pop - a; 3025 3026 sk_msg_iter_var_next(i); 3027 3028 if (b > 0) { 3029 if (space) { 3030 sge->length = a; 3031 sk_msg_shift_right(msg, i); 3032 nsge = sk_msg_elem(msg, i); 3033 get_page(sg_page(sge)); 3034 sg_set_page(nsge, 3035 sg_page(sge), 3036 b, sge->offset + pop + a); 3037 } else { 3038 struct page *page, *orig; 3039 u8 *to, *from; 3040 3041 page = alloc_pages(__GFP_NOWARN | 3042 __GFP_COMP | GFP_ATOMIC, 3043 get_order(a + b)); 3044 if (unlikely(!page)) 3045 return -ENOMEM; 3046 3047 orig = sg_page(sge); 3048 from = sg_virt(sge); 3049 to = page_address(page); 3050 memcpy(to, from, a); 3051 memcpy(to + a, from + a + pop, b); 3052 sg_set_page(sge, page, a + b, 0); 3053 put_page(orig); 3054 } 3055 pop = 0; 3056 } else { 3057 pop -= (sge->length - a); 3058 sge->length = a; 3059 } 3060 } 3061 3062 /* From above the current layout _must_ be as follows, 3063 * 3064 * -| offset 3065 * -| start 3066 * 3067 * |---- pop ---|---------------- b ------------| 3068 * |____________________________________________| length 3069 * 3070 * Offset and start of the current msg elem are equal because in the 3071 * previous case we handled offset != start and either consumed the 3072 * entire element and advanced to the next element OR pop == 0. 3073 * 3074 * Two cases to handle here are first pop is less than the length 3075 * leaving some remainder b above. Simply adjust the element's layout 3076 * in this case. Or pop >= length of the element so that b = 0. In this 3077 * case advance to next element decrementing pop. 3078 */ 3079 while (pop) { 3080 struct scatterlist *sge = sk_msg_elem(msg, i); 3081 3082 if (pop < sge->length) { 3083 sge->length -= pop; 3084 sge->offset += pop; 3085 pop = 0; 3086 } else { 3087 pop -= sge->length; 3088 sk_msg_shift_left(msg, i); 3089 } 3090 } 3091 3092 sk_mem_uncharge(msg->sk, len - pop); 3093 msg->sg.size -= (len - pop); 3094 sk_msg_reset_curr(msg); 3095 sk_msg_compute_data_pointers(msg); 3096 return 0; 3097 } 3098 3099 static const struct bpf_func_proto bpf_msg_pop_data_proto = { 3100 .func = bpf_msg_pop_data, 3101 .gpl_only = false, 3102 .ret_type = RET_INTEGER, 3103 .arg1_type = ARG_PTR_TO_CTX, 3104 .arg2_type = ARG_ANYTHING, 3105 .arg3_type = ARG_ANYTHING, 3106 .arg4_type = ARG_ANYTHING, 3107 }; 3108 3109 #ifdef CONFIG_CGROUP_NET_CLASSID 3110 BPF_CALL_0(bpf_get_cgroup_classid_curr) 3111 { 3112 return __task_get_classid(current); 3113 } 3114 3115 const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = { 3116 .func = bpf_get_cgroup_classid_curr, 3117 .gpl_only = false, 3118 .ret_type = RET_INTEGER, 3119 }; 3120 3121 BPF_CALL_1(bpf_skb_cgroup_classid, const struct sk_buff *, skb) 3122 { 3123 struct sock *sk = skb_to_full_sk(skb); 3124 3125 if (!sk || !sk_fullsock(sk)) 3126 return 0; 3127 3128 return sock_cgroup_classid(&sk->sk_cgrp_data); 3129 } 3130 3131 static const struct bpf_func_proto bpf_skb_cgroup_classid_proto = { 3132 .func = bpf_skb_cgroup_classid, 3133 .gpl_only = false, 3134 .ret_type = RET_INTEGER, 3135 .arg1_type = ARG_PTR_TO_CTX, 3136 }; 3137 #endif 3138 3139 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) 3140 { 3141 return task_get_classid(skb); 3142 } 3143 3144 static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { 3145 .func = bpf_get_cgroup_classid, 3146 .gpl_only = false, 3147 .ret_type = RET_INTEGER, 3148 .arg1_type = ARG_PTR_TO_CTX, 3149 }; 3150 3151 BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb) 3152 { 3153 return dst_tclassid(skb); 3154 } 3155 3156 static const struct bpf_func_proto bpf_get_route_realm_proto = { 3157 .func = bpf_get_route_realm, 3158 .gpl_only = false, 3159 .ret_type = RET_INTEGER, 3160 .arg1_type = ARG_PTR_TO_CTX, 3161 }; 3162 3163 BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb) 3164 { 3165 /* If skb_clear_hash() was called due to mangling, we can 3166 * trigger SW recalculation here. Later access to hash 3167 * can then use the inline skb->hash via context directly 3168 * instead of calling this helper again. 3169 */ 3170 return skb_get_hash(skb); 3171 } 3172 3173 static const struct bpf_func_proto bpf_get_hash_recalc_proto = { 3174 .func = bpf_get_hash_recalc, 3175 .gpl_only = false, 3176 .ret_type = RET_INTEGER, 3177 .arg1_type = ARG_PTR_TO_CTX, 3178 }; 3179 3180 BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb) 3181 { 3182 /* After all direct packet write, this can be used once for 3183 * triggering a lazy recalc on next skb_get_hash() invocation. 3184 */ 3185 skb_clear_hash(skb); 3186 return 0; 3187 } 3188 3189 static const struct bpf_func_proto bpf_set_hash_invalid_proto = { 3190 .func = bpf_set_hash_invalid, 3191 .gpl_only = false, 3192 .ret_type = RET_INTEGER, 3193 .arg1_type = ARG_PTR_TO_CTX, 3194 }; 3195 3196 BPF_CALL_2(bpf_set_hash, struct sk_buff *, skb, u32, hash) 3197 { 3198 /* Set user specified hash as L4(+), so that it gets returned 3199 * on skb_get_hash() call unless BPF prog later on triggers a 3200 * skb_clear_hash(). 3201 */ 3202 __skb_set_sw_hash(skb, hash, true); 3203 return 0; 3204 } 3205 3206 static const struct bpf_func_proto bpf_set_hash_proto = { 3207 .func = bpf_set_hash, 3208 .gpl_only = false, 3209 .ret_type = RET_INTEGER, 3210 .arg1_type = ARG_PTR_TO_CTX, 3211 .arg2_type = ARG_ANYTHING, 3212 }; 3213 3214 BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, 3215 u16, vlan_tci) 3216 { 3217 int ret; 3218 3219 if (unlikely(vlan_proto != htons(ETH_P_8021Q) && 3220 vlan_proto != htons(ETH_P_8021AD))) 3221 vlan_proto = htons(ETH_P_8021Q); 3222 3223 bpf_push_mac_rcsum(skb); 3224 ret = skb_vlan_push(skb, vlan_proto, vlan_tci); 3225 bpf_pull_mac_rcsum(skb); 3226 skb_reset_mac_len(skb); 3227 3228 bpf_compute_data_pointers(skb); 3229 return ret; 3230 } 3231 3232 static const struct bpf_func_proto bpf_skb_vlan_push_proto = { 3233 .func = bpf_skb_vlan_push, 3234 .gpl_only = false, 3235 .ret_type = RET_INTEGER, 3236 .arg1_type = ARG_PTR_TO_CTX, 3237 .arg2_type = ARG_ANYTHING, 3238 .arg3_type = ARG_ANYTHING, 3239 }; 3240 3241 BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) 3242 { 3243 int ret; 3244 3245 bpf_push_mac_rcsum(skb); 3246 ret = skb_vlan_pop(skb); 3247 bpf_pull_mac_rcsum(skb); 3248 3249 bpf_compute_data_pointers(skb); 3250 return ret; 3251 } 3252 3253 static const struct bpf_func_proto bpf_skb_vlan_pop_proto = { 3254 .func = bpf_skb_vlan_pop, 3255 .gpl_only = false, 3256 .ret_type = RET_INTEGER, 3257 .arg1_type = ARG_PTR_TO_CTX, 3258 }; 3259 3260 static void bpf_skb_change_protocol(struct sk_buff *skb, u16 proto) 3261 { 3262 skb->protocol = htons(proto); 3263 if (skb_valid_dst(skb)) 3264 skb_dst_drop(skb); 3265 } 3266 3267 static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) 3268 { 3269 /* Caller already did skb_cow() with meta_len+len as headroom, 3270 * so no need to do it here. 3271 */ 3272 skb_push(skb, len); 3273 skb_postpush_data_move(skb, len, off); 3274 memset(skb->data + off, 0, len); 3275 3276 /* No skb_postpush_rcsum(skb, skb->data + off, len) 3277 * needed here as it does not change the skb->csum 3278 * result for checksum complete when summing over 3279 * zeroed blocks. 3280 */ 3281 return 0; 3282 } 3283 3284 static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len) 3285 { 3286 void *old_data; 3287 3288 /* skb_ensure_writable() is not needed here, as we're 3289 * already working on an uncloned skb. 3290 */ 3291 if (unlikely(!pskb_may_pull(skb, off + len))) 3292 return -ENOMEM; 3293 3294 old_data = skb->data; 3295 __skb_pull(skb, len); 3296 skb_postpull_rcsum(skb, old_data + off, len); 3297 skb_postpull_data_move(skb, len, off); 3298 3299 return 0; 3300 } 3301 3302 static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len) 3303 { 3304 bool trans_same = skb->transport_header == skb->network_header; 3305 int ret; 3306 3307 /* There's no need for __skb_push()/__skb_pull() pair to 3308 * get to the start of the mac header as we're guaranteed 3309 * to always start from here under eBPF. 3310 */ 3311 ret = bpf_skb_generic_push(skb, off, len); 3312 if (likely(!ret)) { 3313 skb->mac_header -= len; 3314 skb->network_header -= len; 3315 if (trans_same) 3316 skb->transport_header = skb->network_header; 3317 } 3318 3319 return ret; 3320 } 3321 3322 static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len) 3323 { 3324 bool trans_same = skb->transport_header == skb->network_header; 3325 int ret; 3326 3327 /* Same here, __skb_push()/__skb_pull() pair not needed. */ 3328 ret = bpf_skb_generic_pop(skb, off, len); 3329 if (likely(!ret)) { 3330 skb->mac_header += len; 3331 skb->network_header += len; 3332 if (trans_same) 3333 skb->transport_header = skb->network_header; 3334 } 3335 3336 return ret; 3337 } 3338 3339 static int bpf_skb_proto_4_to_6(struct sk_buff *skb) 3340 { 3341 const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); 3342 const u8 meta_len = skb_metadata_len(skb); 3343 u32 off = skb_mac_header_len(skb); 3344 int ret; 3345 3346 ret = skb_cow(skb, meta_len + len_diff); 3347 if (unlikely(ret < 0)) 3348 return ret; 3349 3350 ret = bpf_skb_net_hdr_push(skb, off, len_diff); 3351 if (unlikely(ret < 0)) 3352 return ret; 3353 3354 if (skb_is_gso(skb)) { 3355 struct skb_shared_info *shinfo = skb_shinfo(skb); 3356 3357 /* SKB_GSO_TCPV4 needs to be changed into SKB_GSO_TCPV6. */ 3358 if (shinfo->gso_type & SKB_GSO_TCPV4) { 3359 shinfo->gso_type &= ~SKB_GSO_TCPV4; 3360 shinfo->gso_type |= SKB_GSO_TCPV6; 3361 } 3362 shinfo->gso_type |= SKB_GSO_DODGY; 3363 } 3364 3365 bpf_skb_change_protocol(skb, ETH_P_IPV6); 3366 skb_clear_hash(skb); 3367 3368 return 0; 3369 } 3370 3371 static int bpf_skb_proto_6_to_4(struct sk_buff *skb) 3372 { 3373 const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); 3374 u32 off = skb_mac_header_len(skb); 3375 int ret; 3376 3377 ret = skb_unclone(skb, GFP_ATOMIC); 3378 if (unlikely(ret < 0)) 3379 return ret; 3380 3381 ret = bpf_skb_net_hdr_pop(skb, off, len_diff); 3382 if (unlikely(ret < 0)) 3383 return ret; 3384 3385 if (skb_is_gso(skb)) { 3386 struct skb_shared_info *shinfo = skb_shinfo(skb); 3387 3388 /* SKB_GSO_TCPV6 needs to be changed into SKB_GSO_TCPV4. */ 3389 if (shinfo->gso_type & SKB_GSO_TCPV6) { 3390 shinfo->gso_type &= ~SKB_GSO_TCPV6; 3391 shinfo->gso_type |= SKB_GSO_TCPV4; 3392 } 3393 shinfo->gso_type |= SKB_GSO_DODGY; 3394 } 3395 3396 bpf_skb_change_protocol(skb, ETH_P_IP); 3397 skb_clear_hash(skb); 3398 3399 return 0; 3400 } 3401 3402 static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto) 3403 { 3404 __be16 from_proto = skb->protocol; 3405 3406 if (from_proto == htons(ETH_P_IP) && 3407 to_proto == htons(ETH_P_IPV6)) 3408 return bpf_skb_proto_4_to_6(skb); 3409 3410 if (from_proto == htons(ETH_P_IPV6) && 3411 to_proto == htons(ETH_P_IP)) 3412 return bpf_skb_proto_6_to_4(skb); 3413 3414 return -ENOTSUPP; 3415 } 3416 3417 BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, 3418 u64, flags) 3419 { 3420 int ret; 3421 3422 if (unlikely(flags)) 3423 return -EINVAL; 3424 3425 /* General idea is that this helper does the basic groundwork 3426 * needed for changing the protocol, and eBPF program fills the 3427 * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace() 3428 * and other helpers, rather than passing a raw buffer here. 3429 * 3430 * The rationale is to keep this minimal and without a need to 3431 * deal with raw packet data. F.e. even if we would pass buffers 3432 * here, the program still needs to call the bpf_lX_csum_replace() 3433 * helpers anyway. Plus, this way we keep also separation of 3434 * concerns, since f.e. bpf_skb_store_bytes() should only take 3435 * care of stores. 3436 * 3437 * Currently, additional options and extension header space are 3438 * not supported, but flags register is reserved so we can adapt 3439 * that. For offloads, we mark packet as dodgy, so that headers 3440 * need to be verified first. 3441 */ 3442 ret = bpf_skb_proto_xlat(skb, proto); 3443 bpf_compute_data_pointers(skb); 3444 return ret; 3445 } 3446 3447 static const struct bpf_func_proto bpf_skb_change_proto_proto = { 3448 .func = bpf_skb_change_proto, 3449 .gpl_only = false, 3450 .ret_type = RET_INTEGER, 3451 .arg1_type = ARG_PTR_TO_CTX, 3452 .arg2_type = ARG_ANYTHING, 3453 .arg3_type = ARG_ANYTHING, 3454 }; 3455 3456 BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type) 3457 { 3458 /* We only allow a restricted subset to be changed for now. */ 3459 if (unlikely(!skb_pkt_type_ok(skb->pkt_type) || 3460 !skb_pkt_type_ok(pkt_type))) 3461 return -EINVAL; 3462 3463 skb->pkt_type = pkt_type; 3464 return 0; 3465 } 3466 3467 static const struct bpf_func_proto bpf_skb_change_type_proto = { 3468 .func = bpf_skb_change_type, 3469 .gpl_only = false, 3470 .ret_type = RET_INTEGER, 3471 .arg1_type = ARG_PTR_TO_CTX, 3472 .arg2_type = ARG_ANYTHING, 3473 }; 3474 3475 static u32 bpf_skb_net_base_len(const struct sk_buff *skb) 3476 { 3477 switch (skb->protocol) { 3478 case htons(ETH_P_IP): 3479 return sizeof(struct iphdr); 3480 case htons(ETH_P_IPV6): 3481 return sizeof(struct ipv6hdr); 3482 default: 3483 return ~0U; 3484 } 3485 } 3486 3487 #define BPF_F_ADJ_ROOM_ENCAP_L3_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \ 3488 BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) 3489 3490 #define BPF_F_ADJ_ROOM_DECAP_L3_MASK (BPF_F_ADJ_ROOM_DECAP_L3_IPV4 | \ 3491 BPF_F_ADJ_ROOM_DECAP_L3_IPV6) 3492 3493 #define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \ 3494 BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ 3495 BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ 3496 BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ 3497 BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \ 3498 BPF_F_ADJ_ROOM_ENCAP_L2( \ 3499 BPF_ADJ_ROOM_ENCAP_L2_MASK) | \ 3500 BPF_F_ADJ_ROOM_DECAP_L3_MASK) 3501 3502 static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, 3503 u64 flags) 3504 { 3505 u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT; 3506 bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK; 3507 u16 mac_len = 0, inner_net = 0, inner_trans = 0; 3508 const u8 meta_len = skb_metadata_len(skb); 3509 unsigned int gso_type = SKB_GSO_DODGY; 3510 int ret; 3511 3512 if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { 3513 /* udp gso_size delineates datagrams, only allow if fixed */ 3514 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) || 3515 !(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) 3516 return -ENOTSUPP; 3517 } 3518 3519 ret = skb_cow_head(skb, meta_len + len_diff); 3520 if (unlikely(ret < 0)) 3521 return ret; 3522 3523 if (encap) { 3524 if (skb->protocol != htons(ETH_P_IP) && 3525 skb->protocol != htons(ETH_P_IPV6)) 3526 return -ENOTSUPP; 3527 3528 if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 && 3529 flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) 3530 return -EINVAL; 3531 3532 if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE && 3533 flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) 3534 return -EINVAL; 3535 3536 if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH && 3537 inner_mac_len < ETH_HLEN) 3538 return -EINVAL; 3539 3540 if (skb->encapsulation) 3541 return -EALREADY; 3542 3543 mac_len = skb->network_header - skb->mac_header; 3544 inner_net = skb->network_header; 3545 if (inner_mac_len > len_diff) 3546 return -EINVAL; 3547 inner_trans = skb->transport_header; 3548 } 3549 3550 ret = bpf_skb_net_hdr_push(skb, off, len_diff); 3551 if (unlikely(ret < 0)) 3552 return ret; 3553 3554 if (encap) { 3555 skb->inner_mac_header = inner_net - inner_mac_len; 3556 skb->inner_network_header = inner_net; 3557 skb->inner_transport_header = inner_trans; 3558 3559 if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH) 3560 skb_set_inner_protocol(skb, htons(ETH_P_TEB)); 3561 else 3562 skb_set_inner_protocol(skb, skb->protocol); 3563 3564 skb->encapsulation = 1; 3565 skb_set_network_header(skb, mac_len); 3566 3567 if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) 3568 gso_type |= SKB_GSO_UDP_TUNNEL; 3569 else if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE) 3570 gso_type |= SKB_GSO_GRE; 3571 else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) 3572 gso_type |= SKB_GSO_IPXIP6; 3573 else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4) 3574 gso_type |= SKB_GSO_IPXIP4; 3575 3576 if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE || 3577 flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) { 3578 int nh_len = flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 ? 3579 sizeof(struct ipv6hdr) : 3580 sizeof(struct iphdr); 3581 3582 skb_set_transport_header(skb, mac_len + nh_len); 3583 } 3584 3585 /* Match skb->protocol to new outer l3 protocol */ 3586 if (skb->protocol == htons(ETH_P_IP) && 3587 flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) 3588 bpf_skb_change_protocol(skb, ETH_P_IPV6); 3589 else if (skb->protocol == htons(ETH_P_IPV6) && 3590 flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4) 3591 bpf_skb_change_protocol(skb, ETH_P_IP); 3592 } 3593 3594 if (skb_is_gso(skb)) { 3595 struct skb_shared_info *shinfo = skb_shinfo(skb); 3596 3597 /* Header must be checked, and gso_segs recomputed. */ 3598 shinfo->gso_type |= gso_type; 3599 shinfo->gso_segs = 0; 3600 3601 /* Due to header growth, MSS needs to be downgraded. 3602 * There is a BUG_ON() when segmenting the frag_list with 3603 * head_frag true, so linearize the skb after downgrading 3604 * the MSS. 3605 */ 3606 if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) { 3607 skb_decrease_gso_size(shinfo, len_diff); 3608 if (shinfo->frag_list) 3609 return skb_linearize(skb); 3610 } 3611 } 3612 3613 return 0; 3614 } 3615 3616 static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, 3617 u64 flags) 3618 { 3619 int ret; 3620 3621 if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO | 3622 BPF_F_ADJ_ROOM_DECAP_L3_MASK | 3623 BPF_F_ADJ_ROOM_NO_CSUM_RESET))) 3624 return -EINVAL; 3625 3626 if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { 3627 /* udp gso_size delineates datagrams, only allow if fixed */ 3628 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) || 3629 !(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) 3630 return -ENOTSUPP; 3631 } 3632 3633 ret = skb_unclone(skb, GFP_ATOMIC); 3634 if (unlikely(ret < 0)) 3635 return ret; 3636 3637 ret = bpf_skb_net_hdr_pop(skb, off, len_diff); 3638 if (unlikely(ret < 0)) 3639 return ret; 3640 3641 /* Match skb->protocol to new outer l3 protocol */ 3642 if (skb->protocol == htons(ETH_P_IP) && 3643 flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6) 3644 bpf_skb_change_protocol(skb, ETH_P_IPV6); 3645 else if (skb->protocol == htons(ETH_P_IPV6) && 3646 flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4) 3647 bpf_skb_change_protocol(skb, ETH_P_IP); 3648 3649 if (skb_is_gso(skb)) { 3650 struct skb_shared_info *shinfo = skb_shinfo(skb); 3651 3652 /* Due to header shrink, MSS can be upgraded. */ 3653 if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) 3654 skb_increase_gso_size(shinfo, len_diff); 3655 3656 /* Header must be checked, and gso_segs recomputed. */ 3657 shinfo->gso_type |= SKB_GSO_DODGY; 3658 shinfo->gso_segs = 0; 3659 } 3660 3661 return 0; 3662 } 3663 3664 #define BPF_SKB_MAX_LEN SKB_MAX_ALLOC 3665 3666 BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, 3667 u32, mode, u64, flags) 3668 { 3669 u32 len_diff_abs = abs(len_diff); 3670 bool shrink = len_diff < 0; 3671 int ret = 0; 3672 3673 if (unlikely(flags || mode)) 3674 return -EINVAL; 3675 if (unlikely(len_diff_abs > 0xfffU)) 3676 return -EFAULT; 3677 3678 if (!shrink) { 3679 ret = skb_cow(skb, len_diff); 3680 if (unlikely(ret < 0)) 3681 return ret; 3682 __skb_push(skb, len_diff_abs); 3683 memset(skb->data, 0, len_diff_abs); 3684 } else { 3685 if (unlikely(!pskb_may_pull(skb, len_diff_abs))) 3686 return -ENOMEM; 3687 __skb_pull(skb, len_diff_abs); 3688 } 3689 if (tls_sw_has_ctx_rx(skb->sk)) { 3690 struct strp_msg *rxm = strp_msg(skb); 3691 3692 rxm->full_len += len_diff; 3693 } 3694 return ret; 3695 } 3696 3697 static const struct bpf_func_proto sk_skb_adjust_room_proto = { 3698 .func = sk_skb_adjust_room, 3699 .gpl_only = false, 3700 .ret_type = RET_INTEGER, 3701 .arg1_type = ARG_PTR_TO_CTX, 3702 .arg2_type = ARG_ANYTHING, 3703 .arg3_type = ARG_ANYTHING, 3704 .arg4_type = ARG_ANYTHING, 3705 }; 3706 3707 BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, 3708 u32, mode, u64, flags) 3709 { 3710 u32 len_cur, len_diff_abs = abs(len_diff); 3711 u32 len_min = bpf_skb_net_base_len(skb); 3712 u32 len_max = BPF_SKB_MAX_LEN; 3713 __be16 proto = skb->protocol; 3714 bool shrink = len_diff < 0; 3715 u32 off; 3716 int ret; 3717 3718 if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK | 3719 BPF_F_ADJ_ROOM_NO_CSUM_RESET))) 3720 return -EINVAL; 3721 if (unlikely(len_diff_abs > 0xfffU)) 3722 return -EFAULT; 3723 if (unlikely(proto != htons(ETH_P_IP) && 3724 proto != htons(ETH_P_IPV6))) 3725 return -ENOTSUPP; 3726 3727 off = skb_mac_header_len(skb); 3728 switch (mode) { 3729 case BPF_ADJ_ROOM_NET: 3730 off += bpf_skb_net_base_len(skb); 3731 break; 3732 case BPF_ADJ_ROOM_MAC: 3733 break; 3734 default: 3735 return -ENOTSUPP; 3736 } 3737 3738 if (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) { 3739 if (!shrink) 3740 return -EINVAL; 3741 3742 switch (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) { 3743 case BPF_F_ADJ_ROOM_DECAP_L3_IPV4: 3744 len_min = sizeof(struct iphdr); 3745 break; 3746 case BPF_F_ADJ_ROOM_DECAP_L3_IPV6: 3747 len_min = sizeof(struct ipv6hdr); 3748 break; 3749 default: 3750 return -EINVAL; 3751 } 3752 } 3753 3754 len_cur = skb->len - skb_network_offset(skb); 3755 if ((shrink && (len_diff_abs >= len_cur || 3756 len_cur - len_diff_abs < len_min)) || 3757 (!shrink && (skb->len + len_diff_abs > len_max && 3758 !skb_is_gso(skb)))) 3759 return -ENOTSUPP; 3760 3761 ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) : 3762 bpf_skb_net_grow(skb, off, len_diff_abs, flags); 3763 if (!ret && !(flags & BPF_F_ADJ_ROOM_NO_CSUM_RESET)) 3764 __skb_reset_checksum_unnecessary(skb); 3765 3766 bpf_compute_data_pointers(skb); 3767 return ret; 3768 } 3769 3770 static const struct bpf_func_proto bpf_skb_adjust_room_proto = { 3771 .func = bpf_skb_adjust_room, 3772 .gpl_only = false, 3773 .ret_type = RET_INTEGER, 3774 .arg1_type = ARG_PTR_TO_CTX, 3775 .arg2_type = ARG_ANYTHING, 3776 .arg3_type = ARG_ANYTHING, 3777 .arg4_type = ARG_ANYTHING, 3778 }; 3779 3780 static u32 __bpf_skb_min_len(const struct sk_buff *skb) 3781 { 3782 int offset = skb_network_offset(skb); 3783 u32 min_len = 0; 3784 3785 if (offset > 0) 3786 min_len = offset; 3787 if (skb_transport_header_was_set(skb)) { 3788 offset = skb_transport_offset(skb); 3789 if (offset > 0) 3790 min_len = offset; 3791 } 3792 if (skb->ip_summed == CHECKSUM_PARTIAL) { 3793 offset = skb_checksum_start_offset(skb) + 3794 skb->csum_offset + sizeof(__sum16); 3795 if (offset > 0) 3796 min_len = offset; 3797 } 3798 return min_len; 3799 } 3800 3801 static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len) 3802 { 3803 unsigned int old_len = skb->len; 3804 int ret; 3805 3806 ret = __skb_grow_rcsum(skb, new_len); 3807 if (!ret) 3808 memset(skb->data + old_len, 0, new_len - old_len); 3809 return ret; 3810 } 3811 3812 static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) 3813 { 3814 return __skb_trim_rcsum(skb, new_len); 3815 } 3816 3817 static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len, 3818 u64 flags) 3819 { 3820 u32 max_len = BPF_SKB_MAX_LEN; 3821 u32 min_len = __bpf_skb_min_len(skb); 3822 int ret; 3823 3824 if (unlikely(flags || new_len > max_len || new_len < min_len)) 3825 return -EINVAL; 3826 if (skb->encapsulation) 3827 return -ENOTSUPP; 3828 3829 /* The basic idea of this helper is that it's performing the 3830 * needed work to either grow or trim an skb, and eBPF program 3831 * rewrites the rest via helpers like bpf_skb_store_bytes(), 3832 * bpf_lX_csum_replace() and others rather than passing a raw 3833 * buffer here. This one is a slow path helper and intended 3834 * for replies with control messages. 3835 * 3836 * Like in bpf_skb_change_proto(), we want to keep this rather 3837 * minimal and without protocol specifics so that we are able 3838 * to separate concerns as in bpf_skb_store_bytes() should only 3839 * be the one responsible for writing buffers. 3840 * 3841 * It's really expected to be a slow path operation here for 3842 * control message replies, so we're implicitly linearizing, 3843 * uncloning and drop offloads from the skb by this. 3844 */ 3845 ret = __bpf_try_make_writable(skb, skb->len); 3846 if (!ret) { 3847 if (new_len > skb->len) 3848 ret = bpf_skb_grow_rcsum(skb, new_len); 3849 else if (new_len < skb->len) 3850 ret = bpf_skb_trim_rcsum(skb, new_len); 3851 if (!ret && skb_is_gso(skb)) 3852 skb_gso_reset(skb); 3853 } 3854 return ret; 3855 } 3856 3857 BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, 3858 u64, flags) 3859 { 3860 int ret = __bpf_skb_change_tail(skb, new_len, flags); 3861 3862 bpf_compute_data_pointers(skb); 3863 return ret; 3864 } 3865 3866 static const struct bpf_func_proto bpf_skb_change_tail_proto = { 3867 .func = bpf_skb_change_tail, 3868 .gpl_only = false, 3869 .ret_type = RET_INTEGER, 3870 .arg1_type = ARG_PTR_TO_CTX, 3871 .arg2_type = ARG_ANYTHING, 3872 .arg3_type = ARG_ANYTHING, 3873 }; 3874 3875 BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len, 3876 u64, flags) 3877 { 3878 return __bpf_skb_change_tail(skb, new_len, flags); 3879 } 3880 3881 static const struct bpf_func_proto sk_skb_change_tail_proto = { 3882 .func = sk_skb_change_tail, 3883 .gpl_only = false, 3884 .ret_type = RET_INTEGER, 3885 .arg1_type = ARG_PTR_TO_CTX, 3886 .arg2_type = ARG_ANYTHING, 3887 .arg3_type = ARG_ANYTHING, 3888 }; 3889 3890 static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, 3891 u64 flags) 3892 { 3893 const u8 meta_len = skb_metadata_len(skb); 3894 u32 max_len = BPF_SKB_MAX_LEN; 3895 u32 new_len = skb->len + head_room; 3896 int ret; 3897 3898 if (unlikely(flags || (int)head_room < 0 || 3899 (!skb_is_gso(skb) && new_len > max_len) || 3900 new_len < skb->len)) 3901 return -EINVAL; 3902 3903 ret = skb_cow(skb, meta_len + head_room); 3904 if (likely(!ret)) { 3905 /* Idea for this helper is that we currently only 3906 * allow to expand on mac header. This means that 3907 * skb->protocol network header, etc, stay as is. 3908 * Compared to bpf_skb_change_tail(), we're more 3909 * flexible due to not needing to linearize or 3910 * reset GSO. Intention for this helper is to be 3911 * used by an L3 skb that needs to push mac header 3912 * for redirection into L2 device. 3913 */ 3914 __skb_push(skb, head_room); 3915 skb_postpush_data_move(skb, head_room, 0); 3916 memset(skb->data, 0, head_room); 3917 skb_reset_mac_header(skb); 3918 skb_reset_mac_len(skb); 3919 } 3920 3921 return ret; 3922 } 3923 3924 BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, 3925 u64, flags) 3926 { 3927 int ret = __bpf_skb_change_head(skb, head_room, flags); 3928 3929 bpf_compute_data_pointers(skb); 3930 return ret; 3931 } 3932 3933 static const struct bpf_func_proto bpf_skb_change_head_proto = { 3934 .func = bpf_skb_change_head, 3935 .gpl_only = false, 3936 .ret_type = RET_INTEGER, 3937 .arg1_type = ARG_PTR_TO_CTX, 3938 .arg2_type = ARG_ANYTHING, 3939 .arg3_type = ARG_ANYTHING, 3940 }; 3941 3942 BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room, 3943 u64, flags) 3944 { 3945 return __bpf_skb_change_head(skb, head_room, flags); 3946 } 3947 3948 static const struct bpf_func_proto sk_skb_change_head_proto = { 3949 .func = sk_skb_change_head, 3950 .gpl_only = false, 3951 .ret_type = RET_INTEGER, 3952 .arg1_type = ARG_PTR_TO_CTX, 3953 .arg2_type = ARG_ANYTHING, 3954 .arg3_type = ARG_ANYTHING, 3955 }; 3956 3957 BPF_CALL_1(bpf_xdp_get_buff_len, struct xdp_buff*, xdp) 3958 { 3959 return xdp_get_buff_len(xdp); 3960 } 3961 3962 static const struct bpf_func_proto bpf_xdp_get_buff_len_proto = { 3963 .func = bpf_xdp_get_buff_len, 3964 .gpl_only = false, 3965 .ret_type = RET_INTEGER, 3966 .arg1_type = ARG_PTR_TO_CTX, 3967 }; 3968 3969 BTF_ID_LIST_SINGLE(bpf_xdp_get_buff_len_bpf_ids, struct, xdp_buff) 3970 3971 const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto = { 3972 .func = bpf_xdp_get_buff_len, 3973 .gpl_only = false, 3974 .arg1_type = ARG_PTR_TO_BTF_ID, 3975 .arg1_btf_id = &bpf_xdp_get_buff_len_bpf_ids[0], 3976 }; 3977 3978 static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) 3979 { 3980 return xdp_data_meta_unsupported(xdp) ? 0 : 3981 xdp->data - xdp->data_meta; 3982 } 3983 3984 BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) 3985 { 3986 void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame); 3987 unsigned long metalen = xdp_get_metalen(xdp); 3988 void *data_start = xdp_frame_end + metalen; 3989 void *data = xdp->data + offset; 3990 3991 if (unlikely(data < data_start || 3992 data > xdp->data_end - ETH_HLEN)) 3993 return -EINVAL; 3994 3995 if (metalen) 3996 memmove(xdp->data_meta + offset, 3997 xdp->data_meta, metalen); 3998 xdp->data_meta += offset; 3999 xdp->data = data; 4000 4001 return 0; 4002 } 4003 4004 static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { 4005 .func = bpf_xdp_adjust_head, 4006 .gpl_only = false, 4007 .ret_type = RET_INTEGER, 4008 .arg1_type = ARG_PTR_TO_CTX, 4009 .arg2_type = ARG_ANYTHING, 4010 }; 4011 4012 void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, 4013 void *buf, unsigned long len, bool flush) 4014 { 4015 unsigned long ptr_len, ptr_off = 0; 4016 skb_frag_t *next_frag, *end_frag; 4017 struct skb_shared_info *sinfo; 4018 void *src, *dst; 4019 u8 *ptr_buf; 4020 4021 if (likely(xdp->data_end - xdp->data >= off + len)) { 4022 src = flush ? buf : xdp->data + off; 4023 dst = flush ? xdp->data + off : buf; 4024 memcpy(dst, src, len); 4025 return; 4026 } 4027 4028 sinfo = xdp_get_shared_info_from_buff(xdp); 4029 end_frag = &sinfo->frags[sinfo->nr_frags]; 4030 next_frag = &sinfo->frags[0]; 4031 4032 ptr_len = xdp->data_end - xdp->data; 4033 ptr_buf = xdp->data; 4034 4035 while (true) { 4036 if (off < ptr_off + ptr_len) { 4037 unsigned long copy_off = off - ptr_off; 4038 unsigned long copy_len = min(len, ptr_len - copy_off); 4039 4040 src = flush ? buf : ptr_buf + copy_off; 4041 dst = flush ? ptr_buf + copy_off : buf; 4042 memcpy(dst, src, copy_len); 4043 4044 off += copy_len; 4045 len -= copy_len; 4046 buf += copy_len; 4047 } 4048 4049 if (!len || next_frag == end_frag) 4050 break; 4051 4052 ptr_off += ptr_len; 4053 ptr_buf = skb_frag_address(next_frag); 4054 ptr_len = skb_frag_size(next_frag); 4055 next_frag++; 4056 } 4057 } 4058 4059 void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len) 4060 { 4061 u32 size = xdp->data_end - xdp->data; 4062 struct skb_shared_info *sinfo; 4063 void *addr = xdp->data; 4064 int i; 4065 4066 if (unlikely(offset > 0xffff || len > 0xffff)) 4067 return ERR_PTR(-EFAULT); 4068 4069 if (unlikely(offset + len > xdp_get_buff_len(xdp))) 4070 return ERR_PTR(-EINVAL); 4071 4072 if (likely(offset < size)) /* linear area */ 4073 goto out; 4074 4075 sinfo = xdp_get_shared_info_from_buff(xdp); 4076 offset -= size; 4077 for (i = 0; i < sinfo->nr_frags; i++) { /* paged area */ 4078 u32 frag_size = skb_frag_size(&sinfo->frags[i]); 4079 4080 if (offset < frag_size) { 4081 addr = skb_frag_address(&sinfo->frags[i]); 4082 size = frag_size; 4083 break; 4084 } 4085 offset -= frag_size; 4086 } 4087 out: 4088 return offset + len <= size ? addr + offset : NULL; 4089 } 4090 4091 BPF_CALL_4(bpf_xdp_load_bytes, struct xdp_buff *, xdp, u32, offset, 4092 void *, buf, u32, len) 4093 { 4094 void *ptr; 4095 4096 ptr = bpf_xdp_pointer(xdp, offset, len); 4097 if (IS_ERR(ptr)) 4098 return PTR_ERR(ptr); 4099 4100 if (!ptr) 4101 bpf_xdp_copy_buf(xdp, offset, buf, len, false); 4102 else 4103 memcpy(buf, ptr, len); 4104 4105 return 0; 4106 } 4107 4108 static const struct bpf_func_proto bpf_xdp_load_bytes_proto = { 4109 .func = bpf_xdp_load_bytes, 4110 .gpl_only = false, 4111 .ret_type = RET_INTEGER, 4112 .arg1_type = ARG_PTR_TO_CTX, 4113 .arg2_type = ARG_ANYTHING, 4114 .arg3_type = ARG_PTR_TO_UNINIT_MEM, 4115 .arg4_type = ARG_CONST_SIZE, 4116 }; 4117 4118 int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len) 4119 { 4120 return ____bpf_xdp_load_bytes(xdp, offset, buf, len); 4121 } 4122 4123 BPF_CALL_4(bpf_xdp_store_bytes, struct xdp_buff *, xdp, u32, offset, 4124 void *, buf, u32, len) 4125 { 4126 void *ptr; 4127 4128 ptr = bpf_xdp_pointer(xdp, offset, len); 4129 if (IS_ERR(ptr)) 4130 return PTR_ERR(ptr); 4131 4132 if (!ptr) 4133 bpf_xdp_copy_buf(xdp, offset, buf, len, true); 4134 else 4135 memcpy(ptr, buf, len); 4136 4137 return 0; 4138 } 4139 4140 static const struct bpf_func_proto bpf_xdp_store_bytes_proto = { 4141 .func = bpf_xdp_store_bytes, 4142 .gpl_only = false, 4143 .ret_type = RET_INTEGER, 4144 .arg1_type = ARG_PTR_TO_CTX, 4145 .arg2_type = ARG_ANYTHING, 4146 .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, 4147 .arg4_type = ARG_CONST_SIZE, 4148 }; 4149 4150 int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len) 4151 { 4152 return ____bpf_xdp_store_bytes(xdp, offset, buf, len); 4153 } 4154 4155 static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) 4156 { 4157 struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); 4158 skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1]; 4159 struct xdp_rxq_info *rxq = xdp->rxq; 4160 int tailroom; 4161 4162 if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz) 4163 return -EOPNOTSUPP; 4164 4165 tailroom = rxq->frag_size - skb_frag_size(frag) - 4166 skb_frag_off(frag) % rxq->frag_size; 4167 WARN_ON_ONCE(tailroom < 0); 4168 if (unlikely(offset > tailroom)) 4169 return -EINVAL; 4170 4171 memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset); 4172 skb_frag_size_add(frag, offset); 4173 sinfo->xdp_frags_size += offset; 4174 if (rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) 4175 xsk_buff_get_tail(xdp)->data_end += offset; 4176 4177 return 0; 4178 } 4179 4180 static struct xdp_buff *bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, 4181 bool tail, bool release) 4182 { 4183 struct xdp_buff *zc_frag = tail ? xsk_buff_get_tail(xdp) : 4184 xsk_buff_get_head(xdp); 4185 4186 if (release) { 4187 xsk_buff_del_frag(zc_frag); 4188 } else { 4189 if (tail) 4190 zc_frag->data_end -= shrink; 4191 else 4192 zc_frag->data += shrink; 4193 } 4194 4195 return zc_frag; 4196 } 4197 4198 static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, 4199 int shrink, bool tail) 4200 { 4201 enum xdp_mem_type mem_type = xdp->rxq->mem.type; 4202 bool release = skb_frag_size(frag) == shrink; 4203 netmem_ref netmem = skb_frag_netmem(frag); 4204 struct xdp_buff *zc_frag = NULL; 4205 4206 if (mem_type == MEM_TYPE_XSK_BUFF_POOL) { 4207 netmem = 0; 4208 zc_frag = bpf_xdp_shrink_data_zc(xdp, shrink, tail, release); 4209 } 4210 4211 if (release) { 4212 __xdp_return(netmem, mem_type, false, zc_frag); 4213 } else { 4214 if (!tail) 4215 skb_frag_off_add(frag, shrink); 4216 skb_frag_size_sub(frag, shrink); 4217 } 4218 4219 return release; 4220 } 4221 4222 static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) 4223 { 4224 struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); 4225 int i, n_frags_free = 0, len_free = 0; 4226 4227 if (unlikely(offset > (int)xdp_get_buff_len(xdp) - ETH_HLEN)) 4228 return -EINVAL; 4229 4230 for (i = sinfo->nr_frags - 1; i >= 0 && offset > 0; i--) { 4231 skb_frag_t *frag = &sinfo->frags[i]; 4232 int shrink = min_t(int, offset, skb_frag_size(frag)); 4233 4234 len_free += shrink; 4235 offset -= shrink; 4236 if (bpf_xdp_shrink_data(xdp, frag, shrink, true)) 4237 n_frags_free++; 4238 } 4239 sinfo->nr_frags -= n_frags_free; 4240 sinfo->xdp_frags_size -= len_free; 4241 4242 if (unlikely(!sinfo->nr_frags)) { 4243 xdp_buff_clear_frags_flag(xdp); 4244 xdp_buff_clear_frag_pfmemalloc(xdp); 4245 xdp->data_end -= offset; 4246 } 4247 4248 return 0; 4249 } 4250 4251 BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) 4252 { 4253 void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */ 4254 void *data_end = xdp->data_end + offset; 4255 4256 if (unlikely(xdp_buff_has_frags(xdp))) { /* non-linear xdp buff */ 4257 if (offset < 0) 4258 return bpf_xdp_frags_shrink_tail(xdp, -offset); 4259 4260 return bpf_xdp_frags_increase_tail(xdp, offset); 4261 } 4262 4263 /* Notice that xdp_data_hard_end have reserved some tailroom */ 4264 if (unlikely(data_end > data_hard_end)) 4265 return -EINVAL; 4266 4267 if (unlikely(data_end < xdp->data + ETH_HLEN)) 4268 return -EINVAL; 4269 4270 /* Clear memory area on grow, can contain uninit kernel memory */ 4271 if (offset > 0) 4272 memset(xdp->data_end, 0, offset); 4273 4274 xdp->data_end = data_end; 4275 4276 return 0; 4277 } 4278 4279 static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = { 4280 .func = bpf_xdp_adjust_tail, 4281 .gpl_only = false, 4282 .ret_type = RET_INTEGER, 4283 .arg1_type = ARG_PTR_TO_CTX, 4284 .arg2_type = ARG_ANYTHING, 4285 }; 4286 4287 BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset) 4288 { 4289 void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame); 4290 void *meta = xdp->data_meta + offset; 4291 unsigned long metalen = xdp->data - meta; 4292 4293 if (xdp_data_meta_unsupported(xdp)) 4294 return -ENOTSUPP; 4295 if (unlikely(meta < xdp_frame_end || 4296 meta > xdp->data)) 4297 return -EINVAL; 4298 if (unlikely(xdp_metalen_invalid(metalen))) 4299 return -EACCES; 4300 4301 xdp->data_meta = meta; 4302 4303 return 0; 4304 } 4305 4306 static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { 4307 .func = bpf_xdp_adjust_meta, 4308 .gpl_only = false, 4309 .ret_type = RET_INTEGER, 4310 .arg1_type = ARG_PTR_TO_CTX, 4311 .arg2_type = ARG_ANYTHING, 4312 }; 4313 4314 /** 4315 * DOC: xdp redirect 4316 * 4317 * XDP_REDIRECT works by a three-step process, implemented in the functions 4318 * below: 4319 * 4320 * 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target 4321 * of the redirect and store it (along with some other metadata) in a per-CPU 4322 * struct bpf_redirect_info. 4323 * 4324 * 2. When the program returns the XDP_REDIRECT return code, the driver will 4325 * call xdp_do_redirect() which will use the information in struct 4326 * bpf_redirect_info to actually enqueue the frame into a map type-specific 4327 * bulk queue structure. 4328 * 4329 * 3. Before exiting its NAPI poll loop, the driver will call 4330 * xdp_do_flush(), which will flush all the different bulk queues, 4331 * thus completing the redirect. Note that xdp_do_flush() must be 4332 * called before napi_complete_done() in the driver, as the 4333 * XDP_REDIRECT logic relies on being inside a single NAPI instance 4334 * through to the xdp_do_flush() call for RCU protection of all 4335 * in-kernel data structures. 4336 */ 4337 /* 4338 * Pointers to the map entries will be kept around for this whole sequence of 4339 * steps, protected by RCU. However, there is no top-level rcu_read_lock() in 4340 * the core code; instead, the RCU protection relies on everything happening 4341 * inside a single NAPI poll sequence, which means it's between a pair of calls 4342 * to local_bh_disable()/local_bh_enable(). 4343 * 4344 * The map entries are marked as __rcu and the map code makes sure to 4345 * dereference those pointers with rcu_dereference_check() in a way that works 4346 * for both sections that to hold an rcu_read_lock() and sections that are 4347 * called from NAPI without a separate rcu_read_lock(). The code below does not 4348 * use RCU annotations, but relies on those in the map code. 4349 */ 4350 void xdp_do_flush(void) 4351 { 4352 struct list_head *lh_map, *lh_dev, *lh_xsk; 4353 4354 bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk); 4355 if (lh_dev) 4356 __dev_flush(lh_dev); 4357 if (lh_map) 4358 __cpu_map_flush(lh_map); 4359 if (lh_xsk) 4360 __xsk_map_flush(lh_xsk); 4361 } 4362 EXPORT_SYMBOL_GPL(xdp_do_flush); 4363 4364 #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL) 4365 void xdp_do_check_flushed(struct napi_struct *napi) 4366 { 4367 struct list_head *lh_map, *lh_dev, *lh_xsk; 4368 bool missed = false; 4369 4370 bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk); 4371 if (lh_dev) { 4372 __dev_flush(lh_dev); 4373 missed = true; 4374 } 4375 if (lh_map) { 4376 __cpu_map_flush(lh_map); 4377 missed = true; 4378 } 4379 if (lh_xsk) { 4380 __xsk_map_flush(lh_xsk); 4381 missed = true; 4382 } 4383 4384 WARN_ONCE(missed, "Missing xdp_do_flush() invocation after NAPI by %ps\n", 4385 napi->poll); 4386 } 4387 #endif 4388 4389 DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key); 4390 EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key); 4391 4392 u32 xdp_master_redirect(struct xdp_buff *xdp) 4393 { 4394 struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); 4395 struct net_device *master, *slave; 4396 4397 master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev); 4398 slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp); 4399 if (slave && slave != xdp->rxq->dev) { 4400 /* The target device is different from the receiving device, so 4401 * redirect it to the new device. 4402 * Using XDP_REDIRECT gets the correct behaviour from XDP enabled 4403 * drivers to unmap the packet from their rx ring. 4404 */ 4405 ri->tgt_index = slave->ifindex; 4406 ri->map_id = INT_MAX; 4407 ri->map_type = BPF_MAP_TYPE_UNSPEC; 4408 return XDP_REDIRECT; 4409 } 4410 return XDP_TX; 4411 } 4412 EXPORT_SYMBOL_GPL(xdp_master_redirect); 4413 4414 static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri, 4415 const struct net_device *dev, 4416 struct xdp_buff *xdp, 4417 const struct bpf_prog *xdp_prog) 4418 { 4419 enum bpf_map_type map_type = ri->map_type; 4420 void *fwd = ri->tgt_value; 4421 u32 map_id = ri->map_id; 4422 int err; 4423 4424 ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ 4425 ri->map_type = BPF_MAP_TYPE_UNSPEC; 4426 4427 err = __xsk_map_redirect(fwd, xdp); 4428 if (unlikely(err)) 4429 goto err; 4430 4431 _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); 4432 return 0; 4433 err: 4434 _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); 4435 return err; 4436 } 4437 4438 static __always_inline int 4439 __xdp_do_redirect_frame(struct bpf_redirect_info *ri, struct net_device *dev, 4440 struct xdp_frame *xdpf, 4441 const struct bpf_prog *xdp_prog) 4442 { 4443 enum bpf_map_type map_type = ri->map_type; 4444 void *fwd = ri->tgt_value; 4445 u32 map_id = ri->map_id; 4446 u32 flags = ri->flags; 4447 struct bpf_map *map; 4448 int err; 4449 4450 ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ 4451 ri->flags = 0; 4452 ri->map_type = BPF_MAP_TYPE_UNSPEC; 4453 4454 if (unlikely(!xdpf)) { 4455 err = -EOVERFLOW; 4456 goto err; 4457 } 4458 4459 switch (map_type) { 4460 case BPF_MAP_TYPE_DEVMAP: 4461 fallthrough; 4462 case BPF_MAP_TYPE_DEVMAP_HASH: 4463 if (unlikely(flags & BPF_F_BROADCAST)) { 4464 map = READ_ONCE(ri->map); 4465 4466 /* The map pointer is cleared when the map is being torn 4467 * down by dev_map_free() 4468 */ 4469 if (unlikely(!map)) { 4470 err = -ENOENT; 4471 break; 4472 } 4473 4474 WRITE_ONCE(ri->map, NULL); 4475 err = dev_map_enqueue_multi(xdpf, dev, map, 4476 flags & BPF_F_EXCLUDE_INGRESS); 4477 } else { 4478 err = dev_map_enqueue(fwd, xdpf, dev); 4479 } 4480 break; 4481 case BPF_MAP_TYPE_CPUMAP: 4482 err = cpu_map_enqueue(fwd, xdpf, dev); 4483 break; 4484 case BPF_MAP_TYPE_UNSPEC: 4485 if (map_id == INT_MAX) { 4486 fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index); 4487 if (unlikely(!fwd)) { 4488 err = -EINVAL; 4489 break; 4490 } 4491 err = dev_xdp_enqueue(fwd, xdpf, dev); 4492 break; 4493 } 4494 fallthrough; 4495 default: 4496 err = -EBADRQC; 4497 } 4498 4499 if (unlikely(err)) 4500 goto err; 4501 4502 _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); 4503 return 0; 4504 err: 4505 _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); 4506 return err; 4507 } 4508 4509 int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, 4510 const struct bpf_prog *xdp_prog) 4511 { 4512 struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); 4513 enum bpf_map_type map_type = ri->map_type; 4514 4515 if (map_type == BPF_MAP_TYPE_XSKMAP) 4516 return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); 4517 4518 return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp), 4519 xdp_prog); 4520 } 4521 EXPORT_SYMBOL_GPL(xdp_do_redirect); 4522 4523 int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp, 4524 struct xdp_frame *xdpf, 4525 const struct bpf_prog *xdp_prog) 4526 { 4527 struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); 4528 enum bpf_map_type map_type = ri->map_type; 4529 4530 if (map_type == BPF_MAP_TYPE_XSKMAP) 4531 return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); 4532 4533 return __xdp_do_redirect_frame(ri, dev, xdpf, xdp_prog); 4534 } 4535 EXPORT_SYMBOL_GPL(xdp_do_redirect_frame); 4536 4537 static int xdp_do_generic_redirect_map(struct net_device *dev, 4538 struct sk_buff *skb, 4539 struct xdp_buff *xdp, 4540 const struct bpf_prog *xdp_prog, 4541 void *fwd, enum bpf_map_type map_type, 4542 u32 map_id, u32 flags) 4543 { 4544 struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); 4545 struct bpf_map *map; 4546 int err; 4547 4548 switch (map_type) { 4549 case BPF_MAP_TYPE_DEVMAP: 4550 fallthrough; 4551 case BPF_MAP_TYPE_DEVMAP_HASH: 4552 if (unlikely(flags & BPF_F_BROADCAST)) { 4553 map = READ_ONCE(ri->map); 4554 4555 /* The map pointer is cleared when the map is being torn 4556 * down by dev_map_free() 4557 */ 4558 if (unlikely(!map)) { 4559 err = -ENOENT; 4560 break; 4561 } 4562 4563 WRITE_ONCE(ri->map, NULL); 4564 err = dev_map_redirect_multi(dev, skb, xdp_prog, map, 4565 flags & BPF_F_EXCLUDE_INGRESS); 4566 } else { 4567 err = dev_map_generic_redirect(fwd, skb, xdp_prog); 4568 } 4569 if (unlikely(err)) 4570 goto err; 4571 break; 4572 case BPF_MAP_TYPE_XSKMAP: 4573 err = xsk_generic_rcv(fwd, xdp); 4574 if (err) 4575 goto err; 4576 consume_skb(skb); 4577 break; 4578 case BPF_MAP_TYPE_CPUMAP: 4579 err = cpu_map_generic_redirect(fwd, skb); 4580 if (unlikely(err)) 4581 goto err; 4582 break; 4583 default: 4584 err = -EBADRQC; 4585 goto err; 4586 } 4587 4588 _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); 4589 return 0; 4590 err: 4591 _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); 4592 return err; 4593 } 4594 4595 int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, 4596 struct xdp_buff *xdp, 4597 const struct bpf_prog *xdp_prog) 4598 { 4599 struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); 4600 enum bpf_map_type map_type = ri->map_type; 4601 void *fwd = ri->tgt_value; 4602 u32 map_id = ri->map_id; 4603 u32 flags = ri->flags; 4604 int err; 4605 4606 ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ 4607 ri->flags = 0; 4608 ri->map_type = BPF_MAP_TYPE_UNSPEC; 4609 4610 if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { 4611 fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index); 4612 if (unlikely(!fwd)) { 4613 err = -EINVAL; 4614 goto err; 4615 } 4616 4617 err = xdp_ok_fwd_dev(fwd, skb->len); 4618 if (unlikely(err)) 4619 goto err; 4620 4621 skb->dev = fwd; 4622 _trace_xdp_redirect(dev, xdp_prog, ri->tgt_index); 4623 generic_xdp_tx(skb, xdp_prog); 4624 return 0; 4625 } 4626 4627 return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id, flags); 4628 err: 4629 _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err); 4630 return err; 4631 } 4632 4633 BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) 4634 { 4635 struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); 4636 4637 if (unlikely(flags)) 4638 return XDP_ABORTED; 4639 4640 /* NB! Map type UNSPEC and map_id == INT_MAX (never generated 4641 * by map_idr) is used for ifindex based XDP redirect. 4642 */ 4643 ri->tgt_index = ifindex; 4644 ri->map_id = INT_MAX; 4645 ri->map_type = BPF_MAP_TYPE_UNSPEC; 4646 4647 return XDP_REDIRECT; 4648 } 4649 4650 static const struct bpf_func_proto bpf_xdp_redirect_proto = { 4651 .func = bpf_xdp_redirect, 4652 .gpl_only = false, 4653 .ret_type = RET_INTEGER, 4654 .arg1_type = ARG_ANYTHING, 4655 .arg2_type = ARG_ANYTHING, 4656 }; 4657 4658 BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u64, key, 4659 u64, flags) 4660 { 4661 return map->ops->map_redirect(map, key, flags); 4662 } 4663 4664 static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { 4665 .func = bpf_xdp_redirect_map, 4666 .gpl_only = false, 4667 .ret_type = RET_INTEGER, 4668 .arg1_type = ARG_CONST_MAP_PTR, 4669 .arg2_type = ARG_ANYTHING, 4670 .arg3_type = ARG_ANYTHING, 4671 }; 4672 4673 static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, 4674 unsigned long off, unsigned long len) 4675 { 4676 void *ptr = skb_header_pointer(skb, off, len, dst_buff); 4677 4678 if (unlikely(!ptr)) 4679 return len; 4680 if (ptr != dst_buff) 4681 memcpy(dst_buff, ptr, len); 4682 4683 return 0; 4684 } 4685 4686 BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map, 4687 u64, flags, void *, meta, u64, meta_size) 4688 { 4689 u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32; 4690 4691 if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) 4692 return -EINVAL; 4693 if (unlikely(!skb || skb_size > skb->len)) 4694 return -EFAULT; 4695 4696 return bpf_event_output(map, flags, meta, meta_size, skb, skb_size, 4697 bpf_skb_copy); 4698 } 4699 4700 static const struct bpf_func_proto bpf_skb_event_output_proto = { 4701 .func = bpf_skb_event_output, 4702 .gpl_only = true, 4703 .ret_type = RET_INTEGER, 4704 .arg1_type = ARG_PTR_TO_CTX, 4705 .arg2_type = ARG_CONST_MAP_PTR, 4706 .arg3_type = ARG_ANYTHING, 4707 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, 4708 .arg5_type = ARG_CONST_SIZE_OR_ZERO, 4709 }; 4710 4711 BTF_ID_LIST_SINGLE(bpf_skb_output_btf_ids, struct, sk_buff) 4712 4713 const struct bpf_func_proto bpf_skb_output_proto = { 4714 .func = bpf_skb_event_output, 4715 .gpl_only = true, 4716 .ret_type = RET_INTEGER, 4717 .arg1_type = ARG_PTR_TO_BTF_ID, 4718 .arg1_btf_id = &bpf_skb_output_btf_ids[0], 4719 .arg2_type = ARG_CONST_MAP_PTR, 4720 .arg3_type = ARG_ANYTHING, 4721 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, 4722 .arg5_type = ARG_CONST_SIZE_OR_ZERO, 4723 }; 4724 4725 static unsigned short bpf_tunnel_key_af(u64 flags) 4726 { 4727 return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; 4728 } 4729 4730 BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to, 4731 u32, size, u64, flags) 4732 { 4733 const struct ip_tunnel_info *info = skb_tunnel_info(skb); 4734 u8 compat[sizeof(struct bpf_tunnel_key)]; 4735 void *to_orig = to; 4736 int err; 4737 4738 if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6 | 4739 BPF_F_TUNINFO_FLAGS)))) { 4740 err = -EINVAL; 4741 goto err_clear; 4742 } 4743 if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) { 4744 err = -EPROTO; 4745 goto err_clear; 4746 } 4747 if (unlikely(size != sizeof(struct bpf_tunnel_key))) { 4748 err = -EINVAL; 4749 switch (size) { 4750 case offsetof(struct bpf_tunnel_key, local_ipv6[0]): 4751 case offsetof(struct bpf_tunnel_key, tunnel_label): 4752 case offsetof(struct bpf_tunnel_key, tunnel_ext): 4753 goto set_compat; 4754 case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): 4755 /* Fixup deprecated structure layouts here, so we have 4756 * a common path later on. 4757 */ 4758 if (ip_tunnel_info_af(info) != AF_INET) 4759 goto err_clear; 4760 set_compat: 4761 to = (struct bpf_tunnel_key *)compat; 4762 break; 4763 default: 4764 goto err_clear; 4765 } 4766 } 4767 4768 to->tunnel_id = be64_to_cpu(info->key.tun_id); 4769 to->tunnel_tos = info->key.tos; 4770 to->tunnel_ttl = info->key.ttl; 4771 if (flags & BPF_F_TUNINFO_FLAGS) 4772 to->tunnel_flags = ip_tunnel_flags_to_be16(info->key.tun_flags); 4773 else 4774 to->tunnel_ext = 0; 4775 4776 if (flags & BPF_F_TUNINFO_IPV6) { 4777 memcpy(to->remote_ipv6, &info->key.u.ipv6.src, 4778 sizeof(to->remote_ipv6)); 4779 memcpy(to->local_ipv6, &info->key.u.ipv6.dst, 4780 sizeof(to->local_ipv6)); 4781 to->tunnel_label = be32_to_cpu(info->key.label); 4782 } else { 4783 to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); 4784 memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); 4785 to->local_ipv4 = be32_to_cpu(info->key.u.ipv4.dst); 4786 memset(&to->local_ipv6[1], 0, sizeof(__u32) * 3); 4787 to->tunnel_label = 0; 4788 } 4789 4790 if (unlikely(size != sizeof(struct bpf_tunnel_key))) 4791 memcpy(to_orig, to, size); 4792 4793 return 0; 4794 err_clear: 4795 memset(to_orig, 0, size); 4796 return err; 4797 } 4798 4799 static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { 4800 .func = bpf_skb_get_tunnel_key, 4801 .gpl_only = false, 4802 .ret_type = RET_INTEGER, 4803 .arg1_type = ARG_PTR_TO_CTX, 4804 .arg2_type = ARG_PTR_TO_UNINIT_MEM, 4805 .arg3_type = ARG_CONST_SIZE, 4806 .arg4_type = ARG_ANYTHING, 4807 }; 4808 4809 BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size) 4810 { 4811 const struct ip_tunnel_info *info = skb_tunnel_info(skb); 4812 int err; 4813 4814 if (unlikely(!info || 4815 !ip_tunnel_is_options_present(info->key.tun_flags))) { 4816 err = -ENOENT; 4817 goto err_clear; 4818 } 4819 if (unlikely(size < info->options_len)) { 4820 err = -ENOMEM; 4821 goto err_clear; 4822 } 4823 4824 ip_tunnel_info_opts_get(to, info); 4825 if (size > info->options_len) 4826 memset(to + info->options_len, 0, size - info->options_len); 4827 4828 return info->options_len; 4829 err_clear: 4830 memset(to, 0, size); 4831 return err; 4832 } 4833 4834 static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { 4835 .func = bpf_skb_get_tunnel_opt, 4836 .gpl_only = false, 4837 .ret_type = RET_INTEGER, 4838 .arg1_type = ARG_PTR_TO_CTX, 4839 .arg2_type = ARG_PTR_TO_UNINIT_MEM, 4840 .arg3_type = ARG_CONST_SIZE, 4841 }; 4842 4843 static struct metadata_dst __percpu *md_dst; 4844 4845 BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, 4846 const struct bpf_tunnel_key *, from, u32, size, u64, flags) 4847 { 4848 struct metadata_dst *md = this_cpu_ptr(md_dst); 4849 u8 compat[sizeof(struct bpf_tunnel_key)]; 4850 struct ip_tunnel_info *info; 4851 4852 if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | 4853 BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER | 4854 BPF_F_NO_TUNNEL_KEY))) 4855 return -EINVAL; 4856 if (unlikely(size != sizeof(struct bpf_tunnel_key))) { 4857 switch (size) { 4858 case offsetof(struct bpf_tunnel_key, local_ipv6[0]): 4859 case offsetof(struct bpf_tunnel_key, tunnel_label): 4860 case offsetof(struct bpf_tunnel_key, tunnel_ext): 4861 case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): 4862 /* Fixup deprecated structure layouts here, so we have 4863 * a common path later on. 4864 */ 4865 memcpy(compat, from, size); 4866 memset(compat + size, 0, sizeof(compat) - size); 4867 from = (const struct bpf_tunnel_key *) compat; 4868 break; 4869 default: 4870 return -EINVAL; 4871 } 4872 } 4873 if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) || 4874 from->tunnel_ext)) 4875 return -EINVAL; 4876 4877 skb_dst_drop(skb); 4878 dst_hold((struct dst_entry *) md); 4879 skb_dst_set(skb, (struct dst_entry *) md); 4880 4881 info = &md->u.tun_info; 4882 memset(info, 0, sizeof(*info)); 4883 info->mode = IP_TUNNEL_INFO_TX; 4884 4885 __set_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags); 4886 __assign_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags, 4887 flags & BPF_F_DONT_FRAGMENT); 4888 __assign_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags, 4889 !(flags & BPF_F_ZERO_CSUM_TX)); 4890 __assign_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags, 4891 flags & BPF_F_SEQ_NUMBER); 4892 __assign_bit(IP_TUNNEL_KEY_BIT, info->key.tun_flags, 4893 !(flags & BPF_F_NO_TUNNEL_KEY)); 4894 4895 info->key.tun_id = cpu_to_be64(from->tunnel_id); 4896 info->key.tos = from->tunnel_tos; 4897 info->key.ttl = from->tunnel_ttl; 4898 4899 if (flags & BPF_F_TUNINFO_IPV6) { 4900 info->mode |= IP_TUNNEL_INFO_IPV6; 4901 memcpy(&info->key.u.ipv6.dst, from->remote_ipv6, 4902 sizeof(from->remote_ipv6)); 4903 memcpy(&info->key.u.ipv6.src, from->local_ipv6, 4904 sizeof(from->local_ipv6)); 4905 info->key.label = cpu_to_be32(from->tunnel_label) & 4906 IPV6_FLOWLABEL_MASK; 4907 } else { 4908 info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); 4909 info->key.u.ipv4.src = cpu_to_be32(from->local_ipv4); 4910 info->key.flow_flags = FLOWI_FLAG_ANYSRC; 4911 } 4912 4913 return 0; 4914 } 4915 4916 static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { 4917 .func = bpf_skb_set_tunnel_key, 4918 .gpl_only = false, 4919 .ret_type = RET_INTEGER, 4920 .arg1_type = ARG_PTR_TO_CTX, 4921 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, 4922 .arg3_type = ARG_CONST_SIZE, 4923 .arg4_type = ARG_ANYTHING, 4924 }; 4925 4926 BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb, 4927 const u8 *, from, u32, size) 4928 { 4929 struct ip_tunnel_info *info = skb_tunnel_info(skb); 4930 const struct metadata_dst *md = this_cpu_ptr(md_dst); 4931 IP_TUNNEL_DECLARE_FLAGS(present) = { }; 4932 4933 if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1)))) 4934 return -EINVAL; 4935 if (unlikely(size > IP_TUNNEL_OPTS_MAX)) 4936 return -ENOMEM; 4937 4938 ip_tunnel_set_options_present(present); 4939 ip_tunnel_info_opts_set(info, from, size, present); 4940 4941 return 0; 4942 } 4943 4944 static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { 4945 .func = bpf_skb_set_tunnel_opt, 4946 .gpl_only = false, 4947 .ret_type = RET_INTEGER, 4948 .arg1_type = ARG_PTR_TO_CTX, 4949 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, 4950 .arg3_type = ARG_CONST_SIZE, 4951 }; 4952 4953 static const struct bpf_func_proto * 4954 bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) 4955 { 4956 if (!md_dst) { 4957 struct metadata_dst __percpu *tmp; 4958 4959 tmp = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX, 4960 METADATA_IP_TUNNEL, 4961 GFP_KERNEL); 4962 if (!tmp) 4963 return NULL; 4964 if (cmpxchg(&md_dst, NULL, tmp)) 4965 metadata_dst_free_percpu(tmp); 4966 } 4967 4968 switch (which) { 4969 case BPF_FUNC_skb_set_tunnel_key: 4970 return &bpf_skb_set_tunnel_key_proto; 4971 case BPF_FUNC_skb_set_tunnel_opt: 4972 return &bpf_skb_set_tunnel_opt_proto; 4973 default: 4974 return NULL; 4975 } 4976 } 4977 4978 BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map, 4979 u32, idx) 4980 { 4981 struct bpf_array *array = container_of(map, struct bpf_array, map); 4982 struct cgroup *cgrp; 4983 struct sock *sk; 4984 4985 sk = skb_to_full_sk(skb); 4986 if (!sk || !sk_fullsock(sk)) 4987 return -ENOENT; 4988 if (unlikely(idx >= array->map.max_entries)) 4989 return -E2BIG; 4990 4991 cgrp = READ_ONCE(array->ptrs[idx]); 4992 if (unlikely(!cgrp)) 4993 return -EAGAIN; 4994 4995 return sk_under_cgroup_hierarchy(sk, cgrp); 4996 } 4997 4998 static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { 4999 .func = bpf_skb_under_cgroup, 5000 .gpl_only = false, 5001 .ret_type = RET_INTEGER, 5002 .arg1_type = ARG_PTR_TO_CTX, 5003 .arg2_type = ARG_CONST_MAP_PTR, 5004 .arg3_type = ARG_ANYTHING, 5005 }; 5006 5007 #ifdef CONFIG_SOCK_CGROUP_DATA 5008 static inline u64 __bpf_sk_cgroup_id(struct sock *sk) 5009 { 5010 struct cgroup *cgrp; 5011 5012 sk = sk_to_full_sk(sk); 5013 if (!sk || !sk_fullsock(sk)) 5014 return 0; 5015 5016 cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 5017 return cgroup_id(cgrp); 5018 } 5019 5020 BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) 5021 { 5022 return __bpf_sk_cgroup_id(skb->sk); 5023 } 5024 5025 static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { 5026 .func = bpf_skb_cgroup_id, 5027 .gpl_only = false, 5028 .ret_type = RET_INTEGER, 5029 .arg1_type = ARG_PTR_TO_CTX, 5030 }; 5031 5032 static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk, 5033 int ancestor_level) 5034 { 5035 struct cgroup *ancestor; 5036 struct cgroup *cgrp; 5037 5038 sk = sk_to_full_sk(sk); 5039 if (!sk || !sk_fullsock(sk)) 5040 return 0; 5041 5042 cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 5043 ancestor = cgroup_ancestor(cgrp, ancestor_level); 5044 if (!ancestor) 5045 return 0; 5046 5047 return cgroup_id(ancestor); 5048 } 5049 5050 BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, 5051 ancestor_level) 5052 { 5053 return __bpf_sk_ancestor_cgroup_id(skb->sk, ancestor_level); 5054 } 5055 5056 static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { 5057 .func = bpf_skb_ancestor_cgroup_id, 5058 .gpl_only = false, 5059 .ret_type = RET_INTEGER, 5060 .arg1_type = ARG_PTR_TO_CTX, 5061 .arg2_type = ARG_ANYTHING, 5062 }; 5063 5064 BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk) 5065 { 5066 return __bpf_sk_cgroup_id(sk); 5067 } 5068 5069 static const struct bpf_func_proto bpf_sk_cgroup_id_proto = { 5070 .func = bpf_sk_cgroup_id, 5071 .gpl_only = false, 5072 .ret_type = RET_INTEGER, 5073 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, 5074 }; 5075 5076 BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level) 5077 { 5078 return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level); 5079 } 5080 5081 static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = { 5082 .func = bpf_sk_ancestor_cgroup_id, 5083 .gpl_only = false, 5084 .ret_type = RET_INTEGER, 5085 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, 5086 .arg2_type = ARG_ANYTHING, 5087 }; 5088 #endif 5089 5090 static unsigned long bpf_xdp_copy(void *dst, const void *ctx, 5091 unsigned long off, unsigned long len) 5092 { 5093 struct xdp_buff *xdp = (struct xdp_buff *)ctx; 5094 5095 bpf_xdp_copy_buf(xdp, off, dst, len, false); 5096 return 0; 5097 } 5098 5099 BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map, 5100 u64, flags, void *, meta, u64, meta_size) 5101 { 5102 u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32; 5103 5104 if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) 5105 return -EINVAL; 5106 5107 if (unlikely(!xdp || xdp_size > xdp_get_buff_len(xdp))) 5108 return -EFAULT; 5109 5110 return bpf_event_output(map, flags, meta, meta_size, xdp, 5111 xdp_size, bpf_xdp_copy); 5112 } 5113 5114 static const struct bpf_func_proto bpf_xdp_event_output_proto = { 5115 .func = bpf_xdp_event_output, 5116 .gpl_only = true, 5117 .ret_type = RET_INTEGER, 5118 .arg1_type = ARG_PTR_TO_CTX, 5119 .arg2_type = ARG_CONST_MAP_PTR, 5120 .arg3_type = ARG_ANYTHING, 5121 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, 5122 .arg5_type = ARG_CONST_SIZE_OR_ZERO, 5123 }; 5124 5125 BTF_ID_LIST_SINGLE(bpf_xdp_output_btf_ids, struct, xdp_buff) 5126 5127 const struct bpf_func_proto bpf_xdp_output_proto = { 5128 .func = bpf_xdp_event_output, 5129 .gpl_only = true, 5130 .ret_type = RET_INTEGER, 5131 .arg1_type = ARG_PTR_TO_BTF_ID, 5132 .arg1_btf_id = &bpf_xdp_output_btf_ids[0], 5133 .arg2_type = ARG_CONST_MAP_PTR, 5134 .arg3_type = ARG_ANYTHING, 5135 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, 5136 .arg5_type = ARG_CONST_SIZE_OR_ZERO, 5137 }; 5138 5139 BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb) 5140 { 5141 return skb->sk ? __sock_gen_cookie(skb->sk) : 0; 5142 } 5143 5144 static const struct bpf_func_proto bpf_get_socket_cookie_proto = { 5145 .func = bpf_get_socket_cookie, 5146 .gpl_only = false, 5147 .ret_type = RET_INTEGER, 5148 .arg1_type = ARG_PTR_TO_CTX, 5149 }; 5150 5151 BPF_CALL_1(bpf_get_socket_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx) 5152 { 5153 return __sock_gen_cookie(ctx->sk); 5154 } 5155 5156 static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = { 5157 .func = bpf_get_socket_cookie_sock_addr, 5158 .gpl_only = false, 5159 .ret_type = RET_INTEGER, 5160 .arg1_type = ARG_PTR_TO_CTX, 5161 }; 5162 5163 BPF_CALL_1(bpf_get_socket_cookie_sock, struct sock *, ctx) 5164 { 5165 return __sock_gen_cookie(ctx); 5166 } 5167 5168 static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = { 5169 .func = bpf_get_socket_cookie_sock, 5170 .gpl_only = false, 5171 .ret_type = RET_INTEGER, 5172 .arg1_type = ARG_PTR_TO_CTX, 5173 }; 5174 5175 BPF_CALL_1(bpf_get_socket_ptr_cookie, struct sock *, sk) 5176 { 5177 return sk ? sock_gen_cookie(sk) : 0; 5178 } 5179 5180 const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = { 5181 .func = bpf_get_socket_ptr_cookie, 5182 .gpl_only = false, 5183 .ret_type = RET_INTEGER, 5184 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | PTR_MAYBE_NULL, 5185 }; 5186 5187 BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) 5188 { 5189 return __sock_gen_cookie(ctx->sk); 5190 } 5191 5192 static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = { 5193 .func = bpf_get_socket_cookie_sock_ops, 5194 .gpl_only = false, 5195 .ret_type = RET_INTEGER, 5196 .arg1_type = ARG_PTR_TO_CTX, 5197 }; 5198 5199 static u64 __bpf_get_netns_cookie(struct sock *sk) 5200 { 5201 const struct net *net = sk ? sock_net(sk) : &init_net; 5202 5203 return net->net_cookie; 5204 } 5205 5206 BPF_CALL_1(bpf_get_netns_cookie, struct sk_buff *, skb) 5207 { 5208 return __bpf_get_netns_cookie(skb && skb->sk ? skb->sk : NULL); 5209 } 5210 5211 static const struct bpf_func_proto bpf_get_netns_cookie_proto = { 5212 .func = bpf_get_netns_cookie, 5213 .ret_type = RET_INTEGER, 5214 .arg1_type = ARG_PTR_TO_CTX_OR_NULL, 5215 }; 5216 5217 BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx) 5218 { 5219 return __bpf_get_netns_cookie(ctx); 5220 } 5221 5222 static const struct bpf_func_proto bpf_get_netns_cookie_sock_proto = { 5223 .func = bpf_get_netns_cookie_sock, 5224 .gpl_only = false, 5225 .ret_type = RET_INTEGER, 5226 .arg1_type = ARG_PTR_TO_CTX_OR_NULL, 5227 }; 5228 5229 BPF_CALL_1(bpf_get_netns_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx) 5230 { 5231 return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); 5232 } 5233 5234 static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = { 5235 .func = bpf_get_netns_cookie_sock_addr, 5236 .gpl_only = false, 5237 .ret_type = RET_INTEGER, 5238 .arg1_type = ARG_PTR_TO_CTX_OR_NULL, 5239 }; 5240 5241 BPF_CALL_1(bpf_get_netns_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) 5242 { 5243 return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); 5244 } 5245 5246 static const struct bpf_func_proto bpf_get_netns_cookie_sock_ops_proto = { 5247 .func = bpf_get_netns_cookie_sock_ops, 5248 .gpl_only = false, 5249 .ret_type = RET_INTEGER, 5250 .arg1_type = ARG_PTR_TO_CTX_OR_NULL, 5251 }; 5252 5253 BPF_CALL_1(bpf_get_netns_cookie_sk_msg, struct sk_msg *, ctx) 5254 { 5255 return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); 5256 } 5257 5258 static const struct bpf_func_proto bpf_get_netns_cookie_sk_msg_proto = { 5259 .func = bpf_get_netns_cookie_sk_msg, 5260 .gpl_only = false, 5261 .ret_type = RET_INTEGER, 5262 .arg1_type = ARG_PTR_TO_CTX_OR_NULL, 5263 }; 5264 5265 BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb) 5266 { 5267 struct sock *sk = sk_to_full_sk(skb->sk); 5268 kuid_t kuid; 5269 5270 if (!sk || !sk_fullsock(sk)) 5271 return overflowuid; 5272 kuid = sock_net_uid(sock_net(sk), sk); 5273 return from_kuid_munged(sock_net(sk)->user_ns, kuid); 5274 } 5275 5276 static const struct bpf_func_proto bpf_get_socket_uid_proto = { 5277 .func = bpf_get_socket_uid, 5278 .gpl_only = false, 5279 .ret_type = RET_INTEGER, 5280 .arg1_type = ARG_PTR_TO_CTX, 5281 }; 5282 5283 static int sk_bpf_set_get_cb_flags(struct sock *sk, char *optval, bool getopt) 5284 { 5285 u32 sk_bpf_cb_flags; 5286 5287 if (getopt) { 5288 *(u32 *)optval = sk->sk_bpf_cb_flags; 5289 return 0; 5290 } 5291 5292 sk_bpf_cb_flags = *(u32 *)optval; 5293 5294 if (sk_bpf_cb_flags & ~SK_BPF_CB_MASK) 5295 return -EINVAL; 5296 5297 sk->sk_bpf_cb_flags = sk_bpf_cb_flags; 5298 5299 return 0; 5300 } 5301 5302 static int sol_socket_sockopt(struct sock *sk, int optname, 5303 char *optval, int *optlen, 5304 bool getopt) 5305 { 5306 switch (optname) { 5307 case SO_REUSEADDR: 5308 case SO_SNDBUF: 5309 case SO_RCVBUF: 5310 case SO_KEEPALIVE: 5311 case SO_PRIORITY: 5312 case SO_REUSEPORT: 5313 case SO_RCVLOWAT: 5314 case SO_MARK: 5315 case SO_MAX_PACING_RATE: 5316 case SO_BINDTOIFINDEX: 5317 case SO_TXREHASH: 5318 case SK_BPF_CB_FLAGS: 5319 if (*optlen != sizeof(int)) 5320 return -EINVAL; 5321 break; 5322 case SO_BINDTODEVICE: 5323 break; 5324 default: 5325 return -EINVAL; 5326 } 5327 5328 if (optname == SK_BPF_CB_FLAGS) 5329 return sk_bpf_set_get_cb_flags(sk, optval, getopt); 5330 5331 if (getopt) { 5332 if (optname == SO_BINDTODEVICE) 5333 return -EINVAL; 5334 return sk_getsockopt(sk, SOL_SOCKET, optname, 5335 KERNEL_SOCKPTR(optval), 5336 KERNEL_SOCKPTR(optlen)); 5337 } 5338 5339 return sk_setsockopt(sk, SOL_SOCKET, optname, 5340 KERNEL_SOCKPTR(optval), *optlen); 5341 } 5342 5343 static int bpf_sol_tcp_getsockopt(struct sock *sk, int optname, 5344 char *optval, int optlen) 5345 { 5346 if (optlen != sizeof(int)) 5347 return -EINVAL; 5348 5349 switch (optname) { 5350 case TCP_BPF_SOCK_OPS_CB_FLAGS: { 5351 int cb_flags = tcp_sk(sk)->bpf_sock_ops_cb_flags; 5352 5353 memcpy(optval, &cb_flags, optlen); 5354 break; 5355 } 5356 case TCP_BPF_RTO_MIN: { 5357 int rto_min_us = jiffies_to_usecs(inet_csk(sk)->icsk_rto_min); 5358 5359 memcpy(optval, &rto_min_us, optlen); 5360 break; 5361 } 5362 case TCP_BPF_DELACK_MAX: { 5363 int delack_max_us = jiffies_to_usecs(inet_csk(sk)->icsk_delack_max); 5364 5365 memcpy(optval, &delack_max_us, optlen); 5366 break; 5367 } 5368 default: 5369 return -EINVAL; 5370 } 5371 5372 return 0; 5373 } 5374 5375 static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, 5376 char *optval, int optlen) 5377 { 5378 struct tcp_sock *tp = tcp_sk(sk); 5379 unsigned long timeout; 5380 int val; 5381 5382 if (optlen != sizeof(int)) 5383 return -EINVAL; 5384 5385 val = *(int *)optval; 5386 5387 /* Only some options are supported */ 5388 switch (optname) { 5389 case TCP_BPF_IW: 5390 if (val <= 0 || tp->data_segs_out > tp->syn_data) 5391 return -EINVAL; 5392 tcp_snd_cwnd_set(tp, val); 5393 break; 5394 case TCP_BPF_SNDCWND_CLAMP: 5395 if (val <= 0) 5396 return -EINVAL; 5397 tp->snd_cwnd_clamp = val; 5398 tp->snd_ssthresh = val; 5399 break; 5400 case TCP_BPF_DELACK_MAX: 5401 timeout = usecs_to_jiffies(val); 5402 if (timeout > TCP_DELACK_MAX || 5403 timeout < TCP_TIMEOUT_MIN) 5404 return -EINVAL; 5405 inet_csk(sk)->icsk_delack_max = timeout; 5406 break; 5407 case TCP_BPF_RTO_MIN: 5408 timeout = usecs_to_jiffies(val); 5409 if (timeout > TCP_RTO_MIN || 5410 timeout < TCP_TIMEOUT_MIN) 5411 return -EINVAL; 5412 inet_csk(sk)->icsk_rto_min = timeout; 5413 break; 5414 case TCP_BPF_SOCK_OPS_CB_FLAGS: 5415 if (val & ~(BPF_SOCK_OPS_ALL_CB_FLAGS)) 5416 return -EINVAL; 5417 tp->bpf_sock_ops_cb_flags = val; 5418 break; 5419 default: 5420 return -EINVAL; 5421 } 5422 5423 return 0; 5424 } 5425 5426 static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval, 5427 int *optlen, bool getopt) 5428 { 5429 struct tcp_sock *tp; 5430 int ret; 5431 5432 if (*optlen < 2) 5433 return -EINVAL; 5434 5435 if (getopt) { 5436 if (!inet_csk(sk)->icsk_ca_ops) 5437 return -EINVAL; 5438 /* BPF expects NULL-terminated tcp-cc string */ 5439 optval[--(*optlen)] = '\0'; 5440 return do_tcp_getsockopt(sk, SOL_TCP, TCP_CONGESTION, 5441 KERNEL_SOCKPTR(optval), 5442 KERNEL_SOCKPTR(optlen)); 5443 } 5444 5445 /* "cdg" is the only cc that alloc a ptr 5446 * in inet_csk_ca area. The bpf-tcp-cc may 5447 * overwrite this ptr after switching to cdg. 5448 */ 5449 if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen)) 5450 return -ENOTSUPP; 5451 5452 /* It stops this looping 5453 * 5454 * .init => bpf_setsockopt(tcp_cc) => .init => 5455 * bpf_setsockopt(tcp_cc)" => .init => .... 5456 * 5457 * The second bpf_setsockopt(tcp_cc) is not allowed 5458 * in order to break the loop when both .init 5459 * are the same bpf prog. 5460 * 5461 * This applies even the second bpf_setsockopt(tcp_cc) 5462 * does not cause a loop. This limits only the first 5463 * '.init' can call bpf_setsockopt(TCP_CONGESTION) to 5464 * pick a fallback cc (eg. peer does not support ECN) 5465 * and the second '.init' cannot fallback to 5466 * another. 5467 */ 5468 tp = tcp_sk(sk); 5469 if (tp->bpf_chg_cc_inprogress) 5470 return -EBUSY; 5471 5472 tp->bpf_chg_cc_inprogress = 1; 5473 ret = do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION, 5474 KERNEL_SOCKPTR(optval), *optlen); 5475 tp->bpf_chg_cc_inprogress = 0; 5476 return ret; 5477 } 5478 5479 static int sol_tcp_sockopt(struct sock *sk, int optname, 5480 char *optval, int *optlen, 5481 bool getopt) 5482 { 5483 if (sk->sk_protocol != IPPROTO_TCP) 5484 return -EINVAL; 5485 5486 switch (optname) { 5487 case TCP_NODELAY: 5488 case TCP_MAXSEG: 5489 case TCP_KEEPIDLE: 5490 case TCP_KEEPINTVL: 5491 case TCP_KEEPCNT: 5492 case TCP_SYNCNT: 5493 case TCP_WINDOW_CLAMP: 5494 case TCP_THIN_LINEAR_TIMEOUTS: 5495 case TCP_USER_TIMEOUT: 5496 case TCP_NOTSENT_LOWAT: 5497 case TCP_SAVE_SYN: 5498 case TCP_RTO_MAX_MS: 5499 if (*optlen != sizeof(int)) 5500 return -EINVAL; 5501 break; 5502 case TCP_CONGESTION: 5503 return sol_tcp_sockopt_congestion(sk, optval, optlen, getopt); 5504 case TCP_SAVED_SYN: 5505 if (*optlen < 1) 5506 return -EINVAL; 5507 break; 5508 default: 5509 if (getopt) 5510 return bpf_sol_tcp_getsockopt(sk, optname, optval, *optlen); 5511 return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen); 5512 } 5513 5514 if (getopt) { 5515 if (optname == TCP_SAVED_SYN) { 5516 struct tcp_sock *tp = tcp_sk(sk); 5517 5518 if (!tp->saved_syn || 5519 *optlen > tcp_saved_syn_len(tp->saved_syn)) 5520 return -EINVAL; 5521 memcpy(optval, tp->saved_syn->data, *optlen); 5522 /* It cannot free tp->saved_syn here because it 5523 * does not know if the user space still needs it. 5524 */ 5525 return 0; 5526 } 5527 5528 return do_tcp_getsockopt(sk, SOL_TCP, optname, 5529 KERNEL_SOCKPTR(optval), 5530 KERNEL_SOCKPTR(optlen)); 5531 } 5532 5533 return do_tcp_setsockopt(sk, SOL_TCP, optname, 5534 KERNEL_SOCKPTR(optval), *optlen); 5535 } 5536 5537 static int sol_ip_sockopt(struct sock *sk, int optname, 5538 char *optval, int *optlen, 5539 bool getopt) 5540 { 5541 if (sk->sk_family != AF_INET) 5542 return -EINVAL; 5543 5544 switch (optname) { 5545 case IP_TOS: 5546 if (*optlen != sizeof(int)) 5547 return -EINVAL; 5548 break; 5549 default: 5550 return -EINVAL; 5551 } 5552 5553 if (getopt) 5554 return do_ip_getsockopt(sk, SOL_IP, optname, 5555 KERNEL_SOCKPTR(optval), 5556 KERNEL_SOCKPTR(optlen)); 5557 5558 return do_ip_setsockopt(sk, SOL_IP, optname, 5559 KERNEL_SOCKPTR(optval), *optlen); 5560 } 5561 5562 static int sol_ipv6_sockopt(struct sock *sk, int optname, 5563 char *optval, int *optlen, 5564 bool getopt) 5565 { 5566 if (sk->sk_family != AF_INET6) 5567 return -EINVAL; 5568 5569 switch (optname) { 5570 case IPV6_TCLASS: 5571 case IPV6_AUTOFLOWLABEL: 5572 if (*optlen != sizeof(int)) 5573 return -EINVAL; 5574 break; 5575 default: 5576 return -EINVAL; 5577 } 5578 5579 if (getopt) 5580 return ipv6_bpf_stub->ipv6_getsockopt(sk, SOL_IPV6, optname, 5581 KERNEL_SOCKPTR(optval), 5582 KERNEL_SOCKPTR(optlen)); 5583 5584 return ipv6_bpf_stub->ipv6_setsockopt(sk, SOL_IPV6, optname, 5585 KERNEL_SOCKPTR(optval), *optlen); 5586 } 5587 5588 static int __bpf_setsockopt(struct sock *sk, int level, int optname, 5589 char *optval, int optlen) 5590 { 5591 if (!sk_fullsock(sk)) 5592 return -EINVAL; 5593 5594 if (level == SOL_SOCKET) 5595 return sol_socket_sockopt(sk, optname, optval, &optlen, false); 5596 else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) 5597 return sol_ip_sockopt(sk, optname, optval, &optlen, false); 5598 else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) 5599 return sol_ipv6_sockopt(sk, optname, optval, &optlen, false); 5600 else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) 5601 return sol_tcp_sockopt(sk, optname, optval, &optlen, false); 5602 5603 return -EINVAL; 5604 } 5605 5606 static bool is_locked_tcp_sock_ops(struct bpf_sock_ops_kern *bpf_sock) 5607 { 5608 return bpf_sock->op <= BPF_SOCK_OPS_WRITE_HDR_OPT_CB; 5609 } 5610 5611 static int _bpf_setsockopt(struct sock *sk, int level, int optname, 5612 char *optval, int optlen) 5613 { 5614 if (sk_fullsock(sk)) 5615 sock_owned_by_me(sk); 5616 return __bpf_setsockopt(sk, level, optname, optval, optlen); 5617 } 5618 5619 static int __bpf_getsockopt(struct sock *sk, int level, int optname, 5620 char *optval, int optlen) 5621 { 5622 int err, saved_optlen = optlen; 5623 5624 if (!sk_fullsock(sk)) { 5625 err = -EINVAL; 5626 goto done; 5627 } 5628 5629 if (level == SOL_SOCKET) 5630 err = sol_socket_sockopt(sk, optname, optval, &optlen, true); 5631 else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) 5632 err = sol_tcp_sockopt(sk, optname, optval, &optlen, true); 5633 else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) 5634 err = sol_ip_sockopt(sk, optname, optval, &optlen, true); 5635 else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) 5636 err = sol_ipv6_sockopt(sk, optname, optval, &optlen, true); 5637 else 5638 err = -EINVAL; 5639 5640 done: 5641 if (err) 5642 optlen = 0; 5643 if (optlen < saved_optlen) 5644 memset(optval + optlen, 0, saved_optlen - optlen); 5645 return err; 5646 } 5647 5648 static int _bpf_getsockopt(struct sock *sk, int level, int optname, 5649 char *optval, int optlen) 5650 { 5651 if (sk_fullsock(sk)) 5652 sock_owned_by_me(sk); 5653 return __bpf_getsockopt(sk, level, optname, optval, optlen); 5654 } 5655 5656 BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level, 5657 int, optname, char *, optval, int, optlen) 5658 { 5659 return _bpf_setsockopt(sk, level, optname, optval, optlen); 5660 } 5661 5662 const struct bpf_func_proto bpf_sk_setsockopt_proto = { 5663 .func = bpf_sk_setsockopt, 5664 .gpl_only = false, 5665 .ret_type = RET_INTEGER, 5666 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, 5667 .arg2_type = ARG_ANYTHING, 5668 .arg3_type = ARG_ANYTHING, 5669 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, 5670 .arg5_type = ARG_CONST_SIZE, 5671 }; 5672 5673 BPF_CALL_5(bpf_sk_getsockopt, struct sock *, sk, int, level, 5674 int, optname, char *, optval, int, optlen) 5675 { 5676 return _bpf_getsockopt(sk, level, optname, optval, optlen); 5677 } 5678 5679 const struct bpf_func_proto bpf_sk_getsockopt_proto = { 5680 .func = bpf_sk_getsockopt, 5681 .gpl_only = false, 5682 .ret_type = RET_INTEGER, 5683 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, 5684 .arg2_type = ARG_ANYTHING, 5685 .arg3_type = ARG_ANYTHING, 5686 .arg4_type = ARG_PTR_TO_UNINIT_MEM, 5687 .arg5_type = ARG_CONST_SIZE, 5688 }; 5689 5690 BPF_CALL_5(bpf_unlocked_sk_setsockopt, struct sock *, sk, int, level, 5691 int, optname, char *, optval, int, optlen) 5692 { 5693 return __bpf_setsockopt(sk, level, optname, optval, optlen); 5694 } 5695 5696 const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto = { 5697 .func = bpf_unlocked_sk_setsockopt, 5698 .gpl_only = false, 5699 .ret_type = RET_INTEGER, 5700 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, 5701 .arg2_type = ARG_ANYTHING, 5702 .arg3_type = ARG_ANYTHING, 5703 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, 5704 .arg5_type = ARG_CONST_SIZE, 5705 }; 5706 5707 BPF_CALL_5(bpf_unlocked_sk_getsockopt, struct sock *, sk, int, level, 5708 int, optname, char *, optval, int, optlen) 5709 { 5710 return __bpf_getsockopt(sk, level, optname, optval, optlen); 5711 } 5712 5713 const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto = { 5714 .func = bpf_unlocked_sk_getsockopt, 5715 .gpl_only = false, 5716 .ret_type = RET_INTEGER, 5717 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, 5718 .arg2_type = ARG_ANYTHING, 5719 .arg3_type = ARG_ANYTHING, 5720 .arg4_type = ARG_PTR_TO_UNINIT_MEM, 5721 .arg5_type = ARG_CONST_SIZE, 5722 }; 5723 5724 BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx, 5725 int, level, int, optname, char *, optval, int, optlen) 5726 { 5727 return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen); 5728 } 5729 5730 static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = { 5731 .func = bpf_sock_addr_setsockopt, 5732 .gpl_only = false, 5733 .ret_type = RET_INTEGER, 5734 .arg1_type = ARG_PTR_TO_CTX, 5735 .arg2_type = ARG_ANYTHING, 5736 .arg3_type = ARG_ANYTHING, 5737 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, 5738 .arg5_type = ARG_CONST_SIZE, 5739 }; 5740 5741 BPF_CALL_5(bpf_sock_addr_getsockopt, struct bpf_sock_addr_kern *, ctx, 5742 int, level, int, optname, char *, optval, int, optlen) 5743 { 5744 return _bpf_getsockopt(ctx->sk, level, optname, optval, optlen); 5745 } 5746 5747 static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = { 5748 .func = bpf_sock_addr_getsockopt, 5749 .gpl_only = false, 5750 .ret_type = RET_INTEGER, 5751 .arg1_type = ARG_PTR_TO_CTX, 5752 .arg2_type = ARG_ANYTHING, 5753 .arg3_type = ARG_ANYTHING, 5754 .arg4_type = ARG_PTR_TO_UNINIT_MEM, 5755 .arg5_type = ARG_CONST_SIZE, 5756 }; 5757 5758 static int sk_bpf_set_get_bypass_prot_mem(struct sock *sk, 5759 char *optval, int optlen, 5760 bool getopt) 5761 { 5762 int val; 5763 5764 if (optlen != sizeof(int)) 5765 return -EINVAL; 5766 5767 if (!sk_has_account(sk)) 5768 return -EOPNOTSUPP; 5769 5770 if (getopt) { 5771 *(int *)optval = sk->sk_bypass_prot_mem; 5772 return 0; 5773 } 5774 5775 val = *(int *)optval; 5776 if (val < 0 || val > 1) 5777 return -EINVAL; 5778 5779 sk->sk_bypass_prot_mem = val; 5780 return 0; 5781 } 5782 5783 BPF_CALL_5(bpf_sock_create_setsockopt, struct sock *, sk, int, level, 5784 int, optname, char *, optval, int, optlen) 5785 { 5786 if (level == SOL_SOCKET && optname == SK_BPF_BYPASS_PROT_MEM) 5787 return sk_bpf_set_get_bypass_prot_mem(sk, optval, optlen, false); 5788 5789 return __bpf_setsockopt(sk, level, optname, optval, optlen); 5790 } 5791 5792 static const struct bpf_func_proto bpf_sock_create_setsockopt_proto = { 5793 .func = bpf_sock_create_setsockopt, 5794 .gpl_only = false, 5795 .ret_type = RET_INTEGER, 5796 .arg1_type = ARG_PTR_TO_CTX, 5797 .arg2_type = ARG_ANYTHING, 5798 .arg3_type = ARG_ANYTHING, 5799 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, 5800 .arg5_type = ARG_CONST_SIZE, 5801 }; 5802 5803 BPF_CALL_5(bpf_sock_create_getsockopt, struct sock *, sk, int, level, 5804 int, optname, char *, optval, int, optlen) 5805 { 5806 if (level == SOL_SOCKET && optname == SK_BPF_BYPASS_PROT_MEM) { 5807 int err = sk_bpf_set_get_bypass_prot_mem(sk, optval, optlen, true); 5808 5809 if (err) 5810 memset(optval, 0, optlen); 5811 5812 return err; 5813 } 5814 5815 return __bpf_getsockopt(sk, level, optname, optval, optlen); 5816 } 5817 5818 static const struct bpf_func_proto bpf_sock_create_getsockopt_proto = { 5819 .func = bpf_sock_create_getsockopt, 5820 .gpl_only = false, 5821 .ret_type = RET_INTEGER, 5822 .arg1_type = ARG_PTR_TO_CTX, 5823 .arg2_type = ARG_ANYTHING, 5824 .arg3_type = ARG_ANYTHING, 5825 .arg4_type = ARG_PTR_TO_UNINIT_MEM, 5826 .arg5_type = ARG_CONST_SIZE, 5827 }; 5828 5829 BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, 5830 int, level, int, optname, char *, optval, int, optlen) 5831 { 5832 if (!is_locked_tcp_sock_ops(bpf_sock)) 5833 return -EOPNOTSUPP; 5834 5835 return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen); 5836 } 5837 5838 static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { 5839 .func = bpf_sock_ops_setsockopt, 5840 .gpl_only = false, 5841 .ret_type = RET_INTEGER, 5842 .arg1_type = ARG_PTR_TO_CTX, 5843 .arg2_type = ARG_ANYTHING, 5844 .arg3_type = ARG_ANYTHING, 5845 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, 5846 .arg5_type = ARG_CONST_SIZE, 5847 }; 5848 5849 static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock, 5850 int optname, const u8 **start) 5851 { 5852 struct sk_buff *syn_skb = bpf_sock->syn_skb; 5853 const u8 *hdr_start; 5854 int ret; 5855 5856 if (syn_skb) { 5857 /* sk is a request_sock here */ 5858 5859 if (optname == TCP_BPF_SYN) { 5860 hdr_start = syn_skb->data; 5861 ret = tcp_hdrlen(syn_skb); 5862 } else if (optname == TCP_BPF_SYN_IP) { 5863 hdr_start = skb_network_header(syn_skb); 5864 ret = skb_network_header_len(syn_skb) + 5865 tcp_hdrlen(syn_skb); 5866 } else { 5867 /* optname == TCP_BPF_SYN_MAC */ 5868 hdr_start = skb_mac_header(syn_skb); 5869 ret = skb_mac_header_len(syn_skb) + 5870 skb_network_header_len(syn_skb) + 5871 tcp_hdrlen(syn_skb); 5872 } 5873 } else { 5874 struct sock *sk = bpf_sock->sk; 5875 struct saved_syn *saved_syn; 5876 5877 if (sk->sk_state == TCP_NEW_SYN_RECV) 5878 /* synack retransmit. bpf_sock->syn_skb will 5879 * not be available. It has to resort to 5880 * saved_syn (if it is saved). 5881 */ 5882 saved_syn = inet_reqsk(sk)->saved_syn; 5883 else 5884 saved_syn = tcp_sk(sk)->saved_syn; 5885 5886 if (!saved_syn) 5887 return -ENOENT; 5888 5889 if (optname == TCP_BPF_SYN) { 5890 hdr_start = saved_syn->data + 5891 saved_syn->mac_hdrlen + 5892 saved_syn->network_hdrlen; 5893 ret = saved_syn->tcp_hdrlen; 5894 } else if (optname == TCP_BPF_SYN_IP) { 5895 hdr_start = saved_syn->data + 5896 saved_syn->mac_hdrlen; 5897 ret = saved_syn->network_hdrlen + 5898 saved_syn->tcp_hdrlen; 5899 } else { 5900 /* optname == TCP_BPF_SYN_MAC */ 5901 5902 /* TCP_SAVE_SYN may not have saved the mac hdr */ 5903 if (!saved_syn->mac_hdrlen) 5904 return -ENOENT; 5905 5906 hdr_start = saved_syn->data; 5907 ret = saved_syn->mac_hdrlen + 5908 saved_syn->network_hdrlen + 5909 saved_syn->tcp_hdrlen; 5910 } 5911 } 5912 5913 *start = hdr_start; 5914 return ret; 5915 } 5916 5917 BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, 5918 int, level, int, optname, char *, optval, int, optlen) 5919 { 5920 if (!is_locked_tcp_sock_ops(bpf_sock)) 5921 return -EOPNOTSUPP; 5922 5923 if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP && 5924 optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) { 5925 int ret, copy_len = 0; 5926 const u8 *start; 5927 5928 ret = bpf_sock_ops_get_syn(bpf_sock, optname, &start); 5929 if (ret > 0) { 5930 copy_len = ret; 5931 if (optlen < copy_len) { 5932 copy_len = optlen; 5933 ret = -ENOSPC; 5934 } 5935 5936 memcpy(optval, start, copy_len); 5937 } 5938 5939 /* Zero out unused buffer at the end */ 5940 memset(optval + copy_len, 0, optlen - copy_len); 5941 5942 return ret; 5943 } 5944 5945 return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen); 5946 } 5947 5948 static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = { 5949 .func = bpf_sock_ops_getsockopt, 5950 .gpl_only = false, 5951 .ret_type = RET_INTEGER, 5952 .arg1_type = ARG_PTR_TO_CTX, 5953 .arg2_type = ARG_ANYTHING, 5954 .arg3_type = ARG_ANYTHING, 5955 .arg4_type = ARG_PTR_TO_UNINIT_MEM, 5956 .arg5_type = ARG_CONST_SIZE, 5957 }; 5958 5959 BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, 5960 int, argval) 5961 { 5962 struct sock *sk = bpf_sock->sk; 5963 int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS; 5964 5965 if (!is_locked_tcp_sock_ops(bpf_sock)) 5966 return -EOPNOTSUPP; 5967 5968 if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk)) 5969 return -EINVAL; 5970 5971 tcp_sk(sk)->bpf_sock_ops_cb_flags = val; 5972 5973 return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS); 5974 } 5975 5976 static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = { 5977 .func = bpf_sock_ops_cb_flags_set, 5978 .gpl_only = false, 5979 .ret_type = RET_INTEGER, 5980 .arg1_type = ARG_PTR_TO_CTX, 5981 .arg2_type = ARG_ANYTHING, 5982 }; 5983 5984 const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; 5985 EXPORT_SYMBOL_GPL(ipv6_bpf_stub); 5986 5987 BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, 5988 int, addr_len) 5989 { 5990 #ifdef CONFIG_INET 5991 struct sock *sk = ctx->sk; 5992 u32 flags = BIND_FROM_BPF; 5993 int err; 5994 5995 err = -EINVAL; 5996 if (addr_len < offsetofend(struct sockaddr, sa_family)) 5997 return err; 5998 if (addr->sa_family == AF_INET) { 5999 if (addr_len < sizeof(struct sockaddr_in)) 6000 return err; 6001 if (((struct sockaddr_in *)addr)->sin_port == htons(0)) 6002 flags |= BIND_FORCE_ADDRESS_NO_PORT; 6003 return __inet_bind(sk, (struct sockaddr_unsized *)addr, addr_len, flags); 6004 #if IS_ENABLED(CONFIG_IPV6) 6005 } else if (addr->sa_family == AF_INET6) { 6006 if (addr_len < SIN6_LEN_RFC2133) 6007 return err; 6008 if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0)) 6009 flags |= BIND_FORCE_ADDRESS_NO_PORT; 6010 /* ipv6_bpf_stub cannot be NULL, since it's called from 6011 * bpf_cgroup_inet6_connect hook and ipv6 is already loaded 6012 */ 6013 return ipv6_bpf_stub->inet6_bind(sk, (struct sockaddr_unsized *)addr, 6014 addr_len, flags); 6015 #endif /* CONFIG_IPV6 */ 6016 } 6017 #endif /* CONFIG_INET */ 6018 6019 return -EAFNOSUPPORT; 6020 } 6021 6022 static const struct bpf_func_proto bpf_bind_proto = { 6023 .func = bpf_bind, 6024 .gpl_only = false, 6025 .ret_type = RET_INTEGER, 6026 .arg1_type = ARG_PTR_TO_CTX, 6027 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, 6028 .arg3_type = ARG_CONST_SIZE, 6029 }; 6030 6031 #ifdef CONFIG_XFRM 6032 6033 #if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ 6034 (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) 6035 6036 struct metadata_dst __percpu *xfrm_bpf_md_dst; 6037 EXPORT_SYMBOL_GPL(xfrm_bpf_md_dst); 6038 6039 #endif 6040 6041 BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index, 6042 struct bpf_xfrm_state *, to, u32, size, u64, flags) 6043 { 6044 const struct sec_path *sp = skb_sec_path(skb); 6045 const struct xfrm_state *x; 6046 6047 if (!sp || unlikely(index >= sp->len || flags)) 6048 goto err_clear; 6049 6050 x = sp->xvec[index]; 6051 6052 if (unlikely(size != sizeof(struct bpf_xfrm_state))) 6053 goto err_clear; 6054 6055 to->reqid = x->props.reqid; 6056 to->spi = x->id.spi; 6057 to->family = x->props.family; 6058 to->ext = 0; 6059 6060 if (to->family == AF_INET6) { 6061 memcpy(to->remote_ipv6, x->props.saddr.a6, 6062 sizeof(to->remote_ipv6)); 6063 } else { 6064 to->remote_ipv4 = x->props.saddr.a4; 6065 memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); 6066 } 6067 6068 return 0; 6069 err_clear: 6070 memset(to, 0, size); 6071 return -EINVAL; 6072 } 6073 6074 static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = { 6075 .func = bpf_skb_get_xfrm_state, 6076 .gpl_only = false, 6077 .ret_type = RET_INTEGER, 6078 .arg1_type = ARG_PTR_TO_CTX, 6079 .arg2_type = ARG_ANYTHING, 6080 .arg3_type = ARG_PTR_TO_UNINIT_MEM, 6081 .arg4_type = ARG_CONST_SIZE, 6082 .arg5_type = ARG_ANYTHING, 6083 }; 6084 #endif 6085 6086 #if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6) 6087 static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, u32 mtu) 6088 { 6089 params->h_vlan_TCI = 0; 6090 params->h_vlan_proto = 0; 6091 if (mtu) 6092 params->mtu_result = mtu; /* union with tot_len */ 6093 6094 return 0; 6095 } 6096 #endif 6097 6098 #if IS_ENABLED(CONFIG_INET) 6099 static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, 6100 u32 flags, bool check_mtu) 6101 { 6102 struct fib_nh_common *nhc; 6103 struct in_device *in_dev; 6104 struct neighbour *neigh; 6105 struct net_device *dev; 6106 struct fib_result res; 6107 struct flowi4 fl4; 6108 u32 mtu = 0; 6109 int err; 6110 6111 dev = dev_get_by_index_rcu(net, params->ifindex); 6112 if (unlikely(!dev)) 6113 return -ENODEV; 6114 6115 /* verify forwarding is enabled on this interface */ 6116 in_dev = __in_dev_get_rcu(dev); 6117 if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev))) 6118 return BPF_FIB_LKUP_RET_FWD_DISABLED; 6119 6120 if (flags & BPF_FIB_LOOKUP_OUTPUT) { 6121 fl4.flowi4_iif = 1; 6122 fl4.flowi4_oif = params->ifindex; 6123 } else { 6124 fl4.flowi4_iif = params->ifindex; 6125 fl4.flowi4_oif = 0; 6126 } 6127 fl4.flowi4_dscp = inet_dsfield_to_dscp(params->tos); 6128 fl4.flowi4_scope = RT_SCOPE_UNIVERSE; 6129 fl4.flowi4_flags = 0; 6130 6131 fl4.flowi4_proto = params->l4_protocol; 6132 fl4.daddr = params->ipv4_dst; 6133 fl4.saddr = params->ipv4_src; 6134 fl4.fl4_sport = params->sport; 6135 fl4.fl4_dport = params->dport; 6136 fl4.flowi4_multipath_hash = 0; 6137 6138 if (flags & BPF_FIB_LOOKUP_DIRECT) { 6139 u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; 6140 struct fib_table *tb; 6141 6142 if (flags & BPF_FIB_LOOKUP_TBID) { 6143 tbid = params->tbid; 6144 /* zero out for vlan output */ 6145 params->tbid = 0; 6146 } 6147 6148 tb = fib_get_table(net, tbid); 6149 if (unlikely(!tb)) 6150 return BPF_FIB_LKUP_RET_NOT_FWDED; 6151 6152 err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); 6153 } else { 6154 if (flags & BPF_FIB_LOOKUP_MARK) 6155 fl4.flowi4_mark = params->mark; 6156 else 6157 fl4.flowi4_mark = 0; 6158 fl4.flowi4_secid = 0; 6159 fl4.flowi4_tun_key.tun_id = 0; 6160 fl4.flowi4_uid = sock_net_uid(net, NULL); 6161 6162 err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF); 6163 } 6164 6165 if (err) { 6166 /* map fib lookup errors to RTN_ type */ 6167 if (err == -EINVAL) 6168 return BPF_FIB_LKUP_RET_BLACKHOLE; 6169 if (err == -EHOSTUNREACH) 6170 return BPF_FIB_LKUP_RET_UNREACHABLE; 6171 if (err == -EACCES) 6172 return BPF_FIB_LKUP_RET_PROHIBIT; 6173 6174 return BPF_FIB_LKUP_RET_NOT_FWDED; 6175 } 6176 6177 if (res.type != RTN_UNICAST) 6178 return BPF_FIB_LKUP_RET_NOT_FWDED; 6179 6180 if (fib_info_num_path(res.fi) > 1) 6181 fib_select_path(net, &res, &fl4, NULL); 6182 6183 if (check_mtu) { 6184 mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst); 6185 if (params->tot_len > mtu) { 6186 params->mtu_result = mtu; /* union with tot_len */ 6187 return BPF_FIB_LKUP_RET_FRAG_NEEDED; 6188 } 6189 } 6190 6191 nhc = res.nhc; 6192 6193 /* do not handle lwt encaps right now */ 6194 if (nhc->nhc_lwtstate) 6195 return BPF_FIB_LKUP_RET_UNSUPP_LWT; 6196 6197 dev = nhc->nhc_dev; 6198 6199 params->rt_metric = res.fi->fib_priority; 6200 params->ifindex = dev->ifindex; 6201 6202 if (flags & BPF_FIB_LOOKUP_SRC) 6203 params->ipv4_src = fib_result_prefsrc(net, &res); 6204 6205 /* xdp and cls_bpf programs are run in RCU-bh so 6206 * rcu_read_lock_bh is not needed here 6207 */ 6208 if (likely(nhc->nhc_gw_family != AF_INET6)) { 6209 if (nhc->nhc_gw_family) 6210 params->ipv4_dst = nhc->nhc_gw.ipv4; 6211 } else { 6212 struct in6_addr *dst = (struct in6_addr *)params->ipv6_dst; 6213 6214 params->family = AF_INET6; 6215 *dst = nhc->nhc_gw.ipv6; 6216 } 6217 6218 if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH) 6219 goto set_fwd_params; 6220 6221 if (likely(nhc->nhc_gw_family != AF_INET6)) 6222 neigh = __ipv4_neigh_lookup_noref(dev, 6223 (__force u32)params->ipv4_dst); 6224 else 6225 neigh = __ipv6_neigh_lookup_noref_stub(dev, params->ipv6_dst); 6226 6227 if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID)) 6228 return BPF_FIB_LKUP_RET_NO_NEIGH; 6229 memcpy(params->dmac, neigh->ha, ETH_ALEN); 6230 memcpy(params->smac, dev->dev_addr, ETH_ALEN); 6231 6232 set_fwd_params: 6233 return bpf_fib_set_fwd_params(params, mtu); 6234 } 6235 #endif 6236 6237 #if IS_ENABLED(CONFIG_IPV6) 6238 static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, 6239 u32 flags, bool check_mtu) 6240 { 6241 struct in6_addr *src = (struct in6_addr *) params->ipv6_src; 6242 struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst; 6243 struct fib6_result res = {}; 6244 struct neighbour *neigh; 6245 struct net_device *dev; 6246 struct inet6_dev *idev; 6247 struct flowi6 fl6; 6248 int strict = 0; 6249 int oif, err; 6250 u32 mtu = 0; 6251 6252 /* link local addresses are never forwarded */ 6253 if (rt6_need_strict(dst) || rt6_need_strict(src)) 6254 return BPF_FIB_LKUP_RET_NOT_FWDED; 6255 6256 dev = dev_get_by_index_rcu(net, params->ifindex); 6257 if (unlikely(!dev)) 6258 return -ENODEV; 6259 6260 idev = __in6_dev_get_safely(dev); 6261 if (unlikely(!idev || !READ_ONCE(idev->cnf.forwarding))) 6262 return BPF_FIB_LKUP_RET_FWD_DISABLED; 6263 6264 if (flags & BPF_FIB_LOOKUP_OUTPUT) { 6265 fl6.flowi6_iif = 1; 6266 oif = fl6.flowi6_oif = params->ifindex; 6267 } else { 6268 oif = fl6.flowi6_iif = params->ifindex; 6269 fl6.flowi6_oif = 0; 6270 strict = RT6_LOOKUP_F_HAS_SADDR; 6271 } 6272 fl6.flowlabel = params->flowinfo; 6273 fl6.flowi6_scope = 0; 6274 fl6.flowi6_flags = 0; 6275 fl6.mp_hash = 0; 6276 6277 fl6.flowi6_proto = params->l4_protocol; 6278 fl6.daddr = *dst; 6279 fl6.saddr = *src; 6280 fl6.fl6_sport = params->sport; 6281 fl6.fl6_dport = params->dport; 6282 6283 if (flags & BPF_FIB_LOOKUP_DIRECT) { 6284 u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; 6285 struct fib6_table *tb; 6286 6287 if (flags & BPF_FIB_LOOKUP_TBID) { 6288 tbid = params->tbid; 6289 /* zero out for vlan output */ 6290 params->tbid = 0; 6291 } 6292 6293 tb = ipv6_stub->fib6_get_table(net, tbid); 6294 if (unlikely(!tb)) 6295 return BPF_FIB_LKUP_RET_NOT_FWDED; 6296 6297 err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res, 6298 strict); 6299 } else { 6300 if (flags & BPF_FIB_LOOKUP_MARK) 6301 fl6.flowi6_mark = params->mark; 6302 else 6303 fl6.flowi6_mark = 0; 6304 fl6.flowi6_secid = 0; 6305 fl6.flowi6_tun_key.tun_id = 0; 6306 fl6.flowi6_uid = sock_net_uid(net, NULL); 6307 6308 err = ipv6_stub->fib6_lookup(net, oif, &fl6, &res, strict); 6309 } 6310 6311 if (unlikely(err || IS_ERR_OR_NULL(res.f6i) || 6312 res.f6i == net->ipv6.fib6_null_entry)) 6313 return BPF_FIB_LKUP_RET_NOT_FWDED; 6314 6315 switch (res.fib6_type) { 6316 /* only unicast is forwarded */ 6317 case RTN_UNICAST: 6318 break; 6319 case RTN_BLACKHOLE: 6320 return BPF_FIB_LKUP_RET_BLACKHOLE; 6321 case RTN_UNREACHABLE: 6322 return BPF_FIB_LKUP_RET_UNREACHABLE; 6323 case RTN_PROHIBIT: 6324 return BPF_FIB_LKUP_RET_PROHIBIT; 6325 default: 6326 return BPF_FIB_LKUP_RET_NOT_FWDED; 6327 } 6328 6329 ipv6_stub->fib6_select_path(net, &res, &fl6, fl6.flowi6_oif, 6330 fl6.flowi6_oif != 0, NULL, strict); 6331 6332 if (check_mtu) { 6333 mtu = ipv6_stub->ip6_mtu_from_fib6(&res, dst, src); 6334 if (params->tot_len > mtu) { 6335 params->mtu_result = mtu; /* union with tot_len */ 6336 return BPF_FIB_LKUP_RET_FRAG_NEEDED; 6337 } 6338 } 6339 6340 if (res.nh->fib_nh_lws) 6341 return BPF_FIB_LKUP_RET_UNSUPP_LWT; 6342 6343 if (res.nh->fib_nh_gw_family) 6344 *dst = res.nh->fib_nh_gw6; 6345 6346 dev = res.nh->fib_nh_dev; 6347 params->rt_metric = res.f6i->fib6_metric; 6348 params->ifindex = dev->ifindex; 6349 6350 if (flags & BPF_FIB_LOOKUP_SRC) { 6351 if (res.f6i->fib6_prefsrc.plen) { 6352 *src = res.f6i->fib6_prefsrc.addr; 6353 } else { 6354 err = ipv6_bpf_stub->ipv6_dev_get_saddr(net, dev, 6355 &fl6.daddr, 0, 6356 src); 6357 if (err) 6358 return BPF_FIB_LKUP_RET_NO_SRC_ADDR; 6359 } 6360 } 6361 6362 if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH) 6363 goto set_fwd_params; 6364 6365 /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is 6366 * not needed here. 6367 */ 6368 neigh = __ipv6_neigh_lookup_noref_stub(dev, dst); 6369 if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID)) 6370 return BPF_FIB_LKUP_RET_NO_NEIGH; 6371 memcpy(params->dmac, neigh->ha, ETH_ALEN); 6372 memcpy(params->smac, dev->dev_addr, ETH_ALEN); 6373 6374 set_fwd_params: 6375 return bpf_fib_set_fwd_params(params, mtu); 6376 } 6377 #endif 6378 6379 #define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \ 6380 BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \ 6381 BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_MARK) 6382 6383 BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, 6384 struct bpf_fib_lookup *, params, int, plen, u32, flags) 6385 { 6386 if (plen < sizeof(*params)) 6387 return -EINVAL; 6388 6389 if (flags & ~BPF_FIB_LOOKUP_MASK) 6390 return -EINVAL; 6391 6392 switch (params->family) { 6393 #if IS_ENABLED(CONFIG_INET) 6394 case AF_INET: 6395 return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params, 6396 flags, true); 6397 #endif 6398 #if IS_ENABLED(CONFIG_IPV6) 6399 case AF_INET6: 6400 return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params, 6401 flags, true); 6402 #endif 6403 } 6404 return -EAFNOSUPPORT; 6405 } 6406 6407 static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = { 6408 .func = bpf_xdp_fib_lookup, 6409 .gpl_only = true, 6410 .ret_type = RET_INTEGER, 6411 .arg1_type = ARG_PTR_TO_CTX, 6412 .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, 6413 .arg3_type = ARG_CONST_SIZE, 6414 .arg4_type = ARG_ANYTHING, 6415 }; 6416 6417 BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, 6418 struct bpf_fib_lookup *, params, int, plen, u32, flags) 6419 { 6420 struct net *net = dev_net(skb->dev); 6421 int rc = -EAFNOSUPPORT; 6422 bool check_mtu = false; 6423 6424 if (plen < sizeof(*params)) 6425 return -EINVAL; 6426 6427 if (flags & ~BPF_FIB_LOOKUP_MASK) 6428 return -EINVAL; 6429 6430 if (params->tot_len) 6431 check_mtu = true; 6432 6433 switch (params->family) { 6434 #if IS_ENABLED(CONFIG_INET) 6435 case AF_INET: 6436 rc = bpf_ipv4_fib_lookup(net, params, flags, check_mtu); 6437 break; 6438 #endif 6439 #if IS_ENABLED(CONFIG_IPV6) 6440 case AF_INET6: 6441 rc = bpf_ipv6_fib_lookup(net, params, flags, check_mtu); 6442 break; 6443 #endif 6444 } 6445 6446 if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) { 6447 struct net_device *dev; 6448 6449 /* When tot_len isn't provided by user, check skb 6450 * against MTU of FIB lookup resulting net_device 6451 */ 6452 dev = dev_get_by_index_rcu(net, params->ifindex); 6453 if (!is_skb_forwardable(dev, skb)) 6454 rc = BPF_FIB_LKUP_RET_FRAG_NEEDED; 6455 6456 params->mtu_result = dev->mtu; /* union with tot_len */ 6457 } 6458 6459 return rc; 6460 } 6461 6462 static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { 6463 .func = bpf_skb_fib_lookup, 6464 .gpl_only = true, 6465 .ret_type = RET_INTEGER, 6466 .arg1_type = ARG_PTR_TO_CTX, 6467 .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, 6468 .arg3_type = ARG_CONST_SIZE, 6469 .arg4_type = ARG_ANYTHING, 6470 }; 6471 6472 static struct net_device *__dev_via_ifindex(struct net_device *dev_curr, 6473 u32 ifindex) 6474 { 6475 struct net *netns = dev_net(dev_curr); 6476 6477 /* Non-redirect use-cases can use ifindex=0 and save ifindex lookup */ 6478 if (ifindex == 0) 6479 return dev_curr; 6480 6481 return dev_get_by_index_rcu(netns, ifindex); 6482 } 6483 6484 BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb, 6485 u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags) 6486 { 6487 int ret = BPF_MTU_CHK_RET_FRAG_NEEDED; 6488 struct net_device *dev = skb->dev; 6489 int mtu, dev_len, skb_len; 6490 6491 if (unlikely(flags & ~(BPF_MTU_CHK_SEGS))) 6492 return -EINVAL; 6493 if (unlikely(flags & BPF_MTU_CHK_SEGS && (len_diff || *mtu_len))) 6494 return -EINVAL; 6495 6496 dev = __dev_via_ifindex(dev, ifindex); 6497 if (unlikely(!dev)) 6498 return -ENODEV; 6499 6500 mtu = READ_ONCE(dev->mtu); 6501 dev_len = mtu + dev->hard_header_len; 6502 6503 /* If set use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */ 6504 skb_len = *mtu_len ? *mtu_len + dev->hard_header_len : skb->len; 6505 6506 skb_len += len_diff; /* minus result pass check */ 6507 if (skb_len <= dev_len) { 6508 ret = BPF_MTU_CHK_RET_SUCCESS; 6509 goto out; 6510 } 6511 /* At this point, skb->len exceed MTU, but as it include length of all 6512 * segments, it can still be below MTU. The SKB can possibly get 6513 * re-segmented in transmit path (see validate_xmit_skb). Thus, user 6514 * must choose if segs are to be MTU checked. 6515 */ 6516 if (skb_is_gso(skb)) { 6517 ret = BPF_MTU_CHK_RET_SUCCESS; 6518 if (flags & BPF_MTU_CHK_SEGS) { 6519 if (!skb_transport_header_was_set(skb)) 6520 return -EINVAL; 6521 if (!skb_gso_validate_network_len(skb, mtu)) 6522 ret = BPF_MTU_CHK_RET_SEGS_TOOBIG; 6523 } 6524 } 6525 out: 6526 *mtu_len = mtu; 6527 return ret; 6528 } 6529 6530 BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp, 6531 u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags) 6532 { 6533 struct net_device *dev = xdp->rxq->dev; 6534 int xdp_len = xdp->data_end - xdp->data; 6535 int ret = BPF_MTU_CHK_RET_SUCCESS; 6536 int mtu, dev_len; 6537 6538 /* XDP variant doesn't support multi-buffer segment check (yet) */ 6539 if (unlikely(flags)) 6540 return -EINVAL; 6541 6542 dev = __dev_via_ifindex(dev, ifindex); 6543 if (unlikely(!dev)) 6544 return -ENODEV; 6545 6546 mtu = READ_ONCE(dev->mtu); 6547 dev_len = mtu + dev->hard_header_len; 6548 6549 /* Use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */ 6550 if (*mtu_len) 6551 xdp_len = *mtu_len + dev->hard_header_len; 6552 6553 xdp_len += len_diff; /* minus result pass check */ 6554 if (xdp_len > dev_len) 6555 ret = BPF_MTU_CHK_RET_FRAG_NEEDED; 6556 6557 *mtu_len = mtu; 6558 return ret; 6559 } 6560 6561 static const struct bpf_func_proto bpf_skb_check_mtu_proto = { 6562 .func = bpf_skb_check_mtu, 6563 .gpl_only = true, 6564 .ret_type = RET_INTEGER, 6565 .arg1_type = ARG_PTR_TO_CTX, 6566 .arg2_type = ARG_ANYTHING, 6567 .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED, 6568 .arg3_size = sizeof(u32), 6569 .arg4_type = ARG_ANYTHING, 6570 .arg5_type = ARG_ANYTHING, 6571 }; 6572 6573 static const struct bpf_func_proto bpf_xdp_check_mtu_proto = { 6574 .func = bpf_xdp_check_mtu, 6575 .gpl_only = true, 6576 .ret_type = RET_INTEGER, 6577 .arg1_type = ARG_PTR_TO_CTX, 6578 .arg2_type = ARG_ANYTHING, 6579 .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED, 6580 .arg3_size = sizeof(u32), 6581 .arg4_type = ARG_ANYTHING, 6582 .arg5_type = ARG_ANYTHING, 6583 }; 6584 6585 #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) 6586 static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) 6587 { 6588 int err; 6589 struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr; 6590 6591 if (!seg6_validate_srh(srh, len, false)) 6592 return -EINVAL; 6593 6594 switch (type) { 6595 case BPF_LWT_ENCAP_SEG6_INLINE: 6596 if (skb->protocol != htons(ETH_P_IPV6)) 6597 return -EBADMSG; 6598 6599 err = seg6_do_srh_inline(skb, srh); 6600 break; 6601 case BPF_LWT_ENCAP_SEG6: 6602 skb_reset_inner_headers(skb); 6603 skb->encapsulation = 1; 6604 err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6); 6605 break; 6606 default: 6607 return -EINVAL; 6608 } 6609 6610 bpf_compute_data_pointers(skb); 6611 if (err) 6612 return err; 6613 6614 skb_set_transport_header(skb, sizeof(struct ipv6hdr)); 6615 6616 return seg6_lookup_nexthop(skb, NULL, 0); 6617 } 6618 #endif /* CONFIG_IPV6_SEG6_BPF */ 6619 6620 #if IS_ENABLED(CONFIG_LWTUNNEL_BPF) 6621 static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, 6622 bool ingress) 6623 { 6624 return bpf_lwt_push_ip_encap(skb, hdr, len, ingress); 6625 } 6626 #endif 6627 6628 BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, 6629 u32, len) 6630 { 6631 switch (type) { 6632 #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) 6633 case BPF_LWT_ENCAP_SEG6: 6634 case BPF_LWT_ENCAP_SEG6_INLINE: 6635 return bpf_push_seg6_encap(skb, type, hdr, len); 6636 #endif 6637 #if IS_ENABLED(CONFIG_LWTUNNEL_BPF) 6638 case BPF_LWT_ENCAP_IP: 6639 return bpf_push_ip_encap(skb, hdr, len, true /* ingress */); 6640 #endif 6641 default: 6642 return -EINVAL; 6643 } 6644 } 6645 6646 BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type, 6647 void *, hdr, u32, len) 6648 { 6649 switch (type) { 6650 #if IS_ENABLED(CONFIG_LWTUNNEL_BPF) 6651 case BPF_LWT_ENCAP_IP: 6652 return bpf_push_ip_encap(skb, hdr, len, false /* egress */); 6653 #endif 6654 default: 6655 return -EINVAL; 6656 } 6657 } 6658 6659 static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = { 6660 .func = bpf_lwt_in_push_encap, 6661 .gpl_only = false, 6662 .ret_type = RET_INTEGER, 6663 .arg1_type = ARG_PTR_TO_CTX, 6664 .arg2_type = ARG_ANYTHING, 6665 .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, 6666 .arg4_type = ARG_CONST_SIZE 6667 }; 6668 6669 static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = { 6670 .func = bpf_lwt_xmit_push_encap, 6671 .gpl_only = false, 6672 .ret_type = RET_INTEGER, 6673 .arg1_type = ARG_PTR_TO_CTX, 6674 .arg2_type = ARG_ANYTHING, 6675 .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, 6676 .arg4_type = ARG_CONST_SIZE 6677 }; 6678 6679 #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) 6680 BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, 6681 const void *, from, u32, len) 6682 { 6683 struct seg6_bpf_srh_state *srh_state = 6684 this_cpu_ptr(&seg6_bpf_srh_states); 6685 struct ipv6_sr_hdr *srh = srh_state->srh; 6686 void *srh_tlvs, *srh_end, *ptr; 6687 int srhoff = 0; 6688 6689 lockdep_assert_held(&srh_state->bh_lock); 6690 if (srh == NULL) 6691 return -EINVAL; 6692 6693 srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4)); 6694 srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen); 6695 6696 ptr = skb->data + offset; 6697 if (ptr >= srh_tlvs && ptr + len <= srh_end) 6698 srh_state->valid = false; 6699 else if (ptr < (void *)&srh->flags || 6700 ptr + len > (void *)&srh->segments) 6701 return -EFAULT; 6702 6703 if (unlikely(bpf_try_make_writable(skb, offset + len))) 6704 return -EFAULT; 6705 if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) 6706 return -EINVAL; 6707 srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); 6708 6709 memcpy(skb->data + offset, from, len); 6710 return 0; 6711 } 6712 6713 static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { 6714 .func = bpf_lwt_seg6_store_bytes, 6715 .gpl_only = false, 6716 .ret_type = RET_INTEGER, 6717 .arg1_type = ARG_PTR_TO_CTX, 6718 .arg2_type = ARG_ANYTHING, 6719 .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, 6720 .arg4_type = ARG_CONST_SIZE 6721 }; 6722 6723 static void bpf_update_srh_state(struct sk_buff *skb) 6724 { 6725 struct seg6_bpf_srh_state *srh_state = 6726 this_cpu_ptr(&seg6_bpf_srh_states); 6727 int srhoff = 0; 6728 6729 if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) { 6730 srh_state->srh = NULL; 6731 } else { 6732 srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); 6733 srh_state->hdrlen = srh_state->srh->hdrlen << 3; 6734 srh_state->valid = true; 6735 } 6736 } 6737 6738 BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, 6739 u32, action, void *, param, u32, param_len) 6740 { 6741 struct seg6_bpf_srh_state *srh_state = 6742 this_cpu_ptr(&seg6_bpf_srh_states); 6743 int hdroff = 0; 6744 int err; 6745 6746 lockdep_assert_held(&srh_state->bh_lock); 6747 switch (action) { 6748 case SEG6_LOCAL_ACTION_END_X: 6749 if (!seg6_bpf_has_valid_srh(skb)) 6750 return -EBADMSG; 6751 if (param_len != sizeof(struct in6_addr)) 6752 return -EINVAL; 6753 return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0); 6754 case SEG6_LOCAL_ACTION_END_T: 6755 if (!seg6_bpf_has_valid_srh(skb)) 6756 return -EBADMSG; 6757 if (param_len != sizeof(int)) 6758 return -EINVAL; 6759 return seg6_lookup_nexthop(skb, NULL, *(int *)param); 6760 case SEG6_LOCAL_ACTION_END_DT6: 6761 if (!seg6_bpf_has_valid_srh(skb)) 6762 return -EBADMSG; 6763 if (param_len != sizeof(int)) 6764 return -EINVAL; 6765 6766 if (ipv6_find_hdr(skb, &hdroff, IPPROTO_IPV6, NULL, NULL) < 0) 6767 return -EBADMSG; 6768 if (!pskb_pull(skb, hdroff)) 6769 return -EBADMSG; 6770 6771 skb_postpull_rcsum(skb, skb_network_header(skb), hdroff); 6772 skb_reset_network_header(skb); 6773 skb_reset_transport_header(skb); 6774 skb->encapsulation = 0; 6775 6776 bpf_compute_data_pointers(skb); 6777 bpf_update_srh_state(skb); 6778 return seg6_lookup_nexthop(skb, NULL, *(int *)param); 6779 case SEG6_LOCAL_ACTION_END_B6: 6780 if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) 6781 return -EBADMSG; 6782 err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE, 6783 param, param_len); 6784 if (!err) 6785 bpf_update_srh_state(skb); 6786 6787 return err; 6788 case SEG6_LOCAL_ACTION_END_B6_ENCAP: 6789 if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) 6790 return -EBADMSG; 6791 err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6, 6792 param, param_len); 6793 if (!err) 6794 bpf_update_srh_state(skb); 6795 6796 return err; 6797 default: 6798 return -EINVAL; 6799 } 6800 } 6801 6802 static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { 6803 .func = bpf_lwt_seg6_action, 6804 .gpl_only = false, 6805 .ret_type = RET_INTEGER, 6806 .arg1_type = ARG_PTR_TO_CTX, 6807 .arg2_type = ARG_ANYTHING, 6808 .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, 6809 .arg4_type = ARG_CONST_SIZE 6810 }; 6811 6812 BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, 6813 s32, len) 6814 { 6815 struct seg6_bpf_srh_state *srh_state = 6816 this_cpu_ptr(&seg6_bpf_srh_states); 6817 struct ipv6_sr_hdr *srh = srh_state->srh; 6818 void *srh_end, *srh_tlvs, *ptr; 6819 struct ipv6hdr *hdr; 6820 int srhoff = 0; 6821 int ret; 6822 6823 lockdep_assert_held(&srh_state->bh_lock); 6824 if (unlikely(srh == NULL)) 6825 return -EINVAL; 6826 6827 srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) + 6828 ((srh->first_segment + 1) << 4)); 6829 srh_end = (void *)((unsigned char *)srh + sizeof(*srh) + 6830 srh_state->hdrlen); 6831 ptr = skb->data + offset; 6832 6833 if (unlikely(ptr < srh_tlvs || ptr > srh_end)) 6834 return -EFAULT; 6835 if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end)) 6836 return -EFAULT; 6837 6838 if (len > 0) { 6839 ret = skb_cow_head(skb, len); 6840 if (unlikely(ret < 0)) 6841 return ret; 6842 6843 ret = bpf_skb_net_hdr_push(skb, offset, len); 6844 } else { 6845 ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len); 6846 } 6847 6848 bpf_compute_data_pointers(skb); 6849 if (unlikely(ret < 0)) 6850 return ret; 6851 6852 hdr = (struct ipv6hdr *)skb->data; 6853 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); 6854 6855 if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) 6856 return -EINVAL; 6857 srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); 6858 srh_state->hdrlen += len; 6859 srh_state->valid = false; 6860 return 0; 6861 } 6862 6863 static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { 6864 .func = bpf_lwt_seg6_adjust_srh, 6865 .gpl_only = false, 6866 .ret_type = RET_INTEGER, 6867 .arg1_type = ARG_PTR_TO_CTX, 6868 .arg2_type = ARG_ANYTHING, 6869 .arg3_type = ARG_ANYTHING, 6870 }; 6871 #endif /* CONFIG_IPV6_SEG6_BPF */ 6872 6873 #ifdef CONFIG_INET 6874 static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, 6875 int dif, int sdif, u8 family, u8 proto) 6876 { 6877 bool refcounted = false; 6878 struct sock *sk = NULL; 6879 6880 if (family == AF_INET) { 6881 __be32 src4 = tuple->ipv4.saddr; 6882 __be32 dst4 = tuple->ipv4.daddr; 6883 6884 if (proto == IPPROTO_TCP) 6885 sk = __inet_lookup(net, NULL, 0, 6886 src4, tuple->ipv4.sport, 6887 dst4, tuple->ipv4.dport, 6888 dif, sdif, &refcounted); 6889 else 6890 sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport, 6891 dst4, tuple->ipv4.dport, 6892 dif, sdif, net->ipv4.udp_table, NULL); 6893 #if IS_ENABLED(CONFIG_IPV6) 6894 } else { 6895 struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; 6896 struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; 6897 6898 if (proto == IPPROTO_TCP) 6899 sk = __inet6_lookup(net, NULL, 0, 6900 src6, tuple->ipv6.sport, 6901 dst6, ntohs(tuple->ipv6.dport), 6902 dif, sdif, &refcounted); 6903 else if (likely(ipv6_bpf_stub)) 6904 sk = ipv6_bpf_stub->udp6_lib_lookup(net, 6905 src6, tuple->ipv6.sport, 6906 dst6, tuple->ipv6.dport, 6907 dif, sdif, 6908 net->ipv4.udp_table, NULL); 6909 #endif 6910 } 6911 6912 if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) { 6913 WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); 6914 sk = NULL; 6915 } 6916 return sk; 6917 } 6918 6919 /* bpf_skc_lookup performs the core lookup for different types of sockets, 6920 * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE. 6921 */ 6922 static struct sock * 6923 __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, 6924 struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, 6925 u64 flags, int sdif) 6926 { 6927 struct sock *sk = NULL; 6928 struct net *net; 6929 u8 family; 6930 6931 if (len == sizeof(tuple->ipv4)) 6932 family = AF_INET; 6933 else if (len == sizeof(tuple->ipv6)) 6934 family = AF_INET6; 6935 else 6936 return NULL; 6937 6938 if (unlikely(flags || !((s32)netns_id < 0 || netns_id <= S32_MAX))) 6939 goto out; 6940 6941 if (sdif < 0) { 6942 if (family == AF_INET) 6943 sdif = inet_sdif(skb); 6944 else 6945 sdif = inet6_sdif(skb); 6946 } 6947 6948 if ((s32)netns_id < 0) { 6949 net = caller_net; 6950 sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); 6951 } else { 6952 net = get_net_ns_by_id(caller_net, netns_id); 6953 if (unlikely(!net)) 6954 goto out; 6955 sk = sk_lookup(net, tuple, ifindex, sdif, family, proto); 6956 put_net(net); 6957 } 6958 6959 out: 6960 return sk; 6961 } 6962 6963 static struct sock * 6964 __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, 6965 struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id, 6966 u64 flags, int sdif) 6967 { 6968 struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net, 6969 ifindex, proto, netns_id, flags, 6970 sdif); 6971 6972 if (sk) { 6973 struct sock *sk2 = sk_to_full_sk(sk); 6974 6975 /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk 6976 * sock refcnt is decremented to prevent a request_sock leak. 6977 */ 6978 if (sk2 != sk) { 6979 sock_gen_put(sk); 6980 /* Ensure there is no need to bump sk2 refcnt */ 6981 if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) { 6982 WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); 6983 return NULL; 6984 } 6985 sk = sk2; 6986 } 6987 } 6988 6989 return sk; 6990 } 6991 6992 static struct sock * 6993 bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, 6994 u8 proto, u64 netns_id, u64 flags) 6995 { 6996 struct net *caller_net; 6997 int ifindex; 6998 6999 if (skb->dev) { 7000 caller_net = dev_net(skb->dev); 7001 ifindex = skb->dev->ifindex; 7002 } else { 7003 caller_net = sock_net(skb->sk); 7004 ifindex = 0; 7005 } 7006 7007 return __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto, 7008 netns_id, flags, -1); 7009 } 7010 7011 static struct sock * 7012 bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, 7013 u8 proto, u64 netns_id, u64 flags) 7014 { 7015 struct sock *sk = bpf_skc_lookup(skb, tuple, len, proto, netns_id, 7016 flags); 7017 7018 if (sk) { 7019 struct sock *sk2 = sk_to_full_sk(sk); 7020 7021 /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk 7022 * sock refcnt is decremented to prevent a request_sock leak. 7023 */ 7024 if (sk2 != sk) { 7025 sock_gen_put(sk); 7026 /* Ensure there is no need to bump sk2 refcnt */ 7027 if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) { 7028 WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); 7029 return NULL; 7030 } 7031 sk = sk2; 7032 } 7033 } 7034 7035 return sk; 7036 } 7037 7038 BPF_CALL_5(bpf_skc_lookup_tcp, struct sk_buff *, skb, 7039 struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) 7040 { 7041 return (unsigned long)bpf_skc_lookup(skb, tuple, len, IPPROTO_TCP, 7042 netns_id, flags); 7043 } 7044 7045 static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = { 7046 .func = bpf_skc_lookup_tcp, 7047 .gpl_only = false, 7048 .pkt_access = true, 7049 .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, 7050 .arg1_type = ARG_PTR_TO_CTX, 7051 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, 7052 .arg3_type = ARG_CONST_SIZE_OR_ZERO, 7053 .arg4_type = ARG_ANYTHING, 7054 .arg5_type = ARG_ANYTHING, 7055 }; 7056 7057 BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb, 7058 struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) 7059 { 7060 return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, 7061 netns_id, flags); 7062 } 7063 7064 static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { 7065 .func = bpf_sk_lookup_tcp, 7066 .gpl_only = false, 7067 .pkt_access = true, 7068 .ret_type = RET_PTR_TO_SOCKET_OR_NULL, 7069 .arg1_type = ARG_PTR_TO_CTX, 7070 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, 7071 .arg3_type = ARG_CONST_SIZE_OR_ZERO, 7072 .arg4_type = ARG_ANYTHING, 7073 .arg5_type = ARG_ANYTHING, 7074 }; 7075 7076 BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb, 7077 struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) 7078 { 7079 return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, 7080 netns_id, flags); 7081 } 7082 7083 static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { 7084 .func = bpf_sk_lookup_udp, 7085 .gpl_only = false, 7086 .pkt_access = true, 7087 .ret_type = RET_PTR_TO_SOCKET_OR_NULL, 7088 .arg1_type = ARG_PTR_TO_CTX, 7089 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, 7090 .arg3_type = ARG_CONST_SIZE_OR_ZERO, 7091 .arg4_type = ARG_ANYTHING, 7092 .arg5_type = ARG_ANYTHING, 7093 }; 7094 7095 BPF_CALL_5(bpf_tc_skc_lookup_tcp, struct sk_buff *, skb, 7096 struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) 7097 { 7098 struct net_device *dev = skb->dev; 7099 int ifindex = dev->ifindex, sdif = dev_sdif(dev); 7100 struct net *caller_net = dev_net(dev); 7101 7102 return (unsigned long)__bpf_skc_lookup(skb, tuple, len, caller_net, 7103 ifindex, IPPROTO_TCP, netns_id, 7104 flags, sdif); 7105 } 7106 7107 static const struct bpf_func_proto bpf_tc_skc_lookup_tcp_proto = { 7108 .func = bpf_tc_skc_lookup_tcp, 7109 .gpl_only = false, 7110 .pkt_access = true, 7111 .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, 7112 .arg1_type = ARG_PTR_TO_CTX, 7113 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, 7114 .arg3_type = ARG_CONST_SIZE_OR_ZERO, 7115 .arg4_type = ARG_ANYTHING, 7116 .arg5_type = ARG_ANYTHING, 7117 }; 7118 7119 BPF_CALL_5(bpf_tc_sk_lookup_tcp, struct sk_buff *, skb, 7120 struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) 7121 { 7122 struct net_device *dev = skb->dev; 7123 int ifindex = dev->ifindex, sdif = dev_sdif(dev); 7124 struct net *caller_net = dev_net(dev); 7125 7126 return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net, 7127 ifindex, IPPROTO_TCP, netns_id, 7128 flags, sdif); 7129 } 7130 7131 static const struct bpf_func_proto bpf_tc_sk_lookup_tcp_proto = { 7132 .func = bpf_tc_sk_lookup_tcp, 7133 .gpl_only = false, 7134 .pkt_access = true, 7135 .ret_type = RET_PTR_TO_SOCKET_OR_NULL, 7136 .arg1_type = ARG_PTR_TO_CTX, 7137 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, 7138 .arg3_type = ARG_CONST_SIZE_OR_ZERO, 7139 .arg4_type = ARG_ANYTHING, 7140 .arg5_type = ARG_ANYTHING, 7141 }; 7142 7143 BPF_CALL_5(bpf_tc_sk_lookup_udp, struct sk_buff *, skb, 7144 struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) 7145 { 7146 struct net_device *dev = skb->dev; 7147 int ifindex = dev->ifindex, sdif = dev_sdif(dev); 7148 struct net *caller_net = dev_net(dev); 7149 7150 return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net, 7151 ifindex, IPPROTO_UDP, netns_id, 7152 flags, sdif); 7153 } 7154 7155 static const struct bpf_func_proto bpf_tc_sk_lookup_udp_proto = { 7156 .func = bpf_tc_sk_lookup_udp, 7157 .gpl_only = false, 7158 .pkt_access = true, 7159 .ret_type = RET_PTR_TO_SOCKET_OR_NULL, 7160 .arg1_type = ARG_PTR_TO_CTX, 7161 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, 7162 .arg3_type = ARG_CONST_SIZE_OR_ZERO, 7163 .arg4_type = ARG_ANYTHING, 7164 .arg5_type = ARG_ANYTHING, 7165 }; 7166 7167 BPF_CALL_1(bpf_sk_release, struct sock *, sk) 7168 { 7169 if (sk && sk_is_refcounted(sk)) 7170 sock_gen_put(sk); 7171 return 0; 7172 } 7173 7174 static const struct bpf_func_proto bpf_sk_release_proto = { 7175 .func = bpf_sk_release, 7176 .gpl_only = false, 7177 .ret_type = RET_INTEGER, 7178 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | OBJ_RELEASE, 7179 }; 7180 7181 BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, 7182 struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) 7183 { 7184 struct net_device *dev = ctx->rxq->dev; 7185 int ifindex = dev->ifindex, sdif = dev_sdif(dev); 7186 struct net *caller_net = dev_net(dev); 7187 7188 return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net, 7189 ifindex, IPPROTO_UDP, netns_id, 7190 flags, sdif); 7191 } 7192 7193 static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { 7194 .func = bpf_xdp_sk_lookup_udp, 7195 .gpl_only = false, 7196 .pkt_access = true, 7197 .ret_type = RET_PTR_TO_SOCKET_OR_NULL, 7198 .arg1_type = ARG_PTR_TO_CTX, 7199 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, 7200 .arg3_type = ARG_CONST_SIZE_OR_ZERO, 7201 .arg4_type = ARG_ANYTHING, 7202 .arg5_type = ARG_ANYTHING, 7203 }; 7204 7205 BPF_CALL_5(bpf_xdp_skc_lookup_tcp, struct xdp_buff *, ctx, 7206 struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) 7207 { 7208 struct net_device *dev = ctx->rxq->dev; 7209 int ifindex = dev->ifindex, sdif = dev_sdif(dev); 7210 struct net *caller_net = dev_net(dev); 7211 7212 return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, caller_net, 7213 ifindex, IPPROTO_TCP, netns_id, 7214 flags, sdif); 7215 } 7216 7217 static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = { 7218 .func = bpf_xdp_skc_lookup_tcp, 7219 .gpl_only = false, 7220 .pkt_access = true, 7221 .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, 7222 .arg1_type = ARG_PTR_TO_CTX, 7223 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, 7224 .arg3_type = ARG_CONST_SIZE_OR_ZERO, 7225 .arg4_type = ARG_ANYTHING, 7226 .arg5_type = ARG_ANYTHING, 7227 }; 7228 7229 BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx, 7230 struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags) 7231 { 7232 struct net_device *dev = ctx->rxq->dev; 7233 int ifindex = dev->ifindex, sdif = dev_sdif(dev); 7234 struct net *caller_net = dev_net(dev); 7235 7236 return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net, 7237 ifindex, IPPROTO_TCP, netns_id, 7238 flags, sdif); 7239 } 7240 7241 static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { 7242 .func = bpf_xdp_sk_lookup_tcp, 7243 .gpl_only = false, 7244 .pkt_access = true, 7245 .ret_type = RET_PTR_TO_SOCKET_OR_NULL, 7246 .arg1_type = ARG_PTR_TO_CTX, 7247 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, 7248 .arg3_type = ARG_CONST_SIZE_OR_ZERO, 7249 .arg4_type = ARG_ANYTHING, 7250 .arg5_type = ARG_ANYTHING, 7251 }; 7252 7253 BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp, struct bpf_sock_addr_kern *, ctx, 7254 struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) 7255 { 7256 return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, 7257 sock_net(ctx->sk), 0, 7258 IPPROTO_TCP, netns_id, flags, 7259 -1); 7260 } 7261 7262 static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = { 7263 .func = bpf_sock_addr_skc_lookup_tcp, 7264 .gpl_only = false, 7265 .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, 7266 .arg1_type = ARG_PTR_TO_CTX, 7267 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, 7268 .arg3_type = ARG_CONST_SIZE_OR_ZERO, 7269 .arg4_type = ARG_ANYTHING, 7270 .arg5_type = ARG_ANYTHING, 7271 }; 7272 7273 BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx, 7274 struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) 7275 { 7276 return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, 7277 sock_net(ctx->sk), 0, IPPROTO_TCP, 7278 netns_id, flags, -1); 7279 } 7280 7281 static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { 7282 .func = bpf_sock_addr_sk_lookup_tcp, 7283 .gpl_only = false, 7284 .ret_type = RET_PTR_TO_SOCKET_OR_NULL, 7285 .arg1_type = ARG_PTR_TO_CTX, 7286 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, 7287 .arg3_type = ARG_CONST_SIZE_OR_ZERO, 7288 .arg4_type = ARG_ANYTHING, 7289 .arg5_type = ARG_ANYTHING, 7290 }; 7291 7292 BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx, 7293 struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) 7294 { 7295 return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, 7296 sock_net(ctx->sk), 0, IPPROTO_UDP, 7297 netns_id, flags, -1); 7298 } 7299 7300 static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { 7301 .func = bpf_sock_addr_sk_lookup_udp, 7302 .gpl_only = false, 7303 .ret_type = RET_PTR_TO_SOCKET_OR_NULL, 7304 .arg1_type = ARG_PTR_TO_CTX, 7305 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, 7306 .arg3_type = ARG_CONST_SIZE_OR_ZERO, 7307 .arg4_type = ARG_ANYTHING, 7308 .arg5_type = ARG_ANYTHING, 7309 }; 7310 7311 bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, 7312 struct bpf_insn_access_aux *info) 7313 { 7314 if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, 7315 icsk_retransmits)) 7316 return false; 7317 7318 if (off % size != 0) 7319 return false; 7320 7321 switch (off) { 7322 case offsetof(struct bpf_tcp_sock, bytes_received): 7323 case offsetof(struct bpf_tcp_sock, bytes_acked): 7324 return size == sizeof(__u64); 7325 default: 7326 return size == sizeof(__u32); 7327 } 7328 } 7329 7330 u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, 7331 const struct bpf_insn *si, 7332 struct bpf_insn *insn_buf, 7333 struct bpf_prog *prog, u32 *target_size) 7334 { 7335 struct bpf_insn *insn = insn_buf; 7336 7337 #define BPF_TCP_SOCK_GET_COMMON(FIELD) \ 7338 do { \ 7339 BUILD_BUG_ON(sizeof_field(struct tcp_sock, FIELD) > \ 7340 sizeof_field(struct bpf_tcp_sock, FIELD)); \ 7341 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_sock, FIELD),\ 7342 si->dst_reg, si->src_reg, \ 7343 offsetof(struct tcp_sock, FIELD)); \ 7344 } while (0) 7345 7346 #define BPF_INET_SOCK_GET_COMMON(FIELD) \ 7347 do { \ 7348 BUILD_BUG_ON(sizeof_field(struct inet_connection_sock, \ 7349 FIELD) > \ 7350 sizeof_field(struct bpf_tcp_sock, FIELD)); \ 7351 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ 7352 struct inet_connection_sock, \ 7353 FIELD), \ 7354 si->dst_reg, si->src_reg, \ 7355 offsetof( \ 7356 struct inet_connection_sock, \ 7357 FIELD)); \ 7358 } while (0) 7359 7360 BTF_TYPE_EMIT(struct bpf_tcp_sock); 7361 7362 switch (si->off) { 7363 case offsetof(struct bpf_tcp_sock, rtt_min): 7364 BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) != 7365 sizeof(struct minmax)); 7366 BUILD_BUG_ON(sizeof(struct minmax) < 7367 sizeof(struct minmax_sample)); 7368 7369 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 7370 offsetof(struct tcp_sock, rtt_min) + 7371 offsetof(struct minmax_sample, v)); 7372 break; 7373 case offsetof(struct bpf_tcp_sock, snd_cwnd): 7374 BPF_TCP_SOCK_GET_COMMON(snd_cwnd); 7375 break; 7376 case offsetof(struct bpf_tcp_sock, srtt_us): 7377 BPF_TCP_SOCK_GET_COMMON(srtt_us); 7378 break; 7379 case offsetof(struct bpf_tcp_sock, snd_ssthresh): 7380 BPF_TCP_SOCK_GET_COMMON(snd_ssthresh); 7381 break; 7382 case offsetof(struct bpf_tcp_sock, rcv_nxt): 7383 BPF_TCP_SOCK_GET_COMMON(rcv_nxt); 7384 break; 7385 case offsetof(struct bpf_tcp_sock, snd_nxt): 7386 BPF_TCP_SOCK_GET_COMMON(snd_nxt); 7387 break; 7388 case offsetof(struct bpf_tcp_sock, snd_una): 7389 BPF_TCP_SOCK_GET_COMMON(snd_una); 7390 break; 7391 case offsetof(struct bpf_tcp_sock, mss_cache): 7392 BPF_TCP_SOCK_GET_COMMON(mss_cache); 7393 break; 7394 case offsetof(struct bpf_tcp_sock, ecn_flags): 7395 BPF_TCP_SOCK_GET_COMMON(ecn_flags); 7396 break; 7397 case offsetof(struct bpf_tcp_sock, rate_delivered): 7398 BPF_TCP_SOCK_GET_COMMON(rate_delivered); 7399 break; 7400 case offsetof(struct bpf_tcp_sock, rate_interval_us): 7401 BPF_TCP_SOCK_GET_COMMON(rate_interval_us); 7402 break; 7403 case offsetof(struct bpf_tcp_sock, packets_out): 7404 BPF_TCP_SOCK_GET_COMMON(packets_out); 7405 break; 7406 case offsetof(struct bpf_tcp_sock, retrans_out): 7407 BPF_TCP_SOCK_GET_COMMON(retrans_out); 7408 break; 7409 case offsetof(struct bpf_tcp_sock, total_retrans): 7410 BPF_TCP_SOCK_GET_COMMON(total_retrans); 7411 break; 7412 case offsetof(struct bpf_tcp_sock, segs_in): 7413 BPF_TCP_SOCK_GET_COMMON(segs_in); 7414 break; 7415 case offsetof(struct bpf_tcp_sock, data_segs_in): 7416 BPF_TCP_SOCK_GET_COMMON(data_segs_in); 7417 break; 7418 case offsetof(struct bpf_tcp_sock, segs_out): 7419 BPF_TCP_SOCK_GET_COMMON(segs_out); 7420 break; 7421 case offsetof(struct bpf_tcp_sock, data_segs_out): 7422 BPF_TCP_SOCK_GET_COMMON(data_segs_out); 7423 break; 7424 case offsetof(struct bpf_tcp_sock, lost_out): 7425 BPF_TCP_SOCK_GET_COMMON(lost_out); 7426 break; 7427 case offsetof(struct bpf_tcp_sock, sacked_out): 7428 BPF_TCP_SOCK_GET_COMMON(sacked_out); 7429 break; 7430 case offsetof(struct bpf_tcp_sock, bytes_received): 7431 BPF_TCP_SOCK_GET_COMMON(bytes_received); 7432 break; 7433 case offsetof(struct bpf_tcp_sock, bytes_acked): 7434 BPF_TCP_SOCK_GET_COMMON(bytes_acked); 7435 break; 7436 case offsetof(struct bpf_tcp_sock, dsack_dups): 7437 BPF_TCP_SOCK_GET_COMMON(dsack_dups); 7438 break; 7439 case offsetof(struct bpf_tcp_sock, delivered): 7440 BPF_TCP_SOCK_GET_COMMON(delivered); 7441 break; 7442 case offsetof(struct bpf_tcp_sock, delivered_ce): 7443 BPF_TCP_SOCK_GET_COMMON(delivered_ce); 7444 break; 7445 case offsetof(struct bpf_tcp_sock, icsk_retransmits): 7446 BPF_INET_SOCK_GET_COMMON(icsk_retransmits); 7447 break; 7448 } 7449 7450 return insn - insn_buf; 7451 } 7452 7453 BPF_CALL_1(bpf_tcp_sock, struct sock *, sk) 7454 { 7455 if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) 7456 return (unsigned long)sk; 7457 7458 return (unsigned long)NULL; 7459 } 7460 7461 const struct bpf_func_proto bpf_tcp_sock_proto = { 7462 .func = bpf_tcp_sock, 7463 .gpl_only = false, 7464 .ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL, 7465 .arg1_type = ARG_PTR_TO_SOCK_COMMON, 7466 }; 7467 7468 BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk) 7469 { 7470 sk = sk_to_full_sk(sk); 7471 7472 if (sk && sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE)) 7473 return (unsigned long)sk; 7474 7475 return (unsigned long)NULL; 7476 } 7477 7478 static const struct bpf_func_proto bpf_get_listener_sock_proto = { 7479 .func = bpf_get_listener_sock, 7480 .gpl_only = false, 7481 .ret_type = RET_PTR_TO_SOCKET_OR_NULL, 7482 .arg1_type = ARG_PTR_TO_SOCK_COMMON, 7483 }; 7484 7485 BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb) 7486 { 7487 unsigned int iphdr_len; 7488 7489 switch (skb_protocol(skb, true)) { 7490 case cpu_to_be16(ETH_P_IP): 7491 iphdr_len = sizeof(struct iphdr); 7492 break; 7493 case cpu_to_be16(ETH_P_IPV6): 7494 iphdr_len = sizeof(struct ipv6hdr); 7495 break; 7496 default: 7497 return 0; 7498 } 7499 7500 if (skb_headlen(skb) < iphdr_len) 7501 return 0; 7502 7503 if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len)) 7504 return 0; 7505 7506 return INET_ECN_set_ce(skb); 7507 } 7508 7509 bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type, 7510 struct bpf_insn_access_aux *info) 7511 { 7512 if (off < 0 || off >= offsetofend(struct bpf_xdp_sock, queue_id)) 7513 return false; 7514 7515 if (off % size != 0) 7516 return false; 7517 7518 switch (off) { 7519 default: 7520 return size == sizeof(__u32); 7521 } 7522 } 7523 7524 u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, 7525 const struct bpf_insn *si, 7526 struct bpf_insn *insn_buf, 7527 struct bpf_prog *prog, u32 *target_size) 7528 { 7529 struct bpf_insn *insn = insn_buf; 7530 7531 #define BPF_XDP_SOCK_GET(FIELD) \ 7532 do { \ 7533 BUILD_BUG_ON(sizeof_field(struct xdp_sock, FIELD) > \ 7534 sizeof_field(struct bpf_xdp_sock, FIELD)); \ 7535 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\ 7536 si->dst_reg, si->src_reg, \ 7537 offsetof(struct xdp_sock, FIELD)); \ 7538 } while (0) 7539 7540 BTF_TYPE_EMIT(struct bpf_xdp_sock); 7541 7542 switch (si->off) { 7543 case offsetof(struct bpf_xdp_sock, queue_id): 7544 BPF_XDP_SOCK_GET(queue_id); 7545 break; 7546 } 7547 7548 return insn - insn_buf; 7549 } 7550 7551 static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = { 7552 .func = bpf_skb_ecn_set_ce, 7553 .gpl_only = false, 7554 .ret_type = RET_INTEGER, 7555 .arg1_type = ARG_PTR_TO_CTX, 7556 }; 7557 7558 BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len, 7559 struct tcphdr *, th, u32, th_len) 7560 { 7561 #ifdef CONFIG_SYN_COOKIES 7562 int ret; 7563 7564 if (unlikely(!sk || th_len < sizeof(*th))) 7565 return -EINVAL; 7566 7567 /* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */ 7568 if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) 7569 return -EINVAL; 7570 7571 if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies)) 7572 return -EINVAL; 7573 7574 if (!th->ack || th->rst || th->syn) 7575 return -ENOENT; 7576 7577 if (unlikely(iph_len < sizeof(struct iphdr))) 7578 return -EINVAL; 7579 7580 if (tcp_synq_no_recent_overflow(sk)) 7581 return -ENOENT; 7582 7583 /* Both struct iphdr and struct ipv6hdr have the version field at the 7584 * same offset so we can cast to the shorter header (struct iphdr). 7585 */ 7586 switch (((struct iphdr *)iph)->version) { 7587 case 4: 7588 if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk)) 7589 return -EINVAL; 7590 7591 ret = __cookie_v4_check((struct iphdr *)iph, th); 7592 break; 7593 7594 #if IS_BUILTIN(CONFIG_IPV6) 7595 case 6: 7596 if (unlikely(iph_len < sizeof(struct ipv6hdr))) 7597 return -EINVAL; 7598 7599 if (sk->sk_family != AF_INET6) 7600 return -EINVAL; 7601 7602 ret = __cookie_v6_check((struct ipv6hdr *)iph, th); 7603 break; 7604 #endif /* CONFIG_IPV6 */ 7605 7606 default: 7607 return -EPROTONOSUPPORT; 7608 } 7609 7610 if (ret > 0) 7611 return 0; 7612 7613 return -ENOENT; 7614 #else 7615 return -ENOTSUPP; 7616 #endif 7617 } 7618 7619 static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = { 7620 .func = bpf_tcp_check_syncookie, 7621 .gpl_only = true, 7622 .pkt_access = true, 7623 .ret_type = RET_INTEGER, 7624 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, 7625 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, 7626 .arg3_type = ARG_CONST_SIZE, 7627 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, 7628 .arg5_type = ARG_CONST_SIZE, 7629 }; 7630 7631 BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len, 7632 struct tcphdr *, th, u32, th_len) 7633 { 7634 #ifdef CONFIG_SYN_COOKIES 7635 u32 cookie; 7636 u16 mss; 7637 7638 if (unlikely(!sk || th_len < sizeof(*th) || th_len != th->doff * 4)) 7639 return -EINVAL; 7640 7641 if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) 7642 return -EINVAL; 7643 7644 if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies)) 7645 return -ENOENT; 7646 7647 if (!th->syn || th->ack || th->fin || th->rst) 7648 return -EINVAL; 7649 7650 if (unlikely(iph_len < sizeof(struct iphdr))) 7651 return -EINVAL; 7652 7653 /* Both struct iphdr and struct ipv6hdr have the version field at the 7654 * same offset so we can cast to the shorter header (struct iphdr). 7655 */ 7656 switch (((struct iphdr *)iph)->version) { 7657 case 4: 7658 if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk)) 7659 return -EINVAL; 7660 7661 mss = tcp_v4_get_syncookie(sk, iph, th, &cookie); 7662 break; 7663 7664 #if IS_BUILTIN(CONFIG_IPV6) 7665 case 6: 7666 if (unlikely(iph_len < sizeof(struct ipv6hdr))) 7667 return -EINVAL; 7668 7669 if (sk->sk_family != AF_INET6) 7670 return -EINVAL; 7671 7672 mss = tcp_v6_get_syncookie(sk, iph, th, &cookie); 7673 break; 7674 #endif /* CONFIG_IPV6 */ 7675 7676 default: 7677 return -EPROTONOSUPPORT; 7678 } 7679 if (mss == 0) 7680 return -ENOENT; 7681 7682 return cookie | ((u64)mss << 32); 7683 #else 7684 return -EOPNOTSUPP; 7685 #endif /* CONFIG_SYN_COOKIES */ 7686 } 7687 7688 static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = { 7689 .func = bpf_tcp_gen_syncookie, 7690 .gpl_only = true, /* __cookie_v*_init_sequence() is GPL */ 7691 .pkt_access = true, 7692 .ret_type = RET_INTEGER, 7693 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, 7694 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, 7695 .arg3_type = ARG_CONST_SIZE, 7696 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, 7697 .arg5_type = ARG_CONST_SIZE, 7698 }; 7699 7700 BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags) 7701 { 7702 if (!sk || flags != 0) 7703 return -EINVAL; 7704 if (!skb_at_tc_ingress(skb)) 7705 return -EOPNOTSUPP; 7706 if (unlikely(dev_net(skb->dev) != sock_net(sk))) 7707 return -ENETUNREACH; 7708 if (sk_unhashed(sk)) 7709 return -EOPNOTSUPP; 7710 if (sk_is_refcounted(sk) && 7711 unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) 7712 return -ENOENT; 7713 7714 skb_orphan(skb); 7715 skb->sk = sk; 7716 skb->destructor = sock_pfree; 7717 7718 return 0; 7719 } 7720 7721 static const struct bpf_func_proto bpf_sk_assign_proto = { 7722 .func = bpf_sk_assign, 7723 .gpl_only = false, 7724 .ret_type = RET_INTEGER, 7725 .arg1_type = ARG_PTR_TO_CTX, 7726 .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, 7727 .arg3_type = ARG_ANYTHING, 7728 }; 7729 7730 static const u8 *bpf_search_tcp_opt(const u8 *op, const u8 *opend, 7731 u8 search_kind, const u8 *magic, 7732 u8 magic_len, bool *eol) 7733 { 7734 u8 kind, kind_len; 7735 7736 *eol = false; 7737 7738 while (op < opend) { 7739 kind = op[0]; 7740 7741 if (kind == TCPOPT_EOL) { 7742 *eol = true; 7743 return ERR_PTR(-ENOMSG); 7744 } else if (kind == TCPOPT_NOP) { 7745 op++; 7746 continue; 7747 } 7748 7749 if (opend - op < 2 || opend - op < op[1] || op[1] < 2) 7750 /* Something is wrong in the received header. 7751 * Follow the TCP stack's tcp_parse_options() 7752 * and just bail here. 7753 */ 7754 return ERR_PTR(-EFAULT); 7755 7756 kind_len = op[1]; 7757 if (search_kind == kind) { 7758 if (!magic_len) 7759 return op; 7760 7761 if (magic_len > kind_len - 2) 7762 return ERR_PTR(-ENOMSG); 7763 7764 if (!memcmp(&op[2], magic, magic_len)) 7765 return op; 7766 } 7767 7768 op += kind_len; 7769 } 7770 7771 return ERR_PTR(-ENOMSG); 7772 } 7773 7774 BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, 7775 void *, search_res, u32, len, u64, flags) 7776 { 7777 bool eol, load_syn = flags & BPF_LOAD_HDR_OPT_TCP_SYN; 7778 const u8 *op, *opend, *magic, *search = search_res; 7779 u8 search_kind, search_len, copy_len, magic_len; 7780 int ret; 7781 7782 if (!is_locked_tcp_sock_ops(bpf_sock)) 7783 return -EOPNOTSUPP; 7784 7785 /* 2 byte is the minimal option len except TCPOPT_NOP and 7786 * TCPOPT_EOL which are useless for the bpf prog to learn 7787 * and this helper disallow loading them also. 7788 */ 7789 if (len < 2 || flags & ~BPF_LOAD_HDR_OPT_TCP_SYN) 7790 return -EINVAL; 7791 7792 search_kind = search[0]; 7793 search_len = search[1]; 7794 7795 if (search_len > len || search_kind == TCPOPT_NOP || 7796 search_kind == TCPOPT_EOL) 7797 return -EINVAL; 7798 7799 if (search_kind == TCPOPT_EXP || search_kind == 253) { 7800 /* 16 or 32 bit magic. +2 for kind and kind length */ 7801 if (search_len != 4 && search_len != 6) 7802 return -EINVAL; 7803 magic = &search[2]; 7804 magic_len = search_len - 2; 7805 } else { 7806 if (search_len) 7807 return -EINVAL; 7808 magic = NULL; 7809 magic_len = 0; 7810 } 7811 7812 if (load_syn) { 7813 ret = bpf_sock_ops_get_syn(bpf_sock, TCP_BPF_SYN, &op); 7814 if (ret < 0) 7815 return ret; 7816 7817 opend = op + ret; 7818 op += sizeof(struct tcphdr); 7819 } else { 7820 if (!bpf_sock->skb || 7821 bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB) 7822 /* This bpf_sock->op cannot call this helper */ 7823 return -EPERM; 7824 7825 opend = bpf_sock->skb_data_end; 7826 op = bpf_sock->skb->data + sizeof(struct tcphdr); 7827 } 7828 7829 op = bpf_search_tcp_opt(op, opend, search_kind, magic, magic_len, 7830 &eol); 7831 if (IS_ERR(op)) 7832 return PTR_ERR(op); 7833 7834 copy_len = op[1]; 7835 ret = copy_len; 7836 if (copy_len > len) { 7837 ret = -ENOSPC; 7838 copy_len = len; 7839 } 7840 7841 memcpy(search_res, op, copy_len); 7842 return ret; 7843 } 7844 7845 static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = { 7846 .func = bpf_sock_ops_load_hdr_opt, 7847 .gpl_only = false, 7848 .ret_type = RET_INTEGER, 7849 .arg1_type = ARG_PTR_TO_CTX, 7850 .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, 7851 .arg3_type = ARG_CONST_SIZE, 7852 .arg4_type = ARG_ANYTHING, 7853 }; 7854 7855 BPF_CALL_4(bpf_sock_ops_store_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, 7856 const void *, from, u32, len, u64, flags) 7857 { 7858 u8 new_kind, new_kind_len, magic_len = 0, *opend; 7859 const u8 *op, *new_op, *magic = NULL; 7860 struct sk_buff *skb; 7861 bool eol; 7862 7863 if (bpf_sock->op != BPF_SOCK_OPS_WRITE_HDR_OPT_CB) 7864 return -EPERM; 7865 7866 if (len < 2 || flags) 7867 return -EINVAL; 7868 7869 new_op = from; 7870 new_kind = new_op[0]; 7871 new_kind_len = new_op[1]; 7872 7873 if (new_kind_len > len || new_kind == TCPOPT_NOP || 7874 new_kind == TCPOPT_EOL) 7875 return -EINVAL; 7876 7877 if (new_kind_len > bpf_sock->remaining_opt_len) 7878 return -ENOSPC; 7879 7880 /* 253 is another experimental kind */ 7881 if (new_kind == TCPOPT_EXP || new_kind == 253) { 7882 if (new_kind_len < 4) 7883 return -EINVAL; 7884 /* Match for the 2 byte magic also. 7885 * RFC 6994: the magic could be 2 or 4 bytes. 7886 * Hence, matching by 2 byte only is on the 7887 * conservative side but it is the right 7888 * thing to do for the 'search-for-duplication' 7889 * purpose. 7890 */ 7891 magic = &new_op[2]; 7892 magic_len = 2; 7893 } 7894 7895 /* Check for duplication */ 7896 skb = bpf_sock->skb; 7897 op = skb->data + sizeof(struct tcphdr); 7898 opend = bpf_sock->skb_data_end; 7899 7900 op = bpf_search_tcp_opt(op, opend, new_kind, magic, magic_len, 7901 &eol); 7902 if (!IS_ERR(op)) 7903 return -EEXIST; 7904 7905 if (PTR_ERR(op) != -ENOMSG) 7906 return PTR_ERR(op); 7907 7908 if (eol) 7909 /* The option has been ended. Treat it as no more 7910 * header option can be written. 7911 */ 7912 return -ENOSPC; 7913 7914 /* No duplication found. Store the header option. */ 7915 memcpy(opend, from, new_kind_len); 7916 7917 bpf_sock->remaining_opt_len -= new_kind_len; 7918 bpf_sock->skb_data_end += new_kind_len; 7919 7920 return 0; 7921 } 7922 7923 static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = { 7924 .func = bpf_sock_ops_store_hdr_opt, 7925 .gpl_only = false, 7926 .ret_type = RET_INTEGER, 7927 .arg1_type = ARG_PTR_TO_CTX, 7928 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, 7929 .arg3_type = ARG_CONST_SIZE, 7930 .arg4_type = ARG_ANYTHING, 7931 }; 7932 7933 BPF_CALL_3(bpf_sock_ops_reserve_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, 7934 u32, len, u64, flags) 7935 { 7936 if (bpf_sock->op != BPF_SOCK_OPS_HDR_OPT_LEN_CB) 7937 return -EPERM; 7938 7939 if (flags || len < 2) 7940 return -EINVAL; 7941 7942 if (len > bpf_sock->remaining_opt_len) 7943 return -ENOSPC; 7944 7945 bpf_sock->remaining_opt_len -= len; 7946 7947 return 0; 7948 } 7949 7950 static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = { 7951 .func = bpf_sock_ops_reserve_hdr_opt, 7952 .gpl_only = false, 7953 .ret_type = RET_INTEGER, 7954 .arg1_type = ARG_PTR_TO_CTX, 7955 .arg2_type = ARG_ANYTHING, 7956 .arg3_type = ARG_ANYTHING, 7957 }; 7958 7959 BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb, 7960 u64, tstamp, u32, tstamp_type) 7961 { 7962 /* skb_clear_delivery_time() is done for inet protocol */ 7963 if (skb->protocol != htons(ETH_P_IP) && 7964 skb->protocol != htons(ETH_P_IPV6)) 7965 return -EOPNOTSUPP; 7966 7967 switch (tstamp_type) { 7968 case BPF_SKB_CLOCK_REALTIME: 7969 skb->tstamp = tstamp; 7970 skb->tstamp_type = SKB_CLOCK_REALTIME; 7971 break; 7972 case BPF_SKB_CLOCK_MONOTONIC: 7973 if (!tstamp) 7974 return -EINVAL; 7975 skb->tstamp = tstamp; 7976 skb->tstamp_type = SKB_CLOCK_MONOTONIC; 7977 break; 7978 case BPF_SKB_CLOCK_TAI: 7979 if (!tstamp) 7980 return -EINVAL; 7981 skb->tstamp = tstamp; 7982 skb->tstamp_type = SKB_CLOCK_TAI; 7983 break; 7984 default: 7985 return -EINVAL; 7986 } 7987 7988 return 0; 7989 } 7990 7991 static const struct bpf_func_proto bpf_skb_set_tstamp_proto = { 7992 .func = bpf_skb_set_tstamp, 7993 .gpl_only = false, 7994 .ret_type = RET_INTEGER, 7995 .arg1_type = ARG_PTR_TO_CTX, 7996 .arg2_type = ARG_ANYTHING, 7997 .arg3_type = ARG_ANYTHING, 7998 }; 7999 8000 #ifdef CONFIG_SYN_COOKIES 8001 BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv4, struct iphdr *, iph, 8002 struct tcphdr *, th, u32, th_len) 8003 { 8004 u32 cookie; 8005 u16 mss; 8006 8007 if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4)) 8008 return -EINVAL; 8009 8010 mss = tcp_parse_mss_option(th, 0) ?: TCP_MSS_DEFAULT; 8011 cookie = __cookie_v4_init_sequence(iph, th, &mss); 8012 8013 return cookie | ((u64)mss << 32); 8014 } 8015 8016 static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv4_proto = { 8017 .func = bpf_tcp_raw_gen_syncookie_ipv4, 8018 .gpl_only = true, /* __cookie_v4_init_sequence() is GPL */ 8019 .pkt_access = true, 8020 .ret_type = RET_INTEGER, 8021 .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY, 8022 .arg1_size = sizeof(struct iphdr), 8023 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, 8024 .arg3_type = ARG_CONST_SIZE_OR_ZERO, 8025 }; 8026 8027 BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv6, struct ipv6hdr *, iph, 8028 struct tcphdr *, th, u32, th_len) 8029 { 8030 #if IS_BUILTIN(CONFIG_IPV6) 8031 const u16 mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - 8032 sizeof(struct ipv6hdr); 8033 u32 cookie; 8034 u16 mss; 8035 8036 if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4)) 8037 return -EINVAL; 8038 8039 mss = tcp_parse_mss_option(th, 0) ?: mss_clamp; 8040 cookie = __cookie_v6_init_sequence(iph, th, &mss); 8041 8042 return cookie | ((u64)mss << 32); 8043 #else 8044 return -EPROTONOSUPPORT; 8045 #endif 8046 } 8047 8048 static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = { 8049 .func = bpf_tcp_raw_gen_syncookie_ipv6, 8050 .gpl_only = true, /* __cookie_v6_init_sequence() is GPL */ 8051 .pkt_access = true, 8052 .ret_type = RET_INTEGER, 8053 .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY, 8054 .arg1_size = sizeof(struct ipv6hdr), 8055 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, 8056 .arg3_type = ARG_CONST_SIZE_OR_ZERO, 8057 }; 8058 8059 BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4, struct iphdr *, iph, 8060 struct tcphdr *, th) 8061 { 8062 if (__cookie_v4_check(iph, th) > 0) 8063 return 0; 8064 8065 return -EACCES; 8066 } 8067 8068 static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv4_proto = { 8069 .func = bpf_tcp_raw_check_syncookie_ipv4, 8070 .gpl_only = true, /* __cookie_v4_check is GPL */ 8071 .pkt_access = true, 8072 .ret_type = RET_INTEGER, 8073 .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY, 8074 .arg1_size = sizeof(struct iphdr), 8075 .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY, 8076 .arg2_size = sizeof(struct tcphdr), 8077 }; 8078 8079 BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv6, struct ipv6hdr *, iph, 8080 struct tcphdr *, th) 8081 { 8082 #if IS_BUILTIN(CONFIG_IPV6) 8083 if (__cookie_v6_check(iph, th) > 0) 8084 return 0; 8085 8086 return -EACCES; 8087 #else 8088 return -EPROTONOSUPPORT; 8089 #endif 8090 } 8091 8092 static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = { 8093 .func = bpf_tcp_raw_check_syncookie_ipv6, 8094 .gpl_only = true, /* __cookie_v6_check is GPL */ 8095 .pkt_access = true, 8096 .ret_type = RET_INTEGER, 8097 .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY, 8098 .arg1_size = sizeof(struct ipv6hdr), 8099 .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_RDONLY, 8100 .arg2_size = sizeof(struct tcphdr), 8101 }; 8102 #endif /* CONFIG_SYN_COOKIES */ 8103 8104 #endif /* CONFIG_INET */ 8105 8106 bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id) 8107 { 8108 switch (func_id) { 8109 case BPF_FUNC_clone_redirect: 8110 case BPF_FUNC_l3_csum_replace: 8111 case BPF_FUNC_l4_csum_replace: 8112 case BPF_FUNC_lwt_push_encap: 8113 case BPF_FUNC_lwt_seg6_action: 8114 case BPF_FUNC_lwt_seg6_adjust_srh: 8115 case BPF_FUNC_lwt_seg6_store_bytes: 8116 case BPF_FUNC_msg_pop_data: 8117 case BPF_FUNC_msg_pull_data: 8118 case BPF_FUNC_msg_push_data: 8119 case BPF_FUNC_skb_adjust_room: 8120 case BPF_FUNC_skb_change_head: 8121 case BPF_FUNC_skb_change_proto: 8122 case BPF_FUNC_skb_change_tail: 8123 case BPF_FUNC_skb_pull_data: 8124 case BPF_FUNC_skb_store_bytes: 8125 case BPF_FUNC_skb_vlan_pop: 8126 case BPF_FUNC_skb_vlan_push: 8127 case BPF_FUNC_store_hdr_opt: 8128 case BPF_FUNC_xdp_adjust_head: 8129 case BPF_FUNC_xdp_adjust_meta: 8130 case BPF_FUNC_xdp_adjust_tail: 8131 /* tail-called program could call any of the above */ 8132 case BPF_FUNC_tail_call: 8133 return true; 8134 default: 8135 return false; 8136 } 8137 } 8138 8139 const struct bpf_func_proto bpf_event_output_data_proto __weak; 8140 const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto __weak; 8141 8142 static const struct bpf_func_proto * 8143 sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 8144 { 8145 const struct bpf_func_proto *func_proto; 8146 8147 func_proto = cgroup_common_func_proto(func_id, prog); 8148 if (func_proto) 8149 return func_proto; 8150 8151 switch (func_id) { 8152 case BPF_FUNC_get_socket_cookie: 8153 return &bpf_get_socket_cookie_sock_proto; 8154 case BPF_FUNC_get_netns_cookie: 8155 return &bpf_get_netns_cookie_sock_proto; 8156 case BPF_FUNC_perf_event_output: 8157 return &bpf_event_output_data_proto; 8158 case BPF_FUNC_sk_storage_get: 8159 return &bpf_sk_storage_get_cg_sock_proto; 8160 case BPF_FUNC_ktime_get_coarse_ns: 8161 return &bpf_ktime_get_coarse_ns_proto; 8162 case BPF_FUNC_setsockopt: 8163 switch (prog->expected_attach_type) { 8164 case BPF_CGROUP_INET_SOCK_CREATE: 8165 return &bpf_sock_create_setsockopt_proto; 8166 default: 8167 return NULL; 8168 } 8169 case BPF_FUNC_getsockopt: 8170 switch (prog->expected_attach_type) { 8171 case BPF_CGROUP_INET_SOCK_CREATE: 8172 return &bpf_sock_create_getsockopt_proto; 8173 default: 8174 return NULL; 8175 } 8176 default: 8177 return bpf_base_func_proto(func_id, prog); 8178 } 8179 } 8180 8181 static const struct bpf_func_proto * 8182 sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 8183 { 8184 const struct bpf_func_proto *func_proto; 8185 8186 func_proto = cgroup_common_func_proto(func_id, prog); 8187 if (func_proto) 8188 return func_proto; 8189 8190 switch (func_id) { 8191 case BPF_FUNC_bind: 8192 switch (prog->expected_attach_type) { 8193 case BPF_CGROUP_INET4_CONNECT: 8194 case BPF_CGROUP_INET6_CONNECT: 8195 return &bpf_bind_proto; 8196 default: 8197 return NULL; 8198 } 8199 case BPF_FUNC_get_socket_cookie: 8200 return &bpf_get_socket_cookie_sock_addr_proto; 8201 case BPF_FUNC_get_netns_cookie: 8202 return &bpf_get_netns_cookie_sock_addr_proto; 8203 case BPF_FUNC_perf_event_output: 8204 return &bpf_event_output_data_proto; 8205 #ifdef CONFIG_INET 8206 case BPF_FUNC_sk_lookup_tcp: 8207 return &bpf_sock_addr_sk_lookup_tcp_proto; 8208 case BPF_FUNC_sk_lookup_udp: 8209 return &bpf_sock_addr_sk_lookup_udp_proto; 8210 case BPF_FUNC_sk_release: 8211 return &bpf_sk_release_proto; 8212 case BPF_FUNC_skc_lookup_tcp: 8213 return &bpf_sock_addr_skc_lookup_tcp_proto; 8214 #endif /* CONFIG_INET */ 8215 case BPF_FUNC_sk_storage_get: 8216 return &bpf_sk_storage_get_proto; 8217 case BPF_FUNC_sk_storage_delete: 8218 return &bpf_sk_storage_delete_proto; 8219 case BPF_FUNC_setsockopt: 8220 switch (prog->expected_attach_type) { 8221 case BPF_CGROUP_INET4_BIND: 8222 case BPF_CGROUP_INET6_BIND: 8223 case BPF_CGROUP_INET4_CONNECT: 8224 case BPF_CGROUP_INET6_CONNECT: 8225 case BPF_CGROUP_UNIX_CONNECT: 8226 case BPF_CGROUP_UDP4_RECVMSG: 8227 case BPF_CGROUP_UDP6_RECVMSG: 8228 case BPF_CGROUP_UNIX_RECVMSG: 8229 case BPF_CGROUP_UDP4_SENDMSG: 8230 case BPF_CGROUP_UDP6_SENDMSG: 8231 case BPF_CGROUP_UNIX_SENDMSG: 8232 case BPF_CGROUP_INET4_GETPEERNAME: 8233 case BPF_CGROUP_INET6_GETPEERNAME: 8234 case BPF_CGROUP_UNIX_GETPEERNAME: 8235 case BPF_CGROUP_INET4_GETSOCKNAME: 8236 case BPF_CGROUP_INET6_GETSOCKNAME: 8237 case BPF_CGROUP_UNIX_GETSOCKNAME: 8238 return &bpf_sock_addr_setsockopt_proto; 8239 default: 8240 return NULL; 8241 } 8242 case BPF_FUNC_getsockopt: 8243 switch (prog->expected_attach_type) { 8244 case BPF_CGROUP_INET4_BIND: 8245 case BPF_CGROUP_INET6_BIND: 8246 case BPF_CGROUP_INET4_CONNECT: 8247 case BPF_CGROUP_INET6_CONNECT: 8248 case BPF_CGROUP_UNIX_CONNECT: 8249 case BPF_CGROUP_UDP4_RECVMSG: 8250 case BPF_CGROUP_UDP6_RECVMSG: 8251 case BPF_CGROUP_UNIX_RECVMSG: 8252 case BPF_CGROUP_UDP4_SENDMSG: 8253 case BPF_CGROUP_UDP6_SENDMSG: 8254 case BPF_CGROUP_UNIX_SENDMSG: 8255 case BPF_CGROUP_INET4_GETPEERNAME: 8256 case BPF_CGROUP_INET6_GETPEERNAME: 8257 case BPF_CGROUP_UNIX_GETPEERNAME: 8258 case BPF_CGROUP_INET4_GETSOCKNAME: 8259 case BPF_CGROUP_INET6_GETSOCKNAME: 8260 case BPF_CGROUP_UNIX_GETSOCKNAME: 8261 return &bpf_sock_addr_getsockopt_proto; 8262 default: 8263 return NULL; 8264 } 8265 default: 8266 return bpf_sk_base_func_proto(func_id, prog); 8267 } 8268 } 8269 8270 static const struct bpf_func_proto * 8271 sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 8272 { 8273 switch (func_id) { 8274 case BPF_FUNC_skb_load_bytes: 8275 return &bpf_skb_load_bytes_proto; 8276 case BPF_FUNC_skb_load_bytes_relative: 8277 return &bpf_skb_load_bytes_relative_proto; 8278 case BPF_FUNC_get_socket_cookie: 8279 return &bpf_get_socket_cookie_proto; 8280 case BPF_FUNC_get_netns_cookie: 8281 return &bpf_get_netns_cookie_proto; 8282 case BPF_FUNC_get_socket_uid: 8283 return &bpf_get_socket_uid_proto; 8284 case BPF_FUNC_perf_event_output: 8285 return &bpf_skb_event_output_proto; 8286 default: 8287 return bpf_sk_base_func_proto(func_id, prog); 8288 } 8289 } 8290 8291 const struct bpf_func_proto bpf_sk_storage_get_proto __weak; 8292 const struct bpf_func_proto bpf_sk_storage_delete_proto __weak; 8293 8294 static const struct bpf_func_proto * 8295 cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 8296 { 8297 const struct bpf_func_proto *func_proto; 8298 8299 func_proto = cgroup_common_func_proto(func_id, prog); 8300 if (func_proto) 8301 return func_proto; 8302 8303 switch (func_id) { 8304 case BPF_FUNC_sk_fullsock: 8305 return &bpf_sk_fullsock_proto; 8306 case BPF_FUNC_sk_storage_get: 8307 return &bpf_sk_storage_get_proto; 8308 case BPF_FUNC_sk_storage_delete: 8309 return &bpf_sk_storage_delete_proto; 8310 case BPF_FUNC_perf_event_output: 8311 return &bpf_skb_event_output_proto; 8312 #ifdef CONFIG_SOCK_CGROUP_DATA 8313 case BPF_FUNC_skb_cgroup_id: 8314 return &bpf_skb_cgroup_id_proto; 8315 case BPF_FUNC_skb_ancestor_cgroup_id: 8316 return &bpf_skb_ancestor_cgroup_id_proto; 8317 case BPF_FUNC_sk_cgroup_id: 8318 return &bpf_sk_cgroup_id_proto; 8319 case BPF_FUNC_sk_ancestor_cgroup_id: 8320 return &bpf_sk_ancestor_cgroup_id_proto; 8321 #endif 8322 #ifdef CONFIG_INET 8323 case BPF_FUNC_sk_lookup_tcp: 8324 return &bpf_sk_lookup_tcp_proto; 8325 case BPF_FUNC_sk_lookup_udp: 8326 return &bpf_sk_lookup_udp_proto; 8327 case BPF_FUNC_sk_release: 8328 return &bpf_sk_release_proto; 8329 case BPF_FUNC_skc_lookup_tcp: 8330 return &bpf_skc_lookup_tcp_proto; 8331 case BPF_FUNC_tcp_sock: 8332 return &bpf_tcp_sock_proto; 8333 case BPF_FUNC_get_listener_sock: 8334 return &bpf_get_listener_sock_proto; 8335 case BPF_FUNC_skb_ecn_set_ce: 8336 return &bpf_skb_ecn_set_ce_proto; 8337 #endif 8338 default: 8339 return sk_filter_func_proto(func_id, prog); 8340 } 8341 } 8342 8343 static const struct bpf_func_proto * 8344 tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 8345 { 8346 switch (func_id) { 8347 case BPF_FUNC_skb_store_bytes: 8348 return &bpf_skb_store_bytes_proto; 8349 case BPF_FUNC_skb_load_bytes: 8350 return &bpf_skb_load_bytes_proto; 8351 case BPF_FUNC_skb_load_bytes_relative: 8352 return &bpf_skb_load_bytes_relative_proto; 8353 case BPF_FUNC_skb_pull_data: 8354 return &bpf_skb_pull_data_proto; 8355 case BPF_FUNC_csum_diff: 8356 return &bpf_csum_diff_proto; 8357 case BPF_FUNC_csum_update: 8358 return &bpf_csum_update_proto; 8359 case BPF_FUNC_csum_level: 8360 return &bpf_csum_level_proto; 8361 case BPF_FUNC_l3_csum_replace: 8362 return &bpf_l3_csum_replace_proto; 8363 case BPF_FUNC_l4_csum_replace: 8364 return &bpf_l4_csum_replace_proto; 8365 case BPF_FUNC_clone_redirect: 8366 return &bpf_clone_redirect_proto; 8367 case BPF_FUNC_get_cgroup_classid: 8368 return &bpf_get_cgroup_classid_proto; 8369 case BPF_FUNC_skb_vlan_push: 8370 return &bpf_skb_vlan_push_proto; 8371 case BPF_FUNC_skb_vlan_pop: 8372 return &bpf_skb_vlan_pop_proto; 8373 case BPF_FUNC_skb_change_proto: 8374 return &bpf_skb_change_proto_proto; 8375 case BPF_FUNC_skb_change_type: 8376 return &bpf_skb_change_type_proto; 8377 case BPF_FUNC_skb_adjust_room: 8378 return &bpf_skb_adjust_room_proto; 8379 case BPF_FUNC_skb_change_tail: 8380 return &bpf_skb_change_tail_proto; 8381 case BPF_FUNC_skb_change_head: 8382 return &bpf_skb_change_head_proto; 8383 case BPF_FUNC_skb_get_tunnel_key: 8384 return &bpf_skb_get_tunnel_key_proto; 8385 case BPF_FUNC_skb_set_tunnel_key: 8386 return bpf_get_skb_set_tunnel_proto(func_id); 8387 case BPF_FUNC_skb_get_tunnel_opt: 8388 return &bpf_skb_get_tunnel_opt_proto; 8389 case BPF_FUNC_skb_set_tunnel_opt: 8390 return bpf_get_skb_set_tunnel_proto(func_id); 8391 case BPF_FUNC_redirect: 8392 return &bpf_redirect_proto; 8393 case BPF_FUNC_redirect_neigh: 8394 return &bpf_redirect_neigh_proto; 8395 case BPF_FUNC_redirect_peer: 8396 return &bpf_redirect_peer_proto; 8397 case BPF_FUNC_get_route_realm: 8398 return &bpf_get_route_realm_proto; 8399 case BPF_FUNC_get_hash_recalc: 8400 return &bpf_get_hash_recalc_proto; 8401 case BPF_FUNC_set_hash_invalid: 8402 return &bpf_set_hash_invalid_proto; 8403 case BPF_FUNC_set_hash: 8404 return &bpf_set_hash_proto; 8405 case BPF_FUNC_perf_event_output: 8406 return &bpf_skb_event_output_proto; 8407 case BPF_FUNC_get_smp_processor_id: 8408 return &bpf_get_smp_processor_id_proto; 8409 case BPF_FUNC_skb_under_cgroup: 8410 return &bpf_skb_under_cgroup_proto; 8411 case BPF_FUNC_get_socket_cookie: 8412 return &bpf_get_socket_cookie_proto; 8413 case BPF_FUNC_get_netns_cookie: 8414 return &bpf_get_netns_cookie_proto; 8415 case BPF_FUNC_get_socket_uid: 8416 return &bpf_get_socket_uid_proto; 8417 case BPF_FUNC_fib_lookup: 8418 return &bpf_skb_fib_lookup_proto; 8419 case BPF_FUNC_check_mtu: 8420 return &bpf_skb_check_mtu_proto; 8421 case BPF_FUNC_sk_fullsock: 8422 return &bpf_sk_fullsock_proto; 8423 case BPF_FUNC_sk_storage_get: 8424 return &bpf_sk_storage_get_proto; 8425 case BPF_FUNC_sk_storage_delete: 8426 return &bpf_sk_storage_delete_proto; 8427 #ifdef CONFIG_XFRM 8428 case BPF_FUNC_skb_get_xfrm_state: 8429 return &bpf_skb_get_xfrm_state_proto; 8430 #endif 8431 #ifdef CONFIG_CGROUP_NET_CLASSID 8432 case BPF_FUNC_skb_cgroup_classid: 8433 return &bpf_skb_cgroup_classid_proto; 8434 #endif 8435 #ifdef CONFIG_SOCK_CGROUP_DATA 8436 case BPF_FUNC_skb_cgroup_id: 8437 return &bpf_skb_cgroup_id_proto; 8438 case BPF_FUNC_skb_ancestor_cgroup_id: 8439 return &bpf_skb_ancestor_cgroup_id_proto; 8440 #endif 8441 #ifdef CONFIG_INET 8442 case BPF_FUNC_sk_lookup_tcp: 8443 return &bpf_tc_sk_lookup_tcp_proto; 8444 case BPF_FUNC_sk_lookup_udp: 8445 return &bpf_tc_sk_lookup_udp_proto; 8446 case BPF_FUNC_sk_release: 8447 return &bpf_sk_release_proto; 8448 case BPF_FUNC_tcp_sock: 8449 return &bpf_tcp_sock_proto; 8450 case BPF_FUNC_get_listener_sock: 8451 return &bpf_get_listener_sock_proto; 8452 case BPF_FUNC_skc_lookup_tcp: 8453 return &bpf_tc_skc_lookup_tcp_proto; 8454 case BPF_FUNC_tcp_check_syncookie: 8455 return &bpf_tcp_check_syncookie_proto; 8456 case BPF_FUNC_skb_ecn_set_ce: 8457 return &bpf_skb_ecn_set_ce_proto; 8458 case BPF_FUNC_tcp_gen_syncookie: 8459 return &bpf_tcp_gen_syncookie_proto; 8460 case BPF_FUNC_sk_assign: 8461 return &bpf_sk_assign_proto; 8462 case BPF_FUNC_skb_set_tstamp: 8463 return &bpf_skb_set_tstamp_proto; 8464 #ifdef CONFIG_SYN_COOKIES 8465 case BPF_FUNC_tcp_raw_gen_syncookie_ipv4: 8466 return &bpf_tcp_raw_gen_syncookie_ipv4_proto; 8467 case BPF_FUNC_tcp_raw_gen_syncookie_ipv6: 8468 return &bpf_tcp_raw_gen_syncookie_ipv6_proto; 8469 case BPF_FUNC_tcp_raw_check_syncookie_ipv4: 8470 return &bpf_tcp_raw_check_syncookie_ipv4_proto; 8471 case BPF_FUNC_tcp_raw_check_syncookie_ipv6: 8472 return &bpf_tcp_raw_check_syncookie_ipv6_proto; 8473 #endif 8474 #endif 8475 default: 8476 return bpf_sk_base_func_proto(func_id, prog); 8477 } 8478 } 8479 8480 static const struct bpf_func_proto * 8481 xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 8482 { 8483 switch (func_id) { 8484 case BPF_FUNC_perf_event_output: 8485 return &bpf_xdp_event_output_proto; 8486 case BPF_FUNC_get_smp_processor_id: 8487 return &bpf_get_smp_processor_id_proto; 8488 case BPF_FUNC_csum_diff: 8489 return &bpf_csum_diff_proto; 8490 case BPF_FUNC_xdp_adjust_head: 8491 return &bpf_xdp_adjust_head_proto; 8492 case BPF_FUNC_xdp_adjust_meta: 8493 return &bpf_xdp_adjust_meta_proto; 8494 case BPF_FUNC_redirect: 8495 return &bpf_xdp_redirect_proto; 8496 case BPF_FUNC_redirect_map: 8497 return &bpf_xdp_redirect_map_proto; 8498 case BPF_FUNC_xdp_adjust_tail: 8499 return &bpf_xdp_adjust_tail_proto; 8500 case BPF_FUNC_xdp_get_buff_len: 8501 return &bpf_xdp_get_buff_len_proto; 8502 case BPF_FUNC_xdp_load_bytes: 8503 return &bpf_xdp_load_bytes_proto; 8504 case BPF_FUNC_xdp_store_bytes: 8505 return &bpf_xdp_store_bytes_proto; 8506 case BPF_FUNC_fib_lookup: 8507 return &bpf_xdp_fib_lookup_proto; 8508 case BPF_FUNC_check_mtu: 8509 return &bpf_xdp_check_mtu_proto; 8510 #ifdef CONFIG_INET 8511 case BPF_FUNC_sk_lookup_udp: 8512 return &bpf_xdp_sk_lookup_udp_proto; 8513 case BPF_FUNC_sk_lookup_tcp: 8514 return &bpf_xdp_sk_lookup_tcp_proto; 8515 case BPF_FUNC_sk_release: 8516 return &bpf_sk_release_proto; 8517 case BPF_FUNC_skc_lookup_tcp: 8518 return &bpf_xdp_skc_lookup_tcp_proto; 8519 case BPF_FUNC_tcp_check_syncookie: 8520 return &bpf_tcp_check_syncookie_proto; 8521 case BPF_FUNC_tcp_gen_syncookie: 8522 return &bpf_tcp_gen_syncookie_proto; 8523 #ifdef CONFIG_SYN_COOKIES 8524 case BPF_FUNC_tcp_raw_gen_syncookie_ipv4: 8525 return &bpf_tcp_raw_gen_syncookie_ipv4_proto; 8526 case BPF_FUNC_tcp_raw_gen_syncookie_ipv6: 8527 return &bpf_tcp_raw_gen_syncookie_ipv6_proto; 8528 case BPF_FUNC_tcp_raw_check_syncookie_ipv4: 8529 return &bpf_tcp_raw_check_syncookie_ipv4_proto; 8530 case BPF_FUNC_tcp_raw_check_syncookie_ipv6: 8531 return &bpf_tcp_raw_check_syncookie_ipv6_proto; 8532 #endif 8533 #endif 8534 default: 8535 return bpf_sk_base_func_proto(func_id, prog); 8536 } 8537 8538 #if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES) 8539 /* The nf_conn___init type is used in the NF_CONNTRACK kfuncs. The 8540 * kfuncs are defined in two different modules, and we want to be able 8541 * to use them interchangeably with the same BTF type ID. Because modules 8542 * can't de-duplicate BTF IDs between each other, we need the type to be 8543 * referenced in the vmlinux BTF or the verifier will get confused about 8544 * the different types. So we add this dummy type reference which will 8545 * be included in vmlinux BTF, allowing both modules to refer to the 8546 * same type ID. 8547 */ 8548 BTF_TYPE_EMIT(struct nf_conn___init); 8549 #endif 8550 } 8551 8552 const struct bpf_func_proto bpf_sock_map_update_proto __weak; 8553 const struct bpf_func_proto bpf_sock_hash_update_proto __weak; 8554 8555 static const struct bpf_func_proto * 8556 sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 8557 { 8558 const struct bpf_func_proto *func_proto; 8559 8560 func_proto = cgroup_common_func_proto(func_id, prog); 8561 if (func_proto) 8562 return func_proto; 8563 8564 switch (func_id) { 8565 case BPF_FUNC_setsockopt: 8566 return &bpf_sock_ops_setsockopt_proto; 8567 case BPF_FUNC_getsockopt: 8568 return &bpf_sock_ops_getsockopt_proto; 8569 case BPF_FUNC_sock_ops_cb_flags_set: 8570 return &bpf_sock_ops_cb_flags_set_proto; 8571 case BPF_FUNC_sock_map_update: 8572 return &bpf_sock_map_update_proto; 8573 case BPF_FUNC_sock_hash_update: 8574 return &bpf_sock_hash_update_proto; 8575 case BPF_FUNC_get_socket_cookie: 8576 return &bpf_get_socket_cookie_sock_ops_proto; 8577 case BPF_FUNC_perf_event_output: 8578 return &bpf_event_output_data_proto; 8579 case BPF_FUNC_sk_storage_get: 8580 return &bpf_sk_storage_get_proto; 8581 case BPF_FUNC_sk_storage_delete: 8582 return &bpf_sk_storage_delete_proto; 8583 case BPF_FUNC_get_netns_cookie: 8584 return &bpf_get_netns_cookie_sock_ops_proto; 8585 #ifdef CONFIG_INET 8586 case BPF_FUNC_load_hdr_opt: 8587 return &bpf_sock_ops_load_hdr_opt_proto; 8588 case BPF_FUNC_store_hdr_opt: 8589 return &bpf_sock_ops_store_hdr_opt_proto; 8590 case BPF_FUNC_reserve_hdr_opt: 8591 return &bpf_sock_ops_reserve_hdr_opt_proto; 8592 case BPF_FUNC_tcp_sock: 8593 return &bpf_tcp_sock_proto; 8594 #endif /* CONFIG_INET */ 8595 default: 8596 return bpf_sk_base_func_proto(func_id, prog); 8597 } 8598 } 8599 8600 const struct bpf_func_proto bpf_msg_redirect_map_proto __weak; 8601 const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak; 8602 8603 static const struct bpf_func_proto * 8604 sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 8605 { 8606 switch (func_id) { 8607 case BPF_FUNC_msg_redirect_map: 8608 return &bpf_msg_redirect_map_proto; 8609 case BPF_FUNC_msg_redirect_hash: 8610 return &bpf_msg_redirect_hash_proto; 8611 case BPF_FUNC_msg_apply_bytes: 8612 return &bpf_msg_apply_bytes_proto; 8613 case BPF_FUNC_msg_cork_bytes: 8614 return &bpf_msg_cork_bytes_proto; 8615 case BPF_FUNC_msg_pull_data: 8616 return &bpf_msg_pull_data_proto; 8617 case BPF_FUNC_msg_push_data: 8618 return &bpf_msg_push_data_proto; 8619 case BPF_FUNC_msg_pop_data: 8620 return &bpf_msg_pop_data_proto; 8621 case BPF_FUNC_perf_event_output: 8622 return &bpf_event_output_data_proto; 8623 case BPF_FUNC_sk_storage_get: 8624 return &bpf_sk_storage_get_proto; 8625 case BPF_FUNC_sk_storage_delete: 8626 return &bpf_sk_storage_delete_proto; 8627 case BPF_FUNC_get_netns_cookie: 8628 return &bpf_get_netns_cookie_sk_msg_proto; 8629 default: 8630 return bpf_sk_base_func_proto(func_id, prog); 8631 } 8632 } 8633 8634 const struct bpf_func_proto bpf_sk_redirect_map_proto __weak; 8635 const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak; 8636 8637 static const struct bpf_func_proto * 8638 sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 8639 { 8640 switch (func_id) { 8641 case BPF_FUNC_skb_store_bytes: 8642 return &bpf_skb_store_bytes_proto; 8643 case BPF_FUNC_skb_load_bytes: 8644 return &bpf_skb_load_bytes_proto; 8645 case BPF_FUNC_skb_pull_data: 8646 return &sk_skb_pull_data_proto; 8647 case BPF_FUNC_skb_change_tail: 8648 return &sk_skb_change_tail_proto; 8649 case BPF_FUNC_skb_change_head: 8650 return &sk_skb_change_head_proto; 8651 case BPF_FUNC_skb_adjust_room: 8652 return &sk_skb_adjust_room_proto; 8653 case BPF_FUNC_get_socket_cookie: 8654 return &bpf_get_socket_cookie_proto; 8655 case BPF_FUNC_get_socket_uid: 8656 return &bpf_get_socket_uid_proto; 8657 case BPF_FUNC_sk_redirect_map: 8658 return &bpf_sk_redirect_map_proto; 8659 case BPF_FUNC_sk_redirect_hash: 8660 return &bpf_sk_redirect_hash_proto; 8661 case BPF_FUNC_perf_event_output: 8662 return &bpf_skb_event_output_proto; 8663 #ifdef CONFIG_INET 8664 case BPF_FUNC_sk_lookup_tcp: 8665 return &bpf_sk_lookup_tcp_proto; 8666 case BPF_FUNC_sk_lookup_udp: 8667 return &bpf_sk_lookup_udp_proto; 8668 case BPF_FUNC_sk_release: 8669 return &bpf_sk_release_proto; 8670 case BPF_FUNC_skc_lookup_tcp: 8671 return &bpf_skc_lookup_tcp_proto; 8672 #endif 8673 default: 8674 return bpf_sk_base_func_proto(func_id, prog); 8675 } 8676 } 8677 8678 static const struct bpf_func_proto * 8679 flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 8680 { 8681 switch (func_id) { 8682 case BPF_FUNC_skb_load_bytes: 8683 return &bpf_flow_dissector_load_bytes_proto; 8684 default: 8685 return bpf_sk_base_func_proto(func_id, prog); 8686 } 8687 } 8688 8689 static const struct bpf_func_proto * 8690 lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 8691 { 8692 switch (func_id) { 8693 case BPF_FUNC_skb_load_bytes: 8694 return &bpf_skb_load_bytes_proto; 8695 case BPF_FUNC_skb_pull_data: 8696 return &bpf_skb_pull_data_proto; 8697 case BPF_FUNC_csum_diff: 8698 return &bpf_csum_diff_proto; 8699 case BPF_FUNC_get_cgroup_classid: 8700 return &bpf_get_cgroup_classid_proto; 8701 case BPF_FUNC_get_route_realm: 8702 return &bpf_get_route_realm_proto; 8703 case BPF_FUNC_get_hash_recalc: 8704 return &bpf_get_hash_recalc_proto; 8705 case BPF_FUNC_perf_event_output: 8706 return &bpf_skb_event_output_proto; 8707 case BPF_FUNC_get_smp_processor_id: 8708 return &bpf_get_smp_processor_id_proto; 8709 case BPF_FUNC_skb_under_cgroup: 8710 return &bpf_skb_under_cgroup_proto; 8711 default: 8712 return bpf_sk_base_func_proto(func_id, prog); 8713 } 8714 } 8715 8716 static const struct bpf_func_proto * 8717 lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 8718 { 8719 switch (func_id) { 8720 case BPF_FUNC_lwt_push_encap: 8721 return &bpf_lwt_in_push_encap_proto; 8722 default: 8723 return lwt_out_func_proto(func_id, prog); 8724 } 8725 } 8726 8727 static const struct bpf_func_proto * 8728 lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 8729 { 8730 switch (func_id) { 8731 case BPF_FUNC_skb_get_tunnel_key: 8732 return &bpf_skb_get_tunnel_key_proto; 8733 case BPF_FUNC_skb_set_tunnel_key: 8734 return bpf_get_skb_set_tunnel_proto(func_id); 8735 case BPF_FUNC_skb_get_tunnel_opt: 8736 return &bpf_skb_get_tunnel_opt_proto; 8737 case BPF_FUNC_skb_set_tunnel_opt: 8738 return bpf_get_skb_set_tunnel_proto(func_id); 8739 case BPF_FUNC_redirect: 8740 return &bpf_redirect_proto; 8741 case BPF_FUNC_clone_redirect: 8742 return &bpf_clone_redirect_proto; 8743 case BPF_FUNC_skb_change_tail: 8744 return &bpf_skb_change_tail_proto; 8745 case BPF_FUNC_skb_change_head: 8746 return &bpf_skb_change_head_proto; 8747 case BPF_FUNC_skb_store_bytes: 8748 return &bpf_skb_store_bytes_proto; 8749 case BPF_FUNC_csum_update: 8750 return &bpf_csum_update_proto; 8751 case BPF_FUNC_csum_level: 8752 return &bpf_csum_level_proto; 8753 case BPF_FUNC_l3_csum_replace: 8754 return &bpf_l3_csum_replace_proto; 8755 case BPF_FUNC_l4_csum_replace: 8756 return &bpf_l4_csum_replace_proto; 8757 case BPF_FUNC_set_hash_invalid: 8758 return &bpf_set_hash_invalid_proto; 8759 case BPF_FUNC_lwt_push_encap: 8760 return &bpf_lwt_xmit_push_encap_proto; 8761 default: 8762 return lwt_out_func_proto(func_id, prog); 8763 } 8764 } 8765 8766 static const struct bpf_func_proto * 8767 lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 8768 { 8769 switch (func_id) { 8770 #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) 8771 case BPF_FUNC_lwt_seg6_store_bytes: 8772 return &bpf_lwt_seg6_store_bytes_proto; 8773 case BPF_FUNC_lwt_seg6_action: 8774 return &bpf_lwt_seg6_action_proto; 8775 case BPF_FUNC_lwt_seg6_adjust_srh: 8776 return &bpf_lwt_seg6_adjust_srh_proto; 8777 #endif 8778 default: 8779 return lwt_out_func_proto(func_id, prog); 8780 } 8781 } 8782 8783 static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type, 8784 const struct bpf_prog *prog, 8785 struct bpf_insn_access_aux *info) 8786 { 8787 const int size_default = sizeof(__u32); 8788 8789 if (off < 0 || off >= sizeof(struct __sk_buff)) 8790 return false; 8791 8792 /* The verifier guarantees that size > 0. */ 8793 if (off % size != 0) 8794 return false; 8795 8796 switch (off) { 8797 case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): 8798 if (off + size > offsetofend(struct __sk_buff, cb[4])) 8799 return false; 8800 break; 8801 case bpf_ctx_range(struct __sk_buff, data): 8802 case bpf_ctx_range(struct __sk_buff, data_meta): 8803 case bpf_ctx_range(struct __sk_buff, data_end): 8804 if (info->is_ldsx || size != size_default) 8805 return false; 8806 break; 8807 case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]): 8808 case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]): 8809 case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4): 8810 case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4): 8811 if (size != size_default) 8812 return false; 8813 break; 8814 case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): 8815 return false; 8816 case bpf_ctx_range(struct __sk_buff, hwtstamp): 8817 if (type == BPF_WRITE || size != sizeof(__u64)) 8818 return false; 8819 break; 8820 case bpf_ctx_range(struct __sk_buff, tstamp): 8821 if (size != sizeof(__u64)) 8822 return false; 8823 break; 8824 case bpf_ctx_range_ptr(struct __sk_buff, sk): 8825 if (type == BPF_WRITE || size != sizeof(__u64)) 8826 return false; 8827 info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; 8828 break; 8829 case offsetof(struct __sk_buff, tstamp_type): 8830 return false; 8831 case offsetofend(struct __sk_buff, tstamp_type) ... offsetof(struct __sk_buff, hwtstamp) - 1: 8832 /* Explicitly prohibit access to padding in __sk_buff. */ 8833 return false; 8834 default: 8835 /* Only narrow read access allowed for now. */ 8836 if (type == BPF_WRITE) { 8837 if (size != size_default) 8838 return false; 8839 } else { 8840 bpf_ctx_record_field_size(info, size_default); 8841 if (!bpf_ctx_narrow_access_ok(off, size, size_default)) 8842 return false; 8843 } 8844 } 8845 8846 return true; 8847 } 8848 8849 static bool sk_filter_is_valid_access(int off, int size, 8850 enum bpf_access_type type, 8851 const struct bpf_prog *prog, 8852 struct bpf_insn_access_aux *info) 8853 { 8854 switch (off) { 8855 case bpf_ctx_range(struct __sk_buff, tc_classid): 8856 case bpf_ctx_range(struct __sk_buff, data): 8857 case bpf_ctx_range(struct __sk_buff, data_meta): 8858 case bpf_ctx_range(struct __sk_buff, data_end): 8859 case bpf_ctx_range_till(struct __sk_buff, family, local_port): 8860 case bpf_ctx_range(struct __sk_buff, tstamp): 8861 case bpf_ctx_range(struct __sk_buff, wire_len): 8862 case bpf_ctx_range(struct __sk_buff, hwtstamp): 8863 return false; 8864 } 8865 8866 if (type == BPF_WRITE) { 8867 switch (off) { 8868 case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): 8869 break; 8870 default: 8871 return false; 8872 } 8873 } 8874 8875 return bpf_skb_is_valid_access(off, size, type, prog, info); 8876 } 8877 8878 static bool cg_skb_is_valid_access(int off, int size, 8879 enum bpf_access_type type, 8880 const struct bpf_prog *prog, 8881 struct bpf_insn_access_aux *info) 8882 { 8883 switch (off) { 8884 case bpf_ctx_range(struct __sk_buff, tc_classid): 8885 case bpf_ctx_range(struct __sk_buff, data_meta): 8886 case bpf_ctx_range(struct __sk_buff, wire_len): 8887 return false; 8888 case bpf_ctx_range(struct __sk_buff, data): 8889 case bpf_ctx_range(struct __sk_buff, data_end): 8890 if (!bpf_token_capable(prog->aux->token, CAP_BPF)) 8891 return false; 8892 break; 8893 } 8894 8895 if (type == BPF_WRITE) { 8896 switch (off) { 8897 case bpf_ctx_range(struct __sk_buff, mark): 8898 case bpf_ctx_range(struct __sk_buff, priority): 8899 case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): 8900 break; 8901 case bpf_ctx_range(struct __sk_buff, tstamp): 8902 if (!bpf_token_capable(prog->aux->token, CAP_BPF)) 8903 return false; 8904 break; 8905 default: 8906 return false; 8907 } 8908 } 8909 8910 switch (off) { 8911 case bpf_ctx_range(struct __sk_buff, data): 8912 info->reg_type = PTR_TO_PACKET; 8913 break; 8914 case bpf_ctx_range(struct __sk_buff, data_end): 8915 info->reg_type = PTR_TO_PACKET_END; 8916 break; 8917 } 8918 8919 return bpf_skb_is_valid_access(off, size, type, prog, info); 8920 } 8921 8922 static bool lwt_is_valid_access(int off, int size, 8923 enum bpf_access_type type, 8924 const struct bpf_prog *prog, 8925 struct bpf_insn_access_aux *info) 8926 { 8927 switch (off) { 8928 case bpf_ctx_range(struct __sk_buff, tc_classid): 8929 case bpf_ctx_range_till(struct __sk_buff, family, local_port): 8930 case bpf_ctx_range(struct __sk_buff, data_meta): 8931 case bpf_ctx_range(struct __sk_buff, tstamp): 8932 case bpf_ctx_range(struct __sk_buff, wire_len): 8933 case bpf_ctx_range(struct __sk_buff, hwtstamp): 8934 return false; 8935 } 8936 8937 if (type == BPF_WRITE) { 8938 switch (off) { 8939 case bpf_ctx_range(struct __sk_buff, mark): 8940 case bpf_ctx_range(struct __sk_buff, priority): 8941 case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): 8942 break; 8943 default: 8944 return false; 8945 } 8946 } 8947 8948 switch (off) { 8949 case bpf_ctx_range(struct __sk_buff, data): 8950 info->reg_type = PTR_TO_PACKET; 8951 break; 8952 case bpf_ctx_range(struct __sk_buff, data_end): 8953 info->reg_type = PTR_TO_PACKET_END; 8954 break; 8955 } 8956 8957 return bpf_skb_is_valid_access(off, size, type, prog, info); 8958 } 8959 8960 /* Attach type specific accesses */ 8961 static bool __sock_filter_check_attach_type(int off, 8962 enum bpf_access_type access_type, 8963 enum bpf_attach_type attach_type) 8964 { 8965 switch (off) { 8966 case offsetof(struct bpf_sock, bound_dev_if): 8967 case offsetof(struct bpf_sock, mark): 8968 case offsetof(struct bpf_sock, priority): 8969 switch (attach_type) { 8970 case BPF_CGROUP_INET_SOCK_CREATE: 8971 case BPF_CGROUP_INET_SOCK_RELEASE: 8972 goto full_access; 8973 default: 8974 return false; 8975 } 8976 case bpf_ctx_range(struct bpf_sock, src_ip4): 8977 switch (attach_type) { 8978 case BPF_CGROUP_INET4_POST_BIND: 8979 goto read_only; 8980 default: 8981 return false; 8982 } 8983 case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): 8984 switch (attach_type) { 8985 case BPF_CGROUP_INET6_POST_BIND: 8986 goto read_only; 8987 default: 8988 return false; 8989 } 8990 case bpf_ctx_range(struct bpf_sock, src_port): 8991 switch (attach_type) { 8992 case BPF_CGROUP_INET4_POST_BIND: 8993 case BPF_CGROUP_INET6_POST_BIND: 8994 goto read_only; 8995 default: 8996 return false; 8997 } 8998 } 8999 read_only: 9000 return access_type == BPF_READ; 9001 full_access: 9002 return true; 9003 } 9004 9005 bool bpf_sock_common_is_valid_access(int off, int size, 9006 enum bpf_access_type type, 9007 struct bpf_insn_access_aux *info) 9008 { 9009 switch (off) { 9010 case bpf_ctx_range_till(struct bpf_sock, type, priority): 9011 return false; 9012 default: 9013 return bpf_sock_is_valid_access(off, size, type, info); 9014 } 9015 } 9016 9017 bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, 9018 struct bpf_insn_access_aux *info) 9019 { 9020 const int size_default = sizeof(__u32); 9021 int field_size; 9022 9023 if (off < 0 || off >= sizeof(struct bpf_sock)) 9024 return false; 9025 if (off % size != 0) 9026 return false; 9027 9028 switch (off) { 9029 case offsetof(struct bpf_sock, state): 9030 case offsetof(struct bpf_sock, family): 9031 case offsetof(struct bpf_sock, type): 9032 case offsetof(struct bpf_sock, protocol): 9033 case offsetof(struct bpf_sock, src_port): 9034 case offsetof(struct bpf_sock, rx_queue_mapping): 9035 case bpf_ctx_range(struct bpf_sock, src_ip4): 9036 case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): 9037 case bpf_ctx_range(struct bpf_sock, dst_ip4): 9038 case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): 9039 bpf_ctx_record_field_size(info, size_default); 9040 return bpf_ctx_narrow_access_ok(off, size, size_default); 9041 case bpf_ctx_range(struct bpf_sock, dst_port): 9042 field_size = size == size_default ? 9043 size_default : sizeof_field(struct bpf_sock, dst_port); 9044 bpf_ctx_record_field_size(info, field_size); 9045 return bpf_ctx_narrow_access_ok(off, size, field_size); 9046 case offsetofend(struct bpf_sock, dst_port) ... 9047 offsetof(struct bpf_sock, dst_ip4) - 1: 9048 return false; 9049 } 9050 9051 return size == size_default; 9052 } 9053 9054 static bool sock_filter_is_valid_access(int off, int size, 9055 enum bpf_access_type type, 9056 const struct bpf_prog *prog, 9057 struct bpf_insn_access_aux *info) 9058 { 9059 if (!bpf_sock_is_valid_access(off, size, type, info)) 9060 return false; 9061 return __sock_filter_check_attach_type(off, type, 9062 prog->expected_attach_type); 9063 } 9064 9065 static int bpf_noop_prologue(struct bpf_insn *insn_buf, bool direct_write, 9066 const struct bpf_prog *prog) 9067 { 9068 /* Neither direct read nor direct write requires any preliminary 9069 * action. 9070 */ 9071 return 0; 9072 } 9073 9074 static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, 9075 const struct bpf_prog *prog, int drop_verdict) 9076 { 9077 struct bpf_insn *insn = insn_buf; 9078 9079 if (!direct_write) 9080 return 0; 9081 9082 /* if (!skb->cloned) 9083 * goto start; 9084 * 9085 * (Fast-path, otherwise approximation that we might be 9086 * a clone, do the rest in helper.) 9087 */ 9088 *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET); 9089 *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK); 9090 *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7); 9091 9092 /* ret = bpf_skb_pull_data(skb, 0); */ 9093 *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); 9094 *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2); 9095 *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 9096 BPF_FUNC_skb_pull_data); 9097 /* if (!ret) 9098 * goto restore; 9099 * return TC_ACT_SHOT; 9100 */ 9101 *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2); 9102 *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, drop_verdict); 9103 *insn++ = BPF_EXIT_INSN(); 9104 9105 /* restore: */ 9106 *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); 9107 /* start: */ 9108 *insn++ = prog->insnsi[0]; 9109 9110 return insn - insn_buf; 9111 } 9112 9113 static int bpf_gen_ld_abs(const struct bpf_insn *orig, 9114 struct bpf_insn *insn_buf) 9115 { 9116 bool indirect = BPF_MODE(orig->code) == BPF_IND; 9117 struct bpf_insn *insn = insn_buf; 9118 9119 if (!indirect) { 9120 *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm); 9121 } else { 9122 *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg); 9123 if (orig->imm) 9124 *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm); 9125 } 9126 /* We're guaranteed here that CTX is in R6. */ 9127 *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX); 9128 9129 switch (BPF_SIZE(orig->code)) { 9130 case BPF_B: 9131 *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache); 9132 break; 9133 case BPF_H: 9134 *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache); 9135 break; 9136 case BPF_W: 9137 *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache); 9138 break; 9139 } 9140 9141 *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2); 9142 *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0); 9143 *insn++ = BPF_EXIT_INSN(); 9144 9145 return insn - insn_buf; 9146 } 9147 9148 static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, 9149 const struct bpf_prog *prog) 9150 { 9151 return bpf_unclone_prologue(insn_buf, direct_write, prog, TC_ACT_SHOT); 9152 } 9153 9154 static bool tc_cls_act_is_valid_access(int off, int size, 9155 enum bpf_access_type type, 9156 const struct bpf_prog *prog, 9157 struct bpf_insn_access_aux *info) 9158 { 9159 if (type == BPF_WRITE) { 9160 switch (off) { 9161 case bpf_ctx_range(struct __sk_buff, mark): 9162 case bpf_ctx_range(struct __sk_buff, tc_index): 9163 case bpf_ctx_range(struct __sk_buff, priority): 9164 case bpf_ctx_range(struct __sk_buff, tc_classid): 9165 case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): 9166 case bpf_ctx_range(struct __sk_buff, tstamp): 9167 case bpf_ctx_range(struct __sk_buff, queue_mapping): 9168 break; 9169 default: 9170 return false; 9171 } 9172 } 9173 9174 switch (off) { 9175 case bpf_ctx_range(struct __sk_buff, data): 9176 info->reg_type = PTR_TO_PACKET; 9177 break; 9178 case bpf_ctx_range(struct __sk_buff, data_meta): 9179 info->reg_type = PTR_TO_PACKET_META; 9180 break; 9181 case bpf_ctx_range(struct __sk_buff, data_end): 9182 info->reg_type = PTR_TO_PACKET_END; 9183 break; 9184 case bpf_ctx_range_till(struct __sk_buff, family, local_port): 9185 return false; 9186 case offsetof(struct __sk_buff, tstamp_type): 9187 /* The convert_ctx_access() on reading and writing 9188 * __sk_buff->tstamp depends on whether the bpf prog 9189 * has used __sk_buff->tstamp_type or not. 9190 * Thus, we need to set prog->tstamp_type_access 9191 * earlier during is_valid_access() here. 9192 */ 9193 ((struct bpf_prog *)prog)->tstamp_type_access = 1; 9194 return size == sizeof(__u8); 9195 } 9196 9197 return bpf_skb_is_valid_access(off, size, type, prog, info); 9198 } 9199 9200 DEFINE_MUTEX(nf_conn_btf_access_lock); 9201 EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock); 9202 9203 int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, 9204 const struct bpf_reg_state *reg, 9205 int off, int size); 9206 EXPORT_SYMBOL_GPL(nfct_btf_struct_access); 9207 9208 static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log, 9209 const struct bpf_reg_state *reg, 9210 int off, int size) 9211 { 9212 int ret = -EACCES; 9213 9214 mutex_lock(&nf_conn_btf_access_lock); 9215 if (nfct_btf_struct_access) 9216 ret = nfct_btf_struct_access(log, reg, off, size); 9217 mutex_unlock(&nf_conn_btf_access_lock); 9218 9219 return ret; 9220 } 9221 9222 static bool __is_valid_xdp_access(int off, int size) 9223 { 9224 if (off < 0 || off >= sizeof(struct xdp_md)) 9225 return false; 9226 if (off % size != 0) 9227 return false; 9228 if (size != sizeof(__u32)) 9229 return false; 9230 9231 return true; 9232 } 9233 9234 static bool xdp_is_valid_access(int off, int size, 9235 enum bpf_access_type type, 9236 const struct bpf_prog *prog, 9237 struct bpf_insn_access_aux *info) 9238 { 9239 if (prog->expected_attach_type != BPF_XDP_DEVMAP) { 9240 switch (off) { 9241 case offsetof(struct xdp_md, egress_ifindex): 9242 return false; 9243 } 9244 } 9245 9246 if (type == BPF_WRITE) { 9247 if (bpf_prog_is_offloaded(prog->aux)) { 9248 switch (off) { 9249 case offsetof(struct xdp_md, rx_queue_index): 9250 return __is_valid_xdp_access(off, size); 9251 } 9252 } 9253 return false; 9254 } else { 9255 switch (off) { 9256 case offsetof(struct xdp_md, data_meta): 9257 case offsetof(struct xdp_md, data): 9258 case offsetof(struct xdp_md, data_end): 9259 if (info->is_ldsx) 9260 return false; 9261 } 9262 } 9263 9264 switch (off) { 9265 case offsetof(struct xdp_md, data): 9266 info->reg_type = PTR_TO_PACKET; 9267 break; 9268 case offsetof(struct xdp_md, data_meta): 9269 info->reg_type = PTR_TO_PACKET_META; 9270 break; 9271 case offsetof(struct xdp_md, data_end): 9272 info->reg_type = PTR_TO_PACKET_END; 9273 break; 9274 } 9275 9276 return __is_valid_xdp_access(off, size); 9277 } 9278 9279 void bpf_warn_invalid_xdp_action(const struct net_device *dev, 9280 const struct bpf_prog *prog, u32 act) 9281 { 9282 const u32 act_max = XDP_REDIRECT; 9283 9284 pr_warn_once("%s XDP return value %u on prog %s (id %d) dev %s, expect packet loss!\n", 9285 act > act_max ? "Illegal" : "Driver unsupported", 9286 act, prog->aux->name, prog->aux->id, dev ? dev->name : "N/A"); 9287 } 9288 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); 9289 9290 static int xdp_btf_struct_access(struct bpf_verifier_log *log, 9291 const struct bpf_reg_state *reg, 9292 int off, int size) 9293 { 9294 int ret = -EACCES; 9295 9296 mutex_lock(&nf_conn_btf_access_lock); 9297 if (nfct_btf_struct_access) 9298 ret = nfct_btf_struct_access(log, reg, off, size); 9299 mutex_unlock(&nf_conn_btf_access_lock); 9300 9301 return ret; 9302 } 9303 9304 static bool sock_addr_is_valid_access(int off, int size, 9305 enum bpf_access_type type, 9306 const struct bpf_prog *prog, 9307 struct bpf_insn_access_aux *info) 9308 { 9309 const int size_default = sizeof(__u32); 9310 9311 if (off < 0 || off >= sizeof(struct bpf_sock_addr)) 9312 return false; 9313 if (off % size != 0) 9314 return false; 9315 9316 /* Disallow access to fields not belonging to the attach type's address 9317 * family. 9318 */ 9319 switch (off) { 9320 case bpf_ctx_range(struct bpf_sock_addr, user_ip4): 9321 switch (prog->expected_attach_type) { 9322 case BPF_CGROUP_INET4_BIND: 9323 case BPF_CGROUP_INET4_CONNECT: 9324 case BPF_CGROUP_INET4_GETPEERNAME: 9325 case BPF_CGROUP_INET4_GETSOCKNAME: 9326 case BPF_CGROUP_UDP4_SENDMSG: 9327 case BPF_CGROUP_UDP4_RECVMSG: 9328 break; 9329 default: 9330 return false; 9331 } 9332 break; 9333 case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): 9334 switch (prog->expected_attach_type) { 9335 case BPF_CGROUP_INET6_BIND: 9336 case BPF_CGROUP_INET6_CONNECT: 9337 case BPF_CGROUP_INET6_GETPEERNAME: 9338 case BPF_CGROUP_INET6_GETSOCKNAME: 9339 case BPF_CGROUP_UDP6_SENDMSG: 9340 case BPF_CGROUP_UDP6_RECVMSG: 9341 break; 9342 default: 9343 return false; 9344 } 9345 break; 9346 case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): 9347 switch (prog->expected_attach_type) { 9348 case BPF_CGROUP_UDP4_SENDMSG: 9349 break; 9350 default: 9351 return false; 9352 } 9353 break; 9354 case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], 9355 msg_src_ip6[3]): 9356 switch (prog->expected_attach_type) { 9357 case BPF_CGROUP_UDP6_SENDMSG: 9358 break; 9359 default: 9360 return false; 9361 } 9362 break; 9363 } 9364 9365 switch (off) { 9366 case bpf_ctx_range(struct bpf_sock_addr, user_ip4): 9367 case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): 9368 case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): 9369 case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], 9370 msg_src_ip6[3]): 9371 case bpf_ctx_range(struct bpf_sock_addr, user_port): 9372 if (type == BPF_READ) { 9373 bpf_ctx_record_field_size(info, size_default); 9374 9375 if (bpf_ctx_wide_access_ok(off, size, 9376 struct bpf_sock_addr, 9377 user_ip6)) 9378 return true; 9379 9380 if (bpf_ctx_wide_access_ok(off, size, 9381 struct bpf_sock_addr, 9382 msg_src_ip6)) 9383 return true; 9384 9385 if (!bpf_ctx_narrow_access_ok(off, size, size_default)) 9386 return false; 9387 } else { 9388 if (bpf_ctx_wide_access_ok(off, size, 9389 struct bpf_sock_addr, 9390 user_ip6)) 9391 return true; 9392 9393 if (bpf_ctx_wide_access_ok(off, size, 9394 struct bpf_sock_addr, 9395 msg_src_ip6)) 9396 return true; 9397 9398 if (size != size_default) 9399 return false; 9400 } 9401 break; 9402 case bpf_ctx_range_ptr(struct bpf_sock_addr, sk): 9403 if (type != BPF_READ) 9404 return false; 9405 if (size != sizeof(__u64)) 9406 return false; 9407 info->reg_type = PTR_TO_SOCKET; 9408 break; 9409 case bpf_ctx_range(struct bpf_sock_addr, user_family): 9410 case bpf_ctx_range(struct bpf_sock_addr, family): 9411 case bpf_ctx_range(struct bpf_sock_addr, type): 9412 case bpf_ctx_range(struct bpf_sock_addr, protocol): 9413 if (type != BPF_READ) 9414 return false; 9415 if (size != size_default) 9416 return false; 9417 break; 9418 default: 9419 return false; 9420 } 9421 9422 return true; 9423 } 9424 9425 static bool sock_ops_is_valid_access(int off, int size, 9426 enum bpf_access_type type, 9427 const struct bpf_prog *prog, 9428 struct bpf_insn_access_aux *info) 9429 { 9430 const int size_default = sizeof(__u32); 9431 9432 if (off < 0 || off >= sizeof(struct bpf_sock_ops)) 9433 return false; 9434 9435 /* The verifier guarantees that size > 0. */ 9436 if (off % size != 0) 9437 return false; 9438 9439 if (type == BPF_WRITE) { 9440 switch (off) { 9441 case offsetof(struct bpf_sock_ops, reply): 9442 case offsetof(struct bpf_sock_ops, sk_txhash): 9443 if (size != size_default) 9444 return false; 9445 break; 9446 default: 9447 return false; 9448 } 9449 } else { 9450 switch (off) { 9451 case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received, 9452 bytes_acked): 9453 if (size != sizeof(__u64)) 9454 return false; 9455 break; 9456 case bpf_ctx_range_ptr(struct bpf_sock_ops, sk): 9457 if (size != sizeof(__u64)) 9458 return false; 9459 info->reg_type = PTR_TO_SOCKET_OR_NULL; 9460 break; 9461 case bpf_ctx_range_ptr(struct bpf_sock_ops, skb_data): 9462 if (size != sizeof(__u64)) 9463 return false; 9464 info->reg_type = PTR_TO_PACKET; 9465 break; 9466 case bpf_ctx_range_ptr(struct bpf_sock_ops, skb_data_end): 9467 if (size != sizeof(__u64)) 9468 return false; 9469 info->reg_type = PTR_TO_PACKET_END; 9470 break; 9471 case offsetof(struct bpf_sock_ops, skb_tcp_flags): 9472 bpf_ctx_record_field_size(info, size_default); 9473 return bpf_ctx_narrow_access_ok(off, size, 9474 size_default); 9475 case bpf_ctx_range(struct bpf_sock_ops, skb_hwtstamp): 9476 if (size != sizeof(__u64)) 9477 return false; 9478 break; 9479 default: 9480 if (size != size_default) 9481 return false; 9482 break; 9483 } 9484 } 9485 9486 return true; 9487 } 9488 9489 static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, 9490 const struct bpf_prog *prog) 9491 { 9492 return bpf_unclone_prologue(insn_buf, direct_write, prog, SK_DROP); 9493 } 9494 9495 static bool sk_skb_is_valid_access(int off, int size, 9496 enum bpf_access_type type, 9497 const struct bpf_prog *prog, 9498 struct bpf_insn_access_aux *info) 9499 { 9500 switch (off) { 9501 case bpf_ctx_range(struct __sk_buff, tc_classid): 9502 case bpf_ctx_range(struct __sk_buff, data_meta): 9503 case bpf_ctx_range(struct __sk_buff, tstamp): 9504 case bpf_ctx_range(struct __sk_buff, wire_len): 9505 case bpf_ctx_range(struct __sk_buff, hwtstamp): 9506 return false; 9507 } 9508 9509 if (type == BPF_WRITE) { 9510 switch (off) { 9511 case bpf_ctx_range(struct __sk_buff, tc_index): 9512 case bpf_ctx_range(struct __sk_buff, priority): 9513 break; 9514 default: 9515 return false; 9516 } 9517 } 9518 9519 switch (off) { 9520 case bpf_ctx_range(struct __sk_buff, mark): 9521 return false; 9522 case bpf_ctx_range(struct __sk_buff, data): 9523 info->reg_type = PTR_TO_PACKET; 9524 break; 9525 case bpf_ctx_range(struct __sk_buff, data_end): 9526 info->reg_type = PTR_TO_PACKET_END; 9527 break; 9528 } 9529 9530 return bpf_skb_is_valid_access(off, size, type, prog, info); 9531 } 9532 9533 static bool sk_msg_is_valid_access(int off, int size, 9534 enum bpf_access_type type, 9535 const struct bpf_prog *prog, 9536 struct bpf_insn_access_aux *info) 9537 { 9538 if (type == BPF_WRITE) 9539 return false; 9540 9541 if (off % size != 0) 9542 return false; 9543 9544 switch (off) { 9545 case bpf_ctx_range_ptr(struct sk_msg_md, data): 9546 info->reg_type = PTR_TO_PACKET; 9547 if (size != sizeof(__u64)) 9548 return false; 9549 break; 9550 case bpf_ctx_range_ptr(struct sk_msg_md, data_end): 9551 info->reg_type = PTR_TO_PACKET_END; 9552 if (size != sizeof(__u64)) 9553 return false; 9554 break; 9555 case bpf_ctx_range_ptr(struct sk_msg_md, sk): 9556 if (size != sizeof(__u64)) 9557 return false; 9558 info->reg_type = PTR_TO_SOCKET; 9559 break; 9560 case bpf_ctx_range(struct sk_msg_md, family): 9561 case bpf_ctx_range(struct sk_msg_md, remote_ip4): 9562 case bpf_ctx_range(struct sk_msg_md, local_ip4): 9563 case bpf_ctx_range_till(struct sk_msg_md, remote_ip6[0], remote_ip6[3]): 9564 case bpf_ctx_range_till(struct sk_msg_md, local_ip6[0], local_ip6[3]): 9565 case bpf_ctx_range(struct sk_msg_md, remote_port): 9566 case bpf_ctx_range(struct sk_msg_md, local_port): 9567 case bpf_ctx_range(struct sk_msg_md, size): 9568 if (size != sizeof(__u32)) 9569 return false; 9570 break; 9571 default: 9572 return false; 9573 } 9574 return true; 9575 } 9576 9577 static bool flow_dissector_is_valid_access(int off, int size, 9578 enum bpf_access_type type, 9579 const struct bpf_prog *prog, 9580 struct bpf_insn_access_aux *info) 9581 { 9582 const int size_default = sizeof(__u32); 9583 9584 if (off < 0 || off >= sizeof(struct __sk_buff)) 9585 return false; 9586 9587 if (off % size != 0) 9588 return false; 9589 9590 if (type == BPF_WRITE) 9591 return false; 9592 9593 switch (off) { 9594 case bpf_ctx_range(struct __sk_buff, data): 9595 if (info->is_ldsx || size != size_default) 9596 return false; 9597 info->reg_type = PTR_TO_PACKET; 9598 return true; 9599 case bpf_ctx_range(struct __sk_buff, data_end): 9600 if (info->is_ldsx || size != size_default) 9601 return false; 9602 info->reg_type = PTR_TO_PACKET_END; 9603 return true; 9604 case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): 9605 if (size != sizeof(__u64)) 9606 return false; 9607 info->reg_type = PTR_TO_FLOW_KEYS; 9608 return true; 9609 default: 9610 return false; 9611 } 9612 } 9613 9614 static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type, 9615 const struct bpf_insn *si, 9616 struct bpf_insn *insn_buf, 9617 struct bpf_prog *prog, 9618 u32 *target_size) 9619 9620 { 9621 struct bpf_insn *insn = insn_buf; 9622 9623 switch (si->off) { 9624 case offsetof(struct __sk_buff, data): 9625 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data), 9626 si->dst_reg, si->src_reg, 9627 offsetof(struct bpf_flow_dissector, data)); 9628 break; 9629 9630 case offsetof(struct __sk_buff, data_end): 9631 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data_end), 9632 si->dst_reg, si->src_reg, 9633 offsetof(struct bpf_flow_dissector, data_end)); 9634 break; 9635 9636 case offsetof(struct __sk_buff, flow_keys): 9637 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, flow_keys), 9638 si->dst_reg, si->src_reg, 9639 offsetof(struct bpf_flow_dissector, flow_keys)); 9640 break; 9641 } 9642 9643 return insn - insn_buf; 9644 } 9645 9646 static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si, 9647 struct bpf_insn *insn) 9648 { 9649 __u8 value_reg = si->dst_reg; 9650 __u8 skb_reg = si->src_reg; 9651 BUILD_BUG_ON(__SKB_CLOCK_MAX != (int)BPF_SKB_CLOCK_TAI); 9652 BUILD_BUG_ON(SKB_CLOCK_REALTIME != (int)BPF_SKB_CLOCK_REALTIME); 9653 BUILD_BUG_ON(SKB_CLOCK_MONOTONIC != (int)BPF_SKB_CLOCK_MONOTONIC); 9654 BUILD_BUG_ON(SKB_CLOCK_TAI != (int)BPF_SKB_CLOCK_TAI); 9655 *insn++ = BPF_LDX_MEM(BPF_B, value_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); 9656 *insn++ = BPF_ALU32_IMM(BPF_AND, value_reg, SKB_TSTAMP_TYPE_MASK); 9657 #ifdef __BIG_ENDIAN_BITFIELD 9658 *insn++ = BPF_ALU32_IMM(BPF_RSH, value_reg, SKB_TSTAMP_TYPE_RSHIFT); 9659 #else 9660 BUILD_BUG_ON(!(SKB_TSTAMP_TYPE_MASK & 0x1)); 9661 #endif 9662 9663 return insn; 9664 } 9665 9666 static struct bpf_insn *bpf_convert_shinfo_access(__u8 dst_reg, __u8 skb_reg, 9667 struct bpf_insn *insn) 9668 { 9669 /* si->dst_reg = skb_shinfo(SKB); */ 9670 #ifdef NET_SKBUFF_DATA_USES_OFFSET 9671 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), 9672 BPF_REG_AX, skb_reg, 9673 offsetof(struct sk_buff, end)); 9674 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head), 9675 dst_reg, skb_reg, 9676 offsetof(struct sk_buff, head)); 9677 *insn++ = BPF_ALU64_REG(BPF_ADD, dst_reg, BPF_REG_AX); 9678 #else 9679 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), 9680 dst_reg, skb_reg, 9681 offsetof(struct sk_buff, end)); 9682 #endif 9683 9684 return insn; 9685 } 9686 9687 static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog, 9688 const struct bpf_insn *si, 9689 struct bpf_insn *insn) 9690 { 9691 __u8 value_reg = si->dst_reg; 9692 __u8 skb_reg = si->src_reg; 9693 9694 #ifdef CONFIG_NET_XGRESS 9695 /* If the tstamp_type is read, 9696 * the bpf prog is aware the tstamp could have delivery time. 9697 * Thus, read skb->tstamp as is if tstamp_type_access is true. 9698 */ 9699 if (!prog->tstamp_type_access) { 9700 /* AX is needed because src_reg and dst_reg could be the same */ 9701 __u8 tmp_reg = BPF_REG_AX; 9702 9703 *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); 9704 /* check if ingress mask bits is set */ 9705 *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1); 9706 *insn++ = BPF_JMP_A(4); 9707 *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_TSTAMP_TYPE_MASK, 1); 9708 *insn++ = BPF_JMP_A(2); 9709 /* skb->tc_at_ingress && skb->tstamp_type, 9710 * read 0 as the (rcv) timestamp. 9711 */ 9712 *insn++ = BPF_MOV64_IMM(value_reg, 0); 9713 *insn++ = BPF_JMP_A(1); 9714 } 9715 #endif 9716 9717 *insn++ = BPF_LDX_MEM(BPF_DW, value_reg, skb_reg, 9718 offsetof(struct sk_buff, tstamp)); 9719 return insn; 9720 } 9721 9722 static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, 9723 const struct bpf_insn *si, 9724 struct bpf_insn *insn) 9725 { 9726 __u8 value_reg = si->src_reg; 9727 __u8 skb_reg = si->dst_reg; 9728 9729 #ifdef CONFIG_NET_XGRESS 9730 /* If the tstamp_type is read, 9731 * the bpf prog is aware the tstamp could have delivery time. 9732 * Thus, write skb->tstamp as is if tstamp_type_access is true. 9733 * Otherwise, writing at ingress will have to clear the 9734 * skb->tstamp_type bit also. 9735 */ 9736 if (!prog->tstamp_type_access) { 9737 __u8 tmp_reg = BPF_REG_AX; 9738 9739 *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); 9740 /* Writing __sk_buff->tstamp as ingress, goto <clear> */ 9741 *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1); 9742 /* goto <store> */ 9743 *insn++ = BPF_JMP_A(2); 9744 /* <clear>: skb->tstamp_type */ 9745 *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TSTAMP_TYPE_MASK); 9746 *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET); 9747 } 9748 #endif 9749 9750 /* <store>: skb->tstamp = tstamp */ 9751 *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_DW | BPF_MEM, 9752 skb_reg, value_reg, offsetof(struct sk_buff, tstamp), si->imm); 9753 return insn; 9754 } 9755 9756 #define BPF_EMIT_STORE(size, si, off) \ 9757 BPF_RAW_INSN(BPF_CLASS((si)->code) | (size) | BPF_MEM, \ 9758 (si)->dst_reg, (si)->src_reg, (off), (si)->imm) 9759 9760 static u32 bpf_convert_ctx_access(enum bpf_access_type type, 9761 const struct bpf_insn *si, 9762 struct bpf_insn *insn_buf, 9763 struct bpf_prog *prog, u32 *target_size) 9764 { 9765 struct bpf_insn *insn = insn_buf; 9766 int off; 9767 9768 switch (si->off) { 9769 case offsetof(struct __sk_buff, len): 9770 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 9771 bpf_target_off(struct sk_buff, len, 4, 9772 target_size)); 9773 break; 9774 9775 case offsetof(struct __sk_buff, protocol): 9776 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, 9777 bpf_target_off(struct sk_buff, protocol, 2, 9778 target_size)); 9779 break; 9780 9781 case offsetof(struct __sk_buff, vlan_proto): 9782 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, 9783 bpf_target_off(struct sk_buff, vlan_proto, 2, 9784 target_size)); 9785 break; 9786 9787 case offsetof(struct __sk_buff, priority): 9788 if (type == BPF_WRITE) 9789 *insn++ = BPF_EMIT_STORE(BPF_W, si, 9790 bpf_target_off(struct sk_buff, priority, 4, 9791 target_size)); 9792 else 9793 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 9794 bpf_target_off(struct sk_buff, priority, 4, 9795 target_size)); 9796 break; 9797 9798 case offsetof(struct __sk_buff, ingress_ifindex): 9799 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 9800 bpf_target_off(struct sk_buff, skb_iif, 4, 9801 target_size)); 9802 break; 9803 9804 case offsetof(struct __sk_buff, ifindex): 9805 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), 9806 si->dst_reg, si->src_reg, 9807 offsetof(struct sk_buff, dev)); 9808 *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); 9809 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 9810 bpf_target_off(struct net_device, ifindex, 4, 9811 target_size)); 9812 break; 9813 9814 case offsetof(struct __sk_buff, hash): 9815 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 9816 bpf_target_off(struct sk_buff, hash, 4, 9817 target_size)); 9818 break; 9819 9820 case offsetof(struct __sk_buff, mark): 9821 if (type == BPF_WRITE) 9822 *insn++ = BPF_EMIT_STORE(BPF_W, si, 9823 bpf_target_off(struct sk_buff, mark, 4, 9824 target_size)); 9825 else 9826 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 9827 bpf_target_off(struct sk_buff, mark, 4, 9828 target_size)); 9829 break; 9830 9831 case offsetof(struct __sk_buff, pkt_type): 9832 *target_size = 1; 9833 *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg, 9834 PKT_TYPE_OFFSET); 9835 *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX); 9836 #ifdef __BIG_ENDIAN_BITFIELD 9837 *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5); 9838 #endif 9839 break; 9840 9841 case offsetof(struct __sk_buff, queue_mapping): 9842 if (type == BPF_WRITE) { 9843 u32 offset = bpf_target_off(struct sk_buff, queue_mapping, 2, target_size); 9844 9845 if (BPF_CLASS(si->code) == BPF_ST && si->imm >= NO_QUEUE_MAPPING) { 9846 *insn++ = BPF_JMP_A(0); /* noop */ 9847 break; 9848 } 9849 9850 if (BPF_CLASS(si->code) == BPF_STX) 9851 *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1); 9852 *insn++ = BPF_EMIT_STORE(BPF_H, si, offset); 9853 } else { 9854 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, 9855 bpf_target_off(struct sk_buff, 9856 queue_mapping, 9857 2, target_size)); 9858 } 9859 break; 9860 9861 case offsetof(struct __sk_buff, vlan_present): 9862 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 9863 bpf_target_off(struct sk_buff, 9864 vlan_all, 4, target_size)); 9865 *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); 9866 *insn++ = BPF_ALU32_IMM(BPF_MOV, si->dst_reg, 1); 9867 break; 9868 9869 case offsetof(struct __sk_buff, vlan_tci): 9870 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, 9871 bpf_target_off(struct sk_buff, vlan_tci, 2, 9872 target_size)); 9873 break; 9874 9875 case offsetof(struct __sk_buff, cb[0]) ... 9876 offsetofend(struct __sk_buff, cb[4]) - 1: 9877 BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, data) < 20); 9878 BUILD_BUG_ON((offsetof(struct sk_buff, cb) + 9879 offsetof(struct qdisc_skb_cb, data)) % 9880 sizeof(__u64)); 9881 9882 prog->cb_access = 1; 9883 off = si->off; 9884 off -= offsetof(struct __sk_buff, cb[0]); 9885 off += offsetof(struct sk_buff, cb); 9886 off += offsetof(struct qdisc_skb_cb, data); 9887 if (type == BPF_WRITE) 9888 *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off); 9889 else 9890 *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, 9891 si->src_reg, off); 9892 break; 9893 9894 case offsetof(struct __sk_buff, tc_classid): 9895 BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, tc_classid) != 2); 9896 9897 off = si->off; 9898 off -= offsetof(struct __sk_buff, tc_classid); 9899 off += offsetof(struct sk_buff, cb); 9900 off += offsetof(struct qdisc_skb_cb, tc_classid); 9901 *target_size = 2; 9902 if (type == BPF_WRITE) 9903 *insn++ = BPF_EMIT_STORE(BPF_H, si, off); 9904 else 9905 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, 9906 si->src_reg, off); 9907 break; 9908 9909 case offsetof(struct __sk_buff, data): 9910 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), 9911 si->dst_reg, si->src_reg, 9912 offsetof(struct sk_buff, data)); 9913 break; 9914 9915 case offsetof(struct __sk_buff, data_meta): 9916 off = si->off; 9917 off -= offsetof(struct __sk_buff, data_meta); 9918 off += offsetof(struct sk_buff, cb); 9919 off += offsetof(struct bpf_skb_data_end, data_meta); 9920 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, 9921 si->src_reg, off); 9922 break; 9923 9924 case offsetof(struct __sk_buff, data_end): 9925 off = si->off; 9926 off -= offsetof(struct __sk_buff, data_end); 9927 off += offsetof(struct sk_buff, cb); 9928 off += offsetof(struct bpf_skb_data_end, data_end); 9929 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, 9930 si->src_reg, off); 9931 break; 9932 9933 case offsetof(struct __sk_buff, tc_index): 9934 #ifdef CONFIG_NET_SCHED 9935 if (type == BPF_WRITE) 9936 *insn++ = BPF_EMIT_STORE(BPF_H, si, 9937 bpf_target_off(struct sk_buff, tc_index, 2, 9938 target_size)); 9939 else 9940 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, 9941 bpf_target_off(struct sk_buff, tc_index, 2, 9942 target_size)); 9943 #else 9944 *target_size = 2; 9945 if (type == BPF_WRITE) 9946 *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg); 9947 else 9948 *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); 9949 #endif 9950 break; 9951 9952 case offsetof(struct __sk_buff, napi_id): 9953 #if defined(CONFIG_NET_RX_BUSY_POLL) 9954 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 9955 bpf_target_off(struct sk_buff, napi_id, 4, 9956 target_size)); 9957 *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1); 9958 *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); 9959 #else 9960 *target_size = 4; 9961 *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); 9962 #endif 9963 break; 9964 case offsetof(struct __sk_buff, family): 9965 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2); 9966 9967 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), 9968 si->dst_reg, si->src_reg, 9969 offsetof(struct sk_buff, sk)); 9970 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, 9971 bpf_target_off(struct sock_common, 9972 skc_family, 9973 2, target_size)); 9974 break; 9975 case offsetof(struct __sk_buff, remote_ip4): 9976 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4); 9977 9978 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), 9979 si->dst_reg, si->src_reg, 9980 offsetof(struct sk_buff, sk)); 9981 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 9982 bpf_target_off(struct sock_common, 9983 skc_daddr, 9984 4, target_size)); 9985 break; 9986 case offsetof(struct __sk_buff, local_ip4): 9987 BUILD_BUG_ON(sizeof_field(struct sock_common, 9988 skc_rcv_saddr) != 4); 9989 9990 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), 9991 si->dst_reg, si->src_reg, 9992 offsetof(struct sk_buff, sk)); 9993 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 9994 bpf_target_off(struct sock_common, 9995 skc_rcv_saddr, 9996 4, target_size)); 9997 break; 9998 case offsetof(struct __sk_buff, remote_ip6[0]) ... 9999 offsetof(struct __sk_buff, remote_ip6[3]): 10000 #if IS_ENABLED(CONFIG_IPV6) 10001 BUILD_BUG_ON(sizeof_field(struct sock_common, 10002 skc_v6_daddr.s6_addr32[0]) != 4); 10003 10004 off = si->off; 10005 off -= offsetof(struct __sk_buff, remote_ip6[0]); 10006 10007 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), 10008 si->dst_reg, si->src_reg, 10009 offsetof(struct sk_buff, sk)); 10010 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 10011 offsetof(struct sock_common, 10012 skc_v6_daddr.s6_addr32[0]) + 10013 off); 10014 #else 10015 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); 10016 #endif 10017 break; 10018 case offsetof(struct __sk_buff, local_ip6[0]) ... 10019 offsetof(struct __sk_buff, local_ip6[3]): 10020 #if IS_ENABLED(CONFIG_IPV6) 10021 BUILD_BUG_ON(sizeof_field(struct sock_common, 10022 skc_v6_rcv_saddr.s6_addr32[0]) != 4); 10023 10024 off = si->off; 10025 off -= offsetof(struct __sk_buff, local_ip6[0]); 10026 10027 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), 10028 si->dst_reg, si->src_reg, 10029 offsetof(struct sk_buff, sk)); 10030 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 10031 offsetof(struct sock_common, 10032 skc_v6_rcv_saddr.s6_addr32[0]) + 10033 off); 10034 #else 10035 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); 10036 #endif 10037 break; 10038 10039 case offsetof(struct __sk_buff, remote_port): 10040 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2); 10041 10042 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), 10043 si->dst_reg, si->src_reg, 10044 offsetof(struct sk_buff, sk)); 10045 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, 10046 bpf_target_off(struct sock_common, 10047 skc_dport, 10048 2, target_size)); 10049 #ifndef __BIG_ENDIAN_BITFIELD 10050 *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); 10051 #endif 10052 break; 10053 10054 case offsetof(struct __sk_buff, local_port): 10055 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2); 10056 10057 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), 10058 si->dst_reg, si->src_reg, 10059 offsetof(struct sk_buff, sk)); 10060 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, 10061 bpf_target_off(struct sock_common, 10062 skc_num, 2, target_size)); 10063 break; 10064 10065 case offsetof(struct __sk_buff, tstamp): 10066 BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8); 10067 10068 if (type == BPF_WRITE) 10069 insn = bpf_convert_tstamp_write(prog, si, insn); 10070 else 10071 insn = bpf_convert_tstamp_read(prog, si, insn); 10072 break; 10073 10074 case offsetof(struct __sk_buff, tstamp_type): 10075 insn = bpf_convert_tstamp_type_read(si, insn); 10076 break; 10077 10078 case offsetof(struct __sk_buff, gso_segs): 10079 insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn); 10080 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs), 10081 si->dst_reg, si->dst_reg, 10082 bpf_target_off(struct skb_shared_info, 10083 gso_segs, 2, 10084 target_size)); 10085 break; 10086 case offsetof(struct __sk_buff, gso_size): 10087 insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn); 10088 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_size), 10089 si->dst_reg, si->dst_reg, 10090 bpf_target_off(struct skb_shared_info, 10091 gso_size, 2, 10092 target_size)); 10093 break; 10094 case offsetof(struct __sk_buff, wire_len): 10095 BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, pkt_len) != 4); 10096 10097 off = si->off; 10098 off -= offsetof(struct __sk_buff, wire_len); 10099 off += offsetof(struct sk_buff, cb); 10100 off += offsetof(struct qdisc_skb_cb, pkt_len); 10101 *target_size = 4; 10102 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off); 10103 break; 10104 10105 case offsetof(struct __sk_buff, sk): 10106 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), 10107 si->dst_reg, si->src_reg, 10108 offsetof(struct sk_buff, sk)); 10109 break; 10110 case offsetof(struct __sk_buff, hwtstamp): 10111 BUILD_BUG_ON(sizeof_field(struct skb_shared_hwtstamps, hwtstamp) != 8); 10112 BUILD_BUG_ON(offsetof(struct skb_shared_hwtstamps, hwtstamp) != 0); 10113 10114 insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn); 10115 *insn++ = BPF_LDX_MEM(BPF_DW, 10116 si->dst_reg, si->dst_reg, 10117 bpf_target_off(struct skb_shared_info, 10118 hwtstamps, 8, 10119 target_size)); 10120 break; 10121 } 10122 10123 return insn - insn_buf; 10124 } 10125 10126 u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, 10127 const struct bpf_insn *si, 10128 struct bpf_insn *insn_buf, 10129 struct bpf_prog *prog, u32 *target_size) 10130 { 10131 struct bpf_insn *insn = insn_buf; 10132 int off; 10133 10134 switch (si->off) { 10135 case offsetof(struct bpf_sock, bound_dev_if): 10136 BUILD_BUG_ON(sizeof_field(struct sock, sk_bound_dev_if) != 4); 10137 10138 if (type == BPF_WRITE) 10139 *insn++ = BPF_EMIT_STORE(BPF_W, si, 10140 offsetof(struct sock, sk_bound_dev_if)); 10141 else 10142 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 10143 offsetof(struct sock, sk_bound_dev_if)); 10144 break; 10145 10146 case offsetof(struct bpf_sock, mark): 10147 BUILD_BUG_ON(sizeof_field(struct sock, sk_mark) != 4); 10148 10149 if (type == BPF_WRITE) 10150 *insn++ = BPF_EMIT_STORE(BPF_W, si, 10151 offsetof(struct sock, sk_mark)); 10152 else 10153 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 10154 offsetof(struct sock, sk_mark)); 10155 break; 10156 10157 case offsetof(struct bpf_sock, priority): 10158 BUILD_BUG_ON(sizeof_field(struct sock, sk_priority) != 4); 10159 10160 if (type == BPF_WRITE) 10161 *insn++ = BPF_EMIT_STORE(BPF_W, si, 10162 offsetof(struct sock, sk_priority)); 10163 else 10164 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 10165 offsetof(struct sock, sk_priority)); 10166 break; 10167 10168 case offsetof(struct bpf_sock, family): 10169 *insn++ = BPF_LDX_MEM( 10170 BPF_FIELD_SIZEOF(struct sock_common, skc_family), 10171 si->dst_reg, si->src_reg, 10172 bpf_target_off(struct sock_common, 10173 skc_family, 10174 sizeof_field(struct sock_common, 10175 skc_family), 10176 target_size)); 10177 break; 10178 10179 case offsetof(struct bpf_sock, type): 10180 *insn++ = BPF_LDX_MEM( 10181 BPF_FIELD_SIZEOF(struct sock, sk_type), 10182 si->dst_reg, si->src_reg, 10183 bpf_target_off(struct sock, sk_type, 10184 sizeof_field(struct sock, sk_type), 10185 target_size)); 10186 break; 10187 10188 case offsetof(struct bpf_sock, protocol): 10189 *insn++ = BPF_LDX_MEM( 10190 BPF_FIELD_SIZEOF(struct sock, sk_protocol), 10191 si->dst_reg, si->src_reg, 10192 bpf_target_off(struct sock, sk_protocol, 10193 sizeof_field(struct sock, sk_protocol), 10194 target_size)); 10195 break; 10196 10197 case offsetof(struct bpf_sock, src_ip4): 10198 *insn++ = BPF_LDX_MEM( 10199 BPF_SIZE(si->code), si->dst_reg, si->src_reg, 10200 bpf_target_off(struct sock_common, skc_rcv_saddr, 10201 sizeof_field(struct sock_common, 10202 skc_rcv_saddr), 10203 target_size)); 10204 break; 10205 10206 case offsetof(struct bpf_sock, dst_ip4): 10207 *insn++ = BPF_LDX_MEM( 10208 BPF_SIZE(si->code), si->dst_reg, si->src_reg, 10209 bpf_target_off(struct sock_common, skc_daddr, 10210 sizeof_field(struct sock_common, 10211 skc_daddr), 10212 target_size)); 10213 break; 10214 10215 case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): 10216 #if IS_ENABLED(CONFIG_IPV6) 10217 off = si->off; 10218 off -= offsetof(struct bpf_sock, src_ip6[0]); 10219 *insn++ = BPF_LDX_MEM( 10220 BPF_SIZE(si->code), si->dst_reg, si->src_reg, 10221 bpf_target_off( 10222 struct sock_common, 10223 skc_v6_rcv_saddr.s6_addr32[0], 10224 sizeof_field(struct sock_common, 10225 skc_v6_rcv_saddr.s6_addr32[0]), 10226 target_size) + off); 10227 #else 10228 (void)off; 10229 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); 10230 #endif 10231 break; 10232 10233 case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): 10234 #if IS_ENABLED(CONFIG_IPV6) 10235 off = si->off; 10236 off -= offsetof(struct bpf_sock, dst_ip6[0]); 10237 *insn++ = BPF_LDX_MEM( 10238 BPF_SIZE(si->code), si->dst_reg, si->src_reg, 10239 bpf_target_off(struct sock_common, 10240 skc_v6_daddr.s6_addr32[0], 10241 sizeof_field(struct sock_common, 10242 skc_v6_daddr.s6_addr32[0]), 10243 target_size) + off); 10244 #else 10245 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); 10246 *target_size = 4; 10247 #endif 10248 break; 10249 10250 case offsetof(struct bpf_sock, src_port): 10251 *insn++ = BPF_LDX_MEM( 10252 BPF_FIELD_SIZEOF(struct sock_common, skc_num), 10253 si->dst_reg, si->src_reg, 10254 bpf_target_off(struct sock_common, skc_num, 10255 sizeof_field(struct sock_common, 10256 skc_num), 10257 target_size)); 10258 break; 10259 10260 case offsetof(struct bpf_sock, dst_port): 10261 *insn++ = BPF_LDX_MEM( 10262 BPF_FIELD_SIZEOF(struct sock_common, skc_dport), 10263 si->dst_reg, si->src_reg, 10264 bpf_target_off(struct sock_common, skc_dport, 10265 sizeof_field(struct sock_common, 10266 skc_dport), 10267 target_size)); 10268 break; 10269 10270 case offsetof(struct bpf_sock, state): 10271 *insn++ = BPF_LDX_MEM( 10272 BPF_FIELD_SIZEOF(struct sock_common, skc_state), 10273 si->dst_reg, si->src_reg, 10274 bpf_target_off(struct sock_common, skc_state, 10275 sizeof_field(struct sock_common, 10276 skc_state), 10277 target_size)); 10278 break; 10279 case offsetof(struct bpf_sock, rx_queue_mapping): 10280 #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING 10281 *insn++ = BPF_LDX_MEM( 10282 BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping), 10283 si->dst_reg, si->src_reg, 10284 bpf_target_off(struct sock, sk_rx_queue_mapping, 10285 sizeof_field(struct sock, 10286 sk_rx_queue_mapping), 10287 target_size)); 10288 *insn++ = BPF_JMP_IMM(BPF_JNE, si->dst_reg, NO_QUEUE_MAPPING, 10289 1); 10290 *insn++ = BPF_MOV64_IMM(si->dst_reg, -1); 10291 #else 10292 *insn++ = BPF_MOV64_IMM(si->dst_reg, -1); 10293 *target_size = 2; 10294 #endif 10295 break; 10296 } 10297 10298 return insn - insn_buf; 10299 } 10300 10301 static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, 10302 const struct bpf_insn *si, 10303 struct bpf_insn *insn_buf, 10304 struct bpf_prog *prog, u32 *target_size) 10305 { 10306 struct bpf_insn *insn = insn_buf; 10307 10308 switch (si->off) { 10309 case offsetof(struct __sk_buff, ifindex): 10310 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), 10311 si->dst_reg, si->src_reg, 10312 offsetof(struct sk_buff, dev)); 10313 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 10314 bpf_target_off(struct net_device, ifindex, 4, 10315 target_size)); 10316 break; 10317 default: 10318 return bpf_convert_ctx_access(type, si, insn_buf, prog, 10319 target_size); 10320 } 10321 10322 return insn - insn_buf; 10323 } 10324 10325 static u32 xdp_convert_ctx_access(enum bpf_access_type type, 10326 const struct bpf_insn *si, 10327 struct bpf_insn *insn_buf, 10328 struct bpf_prog *prog, u32 *target_size) 10329 { 10330 struct bpf_insn *insn = insn_buf; 10331 10332 switch (si->off) { 10333 case offsetof(struct xdp_md, data): 10334 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data), 10335 si->dst_reg, si->src_reg, 10336 offsetof(struct xdp_buff, data)); 10337 break; 10338 case offsetof(struct xdp_md, data_meta): 10339 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta), 10340 si->dst_reg, si->src_reg, 10341 offsetof(struct xdp_buff, data_meta)); 10342 break; 10343 case offsetof(struct xdp_md, data_end): 10344 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), 10345 si->dst_reg, si->src_reg, 10346 offsetof(struct xdp_buff, data_end)); 10347 break; 10348 case offsetof(struct xdp_md, ingress_ifindex): 10349 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), 10350 si->dst_reg, si->src_reg, 10351 offsetof(struct xdp_buff, rxq)); 10352 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev), 10353 si->dst_reg, si->dst_reg, 10354 offsetof(struct xdp_rxq_info, dev)); 10355 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 10356 offsetof(struct net_device, ifindex)); 10357 break; 10358 case offsetof(struct xdp_md, rx_queue_index): 10359 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), 10360 si->dst_reg, si->src_reg, 10361 offsetof(struct xdp_buff, rxq)); 10362 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 10363 offsetof(struct xdp_rxq_info, 10364 queue_index)); 10365 break; 10366 case offsetof(struct xdp_md, egress_ifindex): 10367 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq), 10368 si->dst_reg, si->src_reg, 10369 offsetof(struct xdp_buff, txq)); 10370 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev), 10371 si->dst_reg, si->dst_reg, 10372 offsetof(struct xdp_txq_info, dev)); 10373 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 10374 offsetof(struct net_device, ifindex)); 10375 break; 10376 } 10377 10378 return insn - insn_buf; 10379 } 10380 10381 /* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of 10382 * context Structure, F is Field in context structure that contains a pointer 10383 * to Nested Structure of type NS that has the field NF. 10384 * 10385 * SIZE encodes the load size (BPF_B, BPF_H, etc). It's up to caller to make 10386 * sure that SIZE is not greater than actual size of S.F.NF. 10387 * 10388 * If offset OFF is provided, the load happens from that offset relative to 10389 * offset of NF. 10390 */ 10391 #define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF) \ 10392 do { \ 10393 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg, \ 10394 si->src_reg, offsetof(S, F)); \ 10395 *insn++ = BPF_LDX_MEM( \ 10396 SIZE, si->dst_reg, si->dst_reg, \ 10397 bpf_target_off(NS, NF, sizeof_field(NS, NF), \ 10398 target_size) \ 10399 + OFF); \ 10400 } while (0) 10401 10402 #define SOCK_ADDR_LOAD_NESTED_FIELD(S, NS, F, NF) \ 10403 SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, \ 10404 BPF_FIELD_SIZEOF(NS, NF), 0) 10405 10406 /* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to 10407 * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation. 10408 * 10409 * In addition it uses Temporary Field TF (member of struct S) as the 3rd 10410 * "register" since two registers available in convert_ctx_access are not 10411 * enough: we can't override neither SRC, since it contains value to store, nor 10412 * DST since it contains pointer to context that may be used by later 10413 * instructions. But we need a temporary place to save pointer to nested 10414 * structure whose field we want to store to. 10415 */ 10416 #define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, OFF, TF) \ 10417 do { \ 10418 int tmp_reg = BPF_REG_9; \ 10419 if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ 10420 --tmp_reg; \ 10421 if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ 10422 --tmp_reg; \ 10423 *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, tmp_reg, \ 10424 offsetof(S, TF)); \ 10425 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \ 10426 si->dst_reg, offsetof(S, F)); \ 10427 *insn++ = BPF_RAW_INSN(SIZE | BPF_MEM | BPF_CLASS(si->code), \ 10428 tmp_reg, si->src_reg, \ 10429 bpf_target_off(NS, NF, sizeof_field(NS, NF), \ 10430 target_size) \ 10431 + OFF, \ 10432 si->imm); \ 10433 *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg, \ 10434 offsetof(S, TF)); \ 10435 } while (0) 10436 10437 #define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF, \ 10438 TF) \ 10439 do { \ 10440 if (type == BPF_WRITE) { \ 10441 SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, \ 10442 OFF, TF); \ 10443 } else { \ 10444 SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( \ 10445 S, NS, F, NF, SIZE, OFF); \ 10446 } \ 10447 } while (0) 10448 10449 static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, 10450 const struct bpf_insn *si, 10451 struct bpf_insn *insn_buf, 10452 struct bpf_prog *prog, u32 *target_size) 10453 { 10454 int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port); 10455 struct bpf_insn *insn = insn_buf; 10456 10457 switch (si->off) { 10458 case offsetof(struct bpf_sock_addr, user_family): 10459 SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, 10460 struct sockaddr, uaddr, sa_family); 10461 break; 10462 10463 case offsetof(struct bpf_sock_addr, user_ip4): 10464 SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( 10465 struct bpf_sock_addr_kern, struct sockaddr_in, uaddr, 10466 sin_addr, BPF_SIZE(si->code), 0, tmp_reg); 10467 break; 10468 10469 case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): 10470 off = si->off; 10471 off -= offsetof(struct bpf_sock_addr, user_ip6[0]); 10472 SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( 10473 struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, 10474 sin6_addr.s6_addr32[0], BPF_SIZE(si->code), off, 10475 tmp_reg); 10476 break; 10477 10478 case offsetof(struct bpf_sock_addr, user_port): 10479 /* To get port we need to know sa_family first and then treat 10480 * sockaddr as either sockaddr_in or sockaddr_in6. 10481 * Though we can simplify since port field has same offset and 10482 * size in both structures. 10483 * Here we check this invariant and use just one of the 10484 * structures if it's true. 10485 */ 10486 BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) != 10487 offsetof(struct sockaddr_in6, sin6_port)); 10488 BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) != 10489 sizeof_field(struct sockaddr_in6, sin6_port)); 10490 /* Account for sin6_port being smaller than user_port. */ 10491 port_size = min(port_size, BPF_LDST_BYTES(si)); 10492 SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( 10493 struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, 10494 sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg); 10495 break; 10496 10497 case offsetof(struct bpf_sock_addr, family): 10498 SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, 10499 struct sock, sk, sk_family); 10500 break; 10501 10502 case offsetof(struct bpf_sock_addr, type): 10503 SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, 10504 struct sock, sk, sk_type); 10505 break; 10506 10507 case offsetof(struct bpf_sock_addr, protocol): 10508 SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, 10509 struct sock, sk, sk_protocol); 10510 break; 10511 10512 case offsetof(struct bpf_sock_addr, msg_src_ip4): 10513 /* Treat t_ctx as struct in_addr for msg_src_ip4. */ 10514 SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( 10515 struct bpf_sock_addr_kern, struct in_addr, t_ctx, 10516 s_addr, BPF_SIZE(si->code), 0, tmp_reg); 10517 break; 10518 10519 case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], 10520 msg_src_ip6[3]): 10521 off = si->off; 10522 off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]); 10523 /* Treat t_ctx as struct in6_addr for msg_src_ip6. */ 10524 SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( 10525 struct bpf_sock_addr_kern, struct in6_addr, t_ctx, 10526 s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg); 10527 break; 10528 case offsetof(struct bpf_sock_addr, sk): 10529 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_addr_kern, sk), 10530 si->dst_reg, si->src_reg, 10531 offsetof(struct bpf_sock_addr_kern, sk)); 10532 break; 10533 } 10534 10535 return insn - insn_buf; 10536 } 10537 10538 static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, 10539 const struct bpf_insn *si, 10540 struct bpf_insn *insn_buf, 10541 struct bpf_prog *prog, 10542 u32 *target_size) 10543 { 10544 struct bpf_insn *insn = insn_buf; 10545 int off; 10546 10547 /* Helper macro for adding read access to tcp_sock or sock fields. */ 10548 #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ 10549 do { \ 10550 int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 2; \ 10551 BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) > \ 10552 sizeof_field(struct bpf_sock_ops, BPF_FIELD)); \ 10553 if (si->dst_reg == reg || si->src_reg == reg) \ 10554 reg--; \ 10555 if (si->dst_reg == reg || si->src_reg == reg) \ 10556 reg--; \ 10557 if (si->dst_reg == si->src_reg) { \ 10558 *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, \ 10559 offsetof(struct bpf_sock_ops_kern, \ 10560 temp)); \ 10561 fullsock_reg = reg; \ 10562 jmp += 2; \ 10563 } \ 10564 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ 10565 struct bpf_sock_ops_kern, \ 10566 is_locked_tcp_sock), \ 10567 fullsock_reg, si->src_reg, \ 10568 offsetof(struct bpf_sock_ops_kern, \ 10569 is_locked_tcp_sock)); \ 10570 *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \ 10571 if (si->dst_reg == si->src_reg) \ 10572 *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ 10573 offsetof(struct bpf_sock_ops_kern, \ 10574 temp)); \ 10575 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ 10576 struct bpf_sock_ops_kern, sk),\ 10577 si->dst_reg, si->src_reg, \ 10578 offsetof(struct bpf_sock_ops_kern, sk));\ 10579 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \ 10580 OBJ_FIELD), \ 10581 si->dst_reg, si->dst_reg, \ 10582 offsetof(OBJ, OBJ_FIELD)); \ 10583 if (si->dst_reg == si->src_reg) { \ 10584 *insn++ = BPF_JMP_A(1); \ 10585 *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ 10586 offsetof(struct bpf_sock_ops_kern, \ 10587 temp)); \ 10588 } \ 10589 } while (0) 10590 10591 #define SOCK_OPS_GET_SK() \ 10592 do { \ 10593 int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 1; \ 10594 if (si->dst_reg == reg || si->src_reg == reg) \ 10595 reg--; \ 10596 if (si->dst_reg == reg || si->src_reg == reg) \ 10597 reg--; \ 10598 if (si->dst_reg == si->src_reg) { \ 10599 *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, \ 10600 offsetof(struct bpf_sock_ops_kern, \ 10601 temp)); \ 10602 fullsock_reg = reg; \ 10603 jmp += 2; \ 10604 } \ 10605 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ 10606 struct bpf_sock_ops_kern, \ 10607 is_fullsock), \ 10608 fullsock_reg, si->src_reg, \ 10609 offsetof(struct bpf_sock_ops_kern, \ 10610 is_fullsock)); \ 10611 *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \ 10612 if (si->dst_reg == si->src_reg) \ 10613 *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ 10614 offsetof(struct bpf_sock_ops_kern, \ 10615 temp)); \ 10616 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ 10617 struct bpf_sock_ops_kern, sk),\ 10618 si->dst_reg, si->src_reg, \ 10619 offsetof(struct bpf_sock_ops_kern, sk));\ 10620 if (si->dst_reg == si->src_reg) { \ 10621 *insn++ = BPF_JMP_A(1); \ 10622 *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ 10623 offsetof(struct bpf_sock_ops_kern, \ 10624 temp)); \ 10625 } \ 10626 } while (0) 10627 10628 #define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \ 10629 SOCK_OPS_GET_FIELD(FIELD, FIELD, struct tcp_sock) 10630 10631 /* Helper macro for adding write access to tcp_sock or sock fields. 10632 * The macro is called with two registers, dst_reg which contains a pointer 10633 * to ctx (context) and src_reg which contains the value that should be 10634 * stored. However, we need an additional register since we cannot overwrite 10635 * dst_reg because it may be used later in the program. 10636 * Instead we "borrow" one of the other register. We first save its value 10637 * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore 10638 * it at the end of the macro. 10639 */ 10640 #define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ 10641 do { \ 10642 int reg = BPF_REG_9; \ 10643 BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) > \ 10644 sizeof_field(struct bpf_sock_ops, BPF_FIELD)); \ 10645 if (si->dst_reg == reg || si->src_reg == reg) \ 10646 reg--; \ 10647 if (si->dst_reg == reg || si->src_reg == reg) \ 10648 reg--; \ 10649 *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \ 10650 offsetof(struct bpf_sock_ops_kern, \ 10651 temp)); \ 10652 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ 10653 struct bpf_sock_ops_kern, \ 10654 is_locked_tcp_sock), \ 10655 reg, si->dst_reg, \ 10656 offsetof(struct bpf_sock_ops_kern, \ 10657 is_locked_tcp_sock)); \ 10658 *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ 10659 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ 10660 struct bpf_sock_ops_kern, sk),\ 10661 reg, si->dst_reg, \ 10662 offsetof(struct bpf_sock_ops_kern, sk));\ 10663 *insn++ = BPF_RAW_INSN(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD) | \ 10664 BPF_MEM | BPF_CLASS(si->code), \ 10665 reg, si->src_reg, \ 10666 offsetof(OBJ, OBJ_FIELD), \ 10667 si->imm); \ 10668 *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \ 10669 offsetof(struct bpf_sock_ops_kern, \ 10670 temp)); \ 10671 } while (0) 10672 10673 #define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \ 10674 do { \ 10675 if (TYPE == BPF_WRITE) \ 10676 SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ 10677 else \ 10678 SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ 10679 } while (0) 10680 10681 switch (si->off) { 10682 case offsetof(struct bpf_sock_ops, op): 10683 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, 10684 op), 10685 si->dst_reg, si->src_reg, 10686 offsetof(struct bpf_sock_ops_kern, op)); 10687 break; 10688 10689 case offsetof(struct bpf_sock_ops, replylong[0]) ... 10690 offsetof(struct bpf_sock_ops, replylong[3]): 10691 BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, reply) != 10692 sizeof_field(struct bpf_sock_ops_kern, reply)); 10693 BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, replylong) != 10694 sizeof_field(struct bpf_sock_ops_kern, replylong)); 10695 off = si->off; 10696 off -= offsetof(struct bpf_sock_ops, replylong[0]); 10697 off += offsetof(struct bpf_sock_ops_kern, replylong[0]); 10698 if (type == BPF_WRITE) 10699 *insn++ = BPF_EMIT_STORE(BPF_W, si, off); 10700 else 10701 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 10702 off); 10703 break; 10704 10705 case offsetof(struct bpf_sock_ops, family): 10706 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2); 10707 10708 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 10709 struct bpf_sock_ops_kern, sk), 10710 si->dst_reg, si->src_reg, 10711 offsetof(struct bpf_sock_ops_kern, sk)); 10712 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, 10713 offsetof(struct sock_common, skc_family)); 10714 break; 10715 10716 case offsetof(struct bpf_sock_ops, remote_ip4): 10717 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4); 10718 10719 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 10720 struct bpf_sock_ops_kern, sk), 10721 si->dst_reg, si->src_reg, 10722 offsetof(struct bpf_sock_ops_kern, sk)); 10723 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 10724 offsetof(struct sock_common, skc_daddr)); 10725 break; 10726 10727 case offsetof(struct bpf_sock_ops, local_ip4): 10728 BUILD_BUG_ON(sizeof_field(struct sock_common, 10729 skc_rcv_saddr) != 4); 10730 10731 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 10732 struct bpf_sock_ops_kern, sk), 10733 si->dst_reg, si->src_reg, 10734 offsetof(struct bpf_sock_ops_kern, sk)); 10735 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 10736 offsetof(struct sock_common, 10737 skc_rcv_saddr)); 10738 break; 10739 10740 case offsetof(struct bpf_sock_ops, remote_ip6[0]) ... 10741 offsetof(struct bpf_sock_ops, remote_ip6[3]): 10742 #if IS_ENABLED(CONFIG_IPV6) 10743 BUILD_BUG_ON(sizeof_field(struct sock_common, 10744 skc_v6_daddr.s6_addr32[0]) != 4); 10745 10746 off = si->off; 10747 off -= offsetof(struct bpf_sock_ops, remote_ip6[0]); 10748 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 10749 struct bpf_sock_ops_kern, sk), 10750 si->dst_reg, si->src_reg, 10751 offsetof(struct bpf_sock_ops_kern, sk)); 10752 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 10753 offsetof(struct sock_common, 10754 skc_v6_daddr.s6_addr32[0]) + 10755 off); 10756 #else 10757 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); 10758 #endif 10759 break; 10760 10761 case offsetof(struct bpf_sock_ops, local_ip6[0]) ... 10762 offsetof(struct bpf_sock_ops, local_ip6[3]): 10763 #if IS_ENABLED(CONFIG_IPV6) 10764 BUILD_BUG_ON(sizeof_field(struct sock_common, 10765 skc_v6_rcv_saddr.s6_addr32[0]) != 4); 10766 10767 off = si->off; 10768 off -= offsetof(struct bpf_sock_ops, local_ip6[0]); 10769 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 10770 struct bpf_sock_ops_kern, sk), 10771 si->dst_reg, si->src_reg, 10772 offsetof(struct bpf_sock_ops_kern, sk)); 10773 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 10774 offsetof(struct sock_common, 10775 skc_v6_rcv_saddr.s6_addr32[0]) + 10776 off); 10777 #else 10778 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); 10779 #endif 10780 break; 10781 10782 case offsetof(struct bpf_sock_ops, remote_port): 10783 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2); 10784 10785 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 10786 struct bpf_sock_ops_kern, sk), 10787 si->dst_reg, si->src_reg, 10788 offsetof(struct bpf_sock_ops_kern, sk)); 10789 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, 10790 offsetof(struct sock_common, skc_dport)); 10791 #ifndef __BIG_ENDIAN_BITFIELD 10792 *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); 10793 #endif 10794 break; 10795 10796 case offsetof(struct bpf_sock_ops, local_port): 10797 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2); 10798 10799 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 10800 struct bpf_sock_ops_kern, sk), 10801 si->dst_reg, si->src_reg, 10802 offsetof(struct bpf_sock_ops_kern, sk)); 10803 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, 10804 offsetof(struct sock_common, skc_num)); 10805 break; 10806 10807 case offsetof(struct bpf_sock_ops, is_fullsock): 10808 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 10809 struct bpf_sock_ops_kern, 10810 is_fullsock), 10811 si->dst_reg, si->src_reg, 10812 offsetof(struct bpf_sock_ops_kern, 10813 is_fullsock)); 10814 break; 10815 10816 case offsetof(struct bpf_sock_ops, state): 10817 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_state) != 1); 10818 10819 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 10820 struct bpf_sock_ops_kern, sk), 10821 si->dst_reg, si->src_reg, 10822 offsetof(struct bpf_sock_ops_kern, sk)); 10823 *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg, 10824 offsetof(struct sock_common, skc_state)); 10825 break; 10826 10827 case offsetof(struct bpf_sock_ops, rtt_min): 10828 BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) != 10829 sizeof(struct minmax)); 10830 BUILD_BUG_ON(sizeof(struct minmax) < 10831 sizeof(struct minmax_sample)); 10832 10833 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 10834 struct bpf_sock_ops_kern, sk), 10835 si->dst_reg, si->src_reg, 10836 offsetof(struct bpf_sock_ops_kern, sk)); 10837 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 10838 offsetof(struct tcp_sock, rtt_min) + 10839 sizeof_field(struct minmax_sample, t)); 10840 break; 10841 10842 case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags): 10843 SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags, 10844 struct tcp_sock); 10845 break; 10846 10847 case offsetof(struct bpf_sock_ops, sk_txhash): 10848 SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash, 10849 struct sock, type); 10850 break; 10851 case offsetof(struct bpf_sock_ops, snd_cwnd): 10852 SOCK_OPS_GET_TCP_SOCK_FIELD(snd_cwnd); 10853 break; 10854 case offsetof(struct bpf_sock_ops, srtt_us): 10855 SOCK_OPS_GET_TCP_SOCK_FIELD(srtt_us); 10856 break; 10857 case offsetof(struct bpf_sock_ops, snd_ssthresh): 10858 SOCK_OPS_GET_TCP_SOCK_FIELD(snd_ssthresh); 10859 break; 10860 case offsetof(struct bpf_sock_ops, rcv_nxt): 10861 SOCK_OPS_GET_TCP_SOCK_FIELD(rcv_nxt); 10862 break; 10863 case offsetof(struct bpf_sock_ops, snd_nxt): 10864 SOCK_OPS_GET_TCP_SOCK_FIELD(snd_nxt); 10865 break; 10866 case offsetof(struct bpf_sock_ops, snd_una): 10867 SOCK_OPS_GET_TCP_SOCK_FIELD(snd_una); 10868 break; 10869 case offsetof(struct bpf_sock_ops, mss_cache): 10870 SOCK_OPS_GET_TCP_SOCK_FIELD(mss_cache); 10871 break; 10872 case offsetof(struct bpf_sock_ops, ecn_flags): 10873 SOCK_OPS_GET_TCP_SOCK_FIELD(ecn_flags); 10874 break; 10875 case offsetof(struct bpf_sock_ops, rate_delivered): 10876 SOCK_OPS_GET_TCP_SOCK_FIELD(rate_delivered); 10877 break; 10878 case offsetof(struct bpf_sock_ops, rate_interval_us): 10879 SOCK_OPS_GET_TCP_SOCK_FIELD(rate_interval_us); 10880 break; 10881 case offsetof(struct bpf_sock_ops, packets_out): 10882 SOCK_OPS_GET_TCP_SOCK_FIELD(packets_out); 10883 break; 10884 case offsetof(struct bpf_sock_ops, retrans_out): 10885 SOCK_OPS_GET_TCP_SOCK_FIELD(retrans_out); 10886 break; 10887 case offsetof(struct bpf_sock_ops, total_retrans): 10888 SOCK_OPS_GET_TCP_SOCK_FIELD(total_retrans); 10889 break; 10890 case offsetof(struct bpf_sock_ops, segs_in): 10891 SOCK_OPS_GET_TCP_SOCK_FIELD(segs_in); 10892 break; 10893 case offsetof(struct bpf_sock_ops, data_segs_in): 10894 SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_in); 10895 break; 10896 case offsetof(struct bpf_sock_ops, segs_out): 10897 SOCK_OPS_GET_TCP_SOCK_FIELD(segs_out); 10898 break; 10899 case offsetof(struct bpf_sock_ops, data_segs_out): 10900 SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_out); 10901 break; 10902 case offsetof(struct bpf_sock_ops, lost_out): 10903 SOCK_OPS_GET_TCP_SOCK_FIELD(lost_out); 10904 break; 10905 case offsetof(struct bpf_sock_ops, sacked_out): 10906 SOCK_OPS_GET_TCP_SOCK_FIELD(sacked_out); 10907 break; 10908 case offsetof(struct bpf_sock_ops, bytes_received): 10909 SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_received); 10910 break; 10911 case offsetof(struct bpf_sock_ops, bytes_acked): 10912 SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_acked); 10913 break; 10914 case offsetof(struct bpf_sock_ops, sk): 10915 SOCK_OPS_GET_SK(); 10916 break; 10917 case offsetof(struct bpf_sock_ops, skb_data_end): 10918 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, 10919 skb_data_end), 10920 si->dst_reg, si->src_reg, 10921 offsetof(struct bpf_sock_ops_kern, 10922 skb_data_end)); 10923 break; 10924 case offsetof(struct bpf_sock_ops, skb_data): 10925 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, 10926 skb), 10927 si->dst_reg, si->src_reg, 10928 offsetof(struct bpf_sock_ops_kern, 10929 skb)); 10930 *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); 10931 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), 10932 si->dst_reg, si->dst_reg, 10933 offsetof(struct sk_buff, data)); 10934 break; 10935 case offsetof(struct bpf_sock_ops, skb_len): 10936 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, 10937 skb), 10938 si->dst_reg, si->src_reg, 10939 offsetof(struct bpf_sock_ops_kern, 10940 skb)); 10941 *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); 10942 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), 10943 si->dst_reg, si->dst_reg, 10944 offsetof(struct sk_buff, len)); 10945 break; 10946 case offsetof(struct bpf_sock_ops, skb_tcp_flags): 10947 off = offsetof(struct sk_buff, cb); 10948 off += offsetof(struct tcp_skb_cb, tcp_flags); 10949 *target_size = sizeof_field(struct tcp_skb_cb, tcp_flags); 10950 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, 10951 skb), 10952 si->dst_reg, si->src_reg, 10953 offsetof(struct bpf_sock_ops_kern, 10954 skb)); 10955 *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); 10956 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_skb_cb, 10957 tcp_flags), 10958 si->dst_reg, si->dst_reg, off); 10959 break; 10960 case offsetof(struct bpf_sock_ops, skb_hwtstamp): { 10961 struct bpf_insn *jmp_on_null_skb; 10962 10963 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, 10964 skb), 10965 si->dst_reg, si->src_reg, 10966 offsetof(struct bpf_sock_ops_kern, 10967 skb)); 10968 /* Reserve one insn to test skb == NULL */ 10969 jmp_on_null_skb = insn++; 10970 insn = bpf_convert_shinfo_access(si->dst_reg, si->dst_reg, insn); 10971 *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, 10972 bpf_target_off(struct skb_shared_info, 10973 hwtstamps, 8, 10974 target_size)); 10975 *jmp_on_null_skb = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 10976 insn - jmp_on_null_skb - 1); 10977 break; 10978 } 10979 } 10980 return insn - insn_buf; 10981 } 10982 10983 /* data_end = skb->data + skb_headlen() */ 10984 static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si, 10985 struct bpf_insn *insn) 10986 { 10987 int reg; 10988 int temp_reg_off = offsetof(struct sk_buff, cb) + 10989 offsetof(struct sk_skb_cb, temp_reg); 10990 10991 if (si->src_reg == si->dst_reg) { 10992 /* We need an extra register, choose and save a register. */ 10993 reg = BPF_REG_9; 10994 if (si->src_reg == reg || si->dst_reg == reg) 10995 reg--; 10996 if (si->src_reg == reg || si->dst_reg == reg) 10997 reg--; 10998 *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, temp_reg_off); 10999 } else { 11000 reg = si->dst_reg; 11001 } 11002 11003 /* reg = skb->data */ 11004 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), 11005 reg, si->src_reg, 11006 offsetof(struct sk_buff, data)); 11007 /* AX = skb->len */ 11008 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), 11009 BPF_REG_AX, si->src_reg, 11010 offsetof(struct sk_buff, len)); 11011 /* reg = skb->data + skb->len */ 11012 *insn++ = BPF_ALU64_REG(BPF_ADD, reg, BPF_REG_AX); 11013 /* AX = skb->data_len */ 11014 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len), 11015 BPF_REG_AX, si->src_reg, 11016 offsetof(struct sk_buff, data_len)); 11017 11018 /* reg = skb->data + skb->len - skb->data_len */ 11019 *insn++ = BPF_ALU64_REG(BPF_SUB, reg, BPF_REG_AX); 11020 11021 if (si->src_reg == si->dst_reg) { 11022 /* Restore the saved register */ 11023 *insn++ = BPF_MOV64_REG(BPF_REG_AX, si->src_reg); 11024 *insn++ = BPF_MOV64_REG(si->dst_reg, reg); 11025 *insn++ = BPF_LDX_MEM(BPF_DW, reg, BPF_REG_AX, temp_reg_off); 11026 } 11027 11028 return insn; 11029 } 11030 11031 static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, 11032 const struct bpf_insn *si, 11033 struct bpf_insn *insn_buf, 11034 struct bpf_prog *prog, u32 *target_size) 11035 { 11036 struct bpf_insn *insn = insn_buf; 11037 int off; 11038 11039 switch (si->off) { 11040 case offsetof(struct __sk_buff, data_end): 11041 insn = bpf_convert_data_end_access(si, insn); 11042 break; 11043 case offsetof(struct __sk_buff, cb[0]) ... 11044 offsetofend(struct __sk_buff, cb[4]) - 1: 11045 BUILD_BUG_ON(sizeof_field(struct sk_skb_cb, data) < 20); 11046 BUILD_BUG_ON((offsetof(struct sk_buff, cb) + 11047 offsetof(struct sk_skb_cb, data)) % 11048 sizeof(__u64)); 11049 11050 prog->cb_access = 1; 11051 off = si->off; 11052 off -= offsetof(struct __sk_buff, cb[0]); 11053 off += offsetof(struct sk_buff, cb); 11054 off += offsetof(struct sk_skb_cb, data); 11055 if (type == BPF_WRITE) 11056 *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off); 11057 else 11058 *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, 11059 si->src_reg, off); 11060 break; 11061 11062 11063 default: 11064 return bpf_convert_ctx_access(type, si, insn_buf, prog, 11065 target_size); 11066 } 11067 11068 return insn - insn_buf; 11069 } 11070 11071 static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, 11072 const struct bpf_insn *si, 11073 struct bpf_insn *insn_buf, 11074 struct bpf_prog *prog, u32 *target_size) 11075 { 11076 struct bpf_insn *insn = insn_buf; 11077 #if IS_ENABLED(CONFIG_IPV6) 11078 int off; 11079 #endif 11080 11081 /* convert ctx uses the fact sg element is first in struct */ 11082 BUILD_BUG_ON(offsetof(struct sk_msg, sg) != 0); 11083 11084 switch (si->off) { 11085 case offsetof(struct sk_msg_md, data): 11086 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data), 11087 si->dst_reg, si->src_reg, 11088 offsetof(struct sk_msg, data)); 11089 break; 11090 case offsetof(struct sk_msg_md, data_end): 11091 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end), 11092 si->dst_reg, si->src_reg, 11093 offsetof(struct sk_msg, data_end)); 11094 break; 11095 case offsetof(struct sk_msg_md, family): 11096 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2); 11097 11098 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 11099 struct sk_msg, sk), 11100 si->dst_reg, si->src_reg, 11101 offsetof(struct sk_msg, sk)); 11102 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, 11103 offsetof(struct sock_common, skc_family)); 11104 break; 11105 11106 case offsetof(struct sk_msg_md, remote_ip4): 11107 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4); 11108 11109 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 11110 struct sk_msg, sk), 11111 si->dst_reg, si->src_reg, 11112 offsetof(struct sk_msg, sk)); 11113 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 11114 offsetof(struct sock_common, skc_daddr)); 11115 break; 11116 11117 case offsetof(struct sk_msg_md, local_ip4): 11118 BUILD_BUG_ON(sizeof_field(struct sock_common, 11119 skc_rcv_saddr) != 4); 11120 11121 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 11122 struct sk_msg, sk), 11123 si->dst_reg, si->src_reg, 11124 offsetof(struct sk_msg, sk)); 11125 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 11126 offsetof(struct sock_common, 11127 skc_rcv_saddr)); 11128 break; 11129 11130 case offsetof(struct sk_msg_md, remote_ip6[0]) ... 11131 offsetof(struct sk_msg_md, remote_ip6[3]): 11132 #if IS_ENABLED(CONFIG_IPV6) 11133 BUILD_BUG_ON(sizeof_field(struct sock_common, 11134 skc_v6_daddr.s6_addr32[0]) != 4); 11135 11136 off = si->off; 11137 off -= offsetof(struct sk_msg_md, remote_ip6[0]); 11138 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 11139 struct sk_msg, sk), 11140 si->dst_reg, si->src_reg, 11141 offsetof(struct sk_msg, sk)); 11142 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 11143 offsetof(struct sock_common, 11144 skc_v6_daddr.s6_addr32[0]) + 11145 off); 11146 #else 11147 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); 11148 #endif 11149 break; 11150 11151 case offsetof(struct sk_msg_md, local_ip6[0]) ... 11152 offsetof(struct sk_msg_md, local_ip6[3]): 11153 #if IS_ENABLED(CONFIG_IPV6) 11154 BUILD_BUG_ON(sizeof_field(struct sock_common, 11155 skc_v6_rcv_saddr.s6_addr32[0]) != 4); 11156 11157 off = si->off; 11158 off -= offsetof(struct sk_msg_md, local_ip6[0]); 11159 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 11160 struct sk_msg, sk), 11161 si->dst_reg, si->src_reg, 11162 offsetof(struct sk_msg, sk)); 11163 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 11164 offsetof(struct sock_common, 11165 skc_v6_rcv_saddr.s6_addr32[0]) + 11166 off); 11167 #else 11168 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); 11169 #endif 11170 break; 11171 11172 case offsetof(struct sk_msg_md, remote_port): 11173 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2); 11174 11175 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 11176 struct sk_msg, sk), 11177 si->dst_reg, si->src_reg, 11178 offsetof(struct sk_msg, sk)); 11179 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, 11180 offsetof(struct sock_common, skc_dport)); 11181 #ifndef __BIG_ENDIAN_BITFIELD 11182 *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); 11183 #endif 11184 break; 11185 11186 case offsetof(struct sk_msg_md, local_port): 11187 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2); 11188 11189 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 11190 struct sk_msg, sk), 11191 si->dst_reg, si->src_reg, 11192 offsetof(struct sk_msg, sk)); 11193 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, 11194 offsetof(struct sock_common, skc_num)); 11195 break; 11196 11197 case offsetof(struct sk_msg_md, size): 11198 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_sg, size), 11199 si->dst_reg, si->src_reg, 11200 offsetof(struct sk_msg_sg, size)); 11201 break; 11202 11203 case offsetof(struct sk_msg_md, sk): 11204 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, sk), 11205 si->dst_reg, si->src_reg, 11206 offsetof(struct sk_msg, sk)); 11207 break; 11208 } 11209 11210 return insn - insn_buf; 11211 } 11212 11213 const struct bpf_verifier_ops sk_filter_verifier_ops = { 11214 .get_func_proto = sk_filter_func_proto, 11215 .is_valid_access = sk_filter_is_valid_access, 11216 .convert_ctx_access = bpf_convert_ctx_access, 11217 .gen_ld_abs = bpf_gen_ld_abs, 11218 }; 11219 11220 const struct bpf_prog_ops sk_filter_prog_ops = { 11221 .test_run = bpf_prog_test_run_skb, 11222 }; 11223 11224 const struct bpf_verifier_ops tc_cls_act_verifier_ops = { 11225 .get_func_proto = tc_cls_act_func_proto, 11226 .is_valid_access = tc_cls_act_is_valid_access, 11227 .convert_ctx_access = tc_cls_act_convert_ctx_access, 11228 .gen_prologue = tc_cls_act_prologue, 11229 .gen_ld_abs = bpf_gen_ld_abs, 11230 .btf_struct_access = tc_cls_act_btf_struct_access, 11231 }; 11232 11233 const struct bpf_prog_ops tc_cls_act_prog_ops = { 11234 .test_run = bpf_prog_test_run_skb, 11235 }; 11236 11237 const struct bpf_verifier_ops xdp_verifier_ops = { 11238 .get_func_proto = xdp_func_proto, 11239 .is_valid_access = xdp_is_valid_access, 11240 .convert_ctx_access = xdp_convert_ctx_access, 11241 .gen_prologue = bpf_noop_prologue, 11242 .btf_struct_access = xdp_btf_struct_access, 11243 }; 11244 11245 const struct bpf_prog_ops xdp_prog_ops = { 11246 .test_run = bpf_prog_test_run_xdp, 11247 }; 11248 11249 const struct bpf_verifier_ops cg_skb_verifier_ops = { 11250 .get_func_proto = cg_skb_func_proto, 11251 .is_valid_access = cg_skb_is_valid_access, 11252 .convert_ctx_access = bpf_convert_ctx_access, 11253 }; 11254 11255 const struct bpf_prog_ops cg_skb_prog_ops = { 11256 .test_run = bpf_prog_test_run_skb, 11257 }; 11258 11259 const struct bpf_verifier_ops lwt_in_verifier_ops = { 11260 .get_func_proto = lwt_in_func_proto, 11261 .is_valid_access = lwt_is_valid_access, 11262 .convert_ctx_access = bpf_convert_ctx_access, 11263 }; 11264 11265 const struct bpf_prog_ops lwt_in_prog_ops = { 11266 .test_run = bpf_prog_test_run_skb, 11267 }; 11268 11269 const struct bpf_verifier_ops lwt_out_verifier_ops = { 11270 .get_func_proto = lwt_out_func_proto, 11271 .is_valid_access = lwt_is_valid_access, 11272 .convert_ctx_access = bpf_convert_ctx_access, 11273 }; 11274 11275 const struct bpf_prog_ops lwt_out_prog_ops = { 11276 .test_run = bpf_prog_test_run_skb, 11277 }; 11278 11279 const struct bpf_verifier_ops lwt_xmit_verifier_ops = { 11280 .get_func_proto = lwt_xmit_func_proto, 11281 .is_valid_access = lwt_is_valid_access, 11282 .convert_ctx_access = bpf_convert_ctx_access, 11283 .gen_prologue = tc_cls_act_prologue, 11284 }; 11285 11286 const struct bpf_prog_ops lwt_xmit_prog_ops = { 11287 .test_run = bpf_prog_test_run_skb, 11288 }; 11289 11290 const struct bpf_verifier_ops lwt_seg6local_verifier_ops = { 11291 .get_func_proto = lwt_seg6local_func_proto, 11292 .is_valid_access = lwt_is_valid_access, 11293 .convert_ctx_access = bpf_convert_ctx_access, 11294 }; 11295 11296 const struct bpf_prog_ops lwt_seg6local_prog_ops = { 11297 }; 11298 11299 const struct bpf_verifier_ops cg_sock_verifier_ops = { 11300 .get_func_proto = sock_filter_func_proto, 11301 .is_valid_access = sock_filter_is_valid_access, 11302 .convert_ctx_access = bpf_sock_convert_ctx_access, 11303 }; 11304 11305 const struct bpf_prog_ops cg_sock_prog_ops = { 11306 }; 11307 11308 const struct bpf_verifier_ops cg_sock_addr_verifier_ops = { 11309 .get_func_proto = sock_addr_func_proto, 11310 .is_valid_access = sock_addr_is_valid_access, 11311 .convert_ctx_access = sock_addr_convert_ctx_access, 11312 }; 11313 11314 const struct bpf_prog_ops cg_sock_addr_prog_ops = { 11315 }; 11316 11317 const struct bpf_verifier_ops sock_ops_verifier_ops = { 11318 .get_func_proto = sock_ops_func_proto, 11319 .is_valid_access = sock_ops_is_valid_access, 11320 .convert_ctx_access = sock_ops_convert_ctx_access, 11321 }; 11322 11323 const struct bpf_prog_ops sock_ops_prog_ops = { 11324 }; 11325 11326 const struct bpf_verifier_ops sk_skb_verifier_ops = { 11327 .get_func_proto = sk_skb_func_proto, 11328 .is_valid_access = sk_skb_is_valid_access, 11329 .convert_ctx_access = sk_skb_convert_ctx_access, 11330 .gen_prologue = sk_skb_prologue, 11331 }; 11332 11333 const struct bpf_prog_ops sk_skb_prog_ops = { 11334 }; 11335 11336 const struct bpf_verifier_ops sk_msg_verifier_ops = { 11337 .get_func_proto = sk_msg_func_proto, 11338 .is_valid_access = sk_msg_is_valid_access, 11339 .convert_ctx_access = sk_msg_convert_ctx_access, 11340 .gen_prologue = bpf_noop_prologue, 11341 }; 11342 11343 const struct bpf_prog_ops sk_msg_prog_ops = { 11344 }; 11345 11346 const struct bpf_verifier_ops flow_dissector_verifier_ops = { 11347 .get_func_proto = flow_dissector_func_proto, 11348 .is_valid_access = flow_dissector_is_valid_access, 11349 .convert_ctx_access = flow_dissector_convert_ctx_access, 11350 }; 11351 11352 const struct bpf_prog_ops flow_dissector_prog_ops = { 11353 .test_run = bpf_prog_test_run_flow_dissector, 11354 }; 11355 11356 int sk_detach_filter(struct sock *sk) 11357 { 11358 int ret = -ENOENT; 11359 struct sk_filter *filter; 11360 11361 if (sock_flag(sk, SOCK_FILTER_LOCKED)) 11362 return -EPERM; 11363 11364 filter = rcu_dereference_protected(sk->sk_filter, 11365 lockdep_sock_is_held(sk)); 11366 if (filter) { 11367 RCU_INIT_POINTER(sk->sk_filter, NULL); 11368 sk_filter_uncharge(sk, filter); 11369 ret = 0; 11370 } 11371 11372 return ret; 11373 } 11374 EXPORT_SYMBOL_GPL(sk_detach_filter); 11375 11376 int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len) 11377 { 11378 struct sock_fprog_kern *fprog; 11379 struct sk_filter *filter; 11380 int ret = 0; 11381 11382 sockopt_lock_sock(sk); 11383 filter = rcu_dereference_protected(sk->sk_filter, 11384 lockdep_sock_is_held(sk)); 11385 if (!filter) 11386 goto out; 11387 11388 /* We're copying the filter that has been originally attached, 11389 * so no conversion/decode needed anymore. eBPF programs that 11390 * have no original program cannot be dumped through this. 11391 */ 11392 ret = -EACCES; 11393 fprog = filter->prog->orig_prog; 11394 if (!fprog) 11395 goto out; 11396 11397 ret = fprog->len; 11398 if (!len) 11399 /* User space only enquires number of filter blocks. */ 11400 goto out; 11401 11402 ret = -EINVAL; 11403 if (len < fprog->len) 11404 goto out; 11405 11406 ret = -EFAULT; 11407 if (copy_to_sockptr(optval, fprog->filter, bpf_classic_proglen(fprog))) 11408 goto out; 11409 11410 /* Instead of bytes, the API requests to return the number 11411 * of filter blocks. 11412 */ 11413 ret = fprog->len; 11414 out: 11415 sockopt_release_sock(sk); 11416 return ret; 11417 } 11418 11419 #ifdef CONFIG_INET 11420 static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern, 11421 struct sock_reuseport *reuse, 11422 struct sock *sk, struct sk_buff *skb, 11423 struct sock *migrating_sk, 11424 u32 hash) 11425 { 11426 reuse_kern->skb = skb; 11427 reuse_kern->sk = sk; 11428 reuse_kern->selected_sk = NULL; 11429 reuse_kern->migrating_sk = migrating_sk; 11430 reuse_kern->data_end = skb->data + skb_headlen(skb); 11431 reuse_kern->hash = hash; 11432 reuse_kern->reuseport_id = reuse->reuseport_id; 11433 reuse_kern->bind_inany = reuse->bind_inany; 11434 } 11435 11436 struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, 11437 struct bpf_prog *prog, struct sk_buff *skb, 11438 struct sock *migrating_sk, 11439 u32 hash) 11440 { 11441 struct sk_reuseport_kern reuse_kern; 11442 enum sk_action action; 11443 11444 bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash); 11445 action = bpf_prog_run(prog, &reuse_kern); 11446 11447 if (action == SK_PASS) 11448 return reuse_kern.selected_sk; 11449 else 11450 return ERR_PTR(-ECONNREFUSED); 11451 } 11452 11453 BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, 11454 struct bpf_map *, map, void *, key, u32, flags) 11455 { 11456 bool is_sockarray = map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY; 11457 struct sock_reuseport *reuse; 11458 struct sock *selected_sk; 11459 int err; 11460 11461 selected_sk = map->ops->map_lookup_elem(map, key); 11462 if (!selected_sk) 11463 return -ENOENT; 11464 11465 reuse = rcu_dereference(selected_sk->sk_reuseport_cb); 11466 if (!reuse) { 11467 /* reuseport_array has only sk with non NULL sk_reuseport_cb. 11468 * The only (!reuse) case here is - the sk has already been 11469 * unhashed (e.g. by close()), so treat it as -ENOENT. 11470 * 11471 * Other maps (e.g. sock_map) do not provide this guarantee and 11472 * the sk may never be in the reuseport group to begin with. 11473 */ 11474 err = is_sockarray ? -ENOENT : -EINVAL; 11475 goto error; 11476 } 11477 11478 if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) { 11479 struct sock *sk = reuse_kern->sk; 11480 11481 if (sk->sk_protocol != selected_sk->sk_protocol) { 11482 err = -EPROTOTYPE; 11483 } else if (sk->sk_family != selected_sk->sk_family) { 11484 err = -EAFNOSUPPORT; 11485 } else { 11486 /* Catch all. Likely bound to a different sockaddr. */ 11487 err = -EBADFD; 11488 } 11489 goto error; 11490 } 11491 11492 reuse_kern->selected_sk = selected_sk; 11493 11494 return 0; 11495 error: 11496 /* Lookup in sock_map can return TCP ESTABLISHED sockets. */ 11497 if (sk_is_refcounted(selected_sk)) 11498 sock_put(selected_sk); 11499 11500 return err; 11501 } 11502 11503 static const struct bpf_func_proto sk_select_reuseport_proto = { 11504 .func = sk_select_reuseport, 11505 .gpl_only = false, 11506 .ret_type = RET_INTEGER, 11507 .arg1_type = ARG_PTR_TO_CTX, 11508 .arg2_type = ARG_CONST_MAP_PTR, 11509 .arg3_type = ARG_PTR_TO_MAP_KEY, 11510 .arg4_type = ARG_ANYTHING, 11511 }; 11512 11513 BPF_CALL_4(sk_reuseport_load_bytes, 11514 const struct sk_reuseport_kern *, reuse_kern, u32, offset, 11515 void *, to, u32, len) 11516 { 11517 return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len); 11518 } 11519 11520 static const struct bpf_func_proto sk_reuseport_load_bytes_proto = { 11521 .func = sk_reuseport_load_bytes, 11522 .gpl_only = false, 11523 .ret_type = RET_INTEGER, 11524 .arg1_type = ARG_PTR_TO_CTX, 11525 .arg2_type = ARG_ANYTHING, 11526 .arg3_type = ARG_PTR_TO_UNINIT_MEM, 11527 .arg4_type = ARG_CONST_SIZE, 11528 }; 11529 11530 BPF_CALL_5(sk_reuseport_load_bytes_relative, 11531 const struct sk_reuseport_kern *, reuse_kern, u32, offset, 11532 void *, to, u32, len, u32, start_header) 11533 { 11534 return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to, 11535 len, start_header); 11536 } 11537 11538 static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = { 11539 .func = sk_reuseport_load_bytes_relative, 11540 .gpl_only = false, 11541 .ret_type = RET_INTEGER, 11542 .arg1_type = ARG_PTR_TO_CTX, 11543 .arg2_type = ARG_ANYTHING, 11544 .arg3_type = ARG_PTR_TO_UNINIT_MEM, 11545 .arg4_type = ARG_CONST_SIZE, 11546 .arg5_type = ARG_ANYTHING, 11547 }; 11548 11549 static const struct bpf_func_proto * 11550 sk_reuseport_func_proto(enum bpf_func_id func_id, 11551 const struct bpf_prog *prog) 11552 { 11553 switch (func_id) { 11554 case BPF_FUNC_sk_select_reuseport: 11555 return &sk_select_reuseport_proto; 11556 case BPF_FUNC_skb_load_bytes: 11557 return &sk_reuseport_load_bytes_proto; 11558 case BPF_FUNC_skb_load_bytes_relative: 11559 return &sk_reuseport_load_bytes_relative_proto; 11560 case BPF_FUNC_get_socket_cookie: 11561 return &bpf_get_socket_ptr_cookie_proto; 11562 case BPF_FUNC_ktime_get_coarse_ns: 11563 return &bpf_ktime_get_coarse_ns_proto; 11564 default: 11565 return bpf_base_func_proto(func_id, prog); 11566 } 11567 } 11568 11569 static bool 11570 sk_reuseport_is_valid_access(int off, int size, 11571 enum bpf_access_type type, 11572 const struct bpf_prog *prog, 11573 struct bpf_insn_access_aux *info) 11574 { 11575 const u32 size_default = sizeof(__u32); 11576 11577 if (off < 0 || off >= sizeof(struct sk_reuseport_md) || 11578 off % size || type != BPF_READ) 11579 return false; 11580 11581 switch (off) { 11582 case offsetof(struct sk_reuseport_md, data): 11583 info->reg_type = PTR_TO_PACKET; 11584 return size == sizeof(__u64); 11585 11586 case offsetof(struct sk_reuseport_md, data_end): 11587 info->reg_type = PTR_TO_PACKET_END; 11588 return size == sizeof(__u64); 11589 11590 case offsetof(struct sk_reuseport_md, hash): 11591 return size == size_default; 11592 11593 case offsetof(struct sk_reuseport_md, sk): 11594 info->reg_type = PTR_TO_SOCKET; 11595 return size == sizeof(__u64); 11596 11597 case offsetof(struct sk_reuseport_md, migrating_sk): 11598 info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; 11599 return size == sizeof(__u64); 11600 11601 /* Fields that allow narrowing */ 11602 case bpf_ctx_range(struct sk_reuseport_md, eth_protocol): 11603 if (size < sizeof_field(struct sk_buff, protocol)) 11604 return false; 11605 fallthrough; 11606 case bpf_ctx_range(struct sk_reuseport_md, ip_protocol): 11607 case bpf_ctx_range(struct sk_reuseport_md, bind_inany): 11608 case bpf_ctx_range(struct sk_reuseport_md, len): 11609 bpf_ctx_record_field_size(info, size_default); 11610 return bpf_ctx_narrow_access_ok(off, size, size_default); 11611 11612 default: 11613 return false; 11614 } 11615 } 11616 11617 #define SK_REUSEPORT_LOAD_FIELD(F) ({ \ 11618 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \ 11619 si->dst_reg, si->src_reg, \ 11620 bpf_target_off(struct sk_reuseport_kern, F, \ 11621 sizeof_field(struct sk_reuseport_kern, F), \ 11622 target_size)); \ 11623 }) 11624 11625 #define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD) \ 11626 SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \ 11627 struct sk_buff, \ 11628 skb, \ 11629 SKB_FIELD) 11630 11631 #define SK_REUSEPORT_LOAD_SK_FIELD(SK_FIELD) \ 11632 SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \ 11633 struct sock, \ 11634 sk, \ 11635 SK_FIELD) 11636 11637 static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, 11638 const struct bpf_insn *si, 11639 struct bpf_insn *insn_buf, 11640 struct bpf_prog *prog, 11641 u32 *target_size) 11642 { 11643 struct bpf_insn *insn = insn_buf; 11644 11645 switch (si->off) { 11646 case offsetof(struct sk_reuseport_md, data): 11647 SK_REUSEPORT_LOAD_SKB_FIELD(data); 11648 break; 11649 11650 case offsetof(struct sk_reuseport_md, len): 11651 SK_REUSEPORT_LOAD_SKB_FIELD(len); 11652 break; 11653 11654 case offsetof(struct sk_reuseport_md, eth_protocol): 11655 SK_REUSEPORT_LOAD_SKB_FIELD(protocol); 11656 break; 11657 11658 case offsetof(struct sk_reuseport_md, ip_protocol): 11659 SK_REUSEPORT_LOAD_SK_FIELD(sk_protocol); 11660 break; 11661 11662 case offsetof(struct sk_reuseport_md, data_end): 11663 SK_REUSEPORT_LOAD_FIELD(data_end); 11664 break; 11665 11666 case offsetof(struct sk_reuseport_md, hash): 11667 SK_REUSEPORT_LOAD_FIELD(hash); 11668 break; 11669 11670 case offsetof(struct sk_reuseport_md, bind_inany): 11671 SK_REUSEPORT_LOAD_FIELD(bind_inany); 11672 break; 11673 11674 case offsetof(struct sk_reuseport_md, sk): 11675 SK_REUSEPORT_LOAD_FIELD(sk); 11676 break; 11677 11678 case offsetof(struct sk_reuseport_md, migrating_sk): 11679 SK_REUSEPORT_LOAD_FIELD(migrating_sk); 11680 break; 11681 } 11682 11683 return insn - insn_buf; 11684 } 11685 11686 const struct bpf_verifier_ops sk_reuseport_verifier_ops = { 11687 .get_func_proto = sk_reuseport_func_proto, 11688 .is_valid_access = sk_reuseport_is_valid_access, 11689 .convert_ctx_access = sk_reuseport_convert_ctx_access, 11690 }; 11691 11692 const struct bpf_prog_ops sk_reuseport_prog_ops = { 11693 }; 11694 11695 DEFINE_STATIC_KEY_FALSE(bpf_sk_lookup_enabled); 11696 EXPORT_SYMBOL(bpf_sk_lookup_enabled); 11697 11698 BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx, 11699 struct sock *, sk, u64, flags) 11700 { 11701 if (unlikely(flags & ~(BPF_SK_LOOKUP_F_REPLACE | 11702 BPF_SK_LOOKUP_F_NO_REUSEPORT))) 11703 return -EINVAL; 11704 if (unlikely(sk && sk_is_refcounted(sk))) 11705 return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */ 11706 if (unlikely(sk && sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN)) 11707 return -ESOCKTNOSUPPORT; /* only accept TCP socket in LISTEN */ 11708 if (unlikely(sk && sk_is_udp(sk) && sk->sk_state != TCP_CLOSE)) 11709 return -ESOCKTNOSUPPORT; /* only accept UDP socket in CLOSE */ 11710 11711 /* Check if socket is suitable for packet L3/L4 protocol */ 11712 if (sk && sk->sk_protocol != ctx->protocol) 11713 return -EPROTOTYPE; 11714 if (sk && sk->sk_family != ctx->family && 11715 (sk->sk_family == AF_INET || ipv6_only_sock(sk))) 11716 return -EAFNOSUPPORT; 11717 11718 if (ctx->selected_sk && !(flags & BPF_SK_LOOKUP_F_REPLACE)) 11719 return -EEXIST; 11720 11721 /* Select socket as lookup result */ 11722 ctx->selected_sk = sk; 11723 ctx->no_reuseport = flags & BPF_SK_LOOKUP_F_NO_REUSEPORT; 11724 return 0; 11725 } 11726 11727 static const struct bpf_func_proto bpf_sk_lookup_assign_proto = { 11728 .func = bpf_sk_lookup_assign, 11729 .gpl_only = false, 11730 .ret_type = RET_INTEGER, 11731 .arg1_type = ARG_PTR_TO_CTX, 11732 .arg2_type = ARG_PTR_TO_SOCKET_OR_NULL, 11733 .arg3_type = ARG_ANYTHING, 11734 }; 11735 11736 static const struct bpf_func_proto * 11737 sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 11738 { 11739 switch (func_id) { 11740 case BPF_FUNC_perf_event_output: 11741 return &bpf_event_output_data_proto; 11742 case BPF_FUNC_sk_assign: 11743 return &bpf_sk_lookup_assign_proto; 11744 case BPF_FUNC_sk_release: 11745 return &bpf_sk_release_proto; 11746 default: 11747 return bpf_sk_base_func_proto(func_id, prog); 11748 } 11749 } 11750 11751 static bool sk_lookup_is_valid_access(int off, int size, 11752 enum bpf_access_type type, 11753 const struct bpf_prog *prog, 11754 struct bpf_insn_access_aux *info) 11755 { 11756 if (off < 0 || off >= sizeof(struct bpf_sk_lookup)) 11757 return false; 11758 if (off % size != 0) 11759 return false; 11760 if (type != BPF_READ) 11761 return false; 11762 11763 switch (off) { 11764 case bpf_ctx_range_ptr(struct bpf_sk_lookup, sk): 11765 info->reg_type = PTR_TO_SOCKET_OR_NULL; 11766 return size == sizeof(__u64); 11767 11768 case bpf_ctx_range(struct bpf_sk_lookup, family): 11769 case bpf_ctx_range(struct bpf_sk_lookup, protocol): 11770 case bpf_ctx_range(struct bpf_sk_lookup, remote_ip4): 11771 case bpf_ctx_range(struct bpf_sk_lookup, local_ip4): 11772 case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]): 11773 case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]): 11774 case bpf_ctx_range(struct bpf_sk_lookup, local_port): 11775 case bpf_ctx_range(struct bpf_sk_lookup, ingress_ifindex): 11776 bpf_ctx_record_field_size(info, sizeof(__u32)); 11777 return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32)); 11778 11779 case bpf_ctx_range(struct bpf_sk_lookup, remote_port): 11780 /* Allow 4-byte access to 2-byte field for backward compatibility */ 11781 if (size == sizeof(__u32)) 11782 return true; 11783 bpf_ctx_record_field_size(info, sizeof(__be16)); 11784 return bpf_ctx_narrow_access_ok(off, size, sizeof(__be16)); 11785 11786 case offsetofend(struct bpf_sk_lookup, remote_port) ... 11787 offsetof(struct bpf_sk_lookup, local_ip4) - 1: 11788 /* Allow access to zero padding for backward compatibility */ 11789 bpf_ctx_record_field_size(info, sizeof(__u16)); 11790 return bpf_ctx_narrow_access_ok(off, size, sizeof(__u16)); 11791 11792 default: 11793 return false; 11794 } 11795 } 11796 11797 static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type, 11798 const struct bpf_insn *si, 11799 struct bpf_insn *insn_buf, 11800 struct bpf_prog *prog, 11801 u32 *target_size) 11802 { 11803 struct bpf_insn *insn = insn_buf; 11804 11805 switch (si->off) { 11806 case offsetof(struct bpf_sk_lookup, sk): 11807 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, 11808 offsetof(struct bpf_sk_lookup_kern, selected_sk)); 11809 break; 11810 11811 case offsetof(struct bpf_sk_lookup, family): 11812 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, 11813 bpf_target_off(struct bpf_sk_lookup_kern, 11814 family, 2, target_size)); 11815 break; 11816 11817 case offsetof(struct bpf_sk_lookup, protocol): 11818 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, 11819 bpf_target_off(struct bpf_sk_lookup_kern, 11820 protocol, 2, target_size)); 11821 break; 11822 11823 case offsetof(struct bpf_sk_lookup, remote_ip4): 11824 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 11825 bpf_target_off(struct bpf_sk_lookup_kern, 11826 v4.saddr, 4, target_size)); 11827 break; 11828 11829 case offsetof(struct bpf_sk_lookup, local_ip4): 11830 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 11831 bpf_target_off(struct bpf_sk_lookup_kern, 11832 v4.daddr, 4, target_size)); 11833 break; 11834 11835 case bpf_ctx_range_till(struct bpf_sk_lookup, 11836 remote_ip6[0], remote_ip6[3]): { 11837 #if IS_ENABLED(CONFIG_IPV6) 11838 int off = si->off; 11839 11840 off -= offsetof(struct bpf_sk_lookup, remote_ip6[0]); 11841 off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); 11842 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, 11843 offsetof(struct bpf_sk_lookup_kern, v6.saddr)); 11844 *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); 11845 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); 11846 #else 11847 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); 11848 #endif 11849 break; 11850 } 11851 case bpf_ctx_range_till(struct bpf_sk_lookup, 11852 local_ip6[0], local_ip6[3]): { 11853 #if IS_ENABLED(CONFIG_IPV6) 11854 int off = si->off; 11855 11856 off -= offsetof(struct bpf_sk_lookup, local_ip6[0]); 11857 off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size); 11858 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, 11859 offsetof(struct bpf_sk_lookup_kern, v6.daddr)); 11860 *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); 11861 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off); 11862 #else 11863 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); 11864 #endif 11865 break; 11866 } 11867 case offsetof(struct bpf_sk_lookup, remote_port): 11868 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, 11869 bpf_target_off(struct bpf_sk_lookup_kern, 11870 sport, 2, target_size)); 11871 break; 11872 11873 case offsetofend(struct bpf_sk_lookup, remote_port): 11874 *target_size = 2; 11875 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); 11876 break; 11877 11878 case offsetof(struct bpf_sk_lookup, local_port): 11879 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, 11880 bpf_target_off(struct bpf_sk_lookup_kern, 11881 dport, 2, target_size)); 11882 break; 11883 11884 case offsetof(struct bpf_sk_lookup, ingress_ifindex): 11885 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 11886 bpf_target_off(struct bpf_sk_lookup_kern, 11887 ingress_ifindex, 4, target_size)); 11888 break; 11889 } 11890 11891 return insn - insn_buf; 11892 } 11893 11894 const struct bpf_prog_ops sk_lookup_prog_ops = { 11895 .test_run = bpf_prog_test_run_sk_lookup, 11896 }; 11897 11898 const struct bpf_verifier_ops sk_lookup_verifier_ops = { 11899 .get_func_proto = sk_lookup_func_proto, 11900 .is_valid_access = sk_lookup_is_valid_access, 11901 .convert_ctx_access = sk_lookup_convert_ctx_access, 11902 }; 11903 11904 #endif /* CONFIG_INET */ 11905 11906 DEFINE_BPF_DISPATCHER(xdp) 11907 11908 void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog) 11909 { 11910 bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog); 11911 } 11912 11913 BTF_ID_LIST_GLOBAL(btf_sock_ids, MAX_BTF_SOCK_TYPE) 11914 #define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type) 11915 BTF_SOCK_TYPE_xxx 11916 #undef BTF_SOCK_TYPE 11917 11918 BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk) 11919 { 11920 /* tcp6_sock type is not generated in dwarf and hence btf, 11921 * trigger an explicit type generation here. 11922 */ 11923 BTF_TYPE_EMIT(struct tcp6_sock); 11924 if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && 11925 sk->sk_family == AF_INET6) 11926 return (unsigned long)sk; 11927 11928 return (unsigned long)NULL; 11929 } 11930 11931 const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = { 11932 .func = bpf_skc_to_tcp6_sock, 11933 .gpl_only = false, 11934 .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, 11935 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, 11936 .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP6], 11937 }; 11938 11939 BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk) 11940 { 11941 if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) 11942 return (unsigned long)sk; 11943 11944 return (unsigned long)NULL; 11945 } 11946 11947 const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = { 11948 .func = bpf_skc_to_tcp_sock, 11949 .gpl_only = false, 11950 .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, 11951 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, 11952 .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP], 11953 }; 11954 11955 BPF_CALL_1(bpf_skc_to_tcp_timewait_sock, struct sock *, sk) 11956 { 11957 /* BTF types for tcp_timewait_sock and inet_timewait_sock are not 11958 * generated if CONFIG_INET=n. Trigger an explicit generation here. 11959 */ 11960 BTF_TYPE_EMIT(struct inet_timewait_sock); 11961 BTF_TYPE_EMIT(struct tcp_timewait_sock); 11962 11963 #ifdef CONFIG_INET 11964 if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_TIME_WAIT) 11965 return (unsigned long)sk; 11966 #endif 11967 11968 #if IS_BUILTIN(CONFIG_IPV6) 11969 if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_TIME_WAIT) 11970 return (unsigned long)sk; 11971 #endif 11972 11973 return (unsigned long)NULL; 11974 } 11975 11976 const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = { 11977 .func = bpf_skc_to_tcp_timewait_sock, 11978 .gpl_only = false, 11979 .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, 11980 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, 11981 .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW], 11982 }; 11983 11984 BPF_CALL_1(bpf_skc_to_tcp_request_sock, struct sock *, sk) 11985 { 11986 #ifdef CONFIG_INET 11987 if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_NEW_SYN_RECV) 11988 return (unsigned long)sk; 11989 #endif 11990 11991 #if IS_BUILTIN(CONFIG_IPV6) 11992 if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_NEW_SYN_RECV) 11993 return (unsigned long)sk; 11994 #endif 11995 11996 return (unsigned long)NULL; 11997 } 11998 11999 const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = { 12000 .func = bpf_skc_to_tcp_request_sock, 12001 .gpl_only = false, 12002 .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, 12003 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, 12004 .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ], 12005 }; 12006 12007 BPF_CALL_1(bpf_skc_to_udp6_sock, struct sock *, sk) 12008 { 12009 /* udp6_sock type is not generated in dwarf and hence btf, 12010 * trigger an explicit type generation here. 12011 */ 12012 BTF_TYPE_EMIT(struct udp6_sock); 12013 if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_UDP && 12014 sk->sk_type == SOCK_DGRAM && sk->sk_family == AF_INET6) 12015 return (unsigned long)sk; 12016 12017 return (unsigned long)NULL; 12018 } 12019 12020 const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = { 12021 .func = bpf_skc_to_udp6_sock, 12022 .gpl_only = false, 12023 .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, 12024 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, 12025 .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UDP6], 12026 }; 12027 12028 BPF_CALL_1(bpf_skc_to_unix_sock, struct sock *, sk) 12029 { 12030 /* unix_sock type is not generated in dwarf and hence btf, 12031 * trigger an explicit type generation here. 12032 */ 12033 BTF_TYPE_EMIT(struct unix_sock); 12034 if (sk && sk_is_unix(sk)) 12035 return (unsigned long)sk; 12036 12037 return (unsigned long)NULL; 12038 } 12039 12040 const struct bpf_func_proto bpf_skc_to_unix_sock_proto = { 12041 .func = bpf_skc_to_unix_sock, 12042 .gpl_only = false, 12043 .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, 12044 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, 12045 .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UNIX], 12046 }; 12047 12048 BPF_CALL_1(bpf_skc_to_mptcp_sock, struct sock *, sk) 12049 { 12050 BTF_TYPE_EMIT(struct mptcp_sock); 12051 return (unsigned long)bpf_mptcp_sock_from_subflow(sk); 12052 } 12053 12054 const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto = { 12055 .func = bpf_skc_to_mptcp_sock, 12056 .gpl_only = false, 12057 .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, 12058 .arg1_type = ARG_PTR_TO_SOCK_COMMON, 12059 .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_MPTCP], 12060 }; 12061 12062 BPF_CALL_1(bpf_sock_from_file, struct file *, file) 12063 { 12064 return (unsigned long)sock_from_file(file); 12065 } 12066 12067 BTF_ID_LIST(bpf_sock_from_file_btf_ids) 12068 BTF_ID(struct, socket) 12069 BTF_ID(struct, file) 12070 12071 const struct bpf_func_proto bpf_sock_from_file_proto = { 12072 .func = bpf_sock_from_file, 12073 .gpl_only = false, 12074 .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, 12075 .ret_btf_id = &bpf_sock_from_file_btf_ids[0], 12076 .arg1_type = ARG_PTR_TO_BTF_ID, 12077 .arg1_btf_id = &bpf_sock_from_file_btf_ids[1], 12078 }; 12079 12080 static const struct bpf_func_proto * 12081 bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 12082 { 12083 const struct bpf_func_proto *func; 12084 12085 switch (func_id) { 12086 case BPF_FUNC_skc_to_tcp6_sock: 12087 func = &bpf_skc_to_tcp6_sock_proto; 12088 break; 12089 case BPF_FUNC_skc_to_tcp_sock: 12090 func = &bpf_skc_to_tcp_sock_proto; 12091 break; 12092 case BPF_FUNC_skc_to_tcp_timewait_sock: 12093 func = &bpf_skc_to_tcp_timewait_sock_proto; 12094 break; 12095 case BPF_FUNC_skc_to_tcp_request_sock: 12096 func = &bpf_skc_to_tcp_request_sock_proto; 12097 break; 12098 case BPF_FUNC_skc_to_udp6_sock: 12099 func = &bpf_skc_to_udp6_sock_proto; 12100 break; 12101 case BPF_FUNC_skc_to_unix_sock: 12102 func = &bpf_skc_to_unix_sock_proto; 12103 break; 12104 case BPF_FUNC_skc_to_mptcp_sock: 12105 func = &bpf_skc_to_mptcp_sock_proto; 12106 break; 12107 case BPF_FUNC_ktime_get_coarse_ns: 12108 return &bpf_ktime_get_coarse_ns_proto; 12109 default: 12110 return bpf_base_func_proto(func_id, prog); 12111 } 12112 12113 if (!bpf_token_capable(prog->aux->token, CAP_PERFMON)) 12114 return NULL; 12115 12116 return func; 12117 } 12118 12119 /** 12120 * bpf_skb_meta_pointer() - Gets a mutable pointer within the skb metadata area. 12121 * @skb: socket buffer carrying the metadata 12122 * @offset: offset into the metadata area, must be <= skb_metadata_len() 12123 */ 12124 void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset) 12125 { 12126 return skb_metadata_end(skb) - skb_metadata_len(skb) + offset; 12127 } 12128 12129 int __bpf_skb_meta_store_bytes(struct sk_buff *skb, u32 offset, 12130 const void *from, u32 len, u64 flags) 12131 { 12132 if (unlikely(flags)) 12133 return -EINVAL; 12134 if (unlikely(bpf_try_make_writable(skb, 0))) 12135 return -EFAULT; 12136 12137 memmove(bpf_skb_meta_pointer(skb, offset), from, len); 12138 return 0; 12139 } 12140 12141 __bpf_kfunc_start_defs(); 12142 __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, 12143 struct bpf_dynptr *ptr__uninit) 12144 { 12145 struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; 12146 struct sk_buff *skb = (struct sk_buff *)s; 12147 12148 if (flags) { 12149 bpf_dynptr_set_null(ptr); 12150 return -EINVAL; 12151 } 12152 12153 bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len); 12154 12155 return 0; 12156 } 12157 12158 /** 12159 * bpf_dynptr_from_skb_meta() - Initialize a dynptr to the skb metadata area. 12160 * @skb_: socket buffer carrying the metadata 12161 * @flags: future use, must be zero 12162 * @ptr__uninit: dynptr to initialize 12163 * 12164 * Set up a dynptr for access to the metadata area earlier allocated from the 12165 * XDP context with bpf_xdp_adjust_meta(). Serves as an alternative to 12166 * &__sk_buff->data_meta. 12167 * 12168 * Return: 12169 * * %0 - dynptr ready to use 12170 * * %-EINVAL - invalid flags, dynptr set to null 12171 */ 12172 __bpf_kfunc int bpf_dynptr_from_skb_meta(struct __sk_buff *skb_, u64 flags, 12173 struct bpf_dynptr *ptr__uninit) 12174 { 12175 struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; 12176 struct sk_buff *skb = (struct sk_buff *)skb_; 12177 12178 if (flags) { 12179 bpf_dynptr_set_null(ptr); 12180 return -EINVAL; 12181 } 12182 12183 bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB_META, 0, skb_metadata_len(skb)); 12184 12185 return 0; 12186 } 12187 12188 __bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_md *x, u64 flags, 12189 struct bpf_dynptr *ptr__uninit) 12190 { 12191 struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; 12192 struct xdp_buff *xdp = (struct xdp_buff *)x; 12193 12194 if (flags) { 12195 bpf_dynptr_set_null(ptr); 12196 return -EINVAL; 12197 } 12198 12199 bpf_dynptr_init(ptr, xdp, BPF_DYNPTR_TYPE_XDP, 0, xdp_get_buff_len(xdp)); 12200 12201 return 0; 12202 } 12203 12204 __bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern, 12205 const u8 *sun_path, u32 sun_path__sz) 12206 { 12207 struct sockaddr_un *un; 12208 12209 if (sa_kern->sk->sk_family != AF_UNIX) 12210 return -EINVAL; 12211 12212 /* We do not allow changing the address to unnamed or larger than the 12213 * maximum allowed address size for a unix sockaddr. 12214 */ 12215 if (sun_path__sz == 0 || sun_path__sz > UNIX_PATH_MAX) 12216 return -EINVAL; 12217 12218 un = (struct sockaddr_un *)sa_kern->uaddr; 12219 memcpy(un->sun_path, sun_path, sun_path__sz); 12220 sa_kern->uaddrlen = offsetof(struct sockaddr_un, sun_path) + sun_path__sz; 12221 12222 return 0; 12223 } 12224 12225 __bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct __sk_buff *s, struct sock *sk, 12226 struct bpf_tcp_req_attrs *attrs, int attrs__sz) 12227 { 12228 #if IS_ENABLED(CONFIG_SYN_COOKIES) 12229 struct sk_buff *skb = (struct sk_buff *)s; 12230 const struct request_sock_ops *ops; 12231 struct inet_request_sock *ireq; 12232 struct tcp_request_sock *treq; 12233 struct request_sock *req; 12234 struct net *net; 12235 __u16 min_mss; 12236 u32 tsoff = 0; 12237 12238 if (attrs__sz != sizeof(*attrs) || 12239 attrs->reserved[0] || attrs->reserved[1] || attrs->reserved[2]) 12240 return -EINVAL; 12241 12242 if (!skb_at_tc_ingress(skb)) 12243 return -EINVAL; 12244 12245 net = dev_net(skb->dev); 12246 if (net != sock_net(sk)) 12247 return -ENETUNREACH; 12248 12249 switch (skb->protocol) { 12250 case htons(ETH_P_IP): 12251 ops = &tcp_request_sock_ops; 12252 min_mss = 536; 12253 break; 12254 #if IS_BUILTIN(CONFIG_IPV6) 12255 case htons(ETH_P_IPV6): 12256 ops = &tcp6_request_sock_ops; 12257 min_mss = IPV6_MIN_MTU - 60; 12258 break; 12259 #endif 12260 default: 12261 return -EINVAL; 12262 } 12263 12264 if (sk->sk_type != SOCK_STREAM || sk->sk_state != TCP_LISTEN || 12265 sk_is_mptcp(sk)) 12266 return -EINVAL; 12267 12268 if (attrs->mss < min_mss) 12269 return -EINVAL; 12270 12271 if (attrs->wscale_ok) { 12272 if (!READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) 12273 return -EINVAL; 12274 12275 if (attrs->snd_wscale > TCP_MAX_WSCALE || 12276 attrs->rcv_wscale > TCP_MAX_WSCALE) 12277 return -EINVAL; 12278 } 12279 12280 if (attrs->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack)) 12281 return -EINVAL; 12282 12283 if (attrs->tstamp_ok) { 12284 if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps)) 12285 return -EINVAL; 12286 12287 tsoff = attrs->rcv_tsecr - tcp_ns_to_ts(attrs->usec_ts_ok, tcp_clock_ns()); 12288 } 12289 12290 req = inet_reqsk_alloc(ops, sk, false); 12291 if (!req) 12292 return -ENOMEM; 12293 12294 ireq = inet_rsk(req); 12295 treq = tcp_rsk(req); 12296 12297 req->rsk_listener = sk; 12298 req->syncookie = 1; 12299 req->mss = attrs->mss; 12300 req->ts_recent = attrs->rcv_tsval; 12301 12302 ireq->snd_wscale = attrs->snd_wscale; 12303 ireq->rcv_wscale = attrs->rcv_wscale; 12304 ireq->tstamp_ok = !!attrs->tstamp_ok; 12305 ireq->sack_ok = !!attrs->sack_ok; 12306 ireq->wscale_ok = !!attrs->wscale_ok; 12307 ireq->ecn_ok = !!attrs->ecn_ok; 12308 12309 treq->req_usec_ts = !!attrs->usec_ts_ok; 12310 treq->ts_off = tsoff; 12311 12312 skb_orphan(skb); 12313 skb->sk = req_to_sk(req); 12314 skb->destructor = sock_pfree; 12315 12316 return 0; 12317 #else 12318 return -EOPNOTSUPP; 12319 #endif 12320 } 12321 12322 __bpf_kfunc int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops, 12323 u64 flags) 12324 { 12325 struct sk_buff *skb; 12326 12327 if (skops->op != BPF_SOCK_OPS_TSTAMP_SENDMSG_CB) 12328 return -EOPNOTSUPP; 12329 12330 if (flags) 12331 return -EINVAL; 12332 12333 skb = skops->skb; 12334 skb_shinfo(skb)->tx_flags |= SKBTX_BPF; 12335 TCP_SKB_CB(skb)->txstamp_ack |= TSTAMP_ACK_BPF; 12336 skb_shinfo(skb)->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; 12337 12338 return 0; 12339 } 12340 12341 /** 12342 * bpf_xdp_pull_data() - Pull in non-linear xdp data. 12343 * @x: &xdp_md associated with the XDP buffer 12344 * @len: length of data to be made directly accessible in the linear part 12345 * 12346 * Pull in data in case the XDP buffer associated with @x is non-linear and 12347 * not all @len are in the linear data area. 12348 * 12349 * Direct packet access allows reading and writing linear XDP data through 12350 * packet pointers (i.e., &xdp_md->data + offsets). The amount of data which 12351 * ends up in the linear part of the xdp_buff depends on the NIC and its 12352 * configuration. When a frag-capable XDP program wants to directly access 12353 * headers that may be in the non-linear area, call this kfunc to make sure 12354 * the data is available in the linear area. Alternatively, use dynptr or 12355 * bpf_xdp_{load,store}_bytes() to access data without pulling. 12356 * 12357 * This kfunc can also be used with bpf_xdp_adjust_head() to decapsulate 12358 * headers in the non-linear data area. 12359 * 12360 * A call to this kfunc may reduce headroom. If there is not enough tailroom 12361 * in the linear data area, metadata and data will be shifted down. 12362 * 12363 * A call to this kfunc is susceptible to change the buffer geometry. 12364 * Therefore, at load time, all checks on pointers previously done by the 12365 * verifier are invalidated and must be performed again, if the kfunc is used 12366 * in combination with direct packet access. 12367 * 12368 * Return: 12369 * * %0 - success 12370 * * %-EINVAL - invalid len 12371 */ 12372 __bpf_kfunc int bpf_xdp_pull_data(struct xdp_md *x, u32 len) 12373 { 12374 struct xdp_buff *xdp = (struct xdp_buff *)x; 12375 struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); 12376 int i, delta, shift, headroom, tailroom, n_frags_free = 0; 12377 void *data_hard_end = xdp_data_hard_end(xdp); 12378 int data_len = xdp->data_end - xdp->data; 12379 void *start; 12380 12381 if (len <= data_len) 12382 return 0; 12383 12384 if (unlikely(len > xdp_get_buff_len(xdp))) 12385 return -EINVAL; 12386 12387 start = xdp_data_meta_unsupported(xdp) ? xdp->data : xdp->data_meta; 12388 12389 headroom = start - xdp->data_hard_start - sizeof(struct xdp_frame); 12390 tailroom = data_hard_end - xdp->data_end; 12391 12392 delta = len - data_len; 12393 if (unlikely(delta > tailroom + headroom)) 12394 return -EINVAL; 12395 12396 shift = delta - tailroom; 12397 if (shift > 0) { 12398 memmove(start - shift, start, xdp->data_end - start); 12399 12400 xdp->data_meta -= shift; 12401 xdp->data -= shift; 12402 xdp->data_end -= shift; 12403 } 12404 12405 for (i = 0; i < sinfo->nr_frags && delta; i++) { 12406 skb_frag_t *frag = &sinfo->frags[i]; 12407 u32 shrink = min_t(u32, delta, skb_frag_size(frag)); 12408 12409 memcpy(xdp->data_end, skb_frag_address(frag), shrink); 12410 12411 xdp->data_end += shrink; 12412 sinfo->xdp_frags_size -= shrink; 12413 delta -= shrink; 12414 if (bpf_xdp_shrink_data(xdp, frag, shrink, false)) 12415 n_frags_free++; 12416 } 12417 12418 if (unlikely(n_frags_free)) { 12419 memmove(sinfo->frags, sinfo->frags + n_frags_free, 12420 (sinfo->nr_frags - n_frags_free) * sizeof(skb_frag_t)); 12421 12422 sinfo->nr_frags -= n_frags_free; 12423 12424 if (!sinfo->nr_frags) { 12425 xdp_buff_clear_frags_flag(xdp); 12426 xdp_buff_clear_frag_pfmemalloc(xdp); 12427 } 12428 } 12429 12430 return 0; 12431 } 12432 12433 __bpf_kfunc_end_defs(); 12434 12435 int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, 12436 struct bpf_dynptr *ptr__uninit) 12437 { 12438 struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; 12439 int err; 12440 12441 err = bpf_dynptr_from_skb(skb, flags, ptr__uninit); 12442 if (err) 12443 return err; 12444 12445 bpf_dynptr_set_rdonly(ptr); 12446 12447 return 0; 12448 } 12449 12450 BTF_KFUNCS_START(bpf_kfunc_check_set_skb) 12451 BTF_ID_FLAGS(func, bpf_dynptr_from_skb) 12452 BTF_KFUNCS_END(bpf_kfunc_check_set_skb) 12453 12454 BTF_KFUNCS_START(bpf_kfunc_check_set_skb_meta) 12455 BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta) 12456 BTF_KFUNCS_END(bpf_kfunc_check_set_skb_meta) 12457 12458 BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) 12459 BTF_ID_FLAGS(func, bpf_dynptr_from_xdp) 12460 BTF_ID_FLAGS(func, bpf_xdp_pull_data) 12461 BTF_KFUNCS_END(bpf_kfunc_check_set_xdp) 12462 12463 BTF_KFUNCS_START(bpf_kfunc_check_set_sock_addr) 12464 BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path) 12465 BTF_KFUNCS_END(bpf_kfunc_check_set_sock_addr) 12466 12467 BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk) 12468 BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk) 12469 BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk) 12470 12471 BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops) 12472 BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp) 12473 BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops) 12474 12475 static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { 12476 .owner = THIS_MODULE, 12477 .set = &bpf_kfunc_check_set_skb, 12478 }; 12479 12480 static const struct btf_kfunc_id_set bpf_kfunc_set_skb_meta = { 12481 .owner = THIS_MODULE, 12482 .set = &bpf_kfunc_check_set_skb_meta, 12483 }; 12484 12485 static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = { 12486 .owner = THIS_MODULE, 12487 .set = &bpf_kfunc_check_set_xdp, 12488 }; 12489 12490 static const struct btf_kfunc_id_set bpf_kfunc_set_sock_addr = { 12491 .owner = THIS_MODULE, 12492 .set = &bpf_kfunc_check_set_sock_addr, 12493 }; 12494 12495 static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = { 12496 .owner = THIS_MODULE, 12497 .set = &bpf_kfunc_check_set_tcp_reqsk, 12498 }; 12499 12500 static const struct btf_kfunc_id_set bpf_kfunc_set_sock_ops = { 12501 .owner = THIS_MODULE, 12502 .set = &bpf_kfunc_check_set_sock_ops, 12503 }; 12504 12505 static int __init bpf_kfunc_init(void) 12506 { 12507 int ret; 12508 12509 ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb); 12510 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb); 12511 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SK_SKB, &bpf_kfunc_set_skb); 12512 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCKET_FILTER, &bpf_kfunc_set_skb); 12513 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &bpf_kfunc_set_skb); 12514 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_OUT, &bpf_kfunc_set_skb); 12515 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_IN, &bpf_kfunc_set_skb); 12516 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb); 12517 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb); 12518 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb); 12519 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_kfunc_set_skb); 12520 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb_meta); 12521 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb_meta); 12522 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); 12523 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, 12524 &bpf_kfunc_set_sock_addr); 12525 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk); 12526 return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCK_OPS, &bpf_kfunc_set_sock_ops); 12527 } 12528 late_initcall(bpf_kfunc_init); 12529 12530 __bpf_kfunc_start_defs(); 12531 12532 /* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code. 12533 * 12534 * The function expects a non-NULL pointer to a socket, and invokes the 12535 * protocol specific socket destroy handlers. 12536 * 12537 * The helper can only be called from BPF contexts that have acquired the socket 12538 * locks. 12539 * 12540 * Parameters: 12541 * @sock: Pointer to socket to be destroyed 12542 * 12543 * Return: 12544 * On error, may return EPROTONOSUPPORT, EINVAL. 12545 * EPROTONOSUPPORT if protocol specific destroy handler is not supported. 12546 * 0 otherwise 12547 */ 12548 __bpf_kfunc int bpf_sock_destroy(struct sock_common *sock) 12549 { 12550 struct sock *sk = (struct sock *)sock; 12551 12552 /* The locking semantics that allow for synchronous execution of the 12553 * destroy handlers are only supported for TCP and UDP. 12554 * Supporting protocols will need to acquire sock lock in the BPF context 12555 * prior to invoking this kfunc. 12556 */ 12557 if (!sk->sk_prot->diag_destroy || (sk->sk_protocol != IPPROTO_TCP && 12558 sk->sk_protocol != IPPROTO_UDP)) 12559 return -EOPNOTSUPP; 12560 12561 return sk->sk_prot->diag_destroy(sk, ECONNABORTED); 12562 } 12563 12564 __bpf_kfunc_end_defs(); 12565 12566 BTF_KFUNCS_START(bpf_sk_iter_kfunc_ids) 12567 BTF_ID_FLAGS(func, bpf_sock_destroy) 12568 BTF_KFUNCS_END(bpf_sk_iter_kfunc_ids) 12569 12570 static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id) 12571 { 12572 if (btf_id_set8_contains(&bpf_sk_iter_kfunc_ids, kfunc_id) && 12573 prog->expected_attach_type != BPF_TRACE_ITER) 12574 return -EACCES; 12575 return 0; 12576 } 12577 12578 static const struct btf_kfunc_id_set bpf_sk_iter_kfunc_set = { 12579 .owner = THIS_MODULE, 12580 .set = &bpf_sk_iter_kfunc_ids, 12581 .filter = tracing_iter_filter, 12582 }; 12583 12584 static int init_subsystem(void) 12585 { 12586 return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_sk_iter_kfunc_set); 12587 } 12588 late_initcall(init_subsystem); 12589