1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <linux/kernel.h> 3 #include <linux/module.h> 4 #include <linux/init.h> 5 #include <linux/netlink.h> 6 #include <linux/netfilter.h> 7 #include <linux/workqueue.h> 8 #include <linux/spinlock.h> 9 #include <linux/netfilter/nf_conntrack_common.h> 10 #include <linux/netfilter/nf_tables.h> 11 #include <net/ip.h> 12 #include <net/inet_dscp.h> 13 #include <net/netfilter/nf_tables.h> 14 #include <net/netfilter/nf_tables_core.h> 15 #include <net/netfilter/nf_conntrack_core.h> 16 #include <net/netfilter/nf_conntrack_extend.h> 17 #include <net/netfilter/nf_flow_table.h> 18 19 struct nft_flow_offload { 20 struct nft_flowtable *flowtable; 21 }; 22 23 static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst) 24 { 25 if (dst_xfrm(dst)) 26 return FLOW_OFFLOAD_XMIT_XFRM; 27 28 return FLOW_OFFLOAD_XMIT_NEIGH; 29 } 30 31 static void nft_default_forward_path(struct nf_flow_route *route, 32 struct dst_entry *dst_cache, 33 enum ip_conntrack_dir dir) 34 { 35 route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex; 36 route->tuple[dir].dst = dst_cache; 37 route->tuple[dir].xmit_type = nft_xmit_type(dst_cache); 38 } 39 40 static bool nft_is_valid_ether_device(const struct net_device *dev) 41 { 42 if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || 43 dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) 44 return false; 45 46 return true; 47 } 48 49 static int nft_dev_fill_forward_path(const struct nf_flow_route *route, 50 const struct dst_entry *dst_cache, 51 const struct nf_conn *ct, 52 enum ip_conntrack_dir dir, u8 *ha, 53 struct net_device_path_stack *stack) 54 { 55 const void *daddr = &ct->tuplehash[!dir].tuple.src.u3; 56 struct net_device *dev = dst_cache->dev; 57 struct neighbour *n; 58 u8 nud_state; 59 60 if (!nft_is_valid_ether_device(dev)) 61 goto out; 62 63 n = dst_neigh_lookup(dst_cache, daddr); 64 if (!n) 65 return -1; 66 67 read_lock_bh(&n->lock); 68 nud_state = n->nud_state; 69 ether_addr_copy(ha, n->ha); 70 read_unlock_bh(&n->lock); 71 neigh_release(n); 72 73 if (!(nud_state & NUD_VALID)) 74 return -1; 75 76 out: 77 return dev_fill_forward_path(dev, ha, stack); 78 } 79 80 struct nft_forward_info { 81 const struct net_device *indev; 82 const struct net_device *outdev; 83 const struct net_device *hw_outdev; 84 struct id { 85 __u16 id; 86 __be16 proto; 87 } encap[NF_FLOW_TABLE_ENCAP_MAX]; 88 u8 num_encaps; 89 u8 ingress_vlans; 90 u8 h_source[ETH_ALEN]; 91 u8 h_dest[ETH_ALEN]; 92 enum flow_offload_xmit_type xmit_type; 93 }; 94 95 static void nft_dev_path_info(const struct net_device_path_stack *stack, 96 struct nft_forward_info *info, 97 unsigned char *ha, struct nf_flowtable *flowtable) 98 { 99 const struct net_device_path *path; 100 int i; 101 102 memcpy(info->h_dest, ha, ETH_ALEN); 103 104 for (i = 0; i < stack->num_paths; i++) { 105 path = &stack->path[i]; 106 switch (path->type) { 107 case DEV_PATH_ETHERNET: 108 case DEV_PATH_DSA: 109 case DEV_PATH_VLAN: 110 case DEV_PATH_PPPOE: 111 info->indev = path->dev; 112 if (is_zero_ether_addr(info->h_source)) 113 memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); 114 115 if (path->type == DEV_PATH_ETHERNET) 116 break; 117 if (path->type == DEV_PATH_DSA) { 118 i = stack->num_paths; 119 break; 120 } 121 122 /* DEV_PATH_VLAN and DEV_PATH_PPPOE */ 123 if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { 124 info->indev = NULL; 125 break; 126 } 127 if (!info->outdev) 128 info->outdev = path->dev; 129 info->encap[info->num_encaps].id = path->encap.id; 130 info->encap[info->num_encaps].proto = path->encap.proto; 131 info->num_encaps++; 132 if (path->type == DEV_PATH_PPPOE) 133 memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); 134 break; 135 case DEV_PATH_BRIDGE: 136 if (is_zero_ether_addr(info->h_source)) 137 memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); 138 139 switch (path->bridge.vlan_mode) { 140 case DEV_PATH_BR_VLAN_UNTAG_HW: 141 info->ingress_vlans |= BIT(info->num_encaps - 1); 142 break; 143 case DEV_PATH_BR_VLAN_TAG: 144 info->encap[info->num_encaps].id = path->bridge.vlan_id; 145 info->encap[info->num_encaps].proto = path->bridge.vlan_proto; 146 info->num_encaps++; 147 break; 148 case DEV_PATH_BR_VLAN_UNTAG: 149 info->num_encaps--; 150 break; 151 case DEV_PATH_BR_VLAN_KEEP: 152 break; 153 } 154 info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; 155 break; 156 default: 157 info->indev = NULL; 158 break; 159 } 160 } 161 if (!info->outdev) 162 info->outdev = info->indev; 163 164 info->hw_outdev = info->indev; 165 166 if (nf_flowtable_hw_offload(flowtable) && 167 nft_is_valid_ether_device(info->indev)) 168 info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; 169 } 170 171 static bool nft_flowtable_find_dev(const struct net_device *dev, 172 struct nft_flowtable *ft) 173 { 174 struct nft_hook *hook; 175 bool found = false; 176 177 list_for_each_entry_rcu(hook, &ft->hook_list, list) { 178 if (!nft_hook_find_ops_rcu(hook, dev)) 179 continue; 180 181 found = true; 182 break; 183 } 184 185 return found; 186 } 187 188 static void nft_dev_forward_path(struct nf_flow_route *route, 189 const struct nf_conn *ct, 190 enum ip_conntrack_dir dir, 191 struct nft_flowtable *ft) 192 { 193 const struct dst_entry *dst = route->tuple[dir].dst; 194 struct net_device_path_stack stack; 195 struct nft_forward_info info = {}; 196 unsigned char ha[ETH_ALEN]; 197 int i; 198 199 if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0) 200 nft_dev_path_info(&stack, &info, ha, &ft->data); 201 202 if (!info.indev || !nft_flowtable_find_dev(info.indev, ft)) 203 return; 204 205 route->tuple[!dir].in.ifindex = info.indev->ifindex; 206 for (i = 0; i < info.num_encaps; i++) { 207 route->tuple[!dir].in.encap[i].id = info.encap[i].id; 208 route->tuple[!dir].in.encap[i].proto = info.encap[i].proto; 209 } 210 route->tuple[!dir].in.num_encaps = info.num_encaps; 211 route->tuple[!dir].in.ingress_vlans = info.ingress_vlans; 212 213 if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) { 214 memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN); 215 memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); 216 route->tuple[dir].out.ifindex = info.outdev->ifindex; 217 route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex; 218 route->tuple[dir].xmit_type = info.xmit_type; 219 } 220 } 221 222 static int nft_flow_route(const struct nft_pktinfo *pkt, 223 const struct nf_conn *ct, 224 struct nf_flow_route *route, 225 enum ip_conntrack_dir dir, 226 struct nft_flowtable *ft) 227 { 228 struct dst_entry *this_dst = skb_dst(pkt->skb); 229 struct dst_entry *other_dst = NULL; 230 struct flowi fl; 231 232 memset(&fl, 0, sizeof(fl)); 233 switch (nft_pf(pkt)) { 234 case NFPROTO_IPV4: 235 fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip; 236 fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip; 237 fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; 238 fl.u.ip4.flowi4_iif = this_dst->dev->ifindex; 239 fl.u.ip4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(pkt->skb))); 240 fl.u.ip4.flowi4_mark = pkt->skb->mark; 241 fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; 242 break; 243 case NFPROTO_IPV6: 244 fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6; 245 fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.src.u3.in6; 246 fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex; 247 fl.u.ip6.flowi6_iif = this_dst->dev->ifindex; 248 fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb)); 249 fl.u.ip6.flowi6_mark = pkt->skb->mark; 250 fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC; 251 break; 252 } 253 254 if (!dst_hold_safe(this_dst)) 255 return -ENOENT; 256 257 nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt)); 258 if (!other_dst) { 259 dst_release(this_dst); 260 return -ENOENT; 261 } 262 263 nft_default_forward_path(route, this_dst, dir); 264 nft_default_forward_path(route, other_dst, !dir); 265 266 if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH && 267 route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) { 268 nft_dev_forward_path(route, ct, dir, ft); 269 nft_dev_forward_path(route, ct, !dir, ft); 270 } 271 272 return 0; 273 } 274 275 static bool nft_flow_offload_skip(struct sk_buff *skb, int family) 276 { 277 if (skb_sec_path(skb)) 278 return true; 279 280 if (family == NFPROTO_IPV4) { 281 const struct ip_options *opt; 282 283 opt = &(IPCB(skb)->opt); 284 285 if (unlikely(opt->optlen)) 286 return true; 287 } 288 289 return false; 290 } 291 292 static void flow_offload_ct_tcp(struct nf_conn *ct) 293 { 294 /* conntrack will not see all packets, disable tcp window validation. */ 295 spin_lock_bh(&ct->lock); 296 ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; 297 ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; 298 spin_unlock_bh(&ct->lock); 299 } 300 301 static void nft_flow_offload_eval(const struct nft_expr *expr, 302 struct nft_regs *regs, 303 const struct nft_pktinfo *pkt) 304 { 305 struct nft_flow_offload *priv = nft_expr_priv(expr); 306 struct nf_flowtable *flowtable = &priv->flowtable->data; 307 struct tcphdr _tcph, *tcph = NULL; 308 struct nf_flow_route route = {}; 309 enum ip_conntrack_info ctinfo; 310 struct flow_offload *flow; 311 enum ip_conntrack_dir dir; 312 struct nf_conn *ct; 313 int ret; 314 315 if (nft_flow_offload_skip(pkt->skb, nft_pf(pkt))) 316 goto out; 317 318 ct = nf_ct_get(pkt->skb, &ctinfo); 319 if (!ct) 320 goto out; 321 322 switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) { 323 case IPPROTO_TCP: 324 tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), 325 sizeof(_tcph), &_tcph); 326 if (unlikely(!tcph || tcph->fin || tcph->rst || 327 !nf_conntrack_tcp_established(ct))) 328 goto out; 329 break; 330 case IPPROTO_UDP: 331 break; 332 #ifdef CONFIG_NF_CT_PROTO_GRE 333 case IPPROTO_GRE: { 334 struct nf_conntrack_tuple *tuple; 335 336 if (ct->status & IPS_NAT_MASK) 337 goto out; 338 tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; 339 /* No support for GRE v1 */ 340 if (tuple->src.u.gre.key || tuple->dst.u.gre.key) 341 goto out; 342 break; 343 } 344 #endif 345 default: 346 goto out; 347 } 348 349 if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) || 350 ct->status & (IPS_SEQ_ADJUST | IPS_NAT_CLASH)) 351 goto out; 352 353 if (!nf_ct_is_confirmed(ct)) 354 goto out; 355 356 if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status)) 357 goto out; 358 359 dir = CTINFO2DIR(ctinfo); 360 if (nft_flow_route(pkt, ct, &route, dir, priv->flowtable) < 0) 361 goto err_flow_route; 362 363 flow = flow_offload_alloc(ct); 364 if (!flow) 365 goto err_flow_alloc; 366 367 flow_offload_route_init(flow, &route); 368 if (tcph) 369 flow_offload_ct_tcp(ct); 370 371 __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags); 372 ret = flow_offload_add(flowtable, flow); 373 if (ret < 0) 374 goto err_flow_add; 375 376 return; 377 378 err_flow_add: 379 flow_offload_free(flow); 380 err_flow_alloc: 381 dst_release(route.tuple[dir].dst); 382 dst_release(route.tuple[!dir].dst); 383 err_flow_route: 384 clear_bit(IPS_OFFLOAD_BIT, &ct->status); 385 out: 386 regs->verdict.code = NFT_BREAK; 387 } 388 389 static int nft_flow_offload_validate(const struct nft_ctx *ctx, 390 const struct nft_expr *expr) 391 { 392 unsigned int hook_mask = (1 << NF_INET_FORWARD); 393 394 if (ctx->family != NFPROTO_IPV4 && 395 ctx->family != NFPROTO_IPV6 && 396 ctx->family != NFPROTO_INET) 397 return -EOPNOTSUPP; 398 399 return nft_chain_validate_hooks(ctx->chain, hook_mask); 400 } 401 402 static const struct nla_policy nft_flow_offload_policy[NFTA_FLOW_MAX + 1] = { 403 [NFTA_FLOW_TABLE_NAME] = { .type = NLA_STRING, 404 .len = NFT_NAME_MAXLEN - 1 }, 405 }; 406 407 static int nft_flow_offload_init(const struct nft_ctx *ctx, 408 const struct nft_expr *expr, 409 const struct nlattr * const tb[]) 410 { 411 struct nft_flow_offload *priv = nft_expr_priv(expr); 412 u8 genmask = nft_genmask_next(ctx->net); 413 struct nft_flowtable *flowtable; 414 415 if (!tb[NFTA_FLOW_TABLE_NAME]) 416 return -EINVAL; 417 418 flowtable = nft_flowtable_lookup(ctx->net, ctx->table, 419 tb[NFTA_FLOW_TABLE_NAME], genmask); 420 if (IS_ERR(flowtable)) 421 return PTR_ERR(flowtable); 422 423 if (!nft_use_inc(&flowtable->use)) 424 return -EMFILE; 425 426 priv->flowtable = flowtable; 427 428 return nf_ct_netns_get(ctx->net, ctx->family); 429 } 430 431 static void nft_flow_offload_deactivate(const struct nft_ctx *ctx, 432 const struct nft_expr *expr, 433 enum nft_trans_phase phase) 434 { 435 struct nft_flow_offload *priv = nft_expr_priv(expr); 436 437 nf_tables_deactivate_flowtable(ctx, priv->flowtable, phase); 438 } 439 440 static void nft_flow_offload_activate(const struct nft_ctx *ctx, 441 const struct nft_expr *expr) 442 { 443 struct nft_flow_offload *priv = nft_expr_priv(expr); 444 445 nft_use_inc_restore(&priv->flowtable->use); 446 } 447 448 static void nft_flow_offload_destroy(const struct nft_ctx *ctx, 449 const struct nft_expr *expr) 450 { 451 nf_ct_netns_put(ctx->net, ctx->family); 452 } 453 454 static int nft_flow_offload_dump(struct sk_buff *skb, 455 const struct nft_expr *expr, bool reset) 456 { 457 struct nft_flow_offload *priv = nft_expr_priv(expr); 458 459 if (nla_put_string(skb, NFTA_FLOW_TABLE_NAME, priv->flowtable->name)) 460 goto nla_put_failure; 461 462 return 0; 463 464 nla_put_failure: 465 return -1; 466 } 467 468 static struct nft_expr_type nft_flow_offload_type; 469 static const struct nft_expr_ops nft_flow_offload_ops = { 470 .type = &nft_flow_offload_type, 471 .size = NFT_EXPR_SIZE(sizeof(struct nft_flow_offload)), 472 .eval = nft_flow_offload_eval, 473 .init = nft_flow_offload_init, 474 .activate = nft_flow_offload_activate, 475 .deactivate = nft_flow_offload_deactivate, 476 .destroy = nft_flow_offload_destroy, 477 .validate = nft_flow_offload_validate, 478 .dump = nft_flow_offload_dump, 479 .reduce = NFT_REDUCE_READONLY, 480 }; 481 482 static struct nft_expr_type nft_flow_offload_type __read_mostly = { 483 .name = "flow_offload", 484 .ops = &nft_flow_offload_ops, 485 .policy = nft_flow_offload_policy, 486 .maxattr = NFTA_FLOW_MAX, 487 .owner = THIS_MODULE, 488 }; 489 490 static int flow_offload_netdev_event(struct notifier_block *this, 491 unsigned long event, void *ptr) 492 { 493 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 494 495 if (event != NETDEV_DOWN) 496 return NOTIFY_DONE; 497 498 nf_flow_table_cleanup(dev); 499 500 return NOTIFY_DONE; 501 } 502 503 static struct notifier_block flow_offload_netdev_notifier = { 504 .notifier_call = flow_offload_netdev_event, 505 }; 506 507 static int __init nft_flow_offload_module_init(void) 508 { 509 int err; 510 511 err = register_netdevice_notifier(&flow_offload_netdev_notifier); 512 if (err) 513 goto err; 514 515 err = nft_register_expr(&nft_flow_offload_type); 516 if (err < 0) 517 goto register_expr; 518 519 return 0; 520 521 register_expr: 522 unregister_netdevice_notifier(&flow_offload_netdev_notifier); 523 err: 524 return err; 525 } 526 527 static void __exit nft_flow_offload_module_exit(void) 528 { 529 nft_unregister_expr(&nft_flow_offload_type); 530 unregister_netdevice_notifier(&flow_offload_netdev_notifier); 531 } 532 533 module_init(nft_flow_offload_module_init); 534 module_exit(nft_flow_offload_module_exit); 535 536 MODULE_LICENSE("GPL"); 537 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); 538 MODULE_ALIAS_NFT_EXPR("flow_offload"); 539 MODULE_DESCRIPTION("nftables hardware flow offload module"); 540