1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Connection state tracking for netfilter. This is separated from, 3 but required by, the NAT layer; it can also be used by an iptables 4 extension. */ 5 6 /* (C) 1999-2001 Paul `Rusty' Russell 7 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> 8 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> 9 * (C) 2005-2012 Patrick McHardy <kaber@trash.net> 10 */ 11 12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 13 14 #include <linux/types.h> 15 #include <linux/netfilter.h> 16 #include <linux/module.h> 17 #include <linux/sched.h> 18 #include <linux/skbuff.h> 19 #include <linux/proc_fs.h> 20 #include <linux/vmalloc.h> 21 #include <linux/stddef.h> 22 #include <linux/slab.h> 23 #include <linux/random.h> 24 #include <linux/siphash.h> 25 #include <linux/err.h> 26 #include <linux/percpu.h> 27 #include <linux/moduleparam.h> 28 #include <linux/notifier.h> 29 #include <linux/kernel.h> 30 #include <linux/netdevice.h> 31 #include <linux/socket.h> 32 #include <linux/mm.h> 33 #include <linux/nsproxy.h> 34 #include <linux/rculist_nulls.h> 35 36 #include <net/netfilter/nf_conntrack.h> 37 #include <net/netfilter/nf_conntrack_bpf.h> 38 #include <net/netfilter/nf_conntrack_l4proto.h> 39 #include <net/netfilter/nf_conntrack_expect.h> 40 #include <net/netfilter/nf_conntrack_helper.h> 41 #include <net/netfilter/nf_conntrack_core.h> 42 #include <net/netfilter/nf_conntrack_extend.h> 43 #include <net/netfilter/nf_conntrack_acct.h> 44 #include <net/netfilter/nf_conntrack_ecache.h> 45 #include <net/netfilter/nf_conntrack_zones.h> 46 #include <net/netfilter/nf_conntrack_timestamp.h> 47 #include <net/netfilter/nf_conntrack_timeout.h> 48 #include <net/netfilter/nf_conntrack_labels.h> 49 #include <net/netfilter/nf_conntrack_synproxy.h> 50 #include <net/netfilter/nf_nat.h> 51 #include <net/netfilter/nf_nat_helper.h> 52 #include <net/netns/hash.h> 53 #include <net/ip.h> 54 55 #include "nf_internals.h" 56 57 __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; 58 EXPORT_SYMBOL_GPL(nf_conntrack_locks); 59 60 __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); 61 EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); 62 63 struct hlist_nulls_head *nf_conntrack_hash __read_mostly; 64 EXPORT_SYMBOL_GPL(nf_conntrack_hash); 65 66 struct conntrack_gc_work { 67 struct delayed_work dwork; 68 u32 next_bucket; 69 u32 avg_timeout; 70 u32 count; 71 u32 start_time; 72 bool exiting; 73 bool early_drop; 74 }; 75 76 static __read_mostly struct kmem_cache *nf_conntrack_cachep; 77 static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); 78 static __read_mostly bool nf_conntrack_locks_all; 79 80 /* serialize hash resizes and nf_ct_iterate_cleanup */ 81 static DEFINE_MUTEX(nf_conntrack_mutex); 82 83 #define GC_SCAN_INTERVAL_MAX (60ul * HZ) 84 #define GC_SCAN_INTERVAL_MIN (1ul * HZ) 85 86 /* clamp timeouts to this value (TCP unacked) */ 87 #define GC_SCAN_INTERVAL_CLAMP (300ul * HZ) 88 89 /* Initial bias pretending we have 100 entries at the upper bound so we don't 90 * wakeup often just because we have three entries with a 1s timeout while still 91 * allowing non-idle machines to wakeup more often when needed. 92 */ 93 #define GC_SCAN_INITIAL_COUNT 100 94 #define GC_SCAN_INTERVAL_INIT GC_SCAN_INTERVAL_MAX 95 96 #define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) 97 #define GC_SCAN_EXPIRED_MAX (64000u / HZ) 98 99 #define MIN_CHAINLEN 50u 100 #define MAX_CHAINLEN (80u - MIN_CHAINLEN) 101 102 static struct conntrack_gc_work conntrack_gc_work; 103 104 void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) 105 { 106 /* 1) Acquire the lock */ 107 spin_lock(lock); 108 109 /* 2) read nf_conntrack_locks_all, with ACQUIRE semantics 110 * It pairs with the smp_store_release() in nf_conntrack_all_unlock() 111 */ 112 if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false)) 113 return; 114 115 /* fast path failed, unlock */ 116 spin_unlock(lock); 117 118 /* Slow path 1) get global lock */ 119 spin_lock(&nf_conntrack_locks_all_lock); 120 121 /* Slow path 2) get the lock we want */ 122 spin_lock(lock); 123 124 /* Slow path 3) release the global lock */ 125 spin_unlock(&nf_conntrack_locks_all_lock); 126 } 127 EXPORT_SYMBOL_GPL(nf_conntrack_lock); 128 129 static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2) 130 { 131 h1 %= CONNTRACK_LOCKS; 132 h2 %= CONNTRACK_LOCKS; 133 spin_unlock(&nf_conntrack_locks[h1]); 134 if (h1 != h2) 135 spin_unlock(&nf_conntrack_locks[h2]); 136 } 137 138 /* return true if we need to recompute hashes (in case hash table was resized) */ 139 static bool nf_conntrack_double_lock(unsigned int h1, unsigned int h2, 140 unsigned int sequence) 141 { 142 h1 %= CONNTRACK_LOCKS; 143 h2 %= CONNTRACK_LOCKS; 144 if (h1 <= h2) { 145 nf_conntrack_lock(&nf_conntrack_locks[h1]); 146 if (h1 != h2) 147 spin_lock_nested(&nf_conntrack_locks[h2], 148 SINGLE_DEPTH_NESTING); 149 } else { 150 nf_conntrack_lock(&nf_conntrack_locks[h2]); 151 spin_lock_nested(&nf_conntrack_locks[h1], 152 SINGLE_DEPTH_NESTING); 153 } 154 if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { 155 nf_conntrack_double_unlock(h1, h2); 156 return true; 157 } 158 return false; 159 } 160 161 static void nf_conntrack_all_lock(void) 162 __acquires(&nf_conntrack_locks_all_lock) 163 { 164 int i; 165 166 spin_lock(&nf_conntrack_locks_all_lock); 167 168 /* For nf_contrack_locks_all, only the latest time when another 169 * CPU will see an update is controlled, by the "release" of the 170 * spin_lock below. 171 * The earliest time is not controlled, an thus KCSAN could detect 172 * a race when nf_conntract_lock() reads the variable. 173 * WRITE_ONCE() is used to ensure the compiler will not 174 * optimize the write. 175 */ 176 WRITE_ONCE(nf_conntrack_locks_all, true); 177 178 for (i = 0; i < CONNTRACK_LOCKS; i++) { 179 spin_lock(&nf_conntrack_locks[i]); 180 181 /* This spin_unlock provides the "release" to ensure that 182 * nf_conntrack_locks_all==true is visible to everyone that 183 * acquired spin_lock(&nf_conntrack_locks[]). 184 */ 185 spin_unlock(&nf_conntrack_locks[i]); 186 } 187 } 188 189 static void nf_conntrack_all_unlock(void) 190 __releases(&nf_conntrack_locks_all_lock) 191 { 192 /* All prior stores must be complete before we clear 193 * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock() 194 * might observe the false value but not the entire 195 * critical section. 196 * It pairs with the smp_load_acquire() in nf_conntrack_lock() 197 */ 198 smp_store_release(&nf_conntrack_locks_all, false); 199 spin_unlock(&nf_conntrack_locks_all_lock); 200 } 201 202 unsigned int nf_conntrack_htable_size __read_mostly; 203 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); 204 205 unsigned int nf_conntrack_max __read_mostly; 206 EXPORT_SYMBOL_GPL(nf_conntrack_max); 207 seqcount_spinlock_t nf_conntrack_generation __read_mostly; 208 static siphash_aligned_key_t nf_conntrack_hash_rnd; 209 210 static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, 211 unsigned int zoneid, 212 const struct net *net) 213 { 214 siphash_key_t key; 215 216 get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); 217 218 key = nf_conntrack_hash_rnd; 219 220 key.key[0] ^= zoneid; 221 key.key[1] ^= net_hash_mix(net); 222 223 return siphash((void *)tuple, 224 offsetofend(struct nf_conntrack_tuple, dst.__nfct_hash_offsetend), 225 &key); 226 } 227 228 static u32 scale_hash(u32 hash) 229 { 230 return reciprocal_scale(hash, nf_conntrack_htable_size); 231 } 232 233 static u32 __hash_conntrack(const struct net *net, 234 const struct nf_conntrack_tuple *tuple, 235 unsigned int zoneid, 236 unsigned int size) 237 { 238 return reciprocal_scale(hash_conntrack_raw(tuple, zoneid, net), size); 239 } 240 241 static u32 hash_conntrack(const struct net *net, 242 const struct nf_conntrack_tuple *tuple, 243 unsigned int zoneid) 244 { 245 return scale_hash(hash_conntrack_raw(tuple, zoneid, net)); 246 } 247 248 static bool nf_ct_get_tuple_ports(const struct sk_buff *skb, 249 unsigned int dataoff, 250 struct nf_conntrack_tuple *tuple) 251 { struct { 252 __be16 sport; 253 __be16 dport; 254 } _inet_hdr, *inet_hdr; 255 256 /* Actually only need first 4 bytes to get ports. */ 257 inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr); 258 if (!inet_hdr) 259 return false; 260 261 tuple->src.u.udp.port = inet_hdr->sport; 262 tuple->dst.u.udp.port = inet_hdr->dport; 263 return true; 264 } 265 266 static bool 267 nf_ct_get_tuple(const struct sk_buff *skb, 268 unsigned int nhoff, 269 unsigned int dataoff, 270 u_int16_t l3num, 271 u_int8_t protonum, 272 struct net *net, 273 struct nf_conntrack_tuple *tuple) 274 { 275 unsigned int size; 276 const __be32 *ap; 277 __be32 _addrs[8]; 278 279 memset(tuple, 0, sizeof(*tuple)); 280 281 tuple->src.l3num = l3num; 282 switch (l3num) { 283 case NFPROTO_IPV4: 284 nhoff += offsetof(struct iphdr, saddr); 285 size = 2 * sizeof(__be32); 286 break; 287 case NFPROTO_IPV6: 288 nhoff += offsetof(struct ipv6hdr, saddr); 289 size = sizeof(_addrs); 290 break; 291 default: 292 return true; 293 } 294 295 ap = skb_header_pointer(skb, nhoff, size, _addrs); 296 if (!ap) 297 return false; 298 299 switch (l3num) { 300 case NFPROTO_IPV4: 301 tuple->src.u3.ip = ap[0]; 302 tuple->dst.u3.ip = ap[1]; 303 break; 304 case NFPROTO_IPV6: 305 memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6)); 306 memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6)); 307 break; 308 } 309 310 tuple->dst.protonum = protonum; 311 tuple->dst.dir = IP_CT_DIR_ORIGINAL; 312 313 switch (protonum) { 314 #if IS_ENABLED(CONFIG_IPV6) 315 case IPPROTO_ICMPV6: 316 return icmpv6_pkt_to_tuple(skb, dataoff, net, tuple); 317 #endif 318 case IPPROTO_ICMP: 319 return icmp_pkt_to_tuple(skb, dataoff, net, tuple); 320 #ifdef CONFIG_NF_CT_PROTO_GRE 321 case IPPROTO_GRE: 322 return gre_pkt_to_tuple(skb, dataoff, net, tuple); 323 #endif 324 case IPPROTO_TCP: 325 case IPPROTO_UDP: 326 #ifdef CONFIG_NF_CT_PROTO_SCTP 327 case IPPROTO_SCTP: 328 #endif 329 /* fallthrough */ 330 return nf_ct_get_tuple_ports(skb, dataoff, tuple); 331 default: 332 break; 333 } 334 335 return true; 336 } 337 338 static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, 339 u_int8_t *protonum) 340 { 341 int dataoff = -1; 342 const struct iphdr *iph; 343 struct iphdr _iph; 344 345 iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); 346 if (!iph) 347 return -1; 348 349 /* Conntrack defragments packets, we might still see fragments 350 * inside ICMP packets though. 351 */ 352 if (iph->frag_off & htons(IP_OFFSET)) 353 return -1; 354 355 dataoff = nhoff + (iph->ihl << 2); 356 *protonum = iph->protocol; 357 358 /* Check bogus IP headers */ 359 if (dataoff > skb->len) { 360 pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n", 361 nhoff, iph->ihl << 2, skb->len); 362 return -1; 363 } 364 return dataoff; 365 } 366 367 #if IS_ENABLED(CONFIG_IPV6) 368 static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, 369 u8 *protonum) 370 { 371 int protoff = -1; 372 unsigned int extoff = nhoff + sizeof(struct ipv6hdr); 373 __be16 frag_off; 374 u8 nexthdr; 375 376 if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr), 377 &nexthdr, sizeof(nexthdr)) != 0) { 378 pr_debug("can't get nexthdr\n"); 379 return -1; 380 } 381 protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off); 382 /* 383 * (protoff == skb->len) means the packet has not data, just 384 * IPv6 and possibly extensions headers, but it is tracked anyway 385 */ 386 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { 387 pr_debug("can't find proto in pkt\n"); 388 return -1; 389 } 390 391 *protonum = nexthdr; 392 return protoff; 393 } 394 #endif 395 396 static int get_l4proto(const struct sk_buff *skb, 397 unsigned int nhoff, u8 pf, u8 *l4num) 398 { 399 switch (pf) { 400 case NFPROTO_IPV4: 401 return ipv4_get_l4proto(skb, nhoff, l4num); 402 #if IS_ENABLED(CONFIG_IPV6) 403 case NFPROTO_IPV6: 404 return ipv6_get_l4proto(skb, nhoff, l4num); 405 #endif 406 default: 407 *l4num = 0; 408 break; 409 } 410 return -1; 411 } 412 413 bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, 414 u_int16_t l3num, 415 struct net *net, struct nf_conntrack_tuple *tuple) 416 { 417 u8 protonum; 418 int protoff; 419 420 protoff = get_l4proto(skb, nhoff, l3num, &protonum); 421 if (protoff <= 0) 422 return false; 423 424 return nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple); 425 } 426 EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr); 427 428 bool 429 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, 430 const struct nf_conntrack_tuple *orig) 431 { 432 memset(inverse, 0, sizeof(*inverse)); 433 434 inverse->src.l3num = orig->src.l3num; 435 436 switch (orig->src.l3num) { 437 case NFPROTO_IPV4: 438 inverse->src.u3.ip = orig->dst.u3.ip; 439 inverse->dst.u3.ip = orig->src.u3.ip; 440 break; 441 case NFPROTO_IPV6: 442 inverse->src.u3.in6 = orig->dst.u3.in6; 443 inverse->dst.u3.in6 = orig->src.u3.in6; 444 break; 445 default: 446 break; 447 } 448 449 inverse->dst.dir = !orig->dst.dir; 450 451 inverse->dst.protonum = orig->dst.protonum; 452 453 switch (orig->dst.protonum) { 454 case IPPROTO_ICMP: 455 return nf_conntrack_invert_icmp_tuple(inverse, orig); 456 #if IS_ENABLED(CONFIG_IPV6) 457 case IPPROTO_ICMPV6: 458 return nf_conntrack_invert_icmpv6_tuple(inverse, orig); 459 #endif 460 } 461 462 inverse->src.u.all = orig->dst.u.all; 463 inverse->dst.u.all = orig->src.u.all; 464 return true; 465 } 466 EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); 467 468 /* Generate a almost-unique pseudo-id for a given conntrack. 469 * 470 * intentionally doesn't re-use any of the seeds used for hash 471 * table location, we assume id gets exposed to userspace. 472 * 473 * Following nf_conn items do not change throughout lifetime 474 * of the nf_conn: 475 * 476 * 1. nf_conn address 477 * 2. nf_conn->master address (normally NULL) 478 * 3. the associated net namespace 479 * 4. the original direction tuple 480 */ 481 u32 nf_ct_get_id(const struct nf_conn *ct) 482 { 483 static siphash_aligned_key_t ct_id_seed; 484 unsigned long a, b, c, d; 485 486 net_get_random_once(&ct_id_seed, sizeof(ct_id_seed)); 487 488 a = (unsigned long)ct; 489 b = (unsigned long)ct->master; 490 c = (unsigned long)nf_ct_net(ct); 491 d = (unsigned long)siphash(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 492 sizeof(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple), 493 &ct_id_seed); 494 #ifdef CONFIG_64BIT 495 return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed); 496 #else 497 return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed); 498 #endif 499 } 500 EXPORT_SYMBOL_GPL(nf_ct_get_id); 501 502 static u32 nf_conntrack_get_id(const struct nf_conntrack *nfct) 503 { 504 return nf_ct_get_id(nf_ct_to_nf_conn(nfct)); 505 } 506 507 static void 508 clean_from_lists(struct nf_conn *ct) 509 { 510 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 511 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); 512 513 /* Destroy all pending expectations */ 514 nf_ct_remove_expectations(ct); 515 } 516 517 #define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK) 518 519 /* Released via nf_ct_destroy() */ 520 struct nf_conn *nf_ct_tmpl_alloc(struct net *net, 521 const struct nf_conntrack_zone *zone, 522 gfp_t flags) 523 { 524 struct nf_conn *tmpl, *p; 525 526 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) { 527 tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags); 528 if (!tmpl) 529 return NULL; 530 531 p = tmpl; 532 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); 533 if (tmpl != p) 534 tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p; 535 } else { 536 tmpl = kzalloc_obj(*tmpl, flags); 537 if (!tmpl) 538 return NULL; 539 } 540 541 tmpl->status = IPS_TEMPLATE; 542 write_pnet(&tmpl->ct_net, net); 543 nf_ct_zone_add(tmpl, zone); 544 refcount_set(&tmpl->ct_general.use, 1); 545 546 return tmpl; 547 } 548 EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); 549 550 void nf_ct_tmpl_free(struct nf_conn *tmpl) 551 { 552 kfree(tmpl->ext); 553 554 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) 555 kfree((char *)tmpl - tmpl->proto.tmpl_padto); 556 else 557 kfree(tmpl); 558 } 559 EXPORT_SYMBOL_GPL(nf_ct_tmpl_free); 560 561 static void destroy_gre_conntrack(struct nf_conn *ct) 562 { 563 #ifdef CONFIG_NF_CT_PROTO_GRE 564 struct nf_conn *master = ct->master; 565 566 if (master) 567 nf_ct_gre_keymap_destroy(master); 568 #endif 569 } 570 571 void nf_ct_destroy(struct nf_conntrack *nfct) 572 { 573 struct nf_conn *ct = (struct nf_conn *)nfct; 574 575 WARN_ON(refcount_read(&nfct->use) != 0); 576 577 if (unlikely(nf_ct_is_template(ct))) { 578 nf_ct_tmpl_free(ct); 579 return; 580 } 581 582 if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE)) 583 destroy_gre_conntrack(ct); 584 585 /* Expectations will have been removed in clean_from_lists, 586 * except TFTP can create an expectation on the first packet, 587 * before connection is in the list, so we need to clean here, 588 * too. 589 */ 590 nf_ct_remove_expectations(ct); 591 592 if (ct->master) 593 nf_ct_put(ct->master); 594 595 nf_conntrack_free(ct); 596 } 597 EXPORT_SYMBOL(nf_ct_destroy); 598 599 static void __nf_ct_delete_from_lists(struct nf_conn *ct) 600 { 601 struct net *net = nf_ct_net(ct); 602 unsigned int hash, reply_hash; 603 unsigned int sequence; 604 605 do { 606 sequence = read_seqcount_begin(&nf_conntrack_generation); 607 hash = hash_conntrack(net, 608 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 609 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); 610 reply_hash = hash_conntrack(net, 611 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 612 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 613 } while (nf_conntrack_double_lock(hash, reply_hash, sequence)); 614 615 clean_from_lists(ct); 616 nf_conntrack_double_unlock(hash, reply_hash); 617 } 618 619 static void nf_ct_delete_from_lists(struct nf_conn *ct) 620 { 621 nf_ct_helper_destroy(ct); 622 local_bh_disable(); 623 624 __nf_ct_delete_from_lists(ct); 625 626 local_bh_enable(); 627 } 628 629 static void nf_ct_add_to_ecache_list(struct nf_conn *ct) 630 { 631 #ifdef CONFIG_NF_CONNTRACK_EVENTS 632 struct nf_conntrack_net *cnet = nf_ct_pernet(nf_ct_net(ct)); 633 634 spin_lock(&cnet->ecache.dying_lock); 635 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 636 &cnet->ecache.dying_list); 637 spin_unlock(&cnet->ecache.dying_lock); 638 #endif 639 } 640 641 bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) 642 { 643 struct nf_conn_tstamp *tstamp; 644 struct net *net; 645 646 if (test_and_set_bit(IPS_DYING_BIT, &ct->status)) 647 return false; 648 649 tstamp = nf_conn_tstamp_find(ct); 650 if (tstamp) { 651 s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp; 652 653 tstamp->stop = ktime_get_real_ns(); 654 if (timeout < 0) 655 tstamp->stop -= jiffies_to_nsecs(-timeout); 656 } 657 658 if (nf_conntrack_event_report(IPCT_DESTROY, ct, 659 portid, report) < 0) { 660 /* destroy event was not delivered. nf_ct_put will 661 * be done by event cache worker on redelivery. 662 */ 663 nf_ct_helper_destroy(ct); 664 local_bh_disable(); 665 __nf_ct_delete_from_lists(ct); 666 nf_ct_add_to_ecache_list(ct); 667 local_bh_enable(); 668 669 nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL); 670 return false; 671 } 672 673 net = nf_ct_net(ct); 674 if (nf_conntrack_ecache_dwork_pending(net)) 675 nf_conntrack_ecache_work(net, NFCT_ECACHE_DESTROY_SENT); 676 nf_ct_delete_from_lists(ct); 677 nf_ct_put(ct); 678 return true; 679 } 680 EXPORT_SYMBOL_GPL(nf_ct_delete); 681 682 static inline bool 683 nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, 684 const struct nf_conntrack_tuple *tuple, 685 const struct nf_conntrack_zone *zone, 686 const struct net *net) 687 { 688 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 689 690 /* A conntrack can be recreated with the equal tuple, 691 * so we need to check that the conntrack is confirmed 692 */ 693 return nf_ct_tuple_equal(tuple, &h->tuple) && 694 nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) && 695 nf_ct_is_confirmed(ct) && 696 net_eq(net, nf_ct_net(ct)); 697 } 698 699 static inline bool 700 nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2) 701 { 702 return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 703 &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && 704 nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple, 705 &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) && 706 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) && 707 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) && 708 net_eq(nf_ct_net(ct1), nf_ct_net(ct2)); 709 } 710 711 /* caller must hold rcu readlock and none of the nf_conntrack_locks */ 712 static void nf_ct_gc_expired(struct nf_conn *ct) 713 { 714 if (!refcount_inc_not_zero(&ct->ct_general.use)) 715 return; 716 717 /* load ->status after refcount increase */ 718 smp_acquire__after_ctrl_dep(); 719 720 if (nf_ct_should_gc(ct)) 721 nf_ct_kill(ct); 722 723 nf_ct_put(ct); 724 } 725 726 /* 727 * Warning : 728 * - Caller must take a reference on returned object 729 * and recheck nf_ct_tuple_equal(tuple, &h->tuple) 730 */ 731 static struct nf_conntrack_tuple_hash * 732 ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, 733 const struct nf_conntrack_tuple *tuple, u32 hash) 734 { 735 struct nf_conntrack_tuple_hash *h; 736 struct hlist_nulls_head *ct_hash; 737 struct hlist_nulls_node *n; 738 unsigned int bucket, hsize; 739 740 begin: 741 nf_conntrack_get_ht(&ct_hash, &hsize); 742 bucket = reciprocal_scale(hash, hsize); 743 744 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { 745 struct nf_conn *ct; 746 747 ct = nf_ct_tuplehash_to_ctrack(h); 748 if (nf_ct_is_expired(ct)) { 749 nf_ct_gc_expired(ct); 750 continue; 751 } 752 753 if (nf_ct_key_equal(h, tuple, zone, net)) 754 return h; 755 } 756 /* 757 * if the nulls value we got at the end of this lookup is 758 * not the expected one, we must restart lookup. 759 * We probably met an item that was moved to another chain. 760 */ 761 if (get_nulls_value(n) != bucket) { 762 NF_CT_STAT_INC_ATOMIC(net, search_restart); 763 goto begin; 764 } 765 766 return NULL; 767 } 768 769 /* Find a connection corresponding to a tuple. */ 770 static struct nf_conntrack_tuple_hash * 771 __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, 772 const struct nf_conntrack_tuple *tuple, u32 hash) 773 { 774 struct nf_conntrack_tuple_hash *h; 775 struct nf_conn *ct; 776 777 h = ____nf_conntrack_find(net, zone, tuple, hash); 778 if (h) { 779 /* We have a candidate that matches the tuple we're interested 780 * in, try to obtain a reference and re-check tuple 781 */ 782 ct = nf_ct_tuplehash_to_ctrack(h); 783 if (likely(refcount_inc_not_zero(&ct->ct_general.use))) { 784 /* re-check key after refcount */ 785 smp_acquire__after_ctrl_dep(); 786 787 if (likely(nf_ct_key_equal(h, tuple, zone, net))) 788 return h; 789 790 /* TYPESAFE_BY_RCU recycled the candidate */ 791 nf_ct_put(ct); 792 } 793 794 h = NULL; 795 } 796 797 return h; 798 } 799 800 struct nf_conntrack_tuple_hash * 801 nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, 802 const struct nf_conntrack_tuple *tuple) 803 { 804 unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); 805 struct nf_conntrack_tuple_hash *thash; 806 807 rcu_read_lock(); 808 809 thash = __nf_conntrack_find_get(net, zone, tuple, 810 hash_conntrack_raw(tuple, zone_id, net)); 811 812 if (thash) 813 goto out_unlock; 814 815 rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); 816 if (rid != zone_id) 817 thash = __nf_conntrack_find_get(net, zone, tuple, 818 hash_conntrack_raw(tuple, rid, net)); 819 820 out_unlock: 821 rcu_read_unlock(); 822 return thash; 823 } 824 EXPORT_SYMBOL_GPL(nf_conntrack_find_get); 825 826 static void __nf_conntrack_hash_insert(struct nf_conn *ct, 827 unsigned int hash, 828 unsigned int reply_hash) 829 { 830 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 831 &nf_conntrack_hash[hash]); 832 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 833 &nf_conntrack_hash[reply_hash]); 834 } 835 836 static bool nf_ct_ext_valid_pre(const struct nf_ct_ext *ext) 837 { 838 /* if ext->gen_id is not equal to nf_conntrack_ext_genid, some extensions 839 * may contain stale pointers to e.g. helper that has been removed. 840 * 841 * The helper can't clear this because the nf_conn object isn't in 842 * any hash and synchronize_rcu() isn't enough because associated skb 843 * might sit in a queue. 844 */ 845 return !ext || ext->gen_id == atomic_read(&nf_conntrack_ext_genid); 846 } 847 848 static bool nf_ct_ext_valid_post(struct nf_ct_ext *ext) 849 { 850 if (!ext) 851 return true; 852 853 if (ext->gen_id != atomic_read(&nf_conntrack_ext_genid)) 854 return false; 855 856 /* inserted into conntrack table, nf_ct_iterate_cleanup() 857 * will find it. Disable nf_ct_ext_find() id check. 858 */ 859 WRITE_ONCE(ext->gen_id, 0); 860 return true; 861 } 862 863 int 864 nf_conntrack_hash_check_insert(struct nf_conn *ct) 865 { 866 const struct nf_conntrack_zone *zone; 867 struct net *net = nf_ct_net(ct); 868 unsigned int hash, reply_hash; 869 struct nf_conntrack_tuple_hash *h; 870 struct hlist_nulls_node *n; 871 unsigned int max_chainlen; 872 unsigned int chainlen = 0; 873 unsigned int sequence; 874 int err = -EEXIST; 875 876 zone = nf_ct_zone(ct); 877 878 if (!nf_ct_ext_valid_pre(ct->ext)) 879 return -EAGAIN; 880 881 local_bh_disable(); 882 do { 883 sequence = read_seqcount_begin(&nf_conntrack_generation); 884 hash = hash_conntrack(net, 885 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 886 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); 887 reply_hash = hash_conntrack(net, 888 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 889 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 890 } while (nf_conntrack_double_lock(hash, reply_hash, sequence)); 891 892 max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN); 893 894 /* See if there's one in the list already, including reverse */ 895 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { 896 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 897 zone, net)) 898 goto out; 899 900 if (chainlen++ > max_chainlen) 901 goto chaintoolong; 902 } 903 904 chainlen = 0; 905 906 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { 907 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 908 zone, net)) 909 goto out; 910 if (chainlen++ > max_chainlen) 911 goto chaintoolong; 912 } 913 914 /* If genid has changed, we can't insert anymore because ct 915 * extensions could have stale pointers and nf_ct_iterate_destroy 916 * might have completed its table scan already. 917 * 918 * Increment of the ext genid right after this check is fine: 919 * nf_ct_iterate_destroy blocks until locks are released. 920 */ 921 if (!nf_ct_ext_valid_post(ct->ext)) { 922 err = -EAGAIN; 923 goto out; 924 } 925 926 smp_wmb(); 927 /* The caller holds a reference to this object */ 928 refcount_set(&ct->ct_general.use, 2); 929 __nf_conntrack_hash_insert(ct, hash, reply_hash); 930 nf_conntrack_double_unlock(hash, reply_hash); 931 NF_CT_STAT_INC(net, insert); 932 local_bh_enable(); 933 934 return 0; 935 chaintoolong: 936 NF_CT_STAT_INC(net, chaintoolong); 937 err = -ENOSPC; 938 out: 939 nf_conntrack_double_unlock(hash, reply_hash); 940 local_bh_enable(); 941 return err; 942 } 943 EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); 944 945 void nf_ct_acct_add(struct nf_conn *ct, u32 dir, unsigned int packets, 946 unsigned int bytes) 947 { 948 struct nf_conn_acct *acct; 949 950 acct = nf_conn_acct_find(ct); 951 if (acct) { 952 struct nf_conn_counter *counter = acct->counter; 953 954 atomic64_add(packets, &counter[dir].packets); 955 atomic64_add(bytes, &counter[dir].bytes); 956 } 957 } 958 EXPORT_SYMBOL_GPL(nf_ct_acct_add); 959 960 static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, 961 const struct nf_conn *loser_ct) 962 { 963 struct nf_conn_acct *acct; 964 965 acct = nf_conn_acct_find(loser_ct); 966 if (acct) { 967 struct nf_conn_counter *counter = acct->counter; 968 unsigned int bytes; 969 970 /* u32 should be fine since we must have seen one packet. */ 971 bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes); 972 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes); 973 } 974 } 975 976 static void __nf_conntrack_insert_prepare(struct nf_conn *ct) 977 { 978 struct nf_conn_tstamp *tstamp; 979 980 refcount_inc(&ct->ct_general.use); 981 982 /* set conntrack timestamp, if enabled. */ 983 tstamp = nf_conn_tstamp_find(ct); 984 if (tstamp) 985 tstamp->start = ktime_get_real_ns(); 986 } 987 988 /** 989 * nf_ct_match_reverse - check if ct1 and ct2 refer to identical flow 990 * @ct1: conntrack in hash table to check against 991 * @ct2: merge candidate 992 * 993 * returns true if ct1 and ct2 happen to refer to the same flow, but 994 * in opposing directions, i.e. 995 * ct1: a:b -> c:d 996 * ct2: c:d -> a:b 997 * for both directions. If so, @ct2 should not have been created 998 * as the skb should have been picked up as ESTABLISHED flow. 999 * But ct1 was not yet committed to hash table before skb that created 1000 * ct2 had arrived. 1001 * 1002 * Note we don't compare netns because ct entries in different net 1003 * namespace cannot clash to begin with. 1004 * 1005 * @return: true if ct1 and ct2 are identical when swapping origin/reply. 1006 */ 1007 static bool 1008 nf_ct_match_reverse(const struct nf_conn *ct1, const struct nf_conn *ct2) 1009 { 1010 u16 id1, id2; 1011 1012 if (!nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1013 &ct2->tuplehash[IP_CT_DIR_REPLY].tuple)) 1014 return false; 1015 1016 if (!nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple, 1017 &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) 1018 return false; 1019 1020 id1 = nf_ct_zone_id(nf_ct_zone(ct1), IP_CT_DIR_ORIGINAL); 1021 id2 = nf_ct_zone_id(nf_ct_zone(ct2), IP_CT_DIR_REPLY); 1022 if (id1 != id2) 1023 return false; 1024 1025 id1 = nf_ct_zone_id(nf_ct_zone(ct1), IP_CT_DIR_REPLY); 1026 id2 = nf_ct_zone_id(nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL); 1027 1028 return id1 == id2; 1029 } 1030 1031 static int nf_ct_can_merge(const struct nf_conn *ct, 1032 const struct nf_conn *loser_ct) 1033 { 1034 return nf_ct_match(ct, loser_ct) || 1035 nf_ct_match_reverse(ct, loser_ct); 1036 } 1037 1038 /* caller must hold locks to prevent concurrent changes */ 1039 static int __nf_ct_resolve_clash(struct sk_buff *skb, 1040 struct nf_conntrack_tuple_hash *h) 1041 { 1042 /* This is the conntrack entry already in hashes that won race. */ 1043 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 1044 enum ip_conntrack_info ctinfo; 1045 struct nf_conn *loser_ct; 1046 1047 loser_ct = nf_ct_get(skb, &ctinfo); 1048 1049 if (nf_ct_can_merge(ct, loser_ct)) { 1050 struct net *net = nf_ct_net(ct); 1051 1052 nf_conntrack_get(&ct->ct_general); 1053 1054 nf_ct_acct_merge(ct, ctinfo, loser_ct); 1055 nf_ct_put(loser_ct); 1056 nf_ct_set(skb, ct, ctinfo); 1057 1058 NF_CT_STAT_INC(net, clash_resolve); 1059 return NF_ACCEPT; 1060 } 1061 1062 return NF_DROP; 1063 } 1064 1065 /** 1066 * nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry 1067 * 1068 * @skb: skb that causes the collision 1069 * @repl_idx: hash slot for reply direction 1070 * 1071 * Called when origin or reply direction had a clash. 1072 * The skb can be handled without packet drop provided the reply direction 1073 * is unique or there the existing entry has the identical tuple in both 1074 * directions. 1075 * 1076 * Caller must hold conntrack table locks to prevent concurrent updates. 1077 * 1078 * Returns NF_DROP if the clash could not be handled. 1079 */ 1080 static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) 1081 { 1082 struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb); 1083 const struct nf_conntrack_zone *zone; 1084 struct nf_conntrack_tuple_hash *h; 1085 struct hlist_nulls_node *n; 1086 struct net *net; 1087 1088 zone = nf_ct_zone(loser_ct); 1089 net = nf_ct_net(loser_ct); 1090 1091 /* Reply direction must never result in a clash, unless both origin 1092 * and reply tuples are identical. 1093 */ 1094 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) { 1095 if (nf_ct_key_equal(h, 1096 &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1097 zone, net)) 1098 return __nf_ct_resolve_clash(skb, h); 1099 } 1100 1101 /* We want the clashing entry to go away real soon: 1 second timeout. */ 1102 WRITE_ONCE(loser_ct->timeout, nfct_time_stamp + HZ); 1103 1104 /* IPS_NAT_CLASH removes the entry automatically on the first 1105 * reply. Also prevents UDP tracker from moving the entry to 1106 * ASSURED state, i.e. the entry can always be evicted under 1107 * pressure. 1108 */ 1109 loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH; 1110 1111 __nf_conntrack_insert_prepare(loser_ct); 1112 1113 /* fake add for ORIGINAL dir: we want lookups to only find the entry 1114 * already in the table. This also hides the clashing entry from 1115 * ctnetlink iteration, i.e. conntrack -L won't show them. 1116 */ 1117 hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 1118 1119 hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 1120 &nf_conntrack_hash[repl_idx]); 1121 /* confirmed bit must be set after hlist add, not before: 1122 * loser_ct can still be visible to other cpu due to 1123 * SLAB_TYPESAFE_BY_RCU. 1124 */ 1125 smp_mb__before_atomic(); 1126 set_bit(IPS_CONFIRMED_BIT, &loser_ct->status); 1127 1128 NF_CT_STAT_INC(net, clash_resolve); 1129 return NF_ACCEPT; 1130 } 1131 1132 /** 1133 * nf_ct_resolve_clash - attempt to handle clash without packet drop 1134 * 1135 * @skb: skb that causes the clash 1136 * @h: tuplehash of the clashing entry already in table 1137 * @reply_hash: hash slot for reply direction 1138 * 1139 * A conntrack entry can be inserted to the connection tracking table 1140 * if there is no existing entry with an identical tuple. 1141 * 1142 * If there is one, @skb (and the associated, unconfirmed conntrack) has 1143 * to be dropped. In case @skb is retransmitted, next conntrack lookup 1144 * will find the already-existing entry. 1145 * 1146 * The major problem with such packet drop is the extra delay added by 1147 * the packet loss -- it will take some time for a retransmit to occur 1148 * (or the sender to time out when waiting for a reply). 1149 * 1150 * This function attempts to handle the situation without packet drop. 1151 * 1152 * If @skb has no NAT transformation or if the colliding entries are 1153 * exactly the same, only the to-be-confirmed conntrack entry is discarded 1154 * and @skb is associated with the conntrack entry already in the table. 1155 * 1156 * Failing that, the new, unconfirmed conntrack is still added to the table 1157 * provided that the collision only occurs in the ORIGINAL direction. 1158 * The new entry will be added only in the non-clashing REPLY direction, 1159 * so packets in the ORIGINAL direction will continue to match the existing 1160 * entry. The new entry will also have a fixed timeout so it expires -- 1161 * due to the collision, it will only see reply traffic. 1162 * 1163 * Returns NF_DROP if the clash could not be resolved. 1164 */ 1165 static __cold noinline int 1166 nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h, 1167 u32 reply_hash) 1168 { 1169 /* This is the conntrack entry already in hashes that won race. */ 1170 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 1171 const struct nf_conntrack_l4proto *l4proto; 1172 enum ip_conntrack_info ctinfo; 1173 struct nf_conn *loser_ct; 1174 struct net *net; 1175 int ret; 1176 1177 loser_ct = nf_ct_get(skb, &ctinfo); 1178 net = nf_ct_net(loser_ct); 1179 1180 l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); 1181 if (!l4proto->allow_clash) 1182 goto drop; 1183 1184 ret = __nf_ct_resolve_clash(skb, h); 1185 if (ret == NF_ACCEPT) 1186 return ret; 1187 1188 ret = nf_ct_resolve_clash_harder(skb, reply_hash); 1189 if (ret == NF_ACCEPT) 1190 return ret; 1191 1192 drop: 1193 NF_CT_STAT_INC(net, drop); 1194 NF_CT_STAT_INC(net, insert_failed); 1195 return NF_DROP; 1196 } 1197 1198 /* Confirm a connection given skb; places it in hash table */ 1199 int 1200 __nf_conntrack_confirm(struct sk_buff *skb) 1201 { 1202 unsigned int chainlen = 0, sequence, max_chainlen; 1203 const struct nf_conntrack_zone *zone; 1204 unsigned int hash, reply_hash; 1205 struct nf_conntrack_tuple_hash *h; 1206 struct nf_conn *ct; 1207 struct nf_conn_help *help; 1208 struct hlist_nulls_node *n; 1209 enum ip_conntrack_info ctinfo; 1210 struct net *net; 1211 int ret = NF_DROP; 1212 1213 ct = nf_ct_get(skb, &ctinfo); 1214 net = nf_ct_net(ct); 1215 1216 /* ipt_REJECT uses nf_conntrack_attach to attach related 1217 ICMP/TCP RST packets in other direction. Actual packet 1218 which created connection will be IP_CT_NEW or for an 1219 expected connection, IP_CT_RELATED. */ 1220 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) 1221 return NF_ACCEPT; 1222 1223 zone = nf_ct_zone(ct); 1224 local_bh_disable(); 1225 1226 do { 1227 sequence = read_seqcount_begin(&nf_conntrack_generation); 1228 /* reuse the hash saved before */ 1229 hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; 1230 hash = scale_hash(hash); 1231 reply_hash = hash_conntrack(net, 1232 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1233 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 1234 } while (nf_conntrack_double_lock(hash, reply_hash, sequence)); 1235 1236 /* We're not in hash table, and we refuse to set up related 1237 * connections for unconfirmed conns. But packet copies and 1238 * REJECT will give spurious warnings here. 1239 */ 1240 1241 /* Another skb with the same unconfirmed conntrack may 1242 * win the race. This may happen for bridge(br_flood) 1243 * or broadcast/multicast packets do skb_clone with 1244 * unconfirmed conntrack. 1245 */ 1246 if (unlikely(nf_ct_is_confirmed(ct))) { 1247 WARN_ON_ONCE(1); 1248 nf_conntrack_double_unlock(hash, reply_hash); 1249 local_bh_enable(); 1250 return NF_DROP; 1251 } 1252 1253 if (!nf_ct_ext_valid_pre(ct->ext)) { 1254 NF_CT_STAT_INC(net, insert_failed); 1255 goto dying; 1256 } 1257 1258 /* We have to check the DYING flag after unlink to prevent 1259 * a race against nf_ct_get_next_corpse() possibly called from 1260 * user context, else we insert an already 'dead' hash, blocking 1261 * further use of that particular connection -JM. 1262 */ 1263 if (unlikely(nf_ct_is_dying(ct))) { 1264 NF_CT_STAT_INC(net, insert_failed); 1265 goto dying; 1266 } 1267 1268 max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN); 1269 /* See if there's one in the list already, including reverse: 1270 NAT could have grabbed it without realizing, since we're 1271 not in the hash. If there is, we lost race. */ 1272 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { 1273 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1274 zone, net)) 1275 goto out; 1276 if (chainlen++ > max_chainlen) 1277 goto chaintoolong; 1278 } 1279 1280 chainlen = 0; 1281 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { 1282 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1283 zone, net)) 1284 goto out; 1285 if (chainlen++ > max_chainlen) { 1286 chaintoolong: 1287 NF_CT_STAT_INC(net, chaintoolong); 1288 NF_CT_STAT_INC(net, insert_failed); 1289 ret = NF_DROP; 1290 goto dying; 1291 } 1292 } 1293 1294 /* Timeout is relative to confirmation time, not original 1295 setting time, otherwise we'd get timer wrap in 1296 weird delay cases. */ 1297 ct->timeout += nfct_time_stamp; 1298 1299 __nf_conntrack_insert_prepare(ct); 1300 1301 /* Since the lookup is lockless, hash insertion must be done after 1302 * setting ct->timeout. The RCU barriers guarantee that no other CPU 1303 * can find the conntrack before the above stores are visible. 1304 */ 1305 __nf_conntrack_hash_insert(ct, hash, reply_hash); 1306 1307 /* IPS_CONFIRMED unset means 'ct not (yet) in hash', conntrack lookups 1308 * skip entries that lack this bit. This happens when a CPU is looking 1309 * at a stale entry that is being recycled due to SLAB_TYPESAFE_BY_RCU 1310 * or when another CPU encounters this entry right after the insertion 1311 * but before the set-confirm-bit below. This bit must not be set until 1312 * after __nf_conntrack_hash_insert(). 1313 */ 1314 smp_mb__before_atomic(); 1315 set_bit(IPS_CONFIRMED_BIT, &ct->status); 1316 1317 nf_conntrack_double_unlock(hash, reply_hash); 1318 local_bh_enable(); 1319 1320 /* ext area is still valid (rcu read lock is held, 1321 * but will go out of scope soon, we need to remove 1322 * this conntrack again. 1323 */ 1324 if (!nf_ct_ext_valid_post(ct->ext)) { 1325 nf_ct_kill(ct); 1326 NF_CT_STAT_INC_ATOMIC(net, drop); 1327 return NF_DROP; 1328 } 1329 1330 help = nfct_help(ct); 1331 if (help && help->helper) 1332 nf_conntrack_event_cache(IPCT_HELPER, ct); 1333 1334 nf_conntrack_event_cache(master_ct(ct) ? 1335 IPCT_RELATED : IPCT_NEW, ct); 1336 return NF_ACCEPT; 1337 1338 out: 1339 ret = nf_ct_resolve_clash(skb, h, reply_hash); 1340 dying: 1341 nf_conntrack_double_unlock(hash, reply_hash); 1342 local_bh_enable(); 1343 return ret; 1344 } 1345 EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); 1346 1347 /* Returns true if a connection corresponds to the tuple (required 1348 for NAT). */ 1349 int 1350 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, 1351 const struct nf_conn *ignored_conntrack) 1352 { 1353 struct net *net = nf_ct_net(ignored_conntrack); 1354 const struct nf_conntrack_zone *zone; 1355 struct nf_conntrack_tuple_hash *h; 1356 struct hlist_nulls_head *ct_hash; 1357 unsigned int hash, hsize; 1358 struct hlist_nulls_node *n; 1359 struct nf_conn *ct; 1360 1361 zone = nf_ct_zone(ignored_conntrack); 1362 1363 rcu_read_lock(); 1364 begin: 1365 nf_conntrack_get_ht(&ct_hash, &hsize); 1366 hash = __hash_conntrack(net, tuple, nf_ct_zone_id(zone, IP_CT_DIR_REPLY), hsize); 1367 1368 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { 1369 ct = nf_ct_tuplehash_to_ctrack(h); 1370 1371 if (ct == ignored_conntrack) 1372 continue; 1373 1374 if (nf_ct_is_expired(ct)) { 1375 nf_ct_gc_expired(ct); 1376 continue; 1377 } 1378 1379 if (nf_ct_key_equal(h, tuple, zone, net)) { 1380 /* Tuple is taken already, so caller will need to find 1381 * a new source port to use. 1382 * 1383 * Only exception: 1384 * If the *original tuples* are identical, then both 1385 * conntracks refer to the same flow. 1386 * This is a rare situation, it can occur e.g. when 1387 * more than one UDP packet is sent from same socket 1388 * in different threads. 1389 * 1390 * Let nf_ct_resolve_clash() deal with this later. 1391 */ 1392 if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1393 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && 1394 nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) 1395 continue; 1396 1397 NF_CT_STAT_INC_ATOMIC(net, found); 1398 rcu_read_unlock(); 1399 return 1; 1400 } 1401 } 1402 1403 if (get_nulls_value(n) != hash) { 1404 NF_CT_STAT_INC_ATOMIC(net, search_restart); 1405 goto begin; 1406 } 1407 1408 rcu_read_unlock(); 1409 1410 return 0; 1411 } 1412 EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); 1413 1414 #define NF_CT_EVICTION_RANGE 8 1415 1416 /* There's a small race here where we may free a just-assured 1417 connection. Too bad: we're in trouble anyway. */ 1418 static unsigned int early_drop_list(struct net *net, 1419 struct hlist_nulls_head *head) 1420 { 1421 struct nf_conntrack_tuple_hash *h; 1422 struct hlist_nulls_node *n; 1423 unsigned int drops = 0; 1424 struct nf_conn *tmp; 1425 1426 hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { 1427 tmp = nf_ct_tuplehash_to_ctrack(h); 1428 1429 if (nf_ct_is_expired(tmp)) { 1430 nf_ct_gc_expired(tmp); 1431 continue; 1432 } 1433 1434 if (test_bit(IPS_ASSURED_BIT, &tmp->status) || 1435 !net_eq(nf_ct_net(tmp), net) || 1436 nf_ct_is_dying(tmp)) 1437 continue; 1438 1439 if (!refcount_inc_not_zero(&tmp->ct_general.use)) 1440 continue; 1441 1442 /* load ->ct_net and ->status after refcount increase */ 1443 smp_acquire__after_ctrl_dep(); 1444 1445 /* kill only if still in same netns -- might have moved due to 1446 * SLAB_TYPESAFE_BY_RCU rules. 1447 * 1448 * We steal the timer reference. If that fails timer has 1449 * already fired or someone else deleted it. Just drop ref 1450 * and move to next entry. 1451 */ 1452 if (net_eq(nf_ct_net(tmp), net) && 1453 nf_ct_is_confirmed(tmp) && 1454 nf_ct_delete(tmp, 0, 0)) 1455 drops++; 1456 1457 nf_ct_put(tmp); 1458 } 1459 1460 return drops; 1461 } 1462 1463 static noinline int early_drop(struct net *net, unsigned int hash) 1464 { 1465 unsigned int i, bucket; 1466 1467 for (i = 0; i < NF_CT_EVICTION_RANGE; i++) { 1468 struct hlist_nulls_head *ct_hash; 1469 unsigned int hsize, drops; 1470 1471 rcu_read_lock(); 1472 nf_conntrack_get_ht(&ct_hash, &hsize); 1473 if (!i) 1474 bucket = reciprocal_scale(hash, hsize); 1475 else 1476 bucket = (bucket + 1) % hsize; 1477 1478 drops = early_drop_list(net, &ct_hash[bucket]); 1479 rcu_read_unlock(); 1480 1481 if (drops) { 1482 NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops); 1483 return true; 1484 } 1485 } 1486 1487 return false; 1488 } 1489 1490 static bool gc_worker_skip_ct(const struct nf_conn *ct) 1491 { 1492 return !nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct); 1493 } 1494 1495 static bool gc_worker_can_early_drop(const struct nf_conn *ct) 1496 { 1497 const struct nf_conntrack_l4proto *l4proto; 1498 u8 protonum = nf_ct_protonum(ct); 1499 1500 if (!test_bit(IPS_ASSURED_BIT, &ct->status)) 1501 return true; 1502 1503 l4proto = nf_ct_l4proto_find(protonum); 1504 if (l4proto->can_early_drop && l4proto->can_early_drop(ct)) 1505 return true; 1506 1507 return false; 1508 } 1509 1510 static void gc_worker(struct work_struct *work) 1511 { 1512 unsigned int i, hashsz, nf_conntrack_max95 = 0; 1513 u32 end_time, start_time = nfct_time_stamp; 1514 struct conntrack_gc_work *gc_work; 1515 unsigned int expired_count = 0; 1516 unsigned long next_run; 1517 s32 delta_time; 1518 long count; 1519 1520 gc_work = container_of(work, struct conntrack_gc_work, dwork.work); 1521 1522 i = gc_work->next_bucket; 1523 if (gc_work->early_drop) 1524 nf_conntrack_max95 = nf_conntrack_max / 100u * 95u; 1525 1526 if (i == 0) { 1527 gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT; 1528 gc_work->count = GC_SCAN_INITIAL_COUNT; 1529 gc_work->start_time = start_time; 1530 } 1531 1532 next_run = gc_work->avg_timeout; 1533 count = gc_work->count; 1534 1535 end_time = start_time + GC_SCAN_MAX_DURATION; 1536 1537 do { 1538 struct nf_conntrack_tuple_hash *h; 1539 struct hlist_nulls_head *ct_hash; 1540 struct hlist_nulls_node *n; 1541 struct nf_conn *tmp; 1542 1543 rcu_read_lock(); 1544 1545 nf_conntrack_get_ht(&ct_hash, &hashsz); 1546 if (i >= hashsz) { 1547 rcu_read_unlock(); 1548 break; 1549 } 1550 1551 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) { 1552 struct nf_conntrack_net *cnet; 1553 struct net *net; 1554 long expires; 1555 1556 tmp = nf_ct_tuplehash_to_ctrack(h); 1557 1558 if (expired_count > GC_SCAN_EXPIRED_MAX) { 1559 rcu_read_unlock(); 1560 1561 gc_work->next_bucket = i; 1562 gc_work->avg_timeout = next_run; 1563 gc_work->count = count; 1564 1565 delta_time = nfct_time_stamp - gc_work->start_time; 1566 1567 /* re-sched immediately if total cycle time is exceeded */ 1568 next_run = delta_time < (s32)GC_SCAN_INTERVAL_MAX; 1569 goto early_exit; 1570 } 1571 1572 if (nf_ct_is_expired(tmp)) { 1573 nf_ct_gc_expired(tmp); 1574 expired_count++; 1575 continue; 1576 } 1577 1578 expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP); 1579 expires = (expires - (long)next_run) / ++count; 1580 next_run += expires; 1581 1582 if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp)) 1583 continue; 1584 1585 net = nf_ct_net(tmp); 1586 cnet = nf_ct_pernet(net); 1587 if (atomic_read(&cnet->count) < nf_conntrack_max95) 1588 continue; 1589 1590 /* need to take reference to avoid possible races */ 1591 if (!refcount_inc_not_zero(&tmp->ct_general.use)) 1592 continue; 1593 1594 /* load ->status after refcount increase */ 1595 smp_acquire__after_ctrl_dep(); 1596 1597 if (gc_worker_skip_ct(tmp)) { 1598 nf_ct_put(tmp); 1599 continue; 1600 } 1601 1602 if (gc_worker_can_early_drop(tmp)) { 1603 nf_ct_kill(tmp); 1604 expired_count++; 1605 } 1606 1607 nf_ct_put(tmp); 1608 } 1609 1610 /* could check get_nulls_value() here and restart if ct 1611 * was moved to another chain. But given gc is best-effort 1612 * we will just continue with next hash slot. 1613 */ 1614 rcu_read_unlock(); 1615 cond_resched(); 1616 i++; 1617 1618 delta_time = nfct_time_stamp - end_time; 1619 if (delta_time > 0 && i < hashsz) { 1620 gc_work->avg_timeout = next_run; 1621 gc_work->count = count; 1622 gc_work->next_bucket = i; 1623 next_run = 0; 1624 goto early_exit; 1625 } 1626 } while (i < hashsz); 1627 1628 gc_work->next_bucket = 0; 1629 1630 next_run = clamp(next_run, GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_MAX); 1631 1632 delta_time = max_t(s32, nfct_time_stamp - gc_work->start_time, 1); 1633 if (next_run > (unsigned long)delta_time) 1634 next_run -= delta_time; 1635 else 1636 next_run = 1; 1637 1638 early_exit: 1639 if (gc_work->exiting) 1640 return; 1641 1642 if (next_run) 1643 gc_work->early_drop = false; 1644 1645 queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run); 1646 } 1647 1648 static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) 1649 { 1650 INIT_DELAYED_WORK(&gc_work->dwork, gc_worker); 1651 gc_work->exiting = false; 1652 } 1653 1654 static struct nf_conn * 1655 __nf_conntrack_alloc(struct net *net, 1656 const struct nf_conntrack_zone *zone, 1657 const struct nf_conntrack_tuple *orig, 1658 const struct nf_conntrack_tuple *repl, 1659 gfp_t gfp, u32 hash) 1660 { 1661 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 1662 unsigned int ct_count; 1663 struct nf_conn *ct; 1664 1665 /* We don't want any race condition at early drop stage */ 1666 ct_count = atomic_inc_return(&cnet->count); 1667 1668 if (unlikely(ct_count > nf_conntrack_max)) { 1669 if (!early_drop(net, hash)) { 1670 if (!conntrack_gc_work.early_drop) 1671 conntrack_gc_work.early_drop = true; 1672 atomic_dec(&cnet->count); 1673 if (net == &init_net) 1674 net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); 1675 else 1676 net_warn_ratelimited("nf_conntrack: table full in netns %u, dropping packet\n", 1677 net->ns.inum); 1678 return ERR_PTR(-ENOMEM); 1679 } 1680 } 1681 1682 /* 1683 * Do not use kmem_cache_zalloc(), as this cache uses 1684 * SLAB_TYPESAFE_BY_RCU. 1685 */ 1686 ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); 1687 if (ct == NULL) 1688 goto out; 1689 1690 spin_lock_init(&ct->lock); 1691 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; 1692 ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; 1693 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; 1694 /* save hash for reusing when confirming */ 1695 *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; 1696 ct->status = 0; 1697 WRITE_ONCE(ct->timeout, 0); 1698 write_pnet(&ct->ct_net, net); 1699 memset_after(ct, 0, __nfct_init_offset); 1700 1701 nf_ct_zone_add(ct, zone); 1702 1703 /* Because we use RCU lookups, we set ct_general.use to zero before 1704 * this is inserted in any list. 1705 */ 1706 refcount_set(&ct->ct_general.use, 0); 1707 return ct; 1708 out: 1709 atomic_dec(&cnet->count); 1710 return ERR_PTR(-ENOMEM); 1711 } 1712 1713 struct nf_conn *nf_conntrack_alloc(struct net *net, 1714 const struct nf_conntrack_zone *zone, 1715 const struct nf_conntrack_tuple *orig, 1716 const struct nf_conntrack_tuple *repl, 1717 gfp_t gfp) 1718 { 1719 return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0); 1720 } 1721 EXPORT_SYMBOL_GPL(nf_conntrack_alloc); 1722 1723 void nf_conntrack_free(struct nf_conn *ct) 1724 { 1725 struct net *net = nf_ct_net(ct); 1726 struct nf_conntrack_net *cnet; 1727 1728 /* A freed object has refcnt == 0, that's 1729 * the golden rule for SLAB_TYPESAFE_BY_RCU 1730 */ 1731 WARN_ON(refcount_read(&ct->ct_general.use) != 0); 1732 1733 if (ct->status & IPS_SRC_NAT_DONE) { 1734 const struct nf_nat_hook *nat_hook; 1735 1736 rcu_read_lock(); 1737 nat_hook = rcu_dereference(nf_nat_hook); 1738 if (nat_hook) 1739 nat_hook->remove_nat_bysrc(ct); 1740 rcu_read_unlock(); 1741 } 1742 1743 kfree(ct->ext); 1744 kmem_cache_free(nf_conntrack_cachep, ct); 1745 cnet = nf_ct_pernet(net); 1746 1747 smp_mb__before_atomic(); 1748 atomic_dec(&cnet->count); 1749 } 1750 EXPORT_SYMBOL_GPL(nf_conntrack_free); 1751 1752 1753 /* Allocate a new conntrack: we return -ENOMEM if classification 1754 failed due to stress. Otherwise it really is unclassifiable. */ 1755 static noinline struct nf_conntrack_tuple_hash * 1756 init_conntrack(struct net *net, struct nf_conn *tmpl, 1757 const struct nf_conntrack_tuple *tuple, 1758 struct sk_buff *skb, 1759 unsigned int dataoff, u32 hash) 1760 { 1761 struct nf_conn *ct; 1762 struct nf_conn_help *help; 1763 struct nf_conntrack_tuple repl_tuple; 1764 #ifdef CONFIG_NF_CONNTRACK_EVENTS 1765 struct nf_conntrack_ecache *ecache; 1766 #endif 1767 struct nf_conntrack_expect *exp = NULL; 1768 const struct nf_conntrack_zone *zone; 1769 struct nf_conn_timeout *timeout_ext; 1770 struct nf_conntrack_zone tmp; 1771 struct nf_conntrack_net *cnet; 1772 1773 if (!nf_ct_invert_tuple(&repl_tuple, tuple)) 1774 return NULL; 1775 1776 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 1777 ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, 1778 hash); 1779 if (IS_ERR(ct)) 1780 return ERR_CAST(ct); 1781 1782 if (!nf_ct_add_synproxy(ct, tmpl)) { 1783 nf_conntrack_free(ct); 1784 return ERR_PTR(-ENOMEM); 1785 } 1786 1787 timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; 1788 1789 if (timeout_ext) 1790 nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout), 1791 GFP_ATOMIC); 1792 1793 nf_ct_acct_ext_add(ct, GFP_ATOMIC); 1794 nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); 1795 nf_ct_labels_ext_add(ct); 1796 1797 #ifdef CONFIG_NF_CONNTRACK_EVENTS 1798 ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; 1799 1800 if ((ecache || net->ct.sysctl_events) && 1801 !nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, 1802 ecache ? ecache->expmask : 0, 1803 GFP_ATOMIC)) { 1804 nf_conntrack_free(ct); 1805 return ERR_PTR(-ENOMEM); 1806 } 1807 #endif 1808 1809 cnet = nf_ct_pernet(net); 1810 if (cnet->expect_count) { 1811 spin_lock_bh(&nf_conntrack_expect_lock); 1812 exp = nf_ct_find_expectation(net, zone, tuple, !tmpl || nf_ct_is_confirmed(tmpl)); 1813 if (exp) { 1814 /* Welcome, Mr. Bond. We've been expecting you... */ 1815 __set_bit(IPS_EXPECTED_BIT, &ct->status); 1816 /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ 1817 ct->master = exp->master; 1818 if (exp->helper) { 1819 help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); 1820 if (help) 1821 rcu_assign_pointer(help->helper, exp->helper); 1822 } 1823 1824 #ifdef CONFIG_NF_CONNTRACK_MARK 1825 ct->mark = READ_ONCE(exp->master->mark); 1826 #endif 1827 #ifdef CONFIG_NF_CONNTRACK_SECMARK 1828 ct->secmark = exp->master->secmark; 1829 #endif 1830 NF_CT_STAT_INC(net, expect_new); 1831 } 1832 spin_unlock_bh(&nf_conntrack_expect_lock); 1833 } 1834 if (!exp && tmpl) 1835 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); 1836 1837 /* Other CPU might have obtained a pointer to this object before it was 1838 * released. Because refcount is 0, refcount_inc_not_zero() will fail. 1839 * 1840 * After refcount_set(1) it will succeed; ensure that zeroing of 1841 * ct->status and the correct ct->net pointer are visible; else other 1842 * core might observe CONFIRMED bit which means the entry is valid and 1843 * in the hash table, but its not (anymore). 1844 */ 1845 smp_wmb(); 1846 1847 /* Now it is going to be associated with an sk_buff, set refcount to 1. */ 1848 refcount_set(&ct->ct_general.use, 1); 1849 1850 if (exp) { 1851 if (exp->expectfn) 1852 exp->expectfn(ct, exp); 1853 nf_ct_expect_put(exp); 1854 } 1855 1856 return &ct->tuplehash[IP_CT_DIR_ORIGINAL]; 1857 } 1858 1859 /* On success, returns 0, sets skb->_nfct | ctinfo */ 1860 static int 1861 resolve_normal_ct(struct nf_conn *tmpl, 1862 struct sk_buff *skb, 1863 unsigned int dataoff, 1864 u_int8_t protonum, 1865 const struct nf_hook_state *state) 1866 { 1867 const struct nf_conntrack_zone *zone; 1868 struct nf_conntrack_tuple tuple; 1869 struct nf_conntrack_tuple_hash *h; 1870 enum ip_conntrack_info ctinfo; 1871 struct nf_conntrack_zone tmp; 1872 u32 hash, zone_id, rid; 1873 struct nf_conn *ct; 1874 1875 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), 1876 dataoff, state->pf, protonum, state->net, 1877 &tuple)) 1878 return 0; 1879 1880 /* look for tuple match */ 1881 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 1882 1883 zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); 1884 hash = hash_conntrack_raw(&tuple, zone_id, state->net); 1885 h = __nf_conntrack_find_get(state->net, zone, &tuple, hash); 1886 1887 if (!h) { 1888 rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); 1889 if (zone_id != rid) { 1890 u32 tmp = hash_conntrack_raw(&tuple, rid, state->net); 1891 1892 h = __nf_conntrack_find_get(state->net, zone, &tuple, tmp); 1893 } 1894 } 1895 1896 if (!h) { 1897 h = init_conntrack(state->net, tmpl, &tuple, 1898 skb, dataoff, hash); 1899 if (!h) 1900 return 0; 1901 if (IS_ERR(h)) 1902 return PTR_ERR(h); 1903 } 1904 ct = nf_ct_tuplehash_to_ctrack(h); 1905 1906 /* It exists; we have (non-exclusive) reference. */ 1907 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { 1908 ctinfo = IP_CT_ESTABLISHED_REPLY; 1909 } else { 1910 unsigned long status = READ_ONCE(ct->status); 1911 1912 /* Once we've had two way comms, always ESTABLISHED. */ 1913 if (likely(status & IPS_SEEN_REPLY)) 1914 ctinfo = IP_CT_ESTABLISHED; 1915 else if (status & IPS_EXPECTED) 1916 ctinfo = IP_CT_RELATED; 1917 else 1918 ctinfo = IP_CT_NEW; 1919 } 1920 nf_ct_set(skb, ct, ctinfo); 1921 return 0; 1922 } 1923 1924 /* 1925 * icmp packets need special treatment to handle error messages that are 1926 * related to a connection. 1927 * 1928 * Callers need to check if skb has a conntrack assigned when this 1929 * helper returns; in such case skb belongs to an already known connection. 1930 */ 1931 static unsigned int __cold 1932 nf_conntrack_handle_icmp(struct nf_conn *tmpl, 1933 struct sk_buff *skb, 1934 unsigned int dataoff, 1935 u8 protonum, 1936 const struct nf_hook_state *state) 1937 { 1938 int ret; 1939 1940 if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP) 1941 ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state); 1942 #if IS_ENABLED(CONFIG_IPV6) 1943 else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6) 1944 ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state); 1945 #endif 1946 else 1947 return NF_ACCEPT; 1948 1949 if (ret <= 0) 1950 NF_CT_STAT_INC_ATOMIC(state->net, error); 1951 1952 return ret; 1953 } 1954 1955 static int generic_packet(struct nf_conn *ct, struct sk_buff *skb, 1956 enum ip_conntrack_info ctinfo) 1957 { 1958 const unsigned int *timeout = nf_ct_timeout_lookup(ct); 1959 1960 if (!timeout) 1961 timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout; 1962 1963 nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); 1964 return NF_ACCEPT; 1965 } 1966 1967 /* Returns verdict for packet, or -1 for invalid. */ 1968 static int nf_conntrack_handle_packet(struct nf_conn *ct, 1969 struct sk_buff *skb, 1970 unsigned int dataoff, 1971 enum ip_conntrack_info ctinfo, 1972 const struct nf_hook_state *state) 1973 { 1974 switch (nf_ct_protonum(ct)) { 1975 case IPPROTO_TCP: 1976 return nf_conntrack_tcp_packet(ct, skb, dataoff, 1977 ctinfo, state); 1978 case IPPROTO_UDP: 1979 return nf_conntrack_udp_packet(ct, skb, dataoff, 1980 ctinfo, state); 1981 case IPPROTO_ICMP: 1982 return nf_conntrack_icmp_packet(ct, skb, ctinfo, state); 1983 #if IS_ENABLED(CONFIG_IPV6) 1984 case IPPROTO_ICMPV6: 1985 return nf_conntrack_icmpv6_packet(ct, skb, ctinfo, state); 1986 #endif 1987 #ifdef CONFIG_NF_CT_PROTO_SCTP 1988 case IPPROTO_SCTP: 1989 return nf_conntrack_sctp_packet(ct, skb, dataoff, 1990 ctinfo, state); 1991 #endif 1992 #ifdef CONFIG_NF_CT_PROTO_GRE 1993 case IPPROTO_GRE: 1994 return nf_conntrack_gre_packet(ct, skb, dataoff, 1995 ctinfo, state); 1996 #endif 1997 } 1998 1999 return generic_packet(ct, skb, ctinfo); 2000 } 2001 2002 unsigned int 2003 nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) 2004 { 2005 enum ip_conntrack_info ctinfo; 2006 struct nf_conn *ct, *tmpl; 2007 u_int8_t protonum; 2008 int dataoff, ret; 2009 2010 tmpl = nf_ct_get(skb, &ctinfo); 2011 if (tmpl || ctinfo == IP_CT_UNTRACKED) { 2012 /* Previously seen (loopback or untracked)? Ignore. */ 2013 if ((tmpl && !nf_ct_is_template(tmpl)) || 2014 ctinfo == IP_CT_UNTRACKED) 2015 return NF_ACCEPT; 2016 skb->_nfct = 0; 2017 } 2018 2019 /* rcu_read_lock()ed by nf_hook_thresh */ 2020 dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum); 2021 if (dataoff <= 0) { 2022 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 2023 ret = NF_ACCEPT; 2024 goto out; 2025 } 2026 2027 if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) { 2028 ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff, 2029 protonum, state); 2030 if (ret <= 0) { 2031 ret = -ret; 2032 goto out; 2033 } 2034 /* ICMP[v6] protocol trackers may assign one conntrack. */ 2035 if (skb->_nfct) 2036 goto out; 2037 } 2038 repeat: 2039 ret = resolve_normal_ct(tmpl, skb, dataoff, 2040 protonum, state); 2041 if (ret < 0) { 2042 /* Too stressed to deal. */ 2043 NF_CT_STAT_INC_ATOMIC(state->net, drop); 2044 ret = NF_DROP; 2045 goto out; 2046 } 2047 2048 ct = nf_ct_get(skb, &ctinfo); 2049 if (!ct) { 2050 /* Not valid part of a connection */ 2051 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 2052 ret = NF_ACCEPT; 2053 goto out; 2054 } 2055 2056 ret = nf_conntrack_handle_packet(ct, skb, dataoff, ctinfo, state); 2057 if (ret <= 0) { 2058 /* Invalid: inverse of the return code tells 2059 * the netfilter core what to do */ 2060 nf_ct_put(ct); 2061 skb->_nfct = 0; 2062 /* Special case: TCP tracker reports an attempt to reopen a 2063 * closed/aborted connection. We have to go back and create a 2064 * fresh conntrack. 2065 */ 2066 if (ret == -NF_REPEAT) 2067 goto repeat; 2068 2069 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 2070 if (ret == NF_DROP) 2071 NF_CT_STAT_INC_ATOMIC(state->net, drop); 2072 2073 ret = -ret; 2074 goto out; 2075 } 2076 2077 if (ctinfo == IP_CT_ESTABLISHED_REPLY && 2078 !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) 2079 nf_conntrack_event_cache(IPCT_REPLY, ct); 2080 out: 2081 if (tmpl) 2082 nf_ct_put(tmpl); 2083 2084 return ret; 2085 } 2086 EXPORT_SYMBOL_GPL(nf_conntrack_in); 2087 2088 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ 2089 void __nf_ct_refresh_acct(struct nf_conn *ct, 2090 enum ip_conntrack_info ctinfo, 2091 u32 extra_jiffies, 2092 unsigned int bytes) 2093 { 2094 /* Only update if this is not a fixed timeout */ 2095 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) 2096 goto acct; 2097 2098 /* If not in hash table, timer will not be active yet */ 2099 if (nf_ct_is_confirmed(ct)) 2100 extra_jiffies += nfct_time_stamp; 2101 2102 if (READ_ONCE(ct->timeout) != extra_jiffies) 2103 WRITE_ONCE(ct->timeout, extra_jiffies); 2104 acct: 2105 if (bytes) 2106 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes); 2107 } 2108 EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); 2109 2110 bool nf_ct_kill_acct(struct nf_conn *ct, 2111 enum ip_conntrack_info ctinfo, 2112 const struct sk_buff *skb) 2113 { 2114 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); 2115 2116 return nf_ct_delete(ct, 0, 0); 2117 } 2118 EXPORT_SYMBOL_GPL(nf_ct_kill_acct); 2119 2120 #if IS_ENABLED(CONFIG_NF_CT_NETLINK) 2121 2122 #include <linux/netfilter/nfnetlink.h> 2123 #include <linux/netfilter/nfnetlink_conntrack.h> 2124 #include <linux/mutex.h> 2125 2126 /* Generic function for tcp/udp/sctp/dccp and alike. */ 2127 int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, 2128 const struct nf_conntrack_tuple *tuple) 2129 { 2130 if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) || 2131 nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port)) 2132 goto nla_put_failure; 2133 return 0; 2134 2135 nla_put_failure: 2136 return -1; 2137 } 2138 EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr); 2139 2140 const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = { 2141 [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 }, 2142 [CTA_PROTO_DST_PORT] = { .type = NLA_U16 }, 2143 }; 2144 EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy); 2145 2146 int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], 2147 struct nf_conntrack_tuple *t, 2148 u_int32_t flags) 2149 { 2150 if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) { 2151 if (!tb[CTA_PROTO_SRC_PORT]) 2152 return -EINVAL; 2153 2154 t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); 2155 } 2156 2157 if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) { 2158 if (!tb[CTA_PROTO_DST_PORT]) 2159 return -EINVAL; 2160 2161 t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); 2162 } 2163 2164 return 0; 2165 } 2166 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple); 2167 2168 unsigned int nf_ct_port_nlattr_tuple_size(void) 2169 { 2170 static unsigned int size __read_mostly; 2171 2172 if (!size) 2173 size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); 2174 2175 return size; 2176 } 2177 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size); 2178 #endif 2179 2180 /* Used by ipt_REJECT and ip6t_REJECT. */ 2181 static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) 2182 { 2183 struct nf_conn *ct; 2184 enum ip_conntrack_info ctinfo; 2185 2186 /* This ICMP is in reverse direction to the packet which caused it */ 2187 ct = nf_ct_get(skb, &ctinfo); 2188 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) 2189 ctinfo = IP_CT_RELATED_REPLY; 2190 else 2191 ctinfo = IP_CT_RELATED; 2192 2193 /* Attach to new skbuff, and increment count */ 2194 nf_ct_set(nskb, ct, ctinfo); 2195 nf_conntrack_get(skb_nfct(nskb)); 2196 } 2197 2198 /* This packet is coming from userspace via nf_queue, complete the packet 2199 * processing after the helper invocation in nf_confirm(). 2200 */ 2201 static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct, 2202 enum ip_conntrack_info ctinfo) 2203 { 2204 const struct nf_conntrack_helper *helper; 2205 const struct nf_conn_help *help; 2206 int protoff; 2207 2208 help = nfct_help(ct); 2209 if (!help) 2210 return NF_ACCEPT; 2211 2212 helper = rcu_dereference(help->helper); 2213 if (!helper) 2214 return NF_ACCEPT; 2215 2216 if (!(helper->flags & NF_CT_HELPER_F_USERSPACE)) 2217 return NF_ACCEPT; 2218 2219 switch (nf_ct_l3num(ct)) { 2220 case NFPROTO_IPV4: 2221 protoff = skb_network_offset(skb) + ip_hdrlen(skb); 2222 break; 2223 #if IS_ENABLED(CONFIG_IPV6) 2224 case NFPROTO_IPV6: { 2225 __be16 frag_off; 2226 u8 pnum; 2227 2228 pnum = ipv6_hdr(skb)->nexthdr; 2229 protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, 2230 &frag_off); 2231 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) 2232 return NF_ACCEPT; 2233 break; 2234 } 2235 #endif 2236 default: 2237 return NF_ACCEPT; 2238 } 2239 2240 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && 2241 !nf_is_loopback_packet(skb)) { 2242 if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { 2243 NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); 2244 return NF_DROP; 2245 } 2246 } 2247 2248 /* We've seen it coming out the other side: confirm it */ 2249 return nf_conntrack_confirm(skb); 2250 } 2251 2252 static int nf_conntrack_update(struct net *net, struct sk_buff *skb) 2253 { 2254 enum ip_conntrack_info ctinfo; 2255 struct nf_conn *ct; 2256 2257 ct = nf_ct_get(skb, &ctinfo); 2258 if (!ct) 2259 return NF_ACCEPT; 2260 2261 return nf_confirm_cthelper(skb, ct, ctinfo); 2262 } 2263 2264 static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, 2265 const struct sk_buff *skb) 2266 { 2267 const struct nf_conntrack_tuple *src_tuple; 2268 const struct nf_conntrack_tuple_hash *hash; 2269 struct nf_conntrack_tuple srctuple; 2270 enum ip_conntrack_info ctinfo; 2271 struct nf_conn *ct; 2272 2273 ct = nf_ct_get(skb, &ctinfo); 2274 if (ct) { 2275 src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo)); 2276 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); 2277 return true; 2278 } 2279 2280 if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), 2281 NFPROTO_IPV4, dev_net(skb->dev), 2282 &srctuple)) 2283 return false; 2284 2285 hash = nf_conntrack_find_get(dev_net(skb->dev), 2286 &nf_ct_zone_dflt, 2287 &srctuple); 2288 if (!hash) 2289 return false; 2290 2291 ct = nf_ct_tuplehash_to_ctrack(hash); 2292 src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir); 2293 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); 2294 nf_ct_put(ct); 2295 2296 return true; 2297 } 2298 2299 /* Bring out ya dead! */ 2300 static struct nf_conn * 2301 get_next_corpse(int (*iter)(struct nf_conn *i, void *data), 2302 const struct nf_ct_iter_data *iter_data, unsigned int *bucket) 2303 { 2304 struct nf_conntrack_tuple_hash *h; 2305 struct nf_conn *ct; 2306 struct hlist_nulls_node *n; 2307 spinlock_t *lockp; 2308 2309 for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { 2310 struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket]; 2311 2312 if (hlist_nulls_empty(hslot)) 2313 continue; 2314 2315 lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; 2316 local_bh_disable(); 2317 nf_conntrack_lock(lockp); 2318 hlist_nulls_for_each_entry(h, n, hslot, hnnode) { 2319 if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY) 2320 continue; 2321 /* All nf_conn objects are added to hash table twice, one 2322 * for original direction tuple, once for the reply tuple. 2323 * 2324 * Exception: In the IPS_NAT_CLASH case, only the reply 2325 * tuple is added (the original tuple already existed for 2326 * a different object). 2327 * 2328 * We only need to call the iterator once for each 2329 * conntrack, so we just use the 'reply' direction 2330 * tuple while iterating. 2331 */ 2332 ct = nf_ct_tuplehash_to_ctrack(h); 2333 2334 if (iter_data->net && 2335 !net_eq(iter_data->net, nf_ct_net(ct))) 2336 continue; 2337 2338 if (iter(ct, iter_data->data)) 2339 goto found; 2340 } 2341 spin_unlock(lockp); 2342 local_bh_enable(); 2343 cond_resched(); 2344 } 2345 2346 return NULL; 2347 found: 2348 refcount_inc(&ct->ct_general.use); 2349 spin_unlock(lockp); 2350 local_bh_enable(); 2351 return ct; 2352 } 2353 2354 static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), 2355 const struct nf_ct_iter_data *iter_data) 2356 { 2357 unsigned int bucket = 0; 2358 struct nf_conn *ct; 2359 2360 might_sleep(); 2361 2362 mutex_lock(&nf_conntrack_mutex); 2363 while ((ct = get_next_corpse(iter, iter_data, &bucket)) != NULL) { 2364 /* Time to push up daises... */ 2365 2366 nf_ct_delete(ct, iter_data->portid, iter_data->report); 2367 nf_ct_put(ct); 2368 cond_resched(); 2369 } 2370 mutex_unlock(&nf_conntrack_mutex); 2371 } 2372 2373 void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data), 2374 const struct nf_ct_iter_data *iter_data) 2375 { 2376 struct net *net = iter_data->net; 2377 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2378 2379 might_sleep(); 2380 2381 if (atomic_read(&cnet->count) == 0) 2382 return; 2383 2384 nf_ct_iterate_cleanup(iter, iter_data); 2385 } 2386 EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net); 2387 2388 /** 2389 * nf_ct_iterate_destroy - destroy unconfirmed conntracks and iterate table 2390 * @iter: callback to invoke for each conntrack 2391 * @data: data to pass to @iter 2392 * 2393 * Like nf_ct_iterate_cleanup, but first marks conntracks on the 2394 * unconfirmed list as dying (so they will not be inserted into 2395 * main table). 2396 * 2397 * Can only be called in module exit path. 2398 */ 2399 void 2400 nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) 2401 { 2402 struct nf_ct_iter_data iter_data = {}; 2403 struct net *net; 2404 2405 down_read(&net_rwsem); 2406 for_each_net(net) { 2407 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2408 2409 if (atomic_read(&cnet->count) == 0) 2410 continue; 2411 nf_queue_nf_hook_drop(net); 2412 } 2413 up_read(&net_rwsem); 2414 2415 /* Need to wait for netns cleanup worker to finish, if its 2416 * running -- it might have deleted a net namespace from 2417 * the global list, so hook drop above might not have 2418 * affected all namespaces. 2419 */ 2420 net_ns_barrier(); 2421 2422 /* a skb w. unconfirmed conntrack could have been reinjected just 2423 * before we called nf_queue_nf_hook_drop(). 2424 * 2425 * This makes sure its inserted into conntrack table. 2426 */ 2427 synchronize_net(); 2428 2429 nf_ct_ext_bump_genid(); 2430 iter_data.data = data; 2431 nf_ct_iterate_cleanup(iter, &iter_data); 2432 2433 /* Another cpu might be in a rcu read section with 2434 * rcu protected pointer cleared in iter callback 2435 * or hidden via nf_ct_ext_bump_genid() above. 2436 * 2437 * Wait until those are done. 2438 */ 2439 synchronize_rcu(); 2440 } 2441 EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy); 2442 2443 static int kill_all(struct nf_conn *i, void *data) 2444 { 2445 return 1; 2446 } 2447 2448 void nf_conntrack_cleanup_start(void) 2449 { 2450 cleanup_nf_conntrack_bpf(); 2451 conntrack_gc_work.exiting = true; 2452 } 2453 2454 void nf_conntrack_cleanup_end(void) 2455 { 2456 RCU_INIT_POINTER(nf_ct_hook, NULL); 2457 cancel_delayed_work_sync(&conntrack_gc_work.dwork); 2458 kvfree(nf_conntrack_hash); 2459 2460 nf_conntrack_proto_fini(); 2461 nf_conntrack_helper_fini(); 2462 nf_conntrack_expect_fini(); 2463 2464 kmem_cache_destroy(nf_conntrack_cachep); 2465 } 2466 2467 /* 2468 * Mishearing the voices in his head, our hero wonders how he's 2469 * supposed to kill the mall. 2470 */ 2471 void nf_conntrack_cleanup_net(struct net *net) 2472 { 2473 LIST_HEAD(single); 2474 2475 list_add(&net->exit_list, &single); 2476 nf_conntrack_cleanup_net_list(&single); 2477 } 2478 2479 void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) 2480 { 2481 struct nf_ct_iter_data iter_data = {}; 2482 unsigned long start = jiffies; 2483 struct net *net; 2484 int busy; 2485 2486 /* 2487 * This makes sure all current packets have passed through 2488 * netfilter framework. Roll on, two-stage module 2489 * delete... 2490 */ 2491 synchronize_rcu_expedited(); 2492 i_see_dead_people: 2493 busy = 0; 2494 list_for_each_entry(net, net_exit_list, exit_list) { 2495 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2496 2497 iter_data.net = net; 2498 nf_ct_iterate_cleanup_net(kill_all, &iter_data); 2499 if (atomic_read(&cnet->count) != 0) 2500 busy = 1; 2501 } 2502 if (busy) { 2503 DEBUG_NET_WARN_ONCE(time_after(jiffies, start + 60 * HZ), 2504 "conntrack cleanup blocked for 60s"); 2505 schedule(); 2506 goto i_see_dead_people; 2507 } 2508 2509 list_for_each_entry(net, net_exit_list, exit_list) { 2510 nf_conntrack_ecache_pernet_fini(net); 2511 nf_conntrack_expect_pernet_fini(net); 2512 free_percpu(net->ct.stat); 2513 } 2514 } 2515 2516 void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) 2517 { 2518 struct hlist_nulls_head *hash; 2519 unsigned int nr_slots, i; 2520 2521 if (*sizep > (INT_MAX / sizeof(struct hlist_nulls_head))) 2522 return NULL; 2523 2524 BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); 2525 nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); 2526 2527 if (nr_slots > (INT_MAX / sizeof(struct hlist_nulls_head))) 2528 return NULL; 2529 2530 hash = kvzalloc_objs(struct hlist_nulls_head, nr_slots); 2531 2532 if (hash && nulls) 2533 for (i = 0; i < nr_slots; i++) 2534 INIT_HLIST_NULLS_HEAD(&hash[i], i); 2535 2536 return hash; 2537 } 2538 EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); 2539 2540 int nf_conntrack_hash_resize(unsigned int hashsize) 2541 { 2542 int i, bucket; 2543 unsigned int old_size; 2544 struct hlist_nulls_head *hash, *old_hash; 2545 struct nf_conntrack_tuple_hash *h; 2546 struct nf_conn *ct; 2547 2548 if (!hashsize) 2549 return -EINVAL; 2550 2551 hash = nf_ct_alloc_hashtable(&hashsize, 1); 2552 if (!hash) 2553 return -ENOMEM; 2554 2555 mutex_lock(&nf_conntrack_mutex); 2556 old_size = nf_conntrack_htable_size; 2557 if (old_size == hashsize) { 2558 mutex_unlock(&nf_conntrack_mutex); 2559 kvfree(hash); 2560 return 0; 2561 } 2562 2563 local_bh_disable(); 2564 nf_conntrack_all_lock(); 2565 write_seqcount_begin(&nf_conntrack_generation); 2566 2567 /* Lookups in the old hash might happen in parallel, which means we 2568 * might get false negatives during connection lookup. New connections 2569 * created because of a false negative won't make it into the hash 2570 * though since that required taking the locks. 2571 */ 2572 2573 for (i = 0; i < nf_conntrack_htable_size; i++) { 2574 while (!hlist_nulls_empty(&nf_conntrack_hash[i])) { 2575 unsigned int zone_id; 2576 2577 h = hlist_nulls_entry(nf_conntrack_hash[i].first, 2578 struct nf_conntrack_tuple_hash, hnnode); 2579 ct = nf_ct_tuplehash_to_ctrack(h); 2580 hlist_nulls_del_rcu(&h->hnnode); 2581 2582 zone_id = nf_ct_zone_id(nf_ct_zone(ct), NF_CT_DIRECTION(h)); 2583 bucket = __hash_conntrack(nf_ct_net(ct), 2584 &h->tuple, zone_id, hashsize); 2585 hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); 2586 } 2587 } 2588 old_hash = nf_conntrack_hash; 2589 2590 nf_conntrack_hash = hash; 2591 nf_conntrack_htable_size = hashsize; 2592 2593 write_seqcount_end(&nf_conntrack_generation); 2594 nf_conntrack_all_unlock(); 2595 local_bh_enable(); 2596 2597 mutex_unlock(&nf_conntrack_mutex); 2598 2599 synchronize_net(); 2600 kvfree(old_hash); 2601 return 0; 2602 } 2603 2604 int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) 2605 { 2606 unsigned int hashsize; 2607 int rc; 2608 2609 if (current->nsproxy->net_ns != &init_net) 2610 return -EOPNOTSUPP; 2611 2612 /* On boot, we can set this without any fancy locking. */ 2613 if (!nf_conntrack_hash) 2614 return param_set_uint(val, kp); 2615 2616 rc = kstrtouint(val, 0, &hashsize); 2617 if (rc) 2618 return rc; 2619 2620 return nf_conntrack_hash_resize(hashsize); 2621 } 2622 2623 int nf_conntrack_init_start(void) 2624 { 2625 unsigned long nr_pages = totalram_pages(); 2626 int max_factor = 8; 2627 int ret = -ENOMEM; 2628 int i; 2629 2630 seqcount_spinlock_init(&nf_conntrack_generation, 2631 &nf_conntrack_locks_all_lock); 2632 2633 for (i = 0; i < CONNTRACK_LOCKS; i++) 2634 spin_lock_init(&nf_conntrack_locks[i]); 2635 2636 if (!nf_conntrack_htable_size) { 2637 nf_conntrack_htable_size 2638 = (((nr_pages << PAGE_SHIFT) / 16384) 2639 / sizeof(struct hlist_head)); 2640 if (BITS_PER_LONG >= 64 && 2641 nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) 2642 nf_conntrack_htable_size = 262144; 2643 else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) 2644 nf_conntrack_htable_size = 65536; 2645 2646 if (nf_conntrack_htable_size < 1024) 2647 nf_conntrack_htable_size = 1024; 2648 /* Use a max. factor of one by default to keep the average 2649 * hash chain length at 2 entries. Each entry has to be added 2650 * twice (once for original direction, once for reply). 2651 * When a table size is given we use the old value of 8 to 2652 * avoid implicit reduction of the max entries setting. 2653 */ 2654 max_factor = 1; 2655 } 2656 2657 nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1); 2658 if (!nf_conntrack_hash) 2659 return -ENOMEM; 2660 2661 nf_conntrack_max = max_factor * nf_conntrack_htable_size; 2662 2663 nf_conntrack_cachep = kmem_cache_create("nf_conntrack", 2664 sizeof(struct nf_conn), 2665 NFCT_INFOMASK + 1, 2666 SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); 2667 if (!nf_conntrack_cachep) 2668 goto err_cachep; 2669 2670 ret = nf_conntrack_expect_init(); 2671 if (ret < 0) 2672 goto err_expect; 2673 2674 ret = nf_conntrack_helper_init(); 2675 if (ret < 0) 2676 goto err_helper; 2677 2678 ret = nf_conntrack_proto_init(); 2679 if (ret < 0) 2680 goto err_proto; 2681 2682 conntrack_gc_work_init(&conntrack_gc_work); 2683 queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ); 2684 2685 ret = register_nf_conntrack_bpf(); 2686 if (ret < 0) 2687 goto err_kfunc; 2688 2689 return 0; 2690 2691 err_kfunc: 2692 cancel_delayed_work_sync(&conntrack_gc_work.dwork); 2693 nf_conntrack_proto_fini(); 2694 err_proto: 2695 nf_conntrack_helper_fini(); 2696 err_helper: 2697 nf_conntrack_expect_fini(); 2698 err_expect: 2699 kmem_cache_destroy(nf_conntrack_cachep); 2700 err_cachep: 2701 kvfree(nf_conntrack_hash); 2702 return ret; 2703 } 2704 2705 static void nf_conntrack_set_closing(struct nf_conntrack *nfct) 2706 { 2707 struct nf_conn *ct = nf_ct_to_nf_conn(nfct); 2708 2709 switch (nf_ct_protonum(ct)) { 2710 case IPPROTO_TCP: 2711 nf_conntrack_tcp_set_closing(ct); 2712 break; 2713 } 2714 } 2715 2716 static const struct nf_ct_hook nf_conntrack_hook = { 2717 .update = nf_conntrack_update, 2718 .destroy = nf_ct_destroy, 2719 .get_tuple_skb = nf_conntrack_get_tuple_skb, 2720 .attach = nf_conntrack_attach, 2721 .set_closing = nf_conntrack_set_closing, 2722 .confirm = __nf_conntrack_confirm, 2723 .get_id = nf_conntrack_get_id, 2724 }; 2725 2726 void nf_conntrack_init_end(void) 2727 { 2728 RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook); 2729 } 2730 2731 /* 2732 * We need to use special "null" values, not used in hash table 2733 */ 2734 #define UNCONFIRMED_NULLS_VAL ((1<<30)+0) 2735 2736 int nf_conntrack_init_net(struct net *net) 2737 { 2738 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2739 int ret = -ENOMEM; 2740 2741 BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER); 2742 BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS); 2743 atomic_set(&cnet->count, 0); 2744 2745 net->ct.stat = alloc_percpu(struct ip_conntrack_stat); 2746 if (!net->ct.stat) 2747 return ret; 2748 2749 ret = nf_conntrack_expect_pernet_init(net); 2750 if (ret < 0) 2751 goto err_expect; 2752 2753 nf_conntrack_acct_pernet_init(net); 2754 nf_conntrack_tstamp_pernet_init(net); 2755 nf_conntrack_ecache_pernet_init(net); 2756 nf_conntrack_proto_pernet_init(net); 2757 2758 return 0; 2759 2760 err_expect: 2761 free_percpu(net->ct.stat); 2762 return ret; 2763 } 2764 2765 /* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */ 2766 2767 int __nf_ct_change_timeout(struct nf_conn *ct, u64 timeout) 2768 { 2769 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) 2770 return -EPERM; 2771 2772 __nf_ct_set_timeout(ct, timeout); 2773 2774 if (test_bit(IPS_DYING_BIT, &ct->status)) 2775 return -ETIME; 2776 2777 return 0; 2778 } 2779 EXPORT_SYMBOL_GPL(__nf_ct_change_timeout); 2780 2781 void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off) 2782 { 2783 unsigned int bit; 2784 2785 /* Ignore these unchangable bits */ 2786 on &= ~IPS_UNCHANGEABLE_MASK; 2787 off &= ~IPS_UNCHANGEABLE_MASK; 2788 2789 for (bit = 0; bit < __IPS_MAX_BIT; bit++) { 2790 if (on & (1 << bit)) 2791 set_bit(bit, &ct->status); 2792 else if (off & (1 << bit)) 2793 clear_bit(bit, &ct->status); 2794 } 2795 } 2796 EXPORT_SYMBOL_GPL(__nf_ct_change_status); 2797 2798 int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status) 2799 { 2800 unsigned long d; 2801 2802 d = ct->status ^ status; 2803 2804 if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING)) 2805 /* unchangeable */ 2806 return -EBUSY; 2807 2808 if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) 2809 /* SEEN_REPLY bit can only be set */ 2810 return -EBUSY; 2811 2812 if (d & IPS_ASSURED && !(status & IPS_ASSURED)) 2813 /* ASSURED bit can only be set */ 2814 return -EBUSY; 2815 2816 __nf_ct_change_status(ct, status, 0); 2817 return 0; 2818 } 2819 EXPORT_SYMBOL_GPL(nf_ct_change_status_common); 2820