1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Generic INET transport hashtables 8 * 9 * Authors: Lotsa people, from code originally in tcp 10 */ 11 12 #include <linux/module.h> 13 #include <linux/random.h> 14 #include <linux/sched.h> 15 #include <linux/slab.h> 16 #include <linux/wait.h> 17 #include <linux/vmalloc.h> 18 #include <linux/memblock.h> 19 #include <linux/gcd.h> 20 21 #include <net/addrconf.h> 22 #include <net/inet_connection_sock.h> 23 #include <net/inet_hashtables.h> 24 #if IS_ENABLED(CONFIG_IPV6) 25 #include <net/inet6_hashtables.h> 26 #endif 27 #include <net/hotdata.h> 28 #include <net/ip.h> 29 #include <net/rps.h> 30 #include <net/secure_seq.h> 31 #include <net/sock_reuseport.h> 32 #include <net/tcp.h> 33 34 static void inet_init_ehash_secret(void) 35 { 36 net_get_random_sleepable_once(&inet_ehash_secret, 37 sizeof(inet_ehash_secret)); 38 } 39 40 u32 inet_ehashfn(const struct net *net, const __be32 laddr, 41 const __u16 lport, const __be32 faddr, 42 const __be16 fport) 43 { 44 return lport + __inet_ehashfn(laddr, 0, faddr, fport, 45 inet_ehash_secret + net_hash_mix(net)); 46 } 47 EXPORT_SYMBOL_GPL(inet_ehashfn); 48 49 /* This function handles inet_sock, but also timewait and request sockets 50 * for IPv4/IPv6. 51 */ 52 static u32 sk_ehashfn(const struct sock *sk) 53 { 54 #if IS_ENABLED(CONFIG_IPV6) 55 if (sk->sk_family == AF_INET6 && 56 !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 57 return inet6_ehashfn(sock_net(sk), 58 &sk->sk_v6_rcv_saddr, sk->sk_num, 59 &sk->sk_v6_daddr, sk->sk_dport); 60 #endif 61 return inet_ehashfn(sock_net(sk), 62 sk->sk_rcv_saddr, sk->sk_num, 63 sk->sk_daddr, sk->sk_dport); 64 } 65 66 static bool sk_is_connect_bind(const struct sock *sk) 67 { 68 if (sk->sk_state == TCP_TIME_WAIT) 69 return inet_twsk(sk)->tw_connect_bind; 70 else 71 return sk->sk_userlocks & SOCK_CONNECT_BIND; 72 } 73 74 /* 75 * Allocate and initialize a new local port bind bucket. 76 * The bindhash mutex for snum's hash chain must be held here. 77 */ 78 struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, 79 struct net *net, 80 struct inet_bind_hashbucket *head, 81 const unsigned short snum, 82 int l3mdev) 83 { 84 struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); 85 86 if (tb) { 87 write_pnet(&tb->ib_net, net); 88 tb->l3mdev = l3mdev; 89 tb->port = snum; 90 tb->fastreuse = 0; 91 tb->fastreuseport = 0; 92 INIT_HLIST_HEAD(&tb->bhash2); 93 hlist_add_head_rcu(&tb->node, &head->chain); 94 } 95 return tb; 96 } 97 98 /* 99 * Caller must hold hashbucket lock for this tb with local BH disabled 100 */ 101 void inet_bind_bucket_destroy(struct inet_bind_bucket *tb) 102 { 103 const struct inet_bind2_bucket *tb2; 104 105 if (hlist_empty(&tb->bhash2)) { 106 hlist_del_rcu(&tb->node); 107 kfree_rcu(tb, rcu); 108 return; 109 } 110 111 if (tb->fastreuse == -1 && tb->fastreuseport == -1) 112 return; 113 hlist_for_each_entry(tb2, &tb->bhash2, bhash_node) { 114 if (tb2->fastreuse != -1 || tb2->fastreuseport != -1) 115 return; 116 } 117 tb->fastreuse = -1; 118 tb->fastreuseport = -1; 119 } 120 121 bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, 122 unsigned short port, int l3mdev) 123 { 124 return net_eq(ib_net(tb), net) && tb->port == port && 125 tb->l3mdev == l3mdev; 126 } 127 128 static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2, 129 struct net *net, 130 struct inet_bind_hashbucket *head, 131 struct inet_bind_bucket *tb, 132 const struct sock *sk) 133 { 134 write_pnet(&tb2->ib_net, net); 135 tb2->l3mdev = tb->l3mdev; 136 tb2->port = tb->port; 137 #if IS_ENABLED(CONFIG_IPV6) 138 BUILD_BUG_ON(USHRT_MAX < (IPV6_ADDR_ANY | IPV6_ADDR_MAPPED)); 139 if (sk->sk_family == AF_INET6) { 140 tb2->addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); 141 tb2->v6_rcv_saddr = sk->sk_v6_rcv_saddr; 142 } else { 143 tb2->addr_type = IPV6_ADDR_MAPPED; 144 ipv6_addr_set_v4mapped(sk->sk_rcv_saddr, &tb2->v6_rcv_saddr); 145 } 146 #else 147 tb2->rcv_saddr = sk->sk_rcv_saddr; 148 #endif 149 tb2->fastreuse = 0; 150 tb2->fastreuseport = 0; 151 INIT_HLIST_HEAD(&tb2->owners); 152 hlist_add_head(&tb2->node, &head->chain); 153 hlist_add_head(&tb2->bhash_node, &tb->bhash2); 154 } 155 156 struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, 157 struct net *net, 158 struct inet_bind_hashbucket *head, 159 struct inet_bind_bucket *tb, 160 const struct sock *sk) 161 { 162 struct inet_bind2_bucket *tb2 = kmem_cache_alloc(cachep, GFP_ATOMIC); 163 164 if (tb2) 165 inet_bind2_bucket_init(tb2, net, head, tb, sk); 166 167 return tb2; 168 } 169 170 /* Caller must hold hashbucket lock for this tb with local BH disabled */ 171 void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) 172 { 173 const struct sock *sk; 174 175 if (hlist_empty(&tb->owners)) { 176 __hlist_del(&tb->node); 177 __hlist_del(&tb->bhash_node); 178 kmem_cache_free(cachep, tb); 179 return; 180 } 181 182 if (tb->fastreuse == -1 && tb->fastreuseport == -1) 183 return; 184 sk_for_each_bound(sk, &tb->owners) { 185 if (!sk_is_connect_bind(sk)) 186 return; 187 } 188 tb->fastreuse = -1; 189 tb->fastreuseport = -1; 190 } 191 192 static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, 193 const struct sock *sk) 194 { 195 #if IS_ENABLED(CONFIG_IPV6) 196 if (sk->sk_family == AF_INET6) 197 return ipv6_addr_equal(&tb2->v6_rcv_saddr, &sk->sk_v6_rcv_saddr); 198 199 if (tb2->addr_type != IPV6_ADDR_MAPPED) 200 return false; 201 #endif 202 return tb2->rcv_saddr == sk->sk_rcv_saddr; 203 } 204 205 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, 206 struct inet_bind2_bucket *tb2, unsigned short port) 207 { 208 WRITE_ONCE(inet_sk(sk)->inet_num, port); 209 inet_csk(sk)->icsk_bind_hash = tb; 210 inet_csk(sk)->icsk_bind2_hash = tb2; 211 sk_add_bind_node(sk, &tb2->owners); 212 } 213 214 /* 215 * Get rid of any references to a local port held by the given sock. 216 */ 217 static void __inet_put_port(struct sock *sk) 218 { 219 struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); 220 struct inet_bind_hashbucket *head, *head2; 221 struct net *net = sock_net(sk); 222 struct inet_bind_bucket *tb; 223 int bhash; 224 225 bhash = inet_bhashfn(net, inet_sk(sk)->inet_num, hashinfo->bhash_size); 226 head = &hashinfo->bhash[bhash]; 227 head2 = inet_bhashfn_portaddr(hashinfo, sk, net, inet_sk(sk)->inet_num); 228 229 spin_lock(&head->lock); 230 tb = inet_csk(sk)->icsk_bind_hash; 231 inet_csk(sk)->icsk_bind_hash = NULL; 232 WRITE_ONCE(inet_sk(sk)->inet_num, 0); 233 sk->sk_userlocks &= ~SOCK_CONNECT_BIND; 234 235 spin_lock(&head2->lock); 236 if (inet_csk(sk)->icsk_bind2_hash) { 237 struct inet_bind2_bucket *tb2 = inet_csk(sk)->icsk_bind2_hash; 238 239 __sk_del_bind_node(sk); 240 inet_csk(sk)->icsk_bind2_hash = NULL; 241 inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); 242 } 243 spin_unlock(&head2->lock); 244 245 inet_bind_bucket_destroy(tb); 246 spin_unlock(&head->lock); 247 } 248 249 void inet_put_port(struct sock *sk) 250 { 251 local_bh_disable(); 252 __inet_put_port(sk); 253 local_bh_enable(); 254 } 255 EXPORT_SYMBOL(inet_put_port); 256 257 int __inet_inherit_port(const struct sock *sk, struct sock *child) 258 { 259 struct inet_hashinfo *table = tcp_get_hashinfo(sk); 260 unsigned short port = inet_sk(child)->inet_num; 261 struct inet_bind_hashbucket *head, *head2; 262 bool created_inet_bind_bucket = false; 263 struct net *net = sock_net(sk); 264 bool update_fastreuse = false; 265 struct inet_bind2_bucket *tb2; 266 struct inet_bind_bucket *tb; 267 int bhash, l3mdev; 268 269 bhash = inet_bhashfn(net, port, table->bhash_size); 270 head = &table->bhash[bhash]; 271 head2 = inet_bhashfn_portaddr(table, child, net, port); 272 273 spin_lock(&head->lock); 274 spin_lock(&head2->lock); 275 tb = inet_csk(sk)->icsk_bind_hash; 276 tb2 = inet_csk(sk)->icsk_bind2_hash; 277 if (unlikely(!tb || !tb2)) { 278 spin_unlock(&head2->lock); 279 spin_unlock(&head->lock); 280 return -ENOENT; 281 } 282 if (tb->port != port) { 283 l3mdev = inet_sk_bound_l3mdev(sk); 284 285 /* NOTE: using tproxy and redirecting skbs to a proxy 286 * on a different listener port breaks the assumption 287 * that the listener socket's icsk_bind_hash is the same 288 * as that of the child socket. We have to look up or 289 * create a new bind bucket for the child here. */ 290 inet_bind_bucket_for_each(tb, &head->chain) { 291 if (inet_bind_bucket_match(tb, net, port, l3mdev)) 292 break; 293 } 294 if (!tb) { 295 tb = inet_bind_bucket_create(table->bind_bucket_cachep, 296 net, head, port, l3mdev); 297 if (!tb) { 298 spin_unlock(&head2->lock); 299 spin_unlock(&head->lock); 300 return -ENOMEM; 301 } 302 created_inet_bind_bucket = true; 303 } 304 update_fastreuse = true; 305 306 goto bhash2_find; 307 } else if (!inet_bind2_bucket_addr_match(tb2, child)) { 308 l3mdev = inet_sk_bound_l3mdev(sk); 309 310 bhash2_find: 311 tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, child); 312 if (!tb2) { 313 tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep, 314 net, head2, tb, child); 315 if (!tb2) 316 goto error; 317 } 318 } 319 if (update_fastreuse) 320 inet_csk_update_fastreuse(child, tb, tb2); 321 inet_bind_hash(child, tb, tb2, port); 322 spin_unlock(&head2->lock); 323 spin_unlock(&head->lock); 324 325 return 0; 326 327 error: 328 if (created_inet_bind_bucket) 329 inet_bind_bucket_destroy(tb); 330 spin_unlock(&head2->lock); 331 spin_unlock(&head->lock); 332 return -ENOMEM; 333 } 334 EXPORT_SYMBOL_GPL(__inet_inherit_port); 335 336 static struct inet_listen_hashbucket * 337 inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) 338 { 339 u32 hash; 340 341 #if IS_ENABLED(CONFIG_IPV6) 342 if (sk->sk_family == AF_INET6) 343 hash = ipv6_portaddr_hash(sock_net(sk), 344 &sk->sk_v6_rcv_saddr, 345 inet_sk(sk)->inet_num); 346 else 347 #endif 348 hash = ipv4_portaddr_hash(sock_net(sk), 349 inet_sk(sk)->inet_rcv_saddr, 350 inet_sk(sk)->inet_num); 351 return inet_lhash2_bucket(h, hash); 352 } 353 354 static inline int compute_score(struct sock *sk, const struct net *net, 355 const unsigned short hnum, const __be32 daddr, 356 const int dif, const int sdif) 357 { 358 int score = -1; 359 360 if (net_eq(sock_net(sk), net) && READ_ONCE(sk->sk_num) == hnum && 361 !ipv6_only_sock(sk)) { 362 if (sk->sk_rcv_saddr != daddr) 363 return -1; 364 365 if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) 366 return -1; 367 score = sk->sk_bound_dev_if ? 2 : 1; 368 369 if (sk->sk_family == PF_INET) 370 score++; 371 if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) 372 score++; 373 } 374 return score; 375 } 376 377 /** 378 * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary. 379 * @net: network namespace. 380 * @sk: AF_INET socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP. 381 * @skb: context for a potential SK_REUSEPORT program. 382 * @doff: header offset. 383 * @saddr: source address. 384 * @sport: source port. 385 * @daddr: destination address. 386 * @hnum: destination port in host byte order. 387 * @ehashfn: hash function used to generate the fallback hash. 388 * 389 * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to 390 * the selected sock or an error. 391 */ 392 struct sock *inet_lookup_reuseport(const struct net *net, struct sock *sk, 393 struct sk_buff *skb, int doff, 394 __be32 saddr, __be16 sport, 395 __be32 daddr, unsigned short hnum, 396 inet_ehashfn_t *ehashfn) 397 { 398 struct sock *reuse_sk = NULL; 399 u32 phash; 400 401 if (sk->sk_reuseport) { 402 phash = INDIRECT_CALL_2(ehashfn, udp_ehashfn, inet_ehashfn, 403 net, daddr, hnum, saddr, sport); 404 reuse_sk = reuseport_select_sock(sk, phash, skb, doff); 405 } 406 return reuse_sk; 407 } 408 EXPORT_SYMBOL_GPL(inet_lookup_reuseport); 409 410 /* 411 * Here are some nice properties to exploit here. The BSD API 412 * does not allow a listening sock to specify the remote port nor the 413 * remote address for the connection. So always assume those are both 414 * wildcarded during the search since they can never be otherwise. 415 */ 416 417 /* called with rcu_read_lock() : No refcount taken on the socket */ 418 static struct sock *inet_lhash2_lookup(const struct net *net, 419 struct inet_listen_hashbucket *ilb2, 420 struct sk_buff *skb, int doff, 421 const __be32 saddr, __be16 sport, 422 const __be32 daddr, const unsigned short hnum, 423 const int dif, const int sdif) 424 { 425 struct sock *sk, *result = NULL; 426 struct hlist_nulls_node *node; 427 int score, hiscore = 0; 428 429 sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { 430 score = compute_score(sk, net, hnum, daddr, dif, sdif); 431 if (score > hiscore) { 432 result = inet_lookup_reuseport(net, sk, skb, doff, 433 saddr, sport, daddr, hnum, inet_ehashfn); 434 if (result) 435 return result; 436 437 result = sk; 438 hiscore = score; 439 } 440 } 441 442 return result; 443 } 444 445 struct sock *inet_lookup_run_sk_lookup(const struct net *net, 446 int protocol, 447 struct sk_buff *skb, int doff, 448 __be32 saddr, __be16 sport, 449 __be32 daddr, u16 hnum, const int dif, 450 inet_ehashfn_t *ehashfn) 451 { 452 struct sock *sk, *reuse_sk; 453 bool no_reuseport; 454 455 no_reuseport = bpf_sk_lookup_run_v4(net, protocol, saddr, sport, 456 daddr, hnum, dif, &sk); 457 if (no_reuseport || IS_ERR_OR_NULL(sk)) 458 return sk; 459 460 reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, 461 ehashfn); 462 if (reuse_sk) 463 sk = reuse_sk; 464 return sk; 465 } 466 467 struct sock *__inet_lookup_listener(const struct net *net, 468 struct sk_buff *skb, int doff, 469 const __be32 saddr, __be16 sport, 470 const __be32 daddr, const unsigned short hnum, 471 const int dif, const int sdif) 472 { 473 struct inet_listen_hashbucket *ilb2; 474 struct inet_hashinfo *hashinfo; 475 struct sock *result = NULL; 476 unsigned int hash2; 477 478 /* Lookup redirect from BPF */ 479 if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { 480 result = inet_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff, 481 saddr, sport, daddr, hnum, dif, 482 inet_ehashfn); 483 if (result) 484 goto done; 485 } 486 487 hashinfo = net->ipv4.tcp_death_row.hashinfo; 488 hash2 = ipv4_portaddr_hash(net, daddr, hnum); 489 ilb2 = inet_lhash2_bucket(hashinfo, hash2); 490 491 result = inet_lhash2_lookup(net, ilb2, skb, doff, 492 saddr, sport, daddr, hnum, 493 dif, sdif); 494 if (result) 495 goto done; 496 497 /* Lookup lhash2 with INADDR_ANY */ 498 hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); 499 ilb2 = inet_lhash2_bucket(hashinfo, hash2); 500 501 result = inet_lhash2_lookup(net, ilb2, skb, doff, 502 saddr, sport, htonl(INADDR_ANY), hnum, 503 dif, sdif); 504 done: 505 if (IS_ERR(result)) 506 return NULL; 507 return result; 508 } 509 EXPORT_SYMBOL_GPL(__inet_lookup_listener); 510 511 /* All sockets share common refcount, but have different destructors */ 512 void sock_gen_put(struct sock *sk) 513 { 514 if (!refcount_dec_and_test(&sk->sk_refcnt)) 515 return; 516 517 if (sk->sk_state == TCP_TIME_WAIT) 518 inet_twsk_free(inet_twsk(sk)); 519 else if (sk->sk_state == TCP_NEW_SYN_RECV) 520 reqsk_free(inet_reqsk(sk)); 521 else 522 sk_free(sk); 523 } 524 EXPORT_SYMBOL_GPL(sock_gen_put); 525 526 void sock_edemux(struct sk_buff *skb) 527 { 528 sock_gen_put(skb->sk); 529 } 530 EXPORT_SYMBOL(sock_edemux); 531 532 struct sock *__inet_lookup_established(const struct net *net, 533 const __be32 saddr, const __be16 sport, 534 const __be32 daddr, const u16 hnum, 535 const int dif, const int sdif) 536 { 537 const __portpair ports = INET_COMBINED_PORTS(sport, hnum); 538 INET_ADDR_COOKIE(acookie, saddr, daddr); 539 const struct hlist_nulls_node *node; 540 struct inet_ehash_bucket *head; 541 struct inet_hashinfo *hashinfo; 542 unsigned int hash, slot; 543 struct sock *sk; 544 545 hashinfo = net->ipv4.tcp_death_row.hashinfo; 546 hash = inet_ehashfn(net, daddr, hnum, saddr, sport); 547 slot = hash & hashinfo->ehash_mask; 548 head = &hashinfo->ehash[slot]; 549 550 begin: 551 sk_nulls_for_each_rcu(sk, node, &head->chain) { 552 if (sk->sk_hash != hash) 553 continue; 554 if (likely(inet_match(net, sk, acookie, ports, dif, sdif))) { 555 if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) 556 goto out; 557 if (unlikely(!inet_match(net, sk, acookie, 558 ports, dif, sdif))) { 559 sock_gen_put(sk); 560 goto begin; 561 } 562 goto found; 563 } 564 } 565 /* 566 * if the nulls value we got at the end of this lookup is 567 * not the expected one, we must restart lookup. 568 * We probably met an item that was moved to another chain. 569 */ 570 if (get_nulls_value(node) != slot) 571 goto begin; 572 out: 573 sk = NULL; 574 found: 575 return sk; 576 } 577 EXPORT_SYMBOL_GPL(__inet_lookup_established); 578 579 /* called with local bh disabled */ 580 static int __inet_check_established(struct inet_timewait_death_row *death_row, 581 struct sock *sk, __u16 lport, 582 struct inet_timewait_sock **twp, 583 bool rcu_lookup, 584 u32 hash) 585 { 586 struct inet_hashinfo *hinfo = death_row->hashinfo; 587 struct inet_sock *inet = inet_sk(sk); 588 __be32 daddr = inet->inet_rcv_saddr; 589 __be32 saddr = inet->inet_daddr; 590 int dif = sk->sk_bound_dev_if; 591 struct net *net = sock_net(sk); 592 int sdif = l3mdev_master_ifindex_by_index(net, dif); 593 INET_ADDR_COOKIE(acookie, saddr, daddr); 594 const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); 595 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 596 struct inet_timewait_sock *tw = NULL; 597 const struct hlist_nulls_node *node; 598 struct sock *sk2; 599 spinlock_t *lock; 600 601 if (rcu_lookup) { 602 sk_nulls_for_each(sk2, node, &head->chain) { 603 if (sk2->sk_hash != hash || 604 !inet_match(net, sk2, acookie, ports, dif, sdif)) 605 continue; 606 if (sk2->sk_state == TCP_TIME_WAIT) 607 break; 608 return -EADDRNOTAVAIL; 609 } 610 return 0; 611 } 612 613 lock = inet_ehash_lockp(hinfo, hash); 614 spin_lock(lock); 615 616 sk_nulls_for_each(sk2, node, &head->chain) { 617 if (sk2->sk_hash != hash) 618 continue; 619 620 if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) { 621 if (sk2->sk_state == TCP_TIME_WAIT) { 622 tw = inet_twsk(sk2); 623 if (tcp_twsk_unique(sk, sk2, twp)) 624 break; 625 } 626 goto not_unique; 627 } 628 } 629 630 /* Must record num and sport now. Otherwise we will see 631 * in hash table socket with a funny identity. 632 */ 633 inet->inet_num = lport; 634 inet->inet_sport = htons(lport); 635 sk->sk_hash = hash; 636 WARN_ON(!sk_unhashed(sk)); 637 __sk_nulls_add_node_rcu(sk, &head->chain); 638 if (tw) { 639 sk_nulls_del_node_init_rcu((struct sock *)tw); 640 __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); 641 } 642 spin_unlock(lock); 643 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 644 645 if (twp) { 646 *twp = tw; 647 } else if (tw) { 648 /* Silly. Should hash-dance instead... */ 649 inet_twsk_deschedule_put(tw); 650 } 651 return 0; 652 653 not_unique: 654 spin_unlock(lock); 655 return -EADDRNOTAVAIL; 656 } 657 658 static u64 inet_sk_port_offset(const struct sock *sk) 659 { 660 const struct inet_sock *inet = inet_sk(sk); 661 662 return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, 663 inet->inet_daddr, 664 inet->inet_dport); 665 } 666 667 /* Searches for an exsiting socket in the ehash bucket list. 668 * Returns true if found, false otherwise. 669 */ 670 static bool inet_ehash_lookup_by_sk(struct sock *sk, 671 struct hlist_nulls_head *list) 672 { 673 const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num); 674 const int sdif = sk->sk_bound_dev_if; 675 const int dif = sk->sk_bound_dev_if; 676 const struct hlist_nulls_node *node; 677 struct net *net = sock_net(sk); 678 struct sock *esk; 679 680 INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr); 681 682 sk_nulls_for_each_rcu(esk, node, list) { 683 if (esk->sk_hash != sk->sk_hash) 684 continue; 685 if (sk->sk_family == AF_INET) { 686 if (unlikely(inet_match(net, esk, acookie, 687 ports, dif, sdif))) { 688 return true; 689 } 690 } 691 #if IS_ENABLED(CONFIG_IPV6) 692 else if (sk->sk_family == AF_INET6) { 693 if (unlikely(inet6_match(net, esk, 694 &sk->sk_v6_daddr, 695 &sk->sk_v6_rcv_saddr, 696 ports, dif, sdif))) { 697 return true; 698 } 699 } 700 #endif 701 } 702 return false; 703 } 704 705 /* Insert a socket into ehash, and eventually remove another one 706 * (The another one can be a SYN_RECV or TIMEWAIT) 707 * If an existing socket already exists, socket sk is not inserted, 708 * and sets found_dup_sk parameter to true. 709 */ 710 bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) 711 { 712 struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); 713 struct inet_ehash_bucket *head; 714 struct hlist_nulls_head *list; 715 spinlock_t *lock; 716 bool ret = true; 717 718 WARN_ON_ONCE(!sk_unhashed(sk)); 719 720 sk->sk_hash = sk_ehashfn(sk); 721 head = inet_ehash_bucket(hashinfo, sk->sk_hash); 722 list = &head->chain; 723 lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 724 725 spin_lock(lock); 726 if (osk) { 727 WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); 728 ret = sk_nulls_replace_node_init_rcu(osk, sk); 729 goto unlock; 730 } 731 732 if (found_dup_sk) { 733 *found_dup_sk = inet_ehash_lookup_by_sk(sk, list); 734 if (*found_dup_sk) 735 ret = false; 736 } 737 738 if (ret) 739 __sk_nulls_add_node_rcu(sk, list); 740 741 unlock: 742 spin_unlock(lock); 743 744 return ret; 745 } 746 747 bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) 748 { 749 bool ok = inet_ehash_insert(sk, osk, found_dup_sk); 750 751 if (ok) { 752 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 753 } else { 754 tcp_orphan_count_inc(); 755 inet_sk_set_state(sk, TCP_CLOSE); 756 sock_set_flag(sk, SOCK_DEAD); 757 inet_csk_destroy_sock(sk); 758 } 759 return ok; 760 } 761 762 static int inet_reuseport_add_sock(struct sock *sk, 763 struct inet_listen_hashbucket *ilb) 764 { 765 struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; 766 const struct hlist_nulls_node *node; 767 kuid_t uid = sk_uid(sk); 768 struct sock *sk2; 769 770 sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) { 771 if (sk2 != sk && 772 sk2->sk_family == sk->sk_family && 773 ipv6_only_sock(sk2) == ipv6_only_sock(sk) && 774 sk2->sk_bound_dev_if == sk->sk_bound_dev_if && 775 inet_csk(sk2)->icsk_bind_hash == tb && 776 sk2->sk_reuseport && uid_eq(uid, sk_uid(sk2)) && 777 inet_rcv_saddr_equal(sk, sk2, false)) 778 return reuseport_add_sock(sk, sk2, 779 inet_rcv_saddr_any(sk)); 780 } 781 782 return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); 783 } 784 785 int inet_hash(struct sock *sk) 786 { 787 struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); 788 struct inet_listen_hashbucket *ilb2; 789 int err = 0; 790 791 if (sk->sk_state == TCP_CLOSE) 792 return 0; 793 794 if (sk->sk_state != TCP_LISTEN) { 795 local_bh_disable(); 796 inet_ehash_nolisten(sk, NULL, NULL); 797 local_bh_enable(); 798 return 0; 799 } 800 801 #if IS_ENABLED(CONFIG_IPV6) 802 if (sk->sk_family == AF_INET6) 803 inet6_init_ehash_secret(); 804 #endif 805 inet_init_ehash_secret(); 806 807 WARN_ON(!sk_unhashed(sk)); 808 ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); 809 810 spin_lock(&ilb2->lock); 811 if (sk->sk_reuseport) { 812 err = inet_reuseport_add_sock(sk, ilb2); 813 if (err) 814 goto unlock; 815 } 816 sock_set_flag(sk, SOCK_RCU_FREE); 817 if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && 818 sk->sk_family == AF_INET6) 819 __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head); 820 else 821 __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head); 822 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 823 unlock: 824 spin_unlock(&ilb2->lock); 825 826 return err; 827 } 828 829 void inet_unhash(struct sock *sk) 830 { 831 struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); 832 833 if (sk_unhashed(sk)) 834 return; 835 836 sock_rps_delete_flow(sk); 837 if (sk->sk_state == TCP_LISTEN) { 838 struct inet_listen_hashbucket *ilb2; 839 840 ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); 841 /* Don't disable bottom halves while acquiring the lock to 842 * avoid circular locking dependency on PREEMPT_RT. 843 */ 844 spin_lock(&ilb2->lock); 845 if (rcu_access_pointer(sk->sk_reuseport_cb)) 846 reuseport_stop_listen_sock(sk); 847 848 __sk_nulls_del_node_init_rcu(sk); 849 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 850 spin_unlock(&ilb2->lock); 851 } else { 852 spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); 853 854 spin_lock_bh(lock); 855 __sk_nulls_del_node_init_rcu(sk); 856 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 857 spin_unlock_bh(lock); 858 } 859 } 860 861 static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb, 862 const struct net *net, unsigned short port, 863 int l3mdev, const struct sock *sk) 864 { 865 if (!net_eq(ib2_net(tb), net) || tb->port != port || 866 tb->l3mdev != l3mdev) 867 return false; 868 869 return inet_bind2_bucket_addr_match(tb, sk); 870 } 871 872 bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net, 873 unsigned short port, int l3mdev, const struct sock *sk) 874 { 875 if (!net_eq(ib2_net(tb), net) || tb->port != port || 876 tb->l3mdev != l3mdev) 877 return false; 878 879 #if IS_ENABLED(CONFIG_IPV6) 880 if (tb->addr_type == IPV6_ADDR_ANY) 881 return true; 882 883 if (tb->addr_type != IPV6_ADDR_MAPPED) 884 return false; 885 886 if (sk->sk_family == AF_INET6 && 887 !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) 888 return false; 889 #endif 890 return tb->rcv_saddr == 0; 891 } 892 893 /* The socket's bhash2 hashbucket spinlock must be held when this is called */ 894 struct inet_bind2_bucket * 895 inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net, 896 unsigned short port, int l3mdev, const struct sock *sk) 897 { 898 struct inet_bind2_bucket *bhash2 = NULL; 899 900 inet_bind_bucket_for_each(bhash2, &head->chain) 901 if (inet_bind2_bucket_match(bhash2, net, port, l3mdev, sk)) 902 break; 903 904 return bhash2; 905 } 906 907 struct inet_bind_hashbucket * 908 inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port) 909 { 910 struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); 911 u32 hash; 912 913 #if IS_ENABLED(CONFIG_IPV6) 914 if (sk->sk_family == AF_INET6) 915 hash = ipv6_portaddr_hash(net, &in6addr_any, port); 916 else 917 #endif 918 hash = ipv4_portaddr_hash(net, 0, port); 919 920 return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; 921 } 922 923 static void inet_update_saddr(struct sock *sk, void *saddr, int family) 924 { 925 if (family == AF_INET) { 926 inet_sk(sk)->inet_saddr = *(__be32 *)saddr; 927 sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr); 928 } 929 #if IS_ENABLED(CONFIG_IPV6) 930 else { 931 sk->sk_v6_rcv_saddr = *(struct in6_addr *)saddr; 932 } 933 #endif 934 } 935 936 static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset) 937 { 938 struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); 939 struct inet_bind_hashbucket *head, *head2; 940 struct inet_bind2_bucket *tb2, *new_tb2; 941 int l3mdev = inet_sk_bound_l3mdev(sk); 942 int port = inet_sk(sk)->inet_num; 943 struct net *net = sock_net(sk); 944 int bhash; 945 946 if (!inet_csk(sk)->icsk_bind2_hash) { 947 /* Not bind()ed before. */ 948 if (reset) 949 inet_reset_saddr(sk); 950 else 951 inet_update_saddr(sk, saddr, family); 952 953 return 0; 954 } 955 956 /* Allocate a bind2 bucket ahead of time to avoid permanently putting 957 * the bhash2 table in an inconsistent state if a new tb2 bucket 958 * allocation fails. 959 */ 960 new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC); 961 if (!new_tb2) { 962 if (reset) { 963 /* The (INADDR_ANY, port) bucket might have already 964 * been freed, then we cannot fixup icsk_bind2_hash, 965 * so we give up and unlink sk from bhash/bhash2 not 966 * to leave inconsistency in bhash2. 967 */ 968 inet_put_port(sk); 969 inet_reset_saddr(sk); 970 } 971 972 return -ENOMEM; 973 } 974 975 bhash = inet_bhashfn(net, port, hinfo->bhash_size); 976 head = &hinfo->bhash[bhash]; 977 head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); 978 979 /* If we change saddr locklessly, another thread 980 * iterating over bhash might see corrupted address. 981 */ 982 spin_lock_bh(&head->lock); 983 984 spin_lock(&head2->lock); 985 __sk_del_bind_node(sk); 986 inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash); 987 spin_unlock(&head2->lock); 988 989 if (reset) 990 inet_reset_saddr(sk); 991 else 992 inet_update_saddr(sk, saddr, family); 993 994 head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); 995 996 spin_lock(&head2->lock); 997 tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); 998 if (!tb2) { 999 tb2 = new_tb2; 1000 inet_bind2_bucket_init(tb2, net, head2, inet_csk(sk)->icsk_bind_hash, sk); 1001 if (sk_is_connect_bind(sk)) { 1002 tb2->fastreuse = -1; 1003 tb2->fastreuseport = -1; 1004 } 1005 } 1006 inet_csk(sk)->icsk_bind2_hash = tb2; 1007 sk_add_bind_node(sk, &tb2->owners); 1008 spin_unlock(&head2->lock); 1009 1010 spin_unlock_bh(&head->lock); 1011 1012 if (tb2 != new_tb2) 1013 kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2); 1014 1015 return 0; 1016 } 1017 1018 int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) 1019 { 1020 return __inet_bhash2_update_saddr(sk, saddr, family, false); 1021 } 1022 1023 void inet_bhash2_reset_saddr(struct sock *sk) 1024 { 1025 if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) 1026 __inet_bhash2_update_saddr(sk, NULL, 0, true); 1027 } 1028 1029 /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm 1030 * Note that we use 32bit integers (vs RFC 'short integers') 1031 * because 2^16 is not a multiple of num_ephemeral and this 1032 * property might be used by clever attacker. 1033 * 1034 * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though 1035 * attacks were since demonstrated, thus we use 65536 by default instead 1036 * to really give more isolation and privacy, at the expense of 256kB 1037 * of kernel memory. 1038 */ 1039 #define INET_TABLE_PERTURB_SIZE (1 << CONFIG_INET_TABLE_PERTURB_ORDER) 1040 static u32 *table_perturb; 1041 1042 int __inet_hash_connect(struct inet_timewait_death_row *death_row, 1043 struct sock *sk, u64 port_offset, 1044 u32 hash_port0, 1045 int (*check_established)(struct inet_timewait_death_row *, 1046 struct sock *, __u16, struct inet_timewait_sock **, 1047 bool rcu_lookup, u32 hash)) 1048 { 1049 struct inet_hashinfo *hinfo = death_row->hashinfo; 1050 struct inet_bind_hashbucket *head, *head2; 1051 struct inet_timewait_sock *tw = NULL; 1052 int port = inet_sk(sk)->inet_num; 1053 struct net *net = sock_net(sk); 1054 struct inet_bind2_bucket *tb2; 1055 struct inet_bind_bucket *tb; 1056 int step, scan_step, l3mdev; 1057 u32 index, max_rand_step; 1058 bool tb_created = false; 1059 u32 remaining, offset; 1060 int ret, i, low, high; 1061 bool local_ports; 1062 1063 if (port) { 1064 local_bh_disable(); 1065 ret = check_established(death_row, sk, port, NULL, false, 1066 hash_port0 + port); 1067 local_bh_enable(); 1068 return ret; 1069 } 1070 1071 l3mdev = inet_sk_bound_l3mdev(sk); 1072 1073 local_ports = inet_sk_get_local_port_range(sk, &low, &high); 1074 step = local_ports ? 1 : 2; 1075 scan_step = step; 1076 max_rand_step = READ_ONCE(net->ipv4.sysctl_ip_local_port_step_width); 1077 1078 high++; /* [32768, 60999] -> [32768, 61000[ */ 1079 remaining = high - low; 1080 if (!local_ports && remaining > 1) 1081 remaining &= ~1U; 1082 1083 get_random_sleepable_once(table_perturb, 1084 INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb)); 1085 index = port_offset & (INET_TABLE_PERTURB_SIZE - 1); 1086 1087 offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32); 1088 offset %= remaining; 1089 1090 /* In first pass we try ports of @low parity. 1091 * inet_csk_get_port() does the opposite choice. 1092 */ 1093 if (!local_ports) 1094 offset &= ~1U; 1095 1096 if (max_rand_step && remaining > 1) { 1097 u32 range = remaining / step; 1098 u32 upper_bound; 1099 1100 upper_bound = min(range, max_rand_step); 1101 scan_step = get_random_u32_inclusive(1, upper_bound); 1102 while (gcd(scan_step, range) != 1) { 1103 scan_step++; 1104 /* if both scan_step and range are even gcd won't be 1 */ 1105 if (!(scan_step & 1) && !(range & 1)) 1106 scan_step++; 1107 if (unlikely(scan_step > upper_bound)) { 1108 scan_step = 1; 1109 break; 1110 } 1111 } 1112 scan_step *= step; 1113 } 1114 other_parity_scan: 1115 port = low + offset; 1116 for (i = 0; i < remaining; i += step, port += scan_step) { 1117 if (unlikely(port >= high)) 1118 port -= remaining; 1119 if (inet_is_local_reserved_port(net, port)) 1120 continue; 1121 head = &hinfo->bhash[inet_bhashfn(net, port, 1122 hinfo->bhash_size)]; 1123 rcu_read_lock(); 1124 hlist_for_each_entry_rcu(tb, &head->chain, node) { 1125 if (!inet_bind_bucket_match(tb, net, port, l3mdev)) 1126 continue; 1127 if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) { 1128 rcu_read_unlock(); 1129 goto next_port; 1130 } 1131 if (!check_established(death_row, sk, port, &tw, true, 1132 hash_port0 + port)) 1133 break; 1134 rcu_read_unlock(); 1135 goto next_port; 1136 } 1137 rcu_read_unlock(); 1138 1139 spin_lock_bh(&head->lock); 1140 1141 /* Does not bother with rcv_saddr checks, because 1142 * the established check is already unique enough. 1143 */ 1144 inet_bind_bucket_for_each(tb, &head->chain) { 1145 if (inet_bind_bucket_match(tb, net, port, l3mdev)) { 1146 if (tb->fastreuse >= 0 || 1147 tb->fastreuseport >= 0) 1148 goto next_port_unlock; 1149 WARN_ON(hlist_empty(&tb->bhash2)); 1150 if (!check_established(death_row, sk, 1151 port, &tw, false, 1152 hash_port0 + port)) 1153 goto ok; 1154 goto next_port_unlock; 1155 } 1156 } 1157 1158 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, 1159 net, head, port, l3mdev); 1160 if (!tb) { 1161 spin_unlock_bh(&head->lock); 1162 return -ENOMEM; 1163 } 1164 tb_created = true; 1165 tb->fastreuse = -1; 1166 tb->fastreuseport = -1; 1167 goto ok; 1168 next_port_unlock: 1169 spin_unlock_bh(&head->lock); 1170 next_port: 1171 cond_resched(); 1172 } 1173 1174 if (!local_ports) { 1175 offset++; 1176 if ((offset & 1) && remaining > 1) 1177 goto other_parity_scan; 1178 } 1179 return -EADDRNOTAVAIL; 1180 1181 ok: 1182 /* Find the corresponding tb2 bucket since we need to 1183 * add the socket to the bhash2 table as well 1184 */ 1185 head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); 1186 spin_lock(&head2->lock); 1187 1188 tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); 1189 if (!tb2) { 1190 tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net, 1191 head2, tb, sk); 1192 if (!tb2) 1193 goto error; 1194 tb2->fastreuse = -1; 1195 tb2->fastreuseport = -1; 1196 } 1197 1198 /* Here we want to add a little bit of randomness to the next source 1199 * port that will be chosen. We use a max() with a random here so that 1200 * on low contention the randomness is maximal and on high contention 1201 * it may be inexistent. 1202 */ 1203 i = max_t(int, i, get_random_u32_below(8) * step); 1204 WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + step); 1205 1206 /* Head lock still held and bh's disabled */ 1207 inet_bind_hash(sk, tb, tb2, port); 1208 sk->sk_userlocks |= SOCK_CONNECT_BIND; 1209 1210 if (sk_unhashed(sk)) { 1211 inet_sk(sk)->inet_sport = htons(port); 1212 inet_ehash_nolisten(sk, (struct sock *)tw, NULL); 1213 } 1214 if (tw) 1215 inet_twsk_bind_unhash(tw, hinfo); 1216 1217 spin_unlock(&head2->lock); 1218 spin_unlock(&head->lock); 1219 1220 if (tw) 1221 inet_twsk_deschedule_put(tw); 1222 local_bh_enable(); 1223 return 0; 1224 1225 error: 1226 if (sk_hashed(sk)) { 1227 spinlock_t *lock = inet_ehash_lockp(hinfo, sk->sk_hash); 1228 1229 sock_prot_inuse_add(net, sk->sk_prot, -1); 1230 1231 spin_lock(lock); 1232 __sk_nulls_del_node_init_rcu(sk); 1233 spin_unlock(lock); 1234 1235 sk->sk_hash = 0; 1236 inet_sk(sk)->inet_sport = 0; 1237 WRITE_ONCE(inet_sk(sk)->inet_num, 0); 1238 1239 if (tw) 1240 inet_twsk_bind_unhash(tw, hinfo); 1241 } 1242 1243 spin_unlock(&head2->lock); 1244 if (tb_created) 1245 inet_bind_bucket_destroy(tb); 1246 spin_unlock(&head->lock); 1247 1248 if (tw) 1249 inet_twsk_deschedule_put(tw); 1250 1251 local_bh_enable(); 1252 1253 return -ENOMEM; 1254 } 1255 1256 /* 1257 * Bind a port for a connect operation and hash it. 1258 */ 1259 int inet_hash_connect(struct inet_timewait_death_row *death_row, 1260 struct sock *sk) 1261 { 1262 const struct inet_sock *inet = inet_sk(sk); 1263 const struct net *net = sock_net(sk); 1264 u64 port_offset = 0; 1265 u32 hash_port0; 1266 1267 if (!inet_sk(sk)->inet_num) 1268 port_offset = inet_sk_port_offset(sk); 1269 1270 inet_init_ehash_secret(); 1271 1272 hash_port0 = inet_ehashfn(net, inet->inet_rcv_saddr, 0, 1273 inet->inet_daddr, inet->inet_dport); 1274 1275 return __inet_hash_connect(death_row, sk, port_offset, hash_port0, 1276 __inet_check_established); 1277 } 1278 1279 void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, 1280 unsigned long numentries, int scale, 1281 unsigned long low_limit, 1282 unsigned long high_limit) 1283 { 1284 unsigned int i; 1285 1286 h->lhash2 = alloc_large_system_hash(name, 1287 sizeof(*h->lhash2), 1288 numentries, 1289 scale, 1290 0, 1291 NULL, 1292 &h->lhash2_mask, 1293 low_limit, 1294 high_limit); 1295 1296 for (i = 0; i <= h->lhash2_mask; i++) { 1297 spin_lock_init(&h->lhash2[i].lock); 1298 INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head, 1299 i + LISTENING_NULLS_BASE); 1300 } 1301 1302 /* this one is used for source ports of outgoing connections */ 1303 table_perturb = alloc_large_system_hash("Table-perturb", 1304 sizeof(*table_perturb), 1305 INET_TABLE_PERTURB_SIZE, 1306 0, 0, NULL, NULL, 1307 INET_TABLE_PERTURB_SIZE, 1308 INET_TABLE_PERTURB_SIZE); 1309 } 1310 1311 int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) 1312 { 1313 unsigned int locksz = sizeof(spinlock_t); 1314 unsigned int i, nblocks = 1; 1315 spinlock_t *ptr = NULL; 1316 1317 if (locksz == 0) 1318 goto set_mask; 1319 1320 /* Allocate 2 cache lines or at least one spinlock per cpu. */ 1321 nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U) * num_possible_cpus(); 1322 1323 /* At least one page per NUMA node. */ 1324 nblocks = max(nblocks, num_online_nodes() * PAGE_SIZE / locksz); 1325 1326 nblocks = roundup_pow_of_two(nblocks); 1327 1328 /* No more locks than number of hash buckets. */ 1329 nblocks = min(nblocks, hashinfo->ehash_mask + 1); 1330 1331 if (num_online_nodes() > 1) { 1332 /* Use vmalloc() to allow NUMA policy to spread pages 1333 * on all available nodes if desired. 1334 */ 1335 ptr = vmalloc_array(nblocks, locksz); 1336 } 1337 if (!ptr) { 1338 ptr = kvmalloc_array(nblocks, locksz, GFP_KERNEL); 1339 if (!ptr) 1340 return -ENOMEM; 1341 } 1342 for (i = 0; i < nblocks; i++) 1343 spin_lock_init(&ptr[i]); 1344 hashinfo->ehash_locks = ptr; 1345 set_mask: 1346 hashinfo->ehash_locks_mask = nblocks - 1; 1347 return 0; 1348 } 1349 1350 struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, 1351 unsigned int ehash_entries) 1352 { 1353 struct inet_hashinfo *new_hashinfo; 1354 int i; 1355 1356 new_hashinfo = kmemdup(hashinfo, sizeof(*hashinfo), GFP_KERNEL); 1357 if (!new_hashinfo) 1358 goto err; 1359 1360 new_hashinfo->ehash = vmalloc_huge(ehash_entries * sizeof(struct inet_ehash_bucket), 1361 GFP_KERNEL_ACCOUNT); 1362 if (!new_hashinfo->ehash) 1363 goto free_hashinfo; 1364 1365 new_hashinfo->ehash_mask = ehash_entries - 1; 1366 1367 if (inet_ehash_locks_alloc(new_hashinfo)) 1368 goto free_ehash; 1369 1370 for (i = 0; i < ehash_entries; i++) 1371 INIT_HLIST_NULLS_HEAD(&new_hashinfo->ehash[i].chain, i); 1372 1373 new_hashinfo->pernet = true; 1374 1375 return new_hashinfo; 1376 1377 free_ehash: 1378 vfree(new_hashinfo->ehash); 1379 free_hashinfo: 1380 kfree(new_hashinfo); 1381 err: 1382 return NULL; 1383 } 1384 1385 void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo) 1386 { 1387 if (!hashinfo->pernet) 1388 return; 1389 1390 inet_ehash_locks_free(hashinfo); 1391 vfree(hashinfo->ehash); 1392 kfree(hashinfo); 1393 } 1394