1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * IPVS An implementation of the IP virtual server support for the 4 * LINUX operating system. IPVS is now implemented as a module 5 * over the NetFilter framework. IPVS can be used to build a 6 * high-performance and highly available server based on a 7 * cluster of servers. 8 * 9 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 10 * Peter Kese <peter.kese@ijs.si> 11 * Julian Anastasov <ja@ssi.bg> 12 * 13 * Changes: 14 */ 15 16 #define pr_fmt(fmt) "IPVS: " fmt 17 18 #include <linux/module.h> 19 #include <linux/init.h> 20 #include <linux/types.h> 21 #include <linux/capability.h> 22 #include <linux/fs.h> 23 #include <linux/sysctl.h> 24 #include <linux/proc_fs.h> 25 #include <linux/workqueue.h> 26 #include <linux/seq_file.h> 27 #include <linux/slab.h> 28 29 #include <linux/netfilter.h> 30 #include <linux/netfilter_ipv4.h> 31 #include <linux/mutex.h> 32 #include <linux/rcupdate_wait.h> 33 34 #include <net/net_namespace.h> 35 #include <linux/nsproxy.h> 36 #include <net/ip.h> 37 #ifdef CONFIG_IP_VS_IPV6 38 #include <net/ipv6.h> 39 #include <net/ip6_route.h> 40 #include <net/netfilter/ipv6/nf_defrag_ipv6.h> 41 #endif 42 #include <net/route.h> 43 #include <net/sock.h> 44 #include <net/genetlink.h> 45 46 #include <linux/uaccess.h> 47 48 #include <net/ip_vs.h> 49 50 MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME); 51 52 static struct lock_class_key __ipvs_service_key; 53 54 /* sysctl variables */ 55 56 #ifdef CONFIG_IP_VS_DEBUG 57 static int sysctl_ip_vs_debug_level = 0; 58 59 int ip_vs_get_debug_level(void) 60 { 61 return sysctl_ip_vs_debug_level; 62 } 63 #endif 64 65 66 /* Protos */ 67 static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup); 68 69 70 #ifdef CONFIG_IP_VS_IPV6 71 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ 72 static bool __ip_vs_addr_is_local_v6(struct net *net, 73 const struct in6_addr *addr) 74 { 75 struct flowi6 fl6 = { 76 .daddr = *addr, 77 }; 78 struct dst_entry *dst = ip6_route_output(net, NULL, &fl6); 79 bool is_local; 80 81 is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK); 82 83 dst_release(dst); 84 return is_local; 85 } 86 #endif 87 88 #ifdef CONFIG_SYSCTL 89 /* 90 * update_defense_level is called from keventd and from sysctl, 91 * so it needs to protect itself from softirqs 92 */ 93 static void update_defense_level(struct netns_ipvs *ipvs) 94 { 95 struct sysinfo i; 96 int availmem; 97 int amemthresh; 98 int nomem; 99 int to_change = -1; 100 101 /* we only count free and buffered memory (in pages) */ 102 si_meminfo(&i); 103 availmem = i.freeram + i.bufferram; 104 /* however in linux 2.5 the i.bufferram is total page cache size, 105 we need adjust it */ 106 /* si_swapinfo(&i); */ 107 /* availmem = availmem - (i.totalswap - i.freeswap); */ 108 109 amemthresh = max(READ_ONCE(ipvs->sysctl_amemthresh), 0); 110 nomem = (availmem < amemthresh); 111 112 local_bh_disable(); 113 114 /* drop_entry */ 115 spin_lock(&ipvs->dropentry_lock); 116 switch (ipvs->sysctl_drop_entry) { 117 case 0: 118 atomic_set(&ipvs->dropentry, 0); 119 break; 120 case 1: 121 if (nomem) { 122 atomic_set(&ipvs->dropentry, 1); 123 ipvs->sysctl_drop_entry = 2; 124 } else { 125 atomic_set(&ipvs->dropentry, 0); 126 } 127 break; 128 case 2: 129 if (nomem) { 130 atomic_set(&ipvs->dropentry, 1); 131 } else { 132 atomic_set(&ipvs->dropentry, 0); 133 ipvs->sysctl_drop_entry = 1; 134 } 135 break; 136 case 3: 137 atomic_set(&ipvs->dropentry, 1); 138 break; 139 } 140 spin_unlock(&ipvs->dropentry_lock); 141 142 /* drop_packet */ 143 spin_lock(&ipvs->droppacket_lock); 144 switch (ipvs->sysctl_drop_packet) { 145 case 0: 146 ipvs->drop_rate = 0; 147 break; 148 case 1: 149 if (nomem) { 150 ipvs->drop_counter = amemthresh / (amemthresh - availmem); 151 ipvs->drop_rate = ipvs->drop_counter; 152 ipvs->sysctl_drop_packet = 2; 153 } else { 154 ipvs->drop_rate = 0; 155 } 156 break; 157 case 2: 158 if (nomem) { 159 ipvs->drop_counter = amemthresh / (amemthresh - availmem); 160 ipvs->drop_rate = ipvs->drop_counter; 161 } else { 162 ipvs->drop_rate = 0; 163 ipvs->sysctl_drop_packet = 1; 164 } 165 break; 166 case 3: 167 ipvs->drop_rate = ipvs->sysctl_am_droprate; 168 break; 169 } 170 spin_unlock(&ipvs->droppacket_lock); 171 172 /* secure_tcp */ 173 spin_lock(&ipvs->securetcp_lock); 174 switch (ipvs->sysctl_secure_tcp) { 175 case 0: 176 if (ipvs->old_secure_tcp >= 2) 177 to_change = 0; 178 break; 179 case 1: 180 if (nomem) { 181 if (ipvs->old_secure_tcp < 2) 182 to_change = 1; 183 ipvs->sysctl_secure_tcp = 2; 184 } else { 185 if (ipvs->old_secure_tcp >= 2) 186 to_change = 0; 187 } 188 break; 189 case 2: 190 if (nomem) { 191 if (ipvs->old_secure_tcp < 2) 192 to_change = 1; 193 } else { 194 if (ipvs->old_secure_tcp >= 2) 195 to_change = 0; 196 ipvs->sysctl_secure_tcp = 1; 197 } 198 break; 199 case 3: 200 if (ipvs->old_secure_tcp < 2) 201 to_change = 1; 202 break; 203 } 204 ipvs->old_secure_tcp = ipvs->sysctl_secure_tcp; 205 if (to_change >= 0) 206 ip_vs_protocol_timeout_change(ipvs, 207 ipvs->sysctl_secure_tcp > 1); 208 spin_unlock(&ipvs->securetcp_lock); 209 210 local_bh_enable(); 211 } 212 213 /* Handler for delayed work for expiring no 214 * destination connections 215 */ 216 static void expire_nodest_conn_handler(struct work_struct *work) 217 { 218 struct netns_ipvs *ipvs; 219 220 ipvs = container_of(work, struct netns_ipvs, 221 expire_nodest_conn_work.work); 222 ip_vs_expire_nodest_conn_flush(ipvs); 223 } 224 225 /* 226 * Timer for checking the defense 227 */ 228 #define DEFENSE_TIMER_PERIOD 1*HZ 229 230 static void defense_work_handler(struct work_struct *work) 231 { 232 struct netns_ipvs *ipvs = 233 container_of(work, struct netns_ipvs, defense_work.work); 234 235 update_defense_level(ipvs); 236 if (atomic_read(&ipvs->dropentry)) 237 ip_vs_random_dropentry(ipvs); 238 queue_delayed_work(system_long_wq, &ipvs->defense_work, 239 DEFENSE_TIMER_PERIOD); 240 } 241 #endif 242 243 static void est_reload_work_handler(struct work_struct *work) 244 { 245 struct netns_ipvs *ipvs = 246 container_of(work, struct netns_ipvs, est_reload_work.work); 247 int genid_done = atomic_read(&ipvs->est_genid_done); 248 unsigned long delay = HZ / 10; /* repeat startups after failure */ 249 bool repeat = false; 250 int genid; 251 int id; 252 253 mutex_lock(&ipvs->est_mutex); 254 genid = atomic_read(&ipvs->est_genid); 255 for (id = 0; id < ipvs->est_kt_count; id++) { 256 struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id]; 257 258 /* netns clean up started, abort delayed work */ 259 if (!READ_ONCE(ipvs->enable)) 260 goto unlock; 261 if (!kd) 262 continue; 263 /* New config ? Stop kthread tasks */ 264 if (genid != genid_done) 265 ip_vs_est_kthread_stop(kd); 266 if (!kd->task && !ip_vs_est_stopped(ipvs)) { 267 /* Do not start kthreads above 0 in calc phase */ 268 if ((!id || !ipvs->est_calc_phase) && 269 ip_vs_est_kthread_start(ipvs, kd) < 0) 270 repeat = true; 271 } 272 } 273 274 atomic_set(&ipvs->est_genid_done, genid); 275 276 if (repeat) 277 queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 278 delay); 279 280 unlock: 281 mutex_unlock(&ipvs->est_mutex); 282 } 283 284 static int get_conn_tab_size(struct netns_ipvs *ipvs) 285 { 286 const struct ip_vs_rht *t; 287 int size = 0; 288 289 rcu_read_lock(); 290 t = rcu_dereference(ipvs->conn_tab); 291 if (t) 292 size = t->size; 293 rcu_read_unlock(); 294 295 return size; 296 } 297 298 int 299 ip_vs_use_count_inc(void) 300 { 301 return try_module_get(THIS_MODULE); 302 } 303 304 void 305 ip_vs_use_count_dec(void) 306 { 307 module_put(THIS_MODULE); 308 } 309 310 311 /* Service hashing: 312 * Operation Locking order 313 * --------------------------------------------------------------------------- 314 * add table service_mutex, svc_resize_sem(W) 315 * del table service_mutex 316 * move between tables svc_resize_sem(W), seqcount_t(W), bit lock 317 * add/del service service_mutex, bit lock 318 * find service RCU, seqcount_t(R) 319 * walk services(blocking) service_mutex, svc_resize_sem(R) 320 * walk services(non-blocking) RCU, seqcount_t(R) 321 * 322 * - new tables are linked/unlinked under service_mutex and svc_resize_sem 323 * - new table is linked on resizing and all operations can run in parallel 324 * in 2 tables until the new table is registered as current one 325 * - two contexts can modify buckets: config and table resize, both in 326 * process context 327 * - only table resizer can move entries, so we do not protect t->seqc[] 328 * items with t->lock[] 329 * - lookups occur under RCU lock and seqcount reader lock to detect if 330 * services are moved to new table 331 * - move operations may disturb readers: find operation will not miss entries 332 * but walkers may see same entry twice if they are forced to retry chains 333 * - walkers using cond_resched_rcu() on !PREEMPT_RCU may need to hold 334 * service_mutex to disallow new tables to be installed or to check 335 * svc_table_changes and repeat the RCU read section if new table is installed 336 */ 337 338 /* 339 * Returns hash value for virtual service 340 */ 341 static inline u32 342 ip_vs_svc_hashval(struct ip_vs_rht *t, int af, unsigned int proto, 343 const union nf_inet_addr *addr, __be16 port) 344 { 345 return ip_vs_rht_hash_linfo(t, af, addr, ntohs(port), proto); 346 } 347 348 /* 349 * Returns hash value of fwmark for virtual service lookup 350 */ 351 static inline u32 ip_vs_svc_fwm_hashval(struct ip_vs_rht *t, int af, 352 __u32 fwmark) 353 { 354 return jhash_2words(fwmark, af, (u32)t->hash_key.key[0]); 355 } 356 357 /* Hashes a service in the svc_table by <proto,addr,port> or by fwmark */ 358 static int ip_vs_svc_hash(struct ip_vs_service *svc) 359 { 360 struct netns_ipvs *ipvs = svc->ipvs; 361 struct hlist_bl_head *head; 362 struct ip_vs_rht *t; 363 u32 hash; 364 365 if (svc->flags & IP_VS_SVC_F_HASHED) { 366 pr_err("%s(): request for already hashed, called from %pS\n", 367 __func__, __builtin_return_address(0)); 368 return 0; 369 } 370 371 /* increase its refcnt because it is referenced by the svc table */ 372 atomic_inc(&svc->refcnt); 373 374 /* New entries go into recent table */ 375 t = rcu_dereference_protected(ipvs->svc_table, 1); 376 t = rcu_dereference_protected(t->new_tbl, 1); 377 378 if (svc->fwmark == 0) { 379 /* 380 * Hash it by <protocol,addr,port> 381 */ 382 hash = ip_vs_svc_hashval(t, svc->af, svc->protocol, 383 &svc->addr, svc->port); 384 } else { 385 /* 386 * Hash it by fwmark 387 */ 388 hash = ip_vs_svc_fwm_hashval(t, svc->af, svc->fwmark); 389 } 390 head = t->buckets + (hash & t->mask); 391 hlist_bl_lock(head); 392 WRITE_ONCE(svc->hash_key, ip_vs_rht_build_hash_key(t, hash)); 393 svc->flags |= IP_VS_SVC_F_HASHED; 394 hlist_bl_add_head_rcu(&svc->s_list, head); 395 hlist_bl_unlock(head); 396 397 return 1; 398 } 399 400 401 /* 402 * Unhashes a service from svc_table. 403 * Should be called with locked tables. 404 */ 405 static int ip_vs_svc_unhash(struct ip_vs_service *svc) 406 { 407 struct netns_ipvs *ipvs = svc->ipvs; 408 struct hlist_bl_head *head; 409 struct ip_vs_rht *t; 410 u32 hash_key2; 411 u32 hash_key; 412 413 if (!(svc->flags & IP_VS_SVC_F_HASHED)) { 414 pr_err("%s(): request for unhash flagged, called from %pS\n", 415 __func__, __builtin_return_address(0)); 416 return 0; 417 } 418 419 t = rcu_dereference_protected(ipvs->svc_table, 1); 420 hash_key = READ_ONCE(svc->hash_key); 421 /* We need to lock the bucket in the right table */ 422 if (ip_vs_rht_same_table(t, hash_key)) { 423 head = t->buckets + (hash_key & t->mask); 424 hlist_bl_lock(head); 425 /* Ensure hash_key is read under lock */ 426 hash_key2 = READ_ONCE(svc->hash_key); 427 /* Moved to new table ? */ 428 if (hash_key != hash_key2) { 429 hlist_bl_unlock(head); 430 t = rcu_dereference_protected(t->new_tbl, 1); 431 head = t->buckets + (hash_key2 & t->mask); 432 hlist_bl_lock(head); 433 } 434 } else { 435 /* It is already moved to new table */ 436 t = rcu_dereference_protected(t->new_tbl, 1); 437 head = t->buckets + (hash_key & t->mask); 438 hlist_bl_lock(head); 439 } 440 /* Remove it from svc_table */ 441 hlist_bl_del_rcu(&svc->s_list); 442 443 svc->flags &= ~IP_VS_SVC_F_HASHED; 444 atomic_dec(&svc->refcnt); 445 hlist_bl_unlock(head); 446 return 1; 447 } 448 449 450 /* 451 * Get service by {netns, proto,addr,port} in the service table. 452 */ 453 static inline struct ip_vs_service * 454 __ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol, 455 const union nf_inet_addr *vaddr, __be16 vport) 456 { 457 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); 458 struct hlist_bl_head *head; 459 struct ip_vs_service *svc; 460 struct ip_vs_rht *t, *p; 461 struct hlist_bl_node *e; 462 u32 hash, hash_key; 463 464 ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, p) { 465 /* Check for "full" addressed entries */ 466 hash = ip_vs_svc_hashval(t, af, protocol, vaddr, vport); 467 468 hash_key = ip_vs_rht_build_hash_key(t, hash); 469 ip_vs_rht_walk_bucket_rcu(t, hash_key, head) { 470 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { 471 if (READ_ONCE(svc->hash_key) == hash_key && 472 svc->af == af && 473 ip_vs_addr_equal(af, &svc->addr, vaddr) && 474 svc->port == vport && 475 svc->protocol == protocol && !svc->fwmark) { 476 /* HIT */ 477 return svc; 478 } 479 } 480 } 481 } 482 483 return NULL; 484 } 485 486 487 /* 488 * Get service by {fwmark} in the service table. 489 */ 490 static inline struct ip_vs_service * 491 __ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark) 492 { 493 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); 494 struct hlist_bl_head *head; 495 struct ip_vs_service *svc; 496 struct ip_vs_rht *t, *p; 497 struct hlist_bl_node *e; 498 u32 hash, hash_key; 499 500 ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, p) { 501 /* Check for fwmark addressed entries */ 502 hash = ip_vs_svc_fwm_hashval(t, af, fwmark); 503 504 hash_key = ip_vs_rht_build_hash_key(t, hash); 505 ip_vs_rht_walk_bucket_rcu(t, hash_key, head) { 506 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { 507 if (READ_ONCE(svc->hash_key) == hash_key && 508 svc->fwmark == fwmark && svc->af == af) { 509 /* HIT */ 510 return svc; 511 } 512 } 513 } 514 } 515 516 return NULL; 517 } 518 519 /* Find service, called under RCU lock */ 520 struct ip_vs_service * 521 ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol, 522 const union nf_inet_addr *vaddr, __be16 vport) 523 { 524 struct ip_vs_service *svc = NULL; 525 int af_id = ip_vs_af_index(af); 526 527 /* 528 * Check the table hashed by fwmark first 529 */ 530 if (fwmark && atomic_read(&ipvs->fwm_services[af_id])) { 531 svc = __ip_vs_svc_fwm_find(ipvs, af, fwmark); 532 if (svc) 533 goto out; 534 } 535 536 if (!atomic_read(&ipvs->nonfwm_services[af_id])) 537 goto out; 538 539 /* 540 * Check the table hashed by <protocol,addr,port> 541 * for "full" addressed entries 542 */ 543 svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport); 544 if (svc) 545 goto out; 546 547 if (protocol == IPPROTO_TCP && 548 atomic_read(&ipvs->ftpsvc_counter[af_id]) && 549 (vport == FTPDATA || !inet_port_requires_bind_service(ipvs->net, ntohs(vport)))) { 550 /* 551 * Check if ftp service entry exists, the packet 552 * might belong to FTP data connections. 553 */ 554 svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, FTPPORT); 555 if (svc) 556 goto out; 557 } 558 559 if (atomic_read(&ipvs->nullsvc_counter[af_id])) { 560 /* 561 * Check if the catch-all port (port zero) exists 562 */ 563 svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, 0); 564 } 565 566 out: 567 IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n", 568 fwmark, ip_vs_proto_name(protocol), 569 IP_VS_DBG_ADDR(af, vaddr), ntohs(vport), 570 svc ? "hit" : "not hit"); 571 572 return svc; 573 } 574 575 /* Return the number of registered services */ 576 static int ip_vs_get_num_services(struct netns_ipvs *ipvs) 577 { 578 int ns = 0, ni = IP_VS_AF_MAX; 579 580 while (--ni >= 0) 581 ns += atomic_read(&ipvs->num_services[ni]); 582 return ns; 583 } 584 585 /* Get default load factor to map num_services/u_thresh to t->size */ 586 static int ip_vs_svc_default_load_factor(struct netns_ipvs *ipvs) 587 { 588 int factor; 589 590 if (net_eq(ipvs->net, &init_net)) 591 factor = -3; /* grow if load is above 12.5% */ 592 else 593 factor = -2; /* grow if load is above 25% */ 594 return factor; 595 } 596 597 /* Get the desired svc_table size */ 598 static int ip_vs_svc_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, 599 int lfactor) 600 { 601 return ip_vs_rht_desired_size(ipvs, t, ip_vs_get_num_services(ipvs), 602 lfactor, IP_VS_SVC_TAB_MIN_BITS, 603 IP_VS_SVC_TAB_MAX_BITS); 604 } 605 606 /* Allocate svc_table */ 607 static struct ip_vs_rht *ip_vs_svc_table_alloc(struct netns_ipvs *ipvs, 608 int buckets, int lfactor) 609 { 610 struct ip_vs_rht *t; 611 int scounts, locks; 612 613 /* No frequent lookups to race with resizing, so use max of 64 614 * seqcounts. Only resizer moves entries, so use 0 locks. 615 */ 616 scounts = clamp(buckets >> 4, 1, 64); 617 locks = 0; 618 619 t = ip_vs_rht_alloc(buckets, scounts, locks); 620 if (!t) 621 return NULL; 622 t->lfactor = lfactor; 623 ip_vs_rht_set_thresholds(t, t->size, lfactor, IP_VS_SVC_TAB_MIN_BITS, 624 IP_VS_SVC_TAB_MAX_BITS); 625 return t; 626 } 627 628 /* svc_table resizer work */ 629 static void svc_resize_work_handler(struct work_struct *work) 630 { 631 struct hlist_bl_head *head, *head2; 632 struct ip_vs_rht *t_free = NULL; 633 unsigned int resched_score = 0; 634 struct hlist_bl_node *cn, *nn; 635 struct ip_vs_rht *t, *t_new; 636 struct ip_vs_service *svc; 637 struct netns_ipvs *ipvs; 638 bool more_work = true; 639 seqcount_t *sc; 640 int limit = 0; 641 int new_size; 642 int lfactor; 643 u32 bucket; 644 645 ipvs = container_of(work, struct netns_ipvs, svc_resize_work.work); 646 647 if (!down_write_trylock(&ipvs->svc_resize_sem)) 648 goto out; 649 if (!mutex_trylock(&ipvs->service_mutex)) 650 goto unlock_sem; 651 more_work = false; 652 clear_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags); 653 if (!READ_ONCE(ipvs->enable) || 654 test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) 655 goto unlock_m; 656 t = rcu_dereference_protected(ipvs->svc_table, 1); 657 /* Do nothing if table is removed */ 658 if (!t) 659 goto unlock_m; 660 /* New table needs to be registered? BUG! */ 661 if (t != rcu_dereference_protected(t->new_tbl, 1)) 662 goto unlock_m; 663 664 lfactor = sysctl_svc_lfactor(ipvs); 665 /* Should we resize ? */ 666 new_size = ip_vs_svc_desired_size(ipvs, t, lfactor); 667 if (new_size == t->size && lfactor == t->lfactor) 668 goto unlock_m; 669 670 t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor); 671 if (!t_new) { 672 more_work = true; 673 goto unlock_m; 674 } 675 /* Flip the table_id */ 676 t_new->table_id = t->table_id ^ IP_VS_RHT_TABLE_ID_MASK; 677 678 rcu_assign_pointer(t->new_tbl, t_new); 679 /* Allow add/del to new_tbl while moving from old table */ 680 mutex_unlock(&ipvs->service_mutex); 681 682 ip_vs_rht_for_each_bucket(t, bucket, head) { 683 same_bucket: 684 if (++limit >= 16) { 685 if (!READ_ONCE(ipvs->enable) || 686 test_bit(IP_VS_WORK_SVC_NORESIZE, 687 &ipvs->work_flags)) 688 goto unlock_sem; 689 if (resched_score >= 100) { 690 resched_score = 0; 691 cond_resched(); 692 } 693 limit = 0; 694 } 695 if (hlist_bl_empty(head)) { 696 resched_score++; 697 continue; 698 } 699 /* Preemption calls ahead... */ 700 resched_score = 0; 701 702 sc = &t->seqc[bucket & t->seqc_mask]; 703 /* seqcount_t usage considering PREEMPT_RT rules: 704 * - we are the only writer => preemption can be allowed 705 * - readers (SoftIRQ) => disable BHs 706 * - readers (processes) => preemption should be disabled 707 */ 708 local_bh_disable(); 709 preempt_disable_nested(); 710 write_seqcount_begin(sc); 711 hlist_bl_lock(head); 712 713 hlist_bl_for_each_entry_safe(svc, cn, nn, head, s_list) { 714 u32 hash; 715 716 /* New hash for the new table */ 717 if (svc->fwmark == 0) { 718 /* Hash it by <protocol,addr,port> */ 719 hash = ip_vs_svc_hashval(t_new, svc->af, 720 svc->protocol, 721 &svc->addr, svc->port); 722 } else { 723 /* Hash it by fwmark */ 724 hash = ip_vs_svc_fwm_hashval(t_new, svc->af, 725 svc->fwmark); 726 } 727 hlist_bl_del_rcu(&svc->s_list); 728 head2 = t_new->buckets + (hash & t_new->mask); 729 730 hlist_bl_lock(head2); 731 WRITE_ONCE(svc->hash_key, 732 ip_vs_rht_build_hash_key(t_new, hash)); 733 /* t_new->seqc are not used at this stage, we race 734 * only with add/del, so only lock the bucket. 735 */ 736 hlist_bl_add_head_rcu(&svc->s_list, head2); 737 hlist_bl_unlock(head2); 738 /* Too long chain? Do it in steps */ 739 if (++limit >= 64) 740 break; 741 } 742 743 hlist_bl_unlock(head); 744 write_seqcount_end(sc); 745 preempt_enable_nested(); 746 local_bh_enable(); 747 if (limit >= 64) 748 goto same_bucket; 749 } 750 751 /* Tables can be switched only under service_mutex */ 752 while (!mutex_trylock(&ipvs->service_mutex)) { 753 cond_resched(); 754 if (!READ_ONCE(ipvs->enable) || 755 test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) 756 goto unlock_sem; 757 } 758 if (!READ_ONCE(ipvs->enable) || 759 test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) 760 goto unlock_m; 761 762 rcu_assign_pointer(ipvs->svc_table, t_new); 763 /* Inform readers that new table is installed */ 764 smp_mb__before_atomic(); 765 atomic_inc(&ipvs->svc_table_changes); 766 t_free = t; 767 768 unlock_m: 769 mutex_unlock(&ipvs->service_mutex); 770 771 unlock_sem: 772 up_write(&ipvs->svc_resize_sem); 773 774 if (t_free) { 775 /* RCU readers should not see more than two tables in chain. 776 * To prevent new table to be attached wait here instead of 777 * freeing the old table in RCU callback. 778 */ 779 synchronize_rcu(); 780 ip_vs_rht_free(t_free); 781 } 782 783 out: 784 if (!READ_ONCE(ipvs->enable) || !more_work || 785 test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) 786 return; 787 queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 1); 788 } 789 790 static inline void 791 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) 792 { 793 atomic_inc(&svc->refcnt); 794 rcu_assign_pointer(dest->svc, svc); 795 } 796 797 static void ip_vs_service_free(struct ip_vs_service *svc) 798 { 799 ip_vs_stats_release(&svc->stats); 800 kfree(svc); 801 } 802 803 static void ip_vs_service_rcu_free(struct rcu_head *head) 804 { 805 struct ip_vs_service *svc; 806 807 svc = container_of(head, struct ip_vs_service, rcu_head); 808 ip_vs_service_free(svc); 809 } 810 811 static void __ip_vs_svc_put(struct ip_vs_service *svc) 812 { 813 if (atomic_dec_and_test(&svc->refcnt)) { 814 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n", 815 svc->fwmark, 816 IP_VS_DBG_ADDR(svc->af, &svc->addr), 817 ntohs(svc->port)); 818 call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); 819 } 820 } 821 822 823 /* 824 * Returns hash value for real service 825 */ 826 static inline unsigned int ip_vs_rs_hashkey(int af, 827 const union nf_inet_addr *addr, 828 __be16 port) 829 { 830 unsigned int porth = ntohs(port); 831 __be32 addr_fold = addr->ip; 832 833 #ifdef CONFIG_IP_VS_IPV6 834 if (af == AF_INET6) 835 addr_fold = addr->ip6[0]^addr->ip6[1]^ 836 addr->ip6[2]^addr->ip6[3]; 837 #endif 838 839 return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth) 840 & IP_VS_RTAB_MASK; 841 } 842 843 /* Hash ip_vs_dest in rs_table by <proto,addr,port>. */ 844 static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) 845 { 846 unsigned int hash; 847 __be16 port; 848 849 if (dest->in_rs_table) 850 return; 851 852 switch (IP_VS_DFWD_METHOD(dest)) { 853 case IP_VS_CONN_F_MASQ: 854 port = dest->port; 855 break; 856 case IP_VS_CONN_F_TUNNEL: 857 switch (dest->tun_type) { 858 case IP_VS_CONN_F_TUNNEL_TYPE_GUE: 859 port = dest->tun_port; 860 break; 861 case IP_VS_CONN_F_TUNNEL_TYPE_IPIP: 862 case IP_VS_CONN_F_TUNNEL_TYPE_GRE: 863 port = 0; 864 break; 865 default: 866 return; 867 } 868 break; 869 default: 870 return; 871 } 872 873 /* 874 * Hash by proto,addr,port, 875 * which are the parameters of the real service. 876 */ 877 hash = ip_vs_rs_hashkey(dest->af, &dest->addr, port); 878 879 hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]); 880 dest->in_rs_table = 1; 881 } 882 883 /* Unhash ip_vs_dest from rs_table. */ 884 static void ip_vs_rs_unhash(struct ip_vs_dest *dest) 885 { 886 /* 887 * Remove it from the rs_table table. 888 */ 889 if (dest->in_rs_table) { 890 hlist_del_rcu(&dest->d_list); 891 dest->in_rs_table = 0; 892 } 893 } 894 895 /* Check if real service by <proto,addr,port> is present */ 896 bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, 897 const union nf_inet_addr *daddr, __be16 dport) 898 { 899 unsigned int hash; 900 struct ip_vs_dest *dest; 901 902 /* Check for "full" addressed entries */ 903 hash = ip_vs_rs_hashkey(af, daddr, dport); 904 905 hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { 906 if (dest->port == dport && 907 dest->af == af && 908 ip_vs_addr_equal(af, &dest->addr, daddr) && 909 (dest->protocol == protocol || dest->vfwmark) && 910 IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) { 911 /* HIT */ 912 return true; 913 } 914 } 915 916 return false; 917 } 918 919 /* Find real service record by <proto,addr,port>. 920 * In case of multiple records with the same <proto,addr,port>, only 921 * the first found record is returned. 922 * 923 * To be called under RCU lock. 924 */ 925 struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af, 926 __u16 protocol, 927 const union nf_inet_addr *daddr, 928 __be16 dport) 929 { 930 unsigned int hash; 931 struct ip_vs_dest *dest; 932 933 /* Check for "full" addressed entries */ 934 hash = ip_vs_rs_hashkey(af, daddr, dport); 935 936 hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { 937 if (dest->port == dport && 938 dest->af == af && 939 ip_vs_addr_equal(af, &dest->addr, daddr) && 940 (dest->protocol == protocol || dest->vfwmark) && 941 IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) { 942 /* HIT */ 943 return dest; 944 } 945 } 946 947 return NULL; 948 } 949 950 /* Find real service record by <af,addr,tun_port>. 951 * In case of multiple records with the same <af,addr,tun_port>, only 952 * the first found record is returned. 953 * 954 * To be called under RCU lock. 955 */ 956 struct ip_vs_dest *ip_vs_find_tunnel(struct netns_ipvs *ipvs, int af, 957 const union nf_inet_addr *daddr, 958 __be16 tun_port) 959 { 960 struct ip_vs_dest *dest; 961 unsigned int hash; 962 963 /* Check for "full" addressed entries */ 964 hash = ip_vs_rs_hashkey(af, daddr, tun_port); 965 966 hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { 967 if (dest->tun_port == tun_port && 968 dest->af == af && 969 ip_vs_addr_equal(af, &dest->addr, daddr) && 970 IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_TUNNEL) { 971 /* HIT */ 972 return dest; 973 } 974 } 975 976 return NULL; 977 } 978 979 /* Lookup destination by {addr,port} in the given service 980 * Called under RCU lock. 981 */ 982 static struct ip_vs_dest * 983 ip_vs_lookup_dest(struct ip_vs_service *svc, int dest_af, 984 const union nf_inet_addr *daddr, __be16 dport) 985 { 986 struct ip_vs_dest *dest; 987 988 /* 989 * Find the destination for the given service 990 */ 991 list_for_each_entry_rcu(dest, &svc->destinations, n_list) { 992 if ((dest->af == dest_af) && 993 ip_vs_addr_equal(dest_af, &dest->addr, daddr) && 994 (dest->port == dport)) { 995 /* HIT */ 996 return dest; 997 } 998 } 999 1000 return NULL; 1001 } 1002 1003 /* 1004 * Find destination by {daddr,dport,vaddr,protocol} 1005 * Created to be used in ip_vs_process_message() in 1006 * the backup synchronization daemon. It finds the 1007 * destination to be bound to the received connection 1008 * on the backup. 1009 * Called under RCU lock, no refcnt is returned. 1010 */ 1011 struct ip_vs_dest *ip_vs_find_dest(struct netns_ipvs *ipvs, int svc_af, int dest_af, 1012 const union nf_inet_addr *daddr, 1013 __be16 dport, 1014 const union nf_inet_addr *vaddr, 1015 __be16 vport, __u16 protocol, __u32 fwmark, 1016 __u32 flags) 1017 { 1018 struct ip_vs_dest *dest; 1019 struct ip_vs_service *svc; 1020 __be16 port = dport; 1021 1022 svc = ip_vs_service_find(ipvs, svc_af, fwmark, protocol, vaddr, vport); 1023 if (!svc) 1024 return NULL; 1025 if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) 1026 port = 0; 1027 dest = ip_vs_lookup_dest(svc, dest_af, daddr, port); 1028 if (!dest) 1029 dest = ip_vs_lookup_dest(svc, dest_af, daddr, port ^ dport); 1030 return dest; 1031 } 1032 1033 void ip_vs_dest_dst_rcu_free(struct rcu_head *head) 1034 { 1035 struct ip_vs_dest_dst *dest_dst = container_of(head, 1036 struct ip_vs_dest_dst, 1037 rcu_head); 1038 1039 dst_release(dest_dst->dst_cache); 1040 kfree(dest_dst); 1041 } 1042 1043 /* Release dest_dst and dst_cache for dest in user context */ 1044 static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest) 1045 { 1046 struct ip_vs_dest_dst *old; 1047 1048 old = rcu_dereference_protected(dest->dest_dst, 1); 1049 if (old) { 1050 RCU_INIT_POINTER(dest->dest_dst, NULL); 1051 call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); 1052 } 1053 } 1054 1055 /* 1056 * Lookup dest by {svc,addr,port} in the destination trash. 1057 * The destination trash is used to hold the destinations that are removed 1058 * from the service table but are still referenced by some conn entries. 1059 * The reason to add the destination trash is when the dest is temporary 1060 * down (either by administrator or by monitor program), the dest can be 1061 * picked back from the trash, the remaining connections to the dest can 1062 * continue, and the counting information of the dest is also useful for 1063 * scheduling. 1064 */ 1065 static struct ip_vs_dest * 1066 ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af, 1067 const union nf_inet_addr *daddr, __be16 dport) 1068 { 1069 struct ip_vs_dest *dest; 1070 struct netns_ipvs *ipvs = svc->ipvs; 1071 1072 /* 1073 * Find the destination in trash 1074 */ 1075 spin_lock_bh(&ipvs->dest_trash_lock); 1076 list_for_each_entry(dest, &ipvs->dest_trash, t_list) { 1077 IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " 1078 "dest->refcnt=%d\n", 1079 dest->vfwmark, 1080 IP_VS_DBG_ADDR(dest->af, &dest->addr), 1081 ntohs(dest->port), 1082 refcount_read(&dest->refcnt)); 1083 if (dest->af == dest_af && 1084 ip_vs_addr_equal(dest_af, &dest->addr, daddr) && 1085 dest->port == dport && 1086 dest->vfwmark == svc->fwmark && 1087 dest->protocol == svc->protocol && 1088 (svc->fwmark || 1089 (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) && 1090 dest->vport == svc->port))) { 1091 /* HIT */ 1092 list_del(&dest->t_list); 1093 goto out; 1094 } 1095 } 1096 1097 dest = NULL; 1098 1099 out: 1100 spin_unlock_bh(&ipvs->dest_trash_lock); 1101 1102 return dest; 1103 } 1104 1105 static void ip_vs_dest_rcu_free(struct rcu_head *head) 1106 { 1107 struct ip_vs_dest *dest; 1108 1109 dest = container_of(head, struct ip_vs_dest, rcu_head); 1110 ip_vs_stats_release(&dest->stats); 1111 ip_vs_dest_put_and_free(dest); 1112 } 1113 1114 static void ip_vs_dest_free(struct ip_vs_dest *dest) 1115 { 1116 struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1); 1117 1118 __ip_vs_svc_put(svc); 1119 call_rcu(&dest->rcu_head, ip_vs_dest_rcu_free); 1120 } 1121 1122 /* 1123 * Clean up all the destinations in the trash 1124 * Called by the ip_vs_control_cleanup() 1125 * 1126 * When the ip_vs_control_clearup is activated by ipvs module exit, 1127 * the service tables must have been flushed and all the connections 1128 * are expired, and the refcnt of each destination in the trash must 1129 * be 1, so we simply release them here. 1130 */ 1131 static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs) 1132 { 1133 struct ip_vs_dest *dest, *nxt; 1134 1135 timer_delete_sync(&ipvs->dest_trash_timer); 1136 /* No need to use dest_trash_lock */ 1137 list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) { 1138 list_del(&dest->t_list); 1139 ip_vs_dest_free(dest); 1140 } 1141 } 1142 1143 static void ip_vs_stats_rcu_free(struct rcu_head *head) 1144 { 1145 struct ip_vs_stats_rcu *rs = container_of(head, 1146 struct ip_vs_stats_rcu, 1147 rcu_head); 1148 1149 ip_vs_stats_release(&rs->s); 1150 kfree(rs); 1151 } 1152 1153 static void 1154 ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src) 1155 { 1156 #define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c 1157 1158 spin_lock(&src->lock); 1159 1160 IP_VS_SHOW_STATS_COUNTER(conns); 1161 IP_VS_SHOW_STATS_COUNTER(inpkts); 1162 IP_VS_SHOW_STATS_COUNTER(outpkts); 1163 IP_VS_SHOW_STATS_COUNTER(inbytes); 1164 IP_VS_SHOW_STATS_COUNTER(outbytes); 1165 1166 ip_vs_read_estimator(dst, src); 1167 1168 spin_unlock(&src->lock); 1169 } 1170 1171 static void 1172 ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src) 1173 { 1174 dst->conns = (u32)src->conns; 1175 dst->inpkts = (u32)src->inpkts; 1176 dst->outpkts = (u32)src->outpkts; 1177 dst->inbytes = src->inbytes; 1178 dst->outbytes = src->outbytes; 1179 dst->cps = (u32)src->cps; 1180 dst->inpps = (u32)src->inpps; 1181 dst->outpps = (u32)src->outpps; 1182 dst->inbps = (u32)src->inbps; 1183 dst->outbps = (u32)src->outbps; 1184 } 1185 1186 static void 1187 ip_vs_zero_stats(struct ip_vs_stats *stats) 1188 { 1189 spin_lock(&stats->lock); 1190 1191 /* get current counters as zero point, rates are zeroed */ 1192 1193 #define IP_VS_ZERO_STATS_COUNTER(c) stats->kstats0.c = stats->kstats.c 1194 1195 IP_VS_ZERO_STATS_COUNTER(conns); 1196 IP_VS_ZERO_STATS_COUNTER(inpkts); 1197 IP_VS_ZERO_STATS_COUNTER(outpkts); 1198 IP_VS_ZERO_STATS_COUNTER(inbytes); 1199 IP_VS_ZERO_STATS_COUNTER(outbytes); 1200 1201 ip_vs_zero_estimator(stats); 1202 1203 spin_unlock(&stats->lock); 1204 } 1205 1206 /* Allocate fields after kzalloc */ 1207 int ip_vs_stats_init_alloc(struct ip_vs_stats *s) 1208 { 1209 int i; 1210 1211 spin_lock_init(&s->lock); 1212 s->cpustats = alloc_percpu(struct ip_vs_cpu_stats); 1213 if (!s->cpustats) 1214 return -ENOMEM; 1215 1216 for_each_possible_cpu(i) { 1217 struct ip_vs_cpu_stats *cs = per_cpu_ptr(s->cpustats, i); 1218 1219 u64_stats_init(&cs->syncp); 1220 } 1221 return 0; 1222 } 1223 1224 struct ip_vs_stats *ip_vs_stats_alloc(void) 1225 { 1226 struct ip_vs_stats *s = kzalloc_obj(*s); 1227 1228 if (s && ip_vs_stats_init_alloc(s) >= 0) 1229 return s; 1230 kfree(s); 1231 return NULL; 1232 } 1233 1234 void ip_vs_stats_release(struct ip_vs_stats *stats) 1235 { 1236 free_percpu(stats->cpustats); 1237 } 1238 1239 void ip_vs_stats_free(struct ip_vs_stats *stats) 1240 { 1241 if (stats) { 1242 ip_vs_stats_release(stats); 1243 kfree(stats); 1244 } 1245 } 1246 1247 /* 1248 * Update a destination in the given service 1249 */ 1250 static void 1251 __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, 1252 struct ip_vs_dest_user_kern *udest, int add) 1253 { 1254 struct netns_ipvs *ipvs = svc->ipvs; 1255 struct ip_vs_service *old_svc; 1256 struct ip_vs_scheduler *sched; 1257 int conn_flags; 1258 1259 /* We cannot modify an address and change the address family */ 1260 BUG_ON(!add && udest->af != dest->af); 1261 1262 if (add && udest->af != svc->af) 1263 ipvs->mixed_address_family_dests++; 1264 1265 /* keep the last_weight with latest non-0 weight */ 1266 if (add || udest->weight != 0) 1267 atomic_set(&dest->last_weight, udest->weight); 1268 1269 /* set the weight and the flags */ 1270 atomic_set(&dest->weight, udest->weight); 1271 conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; 1272 conn_flags |= IP_VS_CONN_F_INACTIVE; 1273 1274 /* Need to rehash? */ 1275 if ((udest->conn_flags & IP_VS_CONN_F_FWD_MASK) != 1276 IP_VS_DFWD_METHOD(dest) || 1277 udest->tun_type != dest->tun_type || 1278 udest->tun_port != dest->tun_port) 1279 ip_vs_rs_unhash(dest); 1280 1281 /* set the tunnel info */ 1282 dest->tun_type = udest->tun_type; 1283 dest->tun_port = udest->tun_port; 1284 dest->tun_flags = udest->tun_flags; 1285 1286 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ 1287 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) { 1288 conn_flags |= IP_VS_CONN_F_NOOUTPUT; 1289 } else { 1290 /* FTP-NAT requires conntrack for mangling */ 1291 if (svc->port == FTPPORT) 1292 ip_vs_register_conntrack(svc); 1293 } 1294 atomic_set(&dest->conn_flags, conn_flags); 1295 /* Put the real service in rs_table if not present. */ 1296 ip_vs_rs_hash(ipvs, dest); 1297 1298 /* bind the service */ 1299 old_svc = rcu_dereference_protected(dest->svc, 1); 1300 if (!old_svc) { 1301 __ip_vs_bind_svc(dest, svc); 1302 } else { 1303 if (old_svc != svc) { 1304 ip_vs_zero_stats(&dest->stats); 1305 __ip_vs_bind_svc(dest, svc); 1306 __ip_vs_svc_put(old_svc); 1307 } 1308 } 1309 1310 /* set the dest status flags */ 1311 dest->flags |= IP_VS_DEST_F_AVAILABLE; 1312 1313 if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold) 1314 dest->flags &= ~IP_VS_DEST_F_OVERLOAD; 1315 dest->u_threshold = udest->u_threshold; 1316 dest->l_threshold = udest->l_threshold; 1317 1318 dest->af = udest->af; 1319 1320 if (add) { 1321 list_add_rcu(&dest->n_list, &svc->destinations); 1322 svc->num_dests++; 1323 sched = rcu_dereference_protected(svc->scheduler, 1); 1324 if (sched && sched->add_dest) 1325 sched->add_dest(svc, dest); 1326 } else { 1327 spin_lock_bh(&dest->dst_lock); 1328 __ip_vs_dst_cache_reset(dest); 1329 spin_unlock_bh(&dest->dst_lock); 1330 1331 sched = rcu_dereference_protected(svc->scheduler, 1); 1332 if (sched && sched->upd_dest) 1333 sched->upd_dest(svc, dest); 1334 } 1335 } 1336 1337 1338 /* 1339 * Create a destination for the given service 1340 */ 1341 static int 1342 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) 1343 { 1344 struct ip_vs_dest *dest; 1345 unsigned int atype; 1346 int ret; 1347 1348 #ifdef CONFIG_IP_VS_IPV6 1349 if (udest->af == AF_INET6) { 1350 atype = ipv6_addr_type(&udest->addr.in6); 1351 if ((!(atype & IPV6_ADDR_UNICAST) || 1352 atype & IPV6_ADDR_LINKLOCAL) && 1353 !__ip_vs_addr_is_local_v6(svc->ipvs->net, &udest->addr.in6)) 1354 return -EINVAL; 1355 1356 ret = nf_defrag_ipv6_enable(svc->ipvs->net); 1357 if (ret) 1358 return ret; 1359 } else 1360 #endif 1361 { 1362 atype = inet_addr_type(svc->ipvs->net, udest->addr.ip); 1363 if (atype != RTN_LOCAL && atype != RTN_UNICAST) 1364 return -EINVAL; 1365 } 1366 1367 dest = kzalloc_obj(struct ip_vs_dest); 1368 if (dest == NULL) 1369 return -ENOMEM; 1370 1371 ret = ip_vs_stats_init_alloc(&dest->stats); 1372 if (ret < 0) 1373 goto err_alloc; 1374 1375 ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); 1376 if (ret < 0) 1377 goto err_stats; 1378 1379 dest->af = udest->af; 1380 dest->protocol = svc->protocol; 1381 dest->vaddr = svc->addr; 1382 dest->vport = svc->port; 1383 dest->vfwmark = svc->fwmark; 1384 ip_vs_addr_copy(udest->af, &dest->addr, &udest->addr); 1385 dest->port = udest->port; 1386 1387 atomic_set(&dest->activeconns, 0); 1388 atomic_set(&dest->inactconns, 0); 1389 atomic_set(&dest->persistconns, 0); 1390 refcount_set(&dest->refcnt, 1); 1391 1392 INIT_HLIST_NODE(&dest->d_list); 1393 spin_lock_init(&dest->dst_lock); 1394 __ip_vs_update_dest(svc, dest, udest, 1); 1395 1396 return 0; 1397 1398 err_stats: 1399 ip_vs_stats_release(&dest->stats); 1400 1401 err_alloc: 1402 kfree(dest); 1403 return ret; 1404 } 1405 1406 1407 /* 1408 * Add a destination into an existing service 1409 */ 1410 static int 1411 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) 1412 { 1413 struct ip_vs_dest *dest; 1414 union nf_inet_addr daddr; 1415 __be16 dport = udest->port; 1416 int ret; 1417 1418 if (udest->weight < 0) { 1419 pr_err("%s(): server weight less than zero\n", __func__); 1420 return -ERANGE; 1421 } 1422 1423 if (udest->l_threshold > udest->u_threshold) { 1424 pr_err("%s(): lower threshold is higher than upper threshold\n", 1425 __func__); 1426 return -ERANGE; 1427 } 1428 1429 if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1430 if (udest->tun_port == 0) { 1431 pr_err("%s(): tunnel port is zero\n", __func__); 1432 return -EINVAL; 1433 } 1434 } 1435 1436 ip_vs_addr_copy(udest->af, &daddr, &udest->addr); 1437 1438 /* We use function that requires RCU lock */ 1439 rcu_read_lock(); 1440 dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport); 1441 rcu_read_unlock(); 1442 1443 if (dest != NULL) { 1444 IP_VS_DBG(1, "%s(): dest already exists\n", __func__); 1445 return -EEXIST; 1446 } 1447 1448 /* 1449 * Check if the dest already exists in the trash and 1450 * is from the same service 1451 */ 1452 dest = ip_vs_trash_get_dest(svc, udest->af, &daddr, dport); 1453 1454 if (dest != NULL) { 1455 IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " 1456 "dest->refcnt=%d, service %u/%s:%u\n", 1457 IP_VS_DBG_ADDR(udest->af, &daddr), ntohs(dport), 1458 refcount_read(&dest->refcnt), 1459 dest->vfwmark, 1460 IP_VS_DBG_ADDR(svc->af, &dest->vaddr), 1461 ntohs(dest->vport)); 1462 1463 ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); 1464 if (ret < 0) 1465 return ret; 1466 __ip_vs_update_dest(svc, dest, udest, 1); 1467 } else { 1468 /* 1469 * Allocate and initialize the dest structure 1470 */ 1471 ret = ip_vs_new_dest(svc, udest); 1472 } 1473 1474 return ret; 1475 } 1476 1477 1478 /* 1479 * Edit a destination in the given service 1480 */ 1481 static int 1482 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) 1483 { 1484 struct ip_vs_dest *dest; 1485 union nf_inet_addr daddr; 1486 __be16 dport = udest->port; 1487 1488 if (udest->weight < 0) { 1489 pr_err("%s(): server weight less than zero\n", __func__); 1490 return -ERANGE; 1491 } 1492 1493 if (udest->l_threshold > udest->u_threshold) { 1494 pr_err("%s(): lower threshold is higher than upper threshold\n", 1495 __func__); 1496 return -ERANGE; 1497 } 1498 1499 if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1500 if (udest->tun_port == 0) { 1501 pr_err("%s(): tunnel port is zero\n", __func__); 1502 return -EINVAL; 1503 } 1504 } 1505 1506 ip_vs_addr_copy(udest->af, &daddr, &udest->addr); 1507 1508 /* We use function that requires RCU lock */ 1509 rcu_read_lock(); 1510 dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport); 1511 rcu_read_unlock(); 1512 1513 if (dest == NULL) { 1514 IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__); 1515 return -ENOENT; 1516 } 1517 1518 __ip_vs_update_dest(svc, dest, udest, 0); 1519 1520 return 0; 1521 } 1522 1523 /* 1524 * Delete a destination (must be already unlinked from the service) 1525 */ 1526 static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest, 1527 bool cleanup) 1528 { 1529 ip_vs_stop_estimator(ipvs, &dest->stats); 1530 1531 /* 1532 * Remove it from the d-linked list with the real services. 1533 */ 1534 ip_vs_rs_unhash(dest); 1535 1536 spin_lock_bh(&ipvs->dest_trash_lock); 1537 IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", 1538 IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), 1539 refcount_read(&dest->refcnt)); 1540 if (list_empty(&ipvs->dest_trash) && !cleanup) 1541 mod_timer(&ipvs->dest_trash_timer, 1542 jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); 1543 /* dest lives in trash with reference */ 1544 list_add(&dest->t_list, &ipvs->dest_trash); 1545 dest->idle_start = 0; 1546 spin_unlock_bh(&ipvs->dest_trash_lock); 1547 1548 /* Queue up delayed work to expire all no destination connections. 1549 * No-op when CONFIG_SYSCTL is disabled. 1550 */ 1551 if (!cleanup) 1552 ip_vs_enqueue_expire_nodest_conns(ipvs); 1553 } 1554 1555 1556 /* 1557 * Unlink a destination from the given service 1558 */ 1559 static void __ip_vs_unlink_dest(struct ip_vs_service *svc, 1560 struct ip_vs_dest *dest, 1561 int svcupd) 1562 { 1563 dest->flags &= ~IP_VS_DEST_F_AVAILABLE; 1564 1565 spin_lock_bh(&dest->dst_lock); 1566 __ip_vs_dst_cache_reset(dest); 1567 spin_unlock_bh(&dest->dst_lock); 1568 1569 /* 1570 * Remove it from the d-linked destination list. 1571 */ 1572 list_del_rcu(&dest->n_list); 1573 svc->num_dests--; 1574 1575 if (dest->af != svc->af) 1576 svc->ipvs->mixed_address_family_dests--; 1577 1578 if (svcupd) { 1579 struct ip_vs_scheduler *sched; 1580 1581 sched = rcu_dereference_protected(svc->scheduler, 1); 1582 if (sched && sched->del_dest) 1583 sched->del_dest(svc, dest); 1584 } 1585 } 1586 1587 1588 /* 1589 * Delete a destination server in the given service 1590 */ 1591 static int 1592 ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) 1593 { 1594 struct ip_vs_dest *dest; 1595 __be16 dport = udest->port; 1596 1597 /* We use function that requires RCU lock */ 1598 rcu_read_lock(); 1599 dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport); 1600 rcu_read_unlock(); 1601 1602 if (dest == NULL) { 1603 IP_VS_DBG(1, "%s(): destination not found!\n", __func__); 1604 return -ENOENT; 1605 } 1606 1607 /* 1608 * Unlink dest from the service 1609 */ 1610 __ip_vs_unlink_dest(svc, dest, 1); 1611 1612 /* 1613 * Delete the destination 1614 */ 1615 __ip_vs_del_dest(svc->ipvs, dest, false); 1616 1617 return 0; 1618 } 1619 1620 static void ip_vs_dest_trash_expire(struct timer_list *t) 1621 { 1622 struct netns_ipvs *ipvs = timer_container_of(ipvs, t, 1623 dest_trash_timer); 1624 struct ip_vs_dest *dest, *next; 1625 unsigned long now = jiffies; 1626 1627 spin_lock(&ipvs->dest_trash_lock); 1628 list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) { 1629 if (refcount_read(&dest->refcnt) > 1) 1630 continue; 1631 if (dest->idle_start) { 1632 if (time_before(now, dest->idle_start + 1633 IP_VS_DEST_TRASH_PERIOD)) 1634 continue; 1635 } else { 1636 dest->idle_start = max(1UL, now); 1637 continue; 1638 } 1639 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n", 1640 dest->vfwmark, 1641 IP_VS_DBG_ADDR(dest->af, &dest->addr), 1642 ntohs(dest->port)); 1643 list_del(&dest->t_list); 1644 ip_vs_dest_free(dest); 1645 } 1646 if (!list_empty(&ipvs->dest_trash)) 1647 mod_timer(&ipvs->dest_trash_timer, 1648 jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); 1649 spin_unlock(&ipvs->dest_trash_lock); 1650 } 1651 1652 /* 1653 * Add a service into the service hash table 1654 */ 1655 static int 1656 ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, 1657 struct ip_vs_service **svc_p) 1658 { 1659 struct ip_vs_scheduler *sched = NULL; 1660 struct ip_vs_rht *tc_new = NULL; 1661 struct ip_vs_rht *t, *t_new = NULL; 1662 int af_id = ip_vs_af_index(u->af); 1663 struct ip_vs_service *svc = NULL; 1664 struct ip_vs_pe *pe = NULL; 1665 int ret_hooks = -1; 1666 int ret = 0; 1667 1668 /* increase the module use count */ 1669 if (!ip_vs_use_count_inc()) 1670 return -ENOPROTOOPT; 1671 1672 /* Lookup the scheduler by 'u->sched_name' */ 1673 if (strcmp(u->sched_name, "none")) { 1674 sched = ip_vs_scheduler_get(u->sched_name); 1675 if (!sched) { 1676 pr_info("Scheduler module ip_vs_%s not found\n", 1677 u->sched_name); 1678 ret = -ENOENT; 1679 goto out_err; 1680 } 1681 } 1682 1683 if (u->pe_name && *u->pe_name) { 1684 pe = ip_vs_pe_getbyname(u->pe_name); 1685 if (pe == NULL) { 1686 pr_info("persistence engine module ip_vs_pe_%s " 1687 "not found\n", u->pe_name); 1688 ret = -ENOENT; 1689 goto out_err; 1690 } 1691 } 1692 1693 #ifdef CONFIG_IP_VS_IPV6 1694 if (u->af == AF_INET6) { 1695 __u32 plen = (__force __u32) u->netmask; 1696 1697 if (plen < 1 || plen > 128) { 1698 ret = -EINVAL; 1699 goto out_err; 1700 } 1701 1702 ret = nf_defrag_ipv6_enable(ipvs->net); 1703 if (ret) 1704 goto out_err; 1705 } 1706 #endif 1707 1708 t = rcu_dereference_protected(ipvs->svc_table, 1); 1709 if (!t) { 1710 int lfactor = sysctl_svc_lfactor(ipvs); 1711 int new_size = ip_vs_svc_desired_size(ipvs, NULL, lfactor); 1712 1713 t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor); 1714 if (!t_new) { 1715 ret = -ENOMEM; 1716 goto out_err; 1717 } 1718 } 1719 1720 if (!rcu_dereference_protected(ipvs->conn_tab, 1)) { 1721 int lfactor = sysctl_conn_lfactor(ipvs); 1722 int new_size = ip_vs_conn_desired_size(ipvs, NULL, lfactor); 1723 1724 tc_new = ip_vs_conn_tab_alloc(ipvs, new_size, lfactor); 1725 if (!tc_new) { 1726 ret = -ENOMEM; 1727 goto out_err; 1728 } 1729 } 1730 1731 if (!atomic_read(&ipvs->num_services[af_id])) { 1732 ret = ip_vs_register_hooks(ipvs, u->af); 1733 if (ret < 0) 1734 goto out_err; 1735 ret_hooks = ret; 1736 } 1737 1738 svc = kzalloc_obj(struct ip_vs_service); 1739 if (svc == NULL) { 1740 IP_VS_DBG(1, "%s(): no memory\n", __func__); 1741 ret = -ENOMEM; 1742 goto out_err; 1743 } 1744 ret = ip_vs_stats_init_alloc(&svc->stats); 1745 if (ret < 0) 1746 goto out_err; 1747 1748 /* I'm the first user of the service */ 1749 atomic_set(&svc->refcnt, 0); 1750 1751 svc->af = u->af; 1752 svc->protocol = u->protocol; 1753 ip_vs_addr_copy(svc->af, &svc->addr, &u->addr); 1754 svc->port = u->port; 1755 svc->fwmark = u->fwmark; 1756 svc->flags = u->flags & ~IP_VS_SVC_F_HASHED; 1757 svc->timeout = u->timeout * HZ; 1758 svc->netmask = u->netmask; 1759 svc->ipvs = ipvs; 1760 1761 INIT_LIST_HEAD(&svc->destinations); 1762 spin_lock_init(&svc->sched_lock); 1763 1764 /* Bind the scheduler */ 1765 if (sched) { 1766 ret = ip_vs_bind_scheduler(svc, sched); 1767 if (ret) 1768 goto out_err; 1769 } 1770 1771 ret = ip_vs_start_estimator(ipvs, &svc->stats); 1772 if (ret < 0) 1773 goto out_err; 1774 1775 if (t_new) { 1776 clear_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags); 1777 rcu_assign_pointer(ipvs->svc_table, t_new); 1778 t_new = NULL; 1779 } 1780 if (tc_new) { 1781 rcu_assign_pointer(ipvs->conn_tab, tc_new); 1782 tc_new = NULL; 1783 } 1784 1785 /* Update the virtual service counters */ 1786 if (svc->port == FTPPORT) 1787 atomic_inc(&ipvs->ftpsvc_counter[af_id]); 1788 else if (!svc->port && !svc->fwmark) 1789 atomic_inc(&ipvs->nullsvc_counter[af_id]); 1790 if (pe && pe->conn_out) 1791 atomic_inc(&ipvs->conn_out_counter[af_id]); 1792 1793 /* Bind the ct retriever */ 1794 RCU_INIT_POINTER(svc->pe, pe); 1795 pe = NULL; 1796 1797 if (svc->fwmark) 1798 atomic_inc(&ipvs->fwm_services[af_id]); 1799 else 1800 atomic_inc(&ipvs->nonfwm_services[af_id]); 1801 atomic_inc(&ipvs->num_services[af_id]); 1802 1803 /* Hash the service into the service table */ 1804 ip_vs_svc_hash(svc); 1805 1806 /* Schedule resize work */ 1807 if (t && ip_vs_get_num_services(ipvs) > t->u_thresh && 1808 !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags)) 1809 queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 1810 1); 1811 1812 *svc_p = svc; 1813 1814 if (!READ_ONCE(ipvs->enable)) { 1815 /* Now there is a service - full throttle */ 1816 WRITE_ONCE(ipvs->enable, 1); 1817 1818 /* Start estimation for first time */ 1819 ip_vs_est_reload_start(ipvs); 1820 } 1821 1822 return 0; 1823 1824 1825 out_err: 1826 if (tc_new) 1827 ip_vs_rht_free(tc_new); 1828 if (t_new) 1829 ip_vs_rht_free(t_new); 1830 if (ret_hooks >= 0) 1831 ip_vs_unregister_hooks(ipvs, u->af); 1832 if (svc != NULL) { 1833 ip_vs_unbind_scheduler(svc, sched); 1834 ip_vs_service_free(svc); 1835 } 1836 ip_vs_scheduler_put(sched); 1837 ip_vs_pe_put(pe); 1838 1839 /* decrease the module use count */ 1840 ip_vs_use_count_dec(); 1841 1842 return ret; 1843 } 1844 1845 1846 /* 1847 * Edit a service and bind it with a new scheduler 1848 */ 1849 static int 1850 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) 1851 { 1852 struct ip_vs_scheduler *sched = NULL, *old_sched; 1853 struct ip_vs_pe *pe = NULL, *old_pe = NULL; 1854 int ret = 0; 1855 bool new_pe_conn_out, old_pe_conn_out; 1856 struct netns_ipvs *ipvs = svc->ipvs; 1857 int af_id = ip_vs_af_index(svc->af); 1858 1859 /* 1860 * Lookup the scheduler, by 'u->sched_name' 1861 */ 1862 if (strcmp(u->sched_name, "none")) { 1863 sched = ip_vs_scheduler_get(u->sched_name); 1864 if (!sched) { 1865 pr_info("Scheduler module ip_vs_%s not found\n", 1866 u->sched_name); 1867 return -ENOENT; 1868 } 1869 } 1870 old_sched = sched; 1871 1872 if (u->pe_name && *u->pe_name) { 1873 pe = ip_vs_pe_getbyname(u->pe_name); 1874 if (pe == NULL) { 1875 pr_info("persistence engine module ip_vs_pe_%s " 1876 "not found\n", u->pe_name); 1877 ret = -ENOENT; 1878 goto out; 1879 } 1880 old_pe = pe; 1881 } 1882 1883 #ifdef CONFIG_IP_VS_IPV6 1884 if (u->af == AF_INET6) { 1885 __u32 plen = (__force __u32) u->netmask; 1886 1887 if (plen < 1 || plen > 128) { 1888 ret = -EINVAL; 1889 goto out; 1890 } 1891 } 1892 #endif 1893 1894 old_sched = rcu_dereference_protected(svc->scheduler, 1); 1895 if (sched != old_sched) { 1896 if (old_sched) { 1897 ip_vs_unbind_scheduler(svc, old_sched); 1898 RCU_INIT_POINTER(svc->scheduler, NULL); 1899 /* Wait all svc->sched_data users */ 1900 synchronize_rcu(); 1901 } 1902 /* Bind the new scheduler */ 1903 if (sched) { 1904 ret = ip_vs_bind_scheduler(svc, sched); 1905 if (ret) { 1906 ip_vs_scheduler_put(sched); 1907 goto out; 1908 } 1909 } 1910 } 1911 1912 /* 1913 * Set the flags and timeout value 1914 */ 1915 svc->flags = u->flags | IP_VS_SVC_F_HASHED; 1916 svc->timeout = u->timeout * HZ; 1917 svc->netmask = u->netmask; 1918 1919 old_pe = rcu_dereference_protected(svc->pe, 1); 1920 if (pe != old_pe) { 1921 rcu_assign_pointer(svc->pe, pe); 1922 /* check for optional methods in new pe */ 1923 new_pe_conn_out = (pe && pe->conn_out) ? true : false; 1924 old_pe_conn_out = (old_pe && old_pe->conn_out) ? true : false; 1925 if (new_pe_conn_out && !old_pe_conn_out) 1926 atomic_inc(&ipvs->conn_out_counter[af_id]); 1927 if (old_pe_conn_out && !new_pe_conn_out) 1928 atomic_dec(&ipvs->conn_out_counter[af_id]); 1929 } 1930 1931 out: 1932 ip_vs_scheduler_put(old_sched); 1933 ip_vs_pe_put(old_pe); 1934 return ret; 1935 } 1936 1937 /* 1938 * Delete a service from the service list 1939 * - The service must be unlinked, unlocked and not referenced! 1940 * - We are called under _bh lock 1941 */ 1942 static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) 1943 { 1944 struct ip_vs_dest *dest, *nxt; 1945 struct ip_vs_scheduler *old_sched; 1946 struct ip_vs_pe *old_pe; 1947 struct netns_ipvs *ipvs = svc->ipvs; 1948 int af_id = ip_vs_af_index(svc->af); 1949 1950 atomic_dec(&ipvs->num_services[af_id]); 1951 if (!atomic_read(&ipvs->num_services[af_id])) 1952 ip_vs_unregister_hooks(ipvs, svc->af); 1953 if (svc->fwmark) 1954 atomic_dec(&ipvs->fwm_services[af_id]); 1955 else 1956 atomic_dec(&ipvs->nonfwm_services[af_id]); 1957 1958 ip_vs_stop_estimator(svc->ipvs, &svc->stats); 1959 1960 /* Unbind scheduler */ 1961 old_sched = rcu_dereference_protected(svc->scheduler, 1); 1962 ip_vs_unbind_scheduler(svc, old_sched); 1963 ip_vs_scheduler_put(old_sched); 1964 1965 /* Unbind persistence engine, keep svc->pe */ 1966 old_pe = rcu_dereference_protected(svc->pe, 1); 1967 if (old_pe && old_pe->conn_out) 1968 atomic_dec(&ipvs->conn_out_counter[af_id]); 1969 ip_vs_pe_put(old_pe); 1970 1971 /* 1972 * Unlink the whole destination list 1973 */ 1974 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { 1975 __ip_vs_unlink_dest(svc, dest, 0); 1976 __ip_vs_del_dest(svc->ipvs, dest, cleanup); 1977 } 1978 1979 /* 1980 * Update the virtual service counters 1981 */ 1982 if (svc->port == FTPPORT) 1983 atomic_dec(&ipvs->ftpsvc_counter[af_id]); 1984 else if (!svc->port && !svc->fwmark) 1985 atomic_dec(&ipvs->nullsvc_counter[af_id]); 1986 1987 /* 1988 * Free the service if nobody refers to it 1989 */ 1990 __ip_vs_svc_put(svc); 1991 1992 /* decrease the module use count */ 1993 ip_vs_use_count_dec(); 1994 } 1995 1996 /* 1997 * Unlink a service from list and try to delete it if its refcnt reached 0 1998 */ 1999 static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup) 2000 { 2001 ip_vs_unregister_conntrack(svc); 2002 /* Hold svc to avoid double release from dest_trash */ 2003 atomic_inc(&svc->refcnt); 2004 /* 2005 * Unhash it from the service table 2006 */ 2007 ip_vs_svc_unhash(svc); 2008 2009 __ip_vs_del_service(svc, cleanup); 2010 } 2011 2012 /* 2013 * Delete a service from the service list 2014 */ 2015 static int ip_vs_del_service(struct ip_vs_service *svc) 2016 { 2017 struct netns_ipvs *ipvs; 2018 struct ip_vs_rht *t, *p; 2019 int ns; 2020 2021 if (svc == NULL) 2022 return -EEXIST; 2023 ipvs = svc->ipvs; 2024 ip_vs_unlink_service(svc, false); 2025 t = rcu_dereference_protected(ipvs->svc_table, 1); 2026 2027 /* Drop the table if no more services */ 2028 ns = ip_vs_get_num_services(ipvs); 2029 if (!ns) { 2030 /* Stop the resizer and drop the tables */ 2031 set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags); 2032 cancel_delayed_work_sync(&ipvs->svc_resize_work); 2033 if (t) { 2034 rcu_assign_pointer(ipvs->svc_table, NULL); 2035 while (1) { 2036 p = rcu_dereference_protected(t->new_tbl, 1); 2037 call_rcu(&t->rcu_head, ip_vs_rht_rcu_free); 2038 if (p == t) 2039 break; 2040 t = p; 2041 } 2042 } 2043 } else if (ns <= t->l_thresh && 2044 !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, 2045 &ipvs->work_flags)) { 2046 queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 2047 1); 2048 } 2049 return 0; 2050 } 2051 2052 2053 /* 2054 * Flush all the virtual services 2055 */ 2056 static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) 2057 { 2058 DECLARE_IP_VS_RHT_WALK_BUCKETS(); 2059 struct hlist_bl_head *head; 2060 struct ip_vs_service *svc; 2061 struct hlist_bl_node *ne; 2062 struct hlist_bl_node *e; 2063 struct ip_vs_rht *t, *p; 2064 2065 /* Stop the resizer and drop the tables */ 2066 if (!test_and_set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags)) 2067 cancel_delayed_work_sync(&ipvs->svc_resize_work); 2068 /* No resizer, so now we have exclusive write access */ 2069 2070 if (ip_vs_get_num_services(ipvs)) { 2071 ip_vs_rht_walk_buckets(ipvs->svc_table, head) { 2072 hlist_bl_for_each_entry_safe(svc, e, ne, head, s_list) 2073 ip_vs_unlink_service(svc, cleanup); 2074 } 2075 } 2076 2077 /* Unregister the hash table and release it after RCU grace period */ 2078 t = rcu_dereference_protected(ipvs->svc_table, 1); 2079 if (t) { 2080 rcu_assign_pointer(ipvs->svc_table, NULL); 2081 while (1) { 2082 p = rcu_dereference_protected(t->new_tbl, 1); 2083 call_rcu(&t->rcu_head, ip_vs_rht_rcu_free); 2084 if (p == t) 2085 break; 2086 t = p; 2087 } 2088 } 2089 return 0; 2090 } 2091 2092 /* 2093 * Delete service by {netns} in the service table. 2094 * Called by __ip_vs_batch_cleanup() 2095 */ 2096 void ip_vs_service_nets_cleanup(struct list_head *net_list) 2097 { 2098 struct netns_ipvs *ipvs; 2099 struct net *net; 2100 2101 /* Check for "full" addressed entries */ 2102 list_for_each_entry(net, net_list, exit_list) { 2103 ipvs = net_ipvs(net); 2104 mutex_lock(&ipvs->service_mutex); 2105 ip_vs_flush(ipvs, true); 2106 mutex_unlock(&ipvs->service_mutex); 2107 } 2108 } 2109 2110 /* Put all references for device (dst_cache) */ 2111 static inline void 2112 ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev) 2113 { 2114 struct ip_vs_dest_dst *dest_dst; 2115 2116 spin_lock_bh(&dest->dst_lock); 2117 dest_dst = rcu_dereference_protected(dest->dest_dst, 1); 2118 if (dest_dst && dest_dst->dst_cache->dev == dev) { 2119 IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n", 2120 dev->name, 2121 IP_VS_DBG_ADDR(dest->af, &dest->addr), 2122 ntohs(dest->port), 2123 refcount_read(&dest->refcnt)); 2124 __ip_vs_dst_cache_reset(dest); 2125 } 2126 spin_unlock_bh(&dest->dst_lock); 2127 2128 } 2129 /* Netdev event receiver 2130 * Currently only NETDEV_DOWN is handled to release refs to cached dsts 2131 */ 2132 static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, 2133 void *ptr) 2134 { 2135 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 2136 struct net *net = dev_net(dev); 2137 struct netns_ipvs *ipvs = net_ipvs(net); 2138 DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU(); 2139 unsigned int resched_score = 0; 2140 struct hlist_bl_head *head; 2141 struct ip_vs_service *svc; 2142 struct hlist_bl_node *e; 2143 struct ip_vs_dest *dest; 2144 int old_gen, new_gen; 2145 2146 if (event != NETDEV_DOWN || !ipvs) 2147 return NOTIFY_DONE; 2148 IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); 2149 2150 old_gen = atomic_read(&ipvs->svc_table_changes); 2151 2152 rcu_read_lock(); 2153 2154 repeat: 2155 smp_rmb(); /* ipvs->svc_table and svc_table_changes */ 2156 ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) { 2157 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { 2158 list_for_each_entry_rcu(dest, &svc->destinations, 2159 n_list) { 2160 ip_vs_forget_dev(dest, dev); 2161 resched_score += 10; 2162 } 2163 resched_score++; 2164 } 2165 resched_score++; 2166 if (resched_score >= 100) { 2167 resched_score = 0; 2168 cond_resched_rcu(); 2169 new_gen = atomic_read(&ipvs->svc_table_changes); 2170 /* New table installed ? */ 2171 if (old_gen != new_gen) { 2172 old_gen = new_gen; 2173 goto repeat; 2174 } 2175 } 2176 } 2177 rcu_read_unlock(); 2178 2179 return NOTIFY_DONE; 2180 } 2181 2182 /* 2183 * Zero counters in a service or all services 2184 */ 2185 static int ip_vs_zero_service(struct ip_vs_service *svc) 2186 { 2187 struct ip_vs_dest *dest; 2188 2189 list_for_each_entry(dest, &svc->destinations, n_list) { 2190 ip_vs_zero_stats(&dest->stats); 2191 } 2192 ip_vs_zero_stats(&svc->stats); 2193 return 0; 2194 } 2195 2196 static int ip_vs_zero_all(struct netns_ipvs *ipvs) 2197 { 2198 DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU(); 2199 unsigned int resched_score = 0; 2200 struct hlist_bl_head *head; 2201 struct ip_vs_service *svc; 2202 struct hlist_bl_node *e; 2203 2204 rcu_read_lock(); 2205 2206 ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) { 2207 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { 2208 ip_vs_zero_service(svc); 2209 resched_score += 10; 2210 } 2211 resched_score++; 2212 if (resched_score >= 100) { 2213 resched_score = 0; 2214 cond_resched_rcu(); 2215 } 2216 } 2217 2218 rcu_read_unlock(); 2219 2220 ip_vs_zero_stats(&ipvs->tot_stats->s); 2221 return 0; 2222 } 2223 2224 #ifdef CONFIG_SYSCTL 2225 2226 static int 2227 proc_do_defense_mode(const struct ctl_table *table, int write, 2228 void *buffer, size_t *lenp, loff_t *ppos) 2229 { 2230 struct netns_ipvs *ipvs = table->extra2; 2231 int *valp = table->data; 2232 int val = *valp; 2233 int rc; 2234 2235 struct ctl_table tmp = { 2236 .data = &val, 2237 .maxlen = sizeof(int), 2238 .mode = table->mode, 2239 }; 2240 2241 rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); 2242 if (write && (*valp != val)) { 2243 if (val < 0 || val > 3) { 2244 rc = -EINVAL; 2245 } else { 2246 *valp = val; 2247 update_defense_level(ipvs); 2248 } 2249 } 2250 return rc; 2251 } 2252 2253 static int 2254 proc_do_sync_threshold(const struct ctl_table *table, int write, 2255 void *buffer, size_t *lenp, loff_t *ppos) 2256 { 2257 struct netns_ipvs *ipvs = table->extra2; 2258 int *valp = table->data; 2259 int val[2]; 2260 int rc; 2261 struct ctl_table tmp = { 2262 .data = &val, 2263 .maxlen = table->maxlen, 2264 .mode = table->mode, 2265 }; 2266 2267 mutex_lock(&ipvs->sync_mutex); 2268 memcpy(val, valp, sizeof(val)); 2269 rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); 2270 if (write) { 2271 if (val[0] < 0 || val[1] < 0 || 2272 (val[0] >= val[1] && val[1])) 2273 rc = -EINVAL; 2274 else 2275 memcpy(valp, val, sizeof(val)); 2276 } 2277 mutex_unlock(&ipvs->sync_mutex); 2278 return rc; 2279 } 2280 2281 static int 2282 proc_do_sync_ports(const struct ctl_table *table, int write, 2283 void *buffer, size_t *lenp, loff_t *ppos) 2284 { 2285 int *valp = table->data; 2286 int val = *valp; 2287 int rc; 2288 2289 struct ctl_table tmp = { 2290 .data = &val, 2291 .maxlen = sizeof(int), 2292 .mode = table->mode, 2293 }; 2294 2295 rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); 2296 if (write && (*valp != val)) { 2297 if (val < 1 || !is_power_of_2(val)) 2298 rc = -EINVAL; 2299 else 2300 *valp = val; 2301 } 2302 return rc; 2303 } 2304 2305 static int ipvs_proc_est_cpumask_set(const struct ctl_table *table, 2306 void *buffer) 2307 { 2308 struct netns_ipvs *ipvs = table->extra2; 2309 cpumask_var_t *valp = table->data; 2310 cpumask_var_t newmask; 2311 int ret; 2312 2313 if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) 2314 return -ENOMEM; 2315 2316 ret = cpulist_parse(buffer, newmask); 2317 if (ret) 2318 goto out; 2319 2320 mutex_lock(&ipvs->est_mutex); 2321 2322 if (!ipvs->est_cpulist_valid) { 2323 if (!zalloc_cpumask_var(valp, GFP_KERNEL)) { 2324 ret = -ENOMEM; 2325 goto unlock; 2326 } 2327 ipvs->est_cpulist_valid = 1; 2328 } 2329 cpumask_and(newmask, newmask, ¤t->cpus_mask); 2330 cpumask_copy(*valp, newmask); 2331 /* est_max_threads may depend on cpulist size */ 2332 ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); 2333 ipvs->est_calc_phase = 1; 2334 ip_vs_est_reload_start(ipvs); 2335 2336 unlock: 2337 mutex_unlock(&ipvs->est_mutex); 2338 2339 out: 2340 free_cpumask_var(newmask); 2341 return ret; 2342 } 2343 2344 static int ipvs_proc_est_cpumask_get(const struct ctl_table *table, 2345 void *buffer, size_t size) 2346 { 2347 struct netns_ipvs *ipvs = table->extra2; 2348 cpumask_var_t *valp = table->data; 2349 struct cpumask *mask; 2350 int ret; 2351 2352 mutex_lock(&ipvs->est_mutex); 2353 2354 if (ipvs->est_cpulist_valid) 2355 mask = *valp; 2356 else 2357 mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD); 2358 ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask)); 2359 2360 mutex_unlock(&ipvs->est_mutex); 2361 2362 return ret; 2363 } 2364 2365 static int ipvs_proc_est_cpulist(const struct ctl_table *table, int write, 2366 void *buffer, size_t *lenp, loff_t *ppos) 2367 { 2368 int ret; 2369 2370 /* Ignore both read and write(append) if *ppos not 0 */ 2371 if (*ppos || !*lenp) { 2372 *lenp = 0; 2373 return 0; 2374 } 2375 if (write) { 2376 /* proc_sys_call_handler() appends terminator */ 2377 ret = ipvs_proc_est_cpumask_set(table, buffer); 2378 if (ret >= 0) 2379 *ppos += *lenp; 2380 } else { 2381 /* proc_sys_call_handler() allocates 1 byte for terminator */ 2382 ret = ipvs_proc_est_cpumask_get(table, buffer, *lenp + 1); 2383 if (ret >= 0) { 2384 *lenp = ret; 2385 *ppos += *lenp; 2386 ret = 0; 2387 } 2388 } 2389 return ret; 2390 } 2391 2392 static int ipvs_proc_est_nice(const struct ctl_table *table, int write, 2393 void *buffer, size_t *lenp, loff_t *ppos) 2394 { 2395 struct netns_ipvs *ipvs = table->extra2; 2396 int *valp = table->data; 2397 int val = *valp; 2398 int ret; 2399 2400 struct ctl_table tmp_table = { 2401 .data = &val, 2402 .maxlen = sizeof(int), 2403 .mode = table->mode, 2404 }; 2405 2406 ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); 2407 if (write && ret >= 0) { 2408 if (val < MIN_NICE || val > MAX_NICE) { 2409 ret = -EINVAL; 2410 } else { 2411 mutex_lock(&ipvs->est_mutex); 2412 if (*valp != val) { 2413 *valp = val; 2414 ip_vs_est_reload_start(ipvs); 2415 } 2416 mutex_unlock(&ipvs->est_mutex); 2417 } 2418 } 2419 return ret; 2420 } 2421 2422 static int ipvs_proc_run_estimation(const struct ctl_table *table, int write, 2423 void *buffer, size_t *lenp, loff_t *ppos) 2424 { 2425 struct netns_ipvs *ipvs = table->extra2; 2426 int *valp = table->data; 2427 int val = *valp; 2428 int ret; 2429 2430 struct ctl_table tmp_table = { 2431 .data = &val, 2432 .maxlen = sizeof(int), 2433 .mode = table->mode, 2434 }; 2435 2436 ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); 2437 if (write && ret >= 0) { 2438 mutex_lock(&ipvs->est_mutex); 2439 if (*valp != val) { 2440 *valp = val; 2441 ip_vs_est_reload_start(ipvs); 2442 } 2443 mutex_unlock(&ipvs->est_mutex); 2444 } 2445 return ret; 2446 } 2447 2448 static int ipvs_proc_conn_lfactor(const struct ctl_table *table, int write, 2449 void *buffer, size_t *lenp, loff_t *ppos) 2450 { 2451 struct netns_ipvs *ipvs = table->extra2; 2452 int *valp = table->data; 2453 int val = *valp; 2454 int ret; 2455 2456 struct ctl_table tmp_table = { 2457 .data = &val, 2458 .maxlen = sizeof(int), 2459 }; 2460 2461 ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); 2462 if (write && ret >= 0) { 2463 if (val < -8 || val > 8) { 2464 ret = -EINVAL; 2465 } else { 2466 *valp = val; 2467 if (rcu_access_pointer(ipvs->conn_tab)) 2468 mod_delayed_work(system_unbound_wq, 2469 &ipvs->conn_resize_work, 0); 2470 } 2471 } 2472 return ret; 2473 } 2474 2475 static int ipvs_proc_svc_lfactor(const struct ctl_table *table, int write, 2476 void *buffer, size_t *lenp, loff_t *ppos) 2477 { 2478 struct netns_ipvs *ipvs = table->extra2; 2479 int *valp = table->data; 2480 int val = *valp; 2481 int ret; 2482 2483 struct ctl_table tmp_table = { 2484 .data = &val, 2485 .maxlen = sizeof(int), 2486 }; 2487 2488 ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); 2489 if (write && ret >= 0) { 2490 if (val < -8 || val > 8) { 2491 ret = -EINVAL; 2492 } else { 2493 *valp = val; 2494 if (rcu_access_pointer(ipvs->svc_table)) 2495 mod_delayed_work(system_unbound_wq, 2496 &ipvs->svc_resize_work, 0); 2497 } 2498 } 2499 return ret; 2500 } 2501 2502 /* 2503 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) 2504 * Do not change order or insert new entries without 2505 * align with netns init in ip_vs_control_net_init() 2506 */ 2507 2508 static struct ctl_table vs_vars[] = { 2509 { 2510 .procname = "amemthresh", 2511 .maxlen = sizeof(int), 2512 .mode = 0644, 2513 .proc_handler = proc_dointvec, 2514 }, 2515 { 2516 .procname = "am_droprate", 2517 .maxlen = sizeof(int), 2518 .mode = 0644, 2519 .proc_handler = proc_dointvec, 2520 }, 2521 { 2522 .procname = "drop_entry", 2523 .maxlen = sizeof(int), 2524 .mode = 0644, 2525 .proc_handler = proc_do_defense_mode, 2526 }, 2527 { 2528 .procname = "drop_packet", 2529 .maxlen = sizeof(int), 2530 .mode = 0644, 2531 .proc_handler = proc_do_defense_mode, 2532 }, 2533 #ifdef CONFIG_IP_VS_NFCT 2534 { 2535 .procname = "conntrack", 2536 .maxlen = sizeof(int), 2537 .mode = 0644, 2538 .proc_handler = &proc_dointvec, 2539 }, 2540 #endif 2541 { 2542 .procname = "secure_tcp", 2543 .maxlen = sizeof(int), 2544 .mode = 0644, 2545 .proc_handler = proc_do_defense_mode, 2546 }, 2547 { 2548 .procname = "snat_reroute", 2549 .maxlen = sizeof(int), 2550 .mode = 0644, 2551 .proc_handler = &proc_dointvec, 2552 }, 2553 { 2554 .procname = "sync_version", 2555 .maxlen = sizeof(int), 2556 .mode = 0644, 2557 .proc_handler = proc_dointvec_minmax, 2558 .extra1 = SYSCTL_ZERO, 2559 .extra2 = SYSCTL_ONE, 2560 }, 2561 { 2562 .procname = "sync_ports", 2563 .maxlen = sizeof(int), 2564 .mode = 0644, 2565 .proc_handler = proc_do_sync_ports, 2566 }, 2567 { 2568 .procname = "sync_persist_mode", 2569 .maxlen = sizeof(int), 2570 .mode = 0644, 2571 .proc_handler = proc_dointvec, 2572 }, 2573 { 2574 .procname = "sync_qlen_max", 2575 .maxlen = sizeof(unsigned long), 2576 .mode = 0644, 2577 .proc_handler = proc_doulongvec_minmax, 2578 }, 2579 { 2580 .procname = "sync_sock_size", 2581 .maxlen = sizeof(int), 2582 .mode = 0644, 2583 .proc_handler = proc_dointvec, 2584 }, 2585 { 2586 .procname = "cache_bypass", 2587 .maxlen = sizeof(int), 2588 .mode = 0644, 2589 .proc_handler = proc_dointvec, 2590 }, 2591 { 2592 .procname = "expire_nodest_conn", 2593 .maxlen = sizeof(int), 2594 .mode = 0644, 2595 .proc_handler = proc_dointvec, 2596 }, 2597 { 2598 .procname = "sloppy_tcp", 2599 .maxlen = sizeof(int), 2600 .mode = 0644, 2601 .proc_handler = proc_dointvec, 2602 }, 2603 { 2604 .procname = "sloppy_sctp", 2605 .maxlen = sizeof(int), 2606 .mode = 0644, 2607 .proc_handler = proc_dointvec, 2608 }, 2609 { 2610 .procname = "expire_quiescent_template", 2611 .maxlen = sizeof(int), 2612 .mode = 0644, 2613 .proc_handler = proc_dointvec, 2614 }, 2615 { 2616 .procname = "sync_threshold", 2617 .maxlen = 2618 sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold), 2619 .mode = 0644, 2620 .proc_handler = proc_do_sync_threshold, 2621 }, 2622 { 2623 .procname = "sync_refresh_period", 2624 .maxlen = sizeof(int), 2625 .mode = 0644, 2626 .proc_handler = proc_dointvec_jiffies, 2627 }, 2628 { 2629 .procname = "sync_retries", 2630 .maxlen = sizeof(int), 2631 .mode = 0644, 2632 .proc_handler = proc_dointvec_minmax, 2633 .extra1 = SYSCTL_ZERO, 2634 .extra2 = SYSCTL_THREE, 2635 }, 2636 { 2637 .procname = "nat_icmp_send", 2638 .maxlen = sizeof(int), 2639 .mode = 0644, 2640 .proc_handler = proc_dointvec, 2641 }, 2642 { 2643 .procname = "pmtu_disc", 2644 .maxlen = sizeof(int), 2645 .mode = 0644, 2646 .proc_handler = proc_dointvec, 2647 }, 2648 { 2649 .procname = "backup_only", 2650 .maxlen = sizeof(int), 2651 .mode = 0644, 2652 .proc_handler = proc_dointvec, 2653 }, 2654 { 2655 .procname = "conn_reuse_mode", 2656 .maxlen = sizeof(int), 2657 .mode = 0644, 2658 .proc_handler = proc_dointvec, 2659 }, 2660 { 2661 .procname = "schedule_icmp", 2662 .maxlen = sizeof(int), 2663 .mode = 0644, 2664 .proc_handler = proc_dointvec, 2665 }, 2666 { 2667 .procname = "ignore_tunneled", 2668 .maxlen = sizeof(int), 2669 .mode = 0644, 2670 .proc_handler = proc_dointvec, 2671 }, 2672 { 2673 .procname = "run_estimation", 2674 .maxlen = sizeof(int), 2675 .mode = 0644, 2676 .proc_handler = ipvs_proc_run_estimation, 2677 }, 2678 { 2679 .procname = "est_cpulist", 2680 .maxlen = NR_CPUS, /* unused */ 2681 .mode = 0644, 2682 .proc_handler = ipvs_proc_est_cpulist, 2683 }, 2684 { 2685 .procname = "est_nice", 2686 .maxlen = sizeof(int), 2687 .mode = 0644, 2688 .proc_handler = ipvs_proc_est_nice, 2689 }, 2690 { 2691 .procname = "conn_lfactor", 2692 .maxlen = sizeof(int), 2693 .mode = 0644, 2694 .proc_handler = ipvs_proc_conn_lfactor, 2695 }, 2696 { 2697 .procname = "svc_lfactor", 2698 .maxlen = sizeof(int), 2699 .mode = 0644, 2700 .proc_handler = ipvs_proc_svc_lfactor, 2701 }, 2702 #ifdef CONFIG_IP_VS_DEBUG 2703 { 2704 .procname = "debug_level", 2705 .data = &sysctl_ip_vs_debug_level, 2706 .maxlen = sizeof(int), 2707 .mode = 0644, 2708 .proc_handler = proc_dointvec, 2709 }, 2710 #endif 2711 }; 2712 2713 #endif 2714 2715 #ifdef CONFIG_PROC_FS 2716 2717 struct ip_vs_iter { 2718 struct seq_net_private p; /* Do not move this, netns depends upon it*/ 2719 struct ip_vs_rht *t; 2720 u32 bucket; 2721 }; 2722 2723 /* 2724 * Write the contents of the VS rule table to a PROCfs file. 2725 * (It is kept just for backward compatibility) 2726 */ 2727 static inline const char *ip_vs_fwd_name(unsigned int flags) 2728 { 2729 switch (flags & IP_VS_CONN_F_FWD_MASK) { 2730 case IP_VS_CONN_F_LOCALNODE: 2731 return "Local"; 2732 case IP_VS_CONN_F_TUNNEL: 2733 return "Tunnel"; 2734 case IP_VS_CONN_F_DROUTE: 2735 return "Route"; 2736 default: 2737 return "Masq"; 2738 } 2739 } 2740 2741 /* Do not expect consistent view during add, del and move(table resize). 2742 * We may miss entries and even show duplicates. 2743 */ 2744 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) 2745 { 2746 struct ip_vs_iter *iter = seq->private; 2747 struct ip_vs_rht *t = iter->t; 2748 struct ip_vs_service *svc; 2749 struct hlist_bl_node *e; 2750 int idx; 2751 2752 if (!t) 2753 return NULL; 2754 for (idx = 0; idx < t->size; idx++) { 2755 hlist_bl_for_each_entry_rcu(svc, e, &t->buckets[idx], s_list) { 2756 if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key))) 2757 break; 2758 if (pos-- == 0) { 2759 iter->bucket = idx; 2760 return svc; 2761 } 2762 } 2763 } 2764 return NULL; 2765 } 2766 2767 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) 2768 __acquires(RCU) 2769 { 2770 struct ip_vs_iter *iter = seq->private; 2771 struct net *net = seq_file_net(seq); 2772 struct netns_ipvs *ipvs = net_ipvs(net); 2773 2774 rcu_read_lock(); 2775 iter->t = rcu_dereference(ipvs->svc_table); 2776 return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; 2777 } 2778 2779 2780 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2781 { 2782 struct ip_vs_service *svc; 2783 struct ip_vs_iter *iter; 2784 struct hlist_bl_node *e; 2785 struct ip_vs_rht *t; 2786 2787 ++*pos; 2788 if (v == SEQ_START_TOKEN) 2789 return ip_vs_info_array(seq,0); 2790 2791 svc = v; 2792 iter = seq->private; 2793 t = iter->t; 2794 if (!t) 2795 return NULL; 2796 2797 hlist_bl_for_each_entry_continue_rcu(svc, e, s_list) { 2798 /* Our cursor was moved to new table ? */ 2799 if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key))) 2800 break; 2801 return svc; 2802 } 2803 2804 while (++iter->bucket < t->size) { 2805 hlist_bl_for_each_entry_rcu(svc, e, &t->buckets[iter->bucket], 2806 s_list) { 2807 if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key))) 2808 break; 2809 return svc; 2810 } 2811 } 2812 return NULL; 2813 } 2814 2815 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) 2816 __releases(RCU) 2817 { 2818 rcu_read_unlock(); 2819 } 2820 2821 2822 static int ip_vs_info_seq_show(struct seq_file *seq, void *v) 2823 { 2824 struct net *net = seq_file_net(seq); 2825 struct netns_ipvs *ipvs = net_ipvs(net); 2826 2827 if (v == SEQ_START_TOKEN) { 2828 seq_printf(seq, 2829 "IP Virtual Server version %d.%d.%d (size=%d)\n", 2830 NVERSION(IP_VS_VERSION_CODE), get_conn_tab_size(ipvs)); 2831 seq_puts(seq, 2832 "Prot LocalAddress:Port Scheduler Flags\n"); 2833 seq_puts(seq, 2834 " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n"); 2835 } else { 2836 const struct ip_vs_service *svc = v; 2837 const struct ip_vs_dest *dest; 2838 struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); 2839 char *sched_name = sched ? sched->name : "none"; 2840 2841 if (!svc->fwmark) { 2842 #ifdef CONFIG_IP_VS_IPV6 2843 if (svc->af == AF_INET6) 2844 seq_printf(seq, "%s [%pI6]:%04X %s ", 2845 ip_vs_proto_name(svc->protocol), 2846 &svc->addr.in6, 2847 ntohs(svc->port), 2848 sched_name); 2849 else 2850 #endif 2851 seq_printf(seq, "%s %08X:%04X %s %s ", 2852 ip_vs_proto_name(svc->protocol), 2853 ntohl(svc->addr.ip), 2854 ntohs(svc->port), 2855 sched_name, 2856 (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); 2857 } else { 2858 seq_printf(seq, "FWM %08X %s %s", 2859 svc->fwmark, sched_name, 2860 (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); 2861 } 2862 2863 if (svc->flags & IP_VS_SVC_F_PERSISTENT) 2864 seq_printf(seq, "persistent %d %08X\n", 2865 svc->timeout, 2866 ntohl(svc->netmask)); 2867 else 2868 seq_putc(seq, '\n'); 2869 2870 list_for_each_entry_rcu(dest, &svc->destinations, n_list) { 2871 #ifdef CONFIG_IP_VS_IPV6 2872 if (dest->af == AF_INET6) 2873 seq_printf(seq, 2874 " -> [%pI6]:%04X" 2875 " %-7s %-6d %-10d %-10d\n", 2876 &dest->addr.in6, 2877 ntohs(dest->port), 2878 ip_vs_fwd_name(atomic_read(&dest->conn_flags)), 2879 atomic_read(&dest->weight), 2880 atomic_read(&dest->activeconns), 2881 atomic_read(&dest->inactconns)); 2882 else 2883 #endif 2884 seq_printf(seq, 2885 " -> %08X:%04X " 2886 "%-7s %-6d %-10d %-10d\n", 2887 ntohl(dest->addr.ip), 2888 ntohs(dest->port), 2889 ip_vs_fwd_name(atomic_read(&dest->conn_flags)), 2890 atomic_read(&dest->weight), 2891 atomic_read(&dest->activeconns), 2892 atomic_read(&dest->inactconns)); 2893 2894 } 2895 } 2896 return 0; 2897 } 2898 2899 static const struct seq_operations ip_vs_info_seq_ops = { 2900 .start = ip_vs_info_seq_start, 2901 .next = ip_vs_info_seq_next, 2902 .stop = ip_vs_info_seq_stop, 2903 .show = ip_vs_info_seq_show, 2904 }; 2905 2906 static int ip_vs_stats_show(struct seq_file *seq, void *v) 2907 { 2908 struct net *net = seq_file_single_net(seq); 2909 struct ip_vs_kstats show; 2910 2911 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ 2912 seq_puts(seq, 2913 " Total Incoming Outgoing Incoming Outgoing\n"); 2914 seq_puts(seq, 2915 " Conns Packets Packets Bytes Bytes\n"); 2916 2917 ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats->s); 2918 seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n", 2919 (unsigned long long)show.conns, 2920 (unsigned long long)show.inpkts, 2921 (unsigned long long)show.outpkts, 2922 (unsigned long long)show.inbytes, 2923 (unsigned long long)show.outbytes); 2924 2925 /* 01234567 01234567 01234567 0123456701234567 0123456701234567*/ 2926 seq_puts(seq, 2927 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); 2928 seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n", 2929 (unsigned long long)show.cps, 2930 (unsigned long long)show.inpps, 2931 (unsigned long long)show.outpps, 2932 (unsigned long long)show.inbps, 2933 (unsigned long long)show.outbps); 2934 2935 return 0; 2936 } 2937 2938 static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) 2939 { 2940 struct net *net = seq_file_single_net(seq); 2941 struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats->s; 2942 struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats; 2943 struct ip_vs_kstats kstats; 2944 int i; 2945 2946 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ 2947 seq_puts(seq, 2948 " Total Incoming Outgoing Incoming Outgoing\n"); 2949 seq_puts(seq, 2950 "CPU Conns Packets Packets Bytes Bytes\n"); 2951 2952 for_each_possible_cpu(i) { 2953 struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i); 2954 unsigned int start; 2955 u64 conns, inpkts, outpkts, inbytes, outbytes; 2956 2957 do { 2958 start = u64_stats_fetch_begin(&u->syncp); 2959 conns = u64_stats_read(&u->cnt.conns); 2960 inpkts = u64_stats_read(&u->cnt.inpkts); 2961 outpkts = u64_stats_read(&u->cnt.outpkts); 2962 inbytes = u64_stats_read(&u->cnt.inbytes); 2963 outbytes = u64_stats_read(&u->cnt.outbytes); 2964 } while (u64_stats_fetch_retry(&u->syncp, start)); 2965 2966 seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n", 2967 i, (u64)conns, (u64)inpkts, 2968 (u64)outpkts, (u64)inbytes, 2969 (u64)outbytes); 2970 } 2971 2972 ip_vs_copy_stats(&kstats, tot_stats); 2973 2974 seq_printf(seq, " ~ %8LX %8LX %8LX %16LX %16LX\n\n", 2975 (unsigned long long)kstats.conns, 2976 (unsigned long long)kstats.inpkts, 2977 (unsigned long long)kstats.outpkts, 2978 (unsigned long long)kstats.inbytes, 2979 (unsigned long long)kstats.outbytes); 2980 2981 /* ... 01234567 01234567 01234567 0123456701234567 0123456701234567 */ 2982 seq_puts(seq, 2983 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); 2984 seq_printf(seq, " %8LX %8LX %8LX %16LX %16LX\n", 2985 kstats.cps, 2986 kstats.inpps, 2987 kstats.outpps, 2988 kstats.inbps, 2989 kstats.outbps); 2990 2991 return 0; 2992 } 2993 2994 static int ip_vs_status_show(struct seq_file *seq, void *v) 2995 { 2996 struct net *net = seq_file_single_net(seq); 2997 struct netns_ipvs *ipvs = net_ipvs(net); 2998 unsigned int resched_score = 0; 2999 struct ip_vs_conn_hnode *hn; 3000 struct hlist_bl_head *head; 3001 struct ip_vs_service *svc; 3002 struct ip_vs_rht *t, *pt; 3003 struct hlist_bl_node *e; 3004 int old_gen, new_gen; 3005 u32 counts[8]; 3006 u32 bucket; 3007 int count; 3008 u32 sum1; 3009 u32 sum; 3010 int i; 3011 3012 rcu_read_lock(); 3013 3014 t = rcu_dereference(ipvs->conn_tab); 3015 3016 seq_printf(seq, "Conns:\t%d\n", atomic_read(&ipvs->conn_count)); 3017 seq_printf(seq, "Conn buckets:\t%d (%d bits, lfactor %d)\n", 3018 t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0); 3019 3020 if (!atomic_read(&ipvs->conn_count)) 3021 goto after_conns; 3022 old_gen = atomic_read(&ipvs->conn_tab_changes); 3023 3024 repeat_conn: 3025 smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */ 3026 memset(counts, 0, sizeof(counts)); 3027 ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) { 3028 for (bucket = 0; bucket < t->size; bucket++) { 3029 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); 3030 3031 count = 0; 3032 resched_score++; 3033 ip_vs_rht_walk_bucket_rcu(t, bucket, head) { 3034 count = 0; 3035 hlist_bl_for_each_entry_rcu(hn, e, head, node) 3036 count++; 3037 } 3038 resched_score += count; 3039 if (resched_score >= 100) { 3040 resched_score = 0; 3041 cond_resched_rcu(); 3042 new_gen = atomic_read(&ipvs->conn_tab_changes); 3043 /* New table installed ? */ 3044 if (old_gen != new_gen) { 3045 old_gen = new_gen; 3046 goto repeat_conn; 3047 } 3048 } 3049 counts[min(count, (int)ARRAY_SIZE(counts) - 1)]++; 3050 } 3051 } 3052 for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++) 3053 sum += counts[i]; 3054 sum1 = sum - counts[0]; 3055 seq_printf(seq, "Conn buckets empty:\t%u (%lu%%)\n", 3056 counts[0], (unsigned long)counts[0] * 100 / max(sum, 1U)); 3057 for (i = 1; i < ARRAY_SIZE(counts); i++) { 3058 if (!counts[i]) 3059 continue; 3060 seq_printf(seq, "Conn buckets len-%d:\t%u (%lu%%)\n", 3061 i, counts[i], 3062 (unsigned long)counts[i] * 100 / max(sum1, 1U)); 3063 } 3064 3065 after_conns: 3066 t = rcu_dereference(ipvs->svc_table); 3067 3068 count = ip_vs_get_num_services(ipvs); 3069 seq_printf(seq, "Services:\t%d\n", count); 3070 seq_printf(seq, "Service buckets:\t%d (%d bits, lfactor %d)\n", 3071 t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0); 3072 3073 if (!count) 3074 goto after_svc; 3075 old_gen = atomic_read(&ipvs->svc_table_changes); 3076 3077 repeat_svc: 3078 smp_rmb(); /* ipvs->svc_table and svc_table_changes */ 3079 memset(counts, 0, sizeof(counts)); 3080 ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, pt) { 3081 for (bucket = 0; bucket < t->size; bucket++) { 3082 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU(); 3083 3084 count = 0; 3085 resched_score++; 3086 ip_vs_rht_walk_bucket_rcu(t, bucket, head) { 3087 count = 0; 3088 hlist_bl_for_each_entry_rcu(svc, e, head, 3089 s_list) 3090 count++; 3091 } 3092 resched_score += count; 3093 if (resched_score >= 100) { 3094 resched_score = 0; 3095 cond_resched_rcu(); 3096 new_gen = atomic_read(&ipvs->svc_table_changes); 3097 /* New table installed ? */ 3098 if (old_gen != new_gen) { 3099 old_gen = new_gen; 3100 goto repeat_svc; 3101 } 3102 } 3103 counts[min(count, (int)ARRAY_SIZE(counts) - 1)]++; 3104 } 3105 } 3106 for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++) 3107 sum += counts[i]; 3108 sum1 = sum - counts[0]; 3109 seq_printf(seq, "Service buckets empty:\t%u (%lu%%)\n", 3110 counts[0], (unsigned long)counts[0] * 100 / max(sum, 1U)); 3111 for (i = 1; i < ARRAY_SIZE(counts); i++) { 3112 if (!counts[i]) 3113 continue; 3114 seq_printf(seq, "Service buckets len-%d:\t%u (%lu%%)\n", 3115 i, counts[i], 3116 (unsigned long)counts[i] * 100 / max(sum1, 1U)); 3117 } 3118 3119 after_svc: 3120 seq_printf(seq, "Stats thread slots:\t%d (max %lu)\n", 3121 ipvs->est_kt_count, ipvs->est_max_threads); 3122 seq_printf(seq, "Stats chain max len:\t%d\n", ipvs->est_chain_max); 3123 seq_printf(seq, "Stats thread ests:\t%d\n", 3124 ipvs->est_chain_max * IPVS_EST_CHAIN_FACTOR * 3125 IPVS_EST_NTICKS); 3126 3127 rcu_read_unlock(); 3128 return 0; 3129 } 3130 3131 #endif 3132 3133 /* 3134 * Set timeout values for tcp tcpfin udp in the timeout_table. 3135 */ 3136 static int ip_vs_set_timeout(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u) 3137 { 3138 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) 3139 struct ip_vs_proto_data *pd; 3140 #endif 3141 3142 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", 3143 u->tcp_timeout, 3144 u->tcp_fin_timeout, 3145 u->udp_timeout); 3146 3147 #ifdef CONFIG_IP_VS_PROTO_TCP 3148 if (u->tcp_timeout < 0 || u->tcp_timeout > (INT_MAX / HZ) || 3149 u->tcp_fin_timeout < 0 || u->tcp_fin_timeout > (INT_MAX / HZ)) { 3150 return -EINVAL; 3151 } 3152 #endif 3153 3154 #ifdef CONFIG_IP_VS_PROTO_UDP 3155 if (u->udp_timeout < 0 || u->udp_timeout > (INT_MAX / HZ)) 3156 return -EINVAL; 3157 #endif 3158 3159 #ifdef CONFIG_IP_VS_PROTO_TCP 3160 if (u->tcp_timeout) { 3161 pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); 3162 pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] 3163 = u->tcp_timeout * HZ; 3164 } 3165 3166 if (u->tcp_fin_timeout) { 3167 pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); 3168 pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] 3169 = u->tcp_fin_timeout * HZ; 3170 } 3171 #endif 3172 3173 #ifdef CONFIG_IP_VS_PROTO_UDP 3174 if (u->udp_timeout) { 3175 pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); 3176 pd->timeout_table[IP_VS_UDP_S_NORMAL] 3177 = u->udp_timeout * HZ; 3178 } 3179 #endif 3180 return 0; 3181 } 3182 3183 #define CMDID(cmd) (cmd - IP_VS_BASE_CTL) 3184 3185 struct ip_vs_svcdest_user { 3186 struct ip_vs_service_user s; 3187 struct ip_vs_dest_user d; 3188 }; 3189 3190 static const unsigned char set_arglen[CMDID(IP_VS_SO_SET_MAX) + 1] = { 3191 [CMDID(IP_VS_SO_SET_ADD)] = sizeof(struct ip_vs_service_user), 3192 [CMDID(IP_VS_SO_SET_EDIT)] = sizeof(struct ip_vs_service_user), 3193 [CMDID(IP_VS_SO_SET_DEL)] = sizeof(struct ip_vs_service_user), 3194 [CMDID(IP_VS_SO_SET_ADDDEST)] = sizeof(struct ip_vs_svcdest_user), 3195 [CMDID(IP_VS_SO_SET_DELDEST)] = sizeof(struct ip_vs_svcdest_user), 3196 [CMDID(IP_VS_SO_SET_EDITDEST)] = sizeof(struct ip_vs_svcdest_user), 3197 [CMDID(IP_VS_SO_SET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user), 3198 [CMDID(IP_VS_SO_SET_STARTDAEMON)] = sizeof(struct ip_vs_daemon_user), 3199 [CMDID(IP_VS_SO_SET_STOPDAEMON)] = sizeof(struct ip_vs_daemon_user), 3200 [CMDID(IP_VS_SO_SET_ZERO)] = sizeof(struct ip_vs_service_user), 3201 }; 3202 3203 union ip_vs_set_arglen { 3204 struct ip_vs_service_user field_IP_VS_SO_SET_ADD; 3205 struct ip_vs_service_user field_IP_VS_SO_SET_EDIT; 3206 struct ip_vs_service_user field_IP_VS_SO_SET_DEL; 3207 struct ip_vs_svcdest_user field_IP_VS_SO_SET_ADDDEST; 3208 struct ip_vs_svcdest_user field_IP_VS_SO_SET_DELDEST; 3209 struct ip_vs_svcdest_user field_IP_VS_SO_SET_EDITDEST; 3210 struct ip_vs_timeout_user field_IP_VS_SO_SET_TIMEOUT; 3211 struct ip_vs_daemon_user field_IP_VS_SO_SET_STARTDAEMON; 3212 struct ip_vs_daemon_user field_IP_VS_SO_SET_STOPDAEMON; 3213 struct ip_vs_service_user field_IP_VS_SO_SET_ZERO; 3214 }; 3215 3216 #define MAX_SET_ARGLEN sizeof(union ip_vs_set_arglen) 3217 3218 static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, 3219 struct ip_vs_service_user *usvc_compat) 3220 { 3221 memset(usvc, 0, sizeof(*usvc)); 3222 3223 usvc->af = AF_INET; 3224 usvc->protocol = usvc_compat->protocol; 3225 usvc->addr.ip = usvc_compat->addr; 3226 usvc->port = usvc_compat->port; 3227 usvc->fwmark = usvc_compat->fwmark; 3228 3229 /* Deep copy of sched_name is not needed here */ 3230 usvc->sched_name = usvc_compat->sched_name; 3231 3232 usvc->flags = usvc_compat->flags; 3233 usvc->timeout = usvc_compat->timeout; 3234 usvc->netmask = usvc_compat->netmask; 3235 } 3236 3237 static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, 3238 struct ip_vs_dest_user *udest_compat) 3239 { 3240 memset(udest, 0, sizeof(*udest)); 3241 3242 udest->addr.ip = udest_compat->addr; 3243 udest->port = udest_compat->port; 3244 udest->conn_flags = udest_compat->conn_flags; 3245 udest->weight = udest_compat->weight; 3246 udest->u_threshold = udest_compat->u_threshold; 3247 udest->l_threshold = udest_compat->l_threshold; 3248 udest->af = AF_INET; 3249 udest->tun_type = IP_VS_CONN_F_TUNNEL_TYPE_IPIP; 3250 } 3251 3252 static int 3253 do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len) 3254 { 3255 struct net *net = sock_net(sk); 3256 int ret; 3257 unsigned char arg[MAX_SET_ARGLEN]; 3258 struct ip_vs_service_user *usvc_compat; 3259 struct ip_vs_service_user_kern usvc; 3260 struct ip_vs_service *svc; 3261 struct ip_vs_dest_user *udest_compat; 3262 struct ip_vs_dest_user_kern udest; 3263 struct netns_ipvs *ipvs = net_ipvs(net); 3264 3265 BUILD_BUG_ON(sizeof(arg) > 255); 3266 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3267 return -EPERM; 3268 3269 if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX) 3270 return -EINVAL; 3271 if (len != set_arglen[CMDID(cmd)]) { 3272 IP_VS_DBG(1, "set_ctl: len %u != %u\n", 3273 len, set_arglen[CMDID(cmd)]); 3274 return -EINVAL; 3275 } 3276 3277 if (copy_from_sockptr(arg, ptr, len) != 0) 3278 return -EFAULT; 3279 3280 /* Handle daemons since they have another lock */ 3281 if (cmd == IP_VS_SO_SET_STARTDAEMON || 3282 cmd == IP_VS_SO_SET_STOPDAEMON) { 3283 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; 3284 3285 if (cmd == IP_VS_SO_SET_STARTDAEMON) { 3286 struct ipvs_sync_daemon_cfg cfg; 3287 3288 memset(&cfg, 0, sizeof(cfg)); 3289 ret = -EINVAL; 3290 if (strscpy(cfg.mcast_ifn, dm->mcast_ifn, 3291 sizeof(cfg.mcast_ifn)) <= 0) 3292 return ret; 3293 cfg.syncid = dm->syncid; 3294 ret = start_sync_thread(ipvs, &cfg, dm->state); 3295 } else { 3296 ret = stop_sync_thread(ipvs, dm->state); 3297 } 3298 return ret; 3299 } 3300 3301 mutex_lock(&ipvs->service_mutex); 3302 if (cmd == IP_VS_SO_SET_FLUSH) { 3303 /* Flush the virtual service */ 3304 ret = ip_vs_flush(ipvs, false); 3305 goto out_unlock; 3306 } else if (cmd == IP_VS_SO_SET_TIMEOUT) { 3307 /* Set timeout values for (tcp tcpfin udp) */ 3308 ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg); 3309 goto out_unlock; 3310 } else if (!len) { 3311 /* No more commands with len == 0 below */ 3312 ret = -EINVAL; 3313 goto out_unlock; 3314 } 3315 3316 usvc_compat = (struct ip_vs_service_user *)arg; 3317 udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1); 3318 3319 /* We only use the new structs internally, so copy userspace compat 3320 * structs to extended internal versions */ 3321 ip_vs_copy_usvc_compat(&usvc, usvc_compat); 3322 ip_vs_copy_udest_compat(&udest, udest_compat); 3323 3324 if (cmd == IP_VS_SO_SET_ZERO) { 3325 /* if no service address is set, zero counters in all */ 3326 if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { 3327 ret = ip_vs_zero_all(ipvs); 3328 goto out_unlock; 3329 } 3330 } 3331 3332 if ((cmd == IP_VS_SO_SET_ADD || cmd == IP_VS_SO_SET_EDIT) && 3333 strnlen(usvc.sched_name, IP_VS_SCHEDNAME_MAXLEN) == 3334 IP_VS_SCHEDNAME_MAXLEN) { 3335 ret = -EINVAL; 3336 goto out_unlock; 3337 } 3338 3339 /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */ 3340 if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP && 3341 usvc.protocol != IPPROTO_SCTP) { 3342 pr_err("set_ctl: invalid protocol: %d %pI4:%d\n", 3343 usvc.protocol, &usvc.addr.ip, 3344 ntohs(usvc.port)); 3345 ret = -EFAULT; 3346 goto out_unlock; 3347 } 3348 3349 /* Lookup the exact service by <protocol, addr, port> or fwmark */ 3350 rcu_read_lock(); 3351 if (usvc.fwmark == 0) 3352 svc = __ip_vs_service_find(ipvs, usvc.af, usvc.protocol, 3353 &usvc.addr, usvc.port); 3354 else 3355 svc = __ip_vs_svc_fwm_find(ipvs, usvc.af, usvc.fwmark); 3356 rcu_read_unlock(); 3357 3358 if (cmd != IP_VS_SO_SET_ADD 3359 && (svc == NULL || svc->protocol != usvc.protocol)) { 3360 ret = -ESRCH; 3361 goto out_unlock; 3362 } 3363 3364 switch (cmd) { 3365 case IP_VS_SO_SET_ADD: 3366 if (svc != NULL) 3367 ret = -EEXIST; 3368 else 3369 ret = ip_vs_add_service(ipvs, &usvc, &svc); 3370 break; 3371 case IP_VS_SO_SET_EDIT: 3372 ret = ip_vs_edit_service(svc, &usvc); 3373 break; 3374 case IP_VS_SO_SET_DEL: 3375 ret = ip_vs_del_service(svc); 3376 if (!ret) 3377 goto out_unlock; 3378 break; 3379 case IP_VS_SO_SET_ZERO: 3380 ret = ip_vs_zero_service(svc); 3381 break; 3382 case IP_VS_SO_SET_ADDDEST: 3383 ret = ip_vs_add_dest(svc, &udest); 3384 break; 3385 case IP_VS_SO_SET_EDITDEST: 3386 ret = ip_vs_edit_dest(svc, &udest); 3387 break; 3388 case IP_VS_SO_SET_DELDEST: 3389 ret = ip_vs_del_dest(svc, &udest); 3390 break; 3391 default: 3392 WARN_ON_ONCE(1); 3393 ret = -EINVAL; 3394 break; 3395 } 3396 3397 out_unlock: 3398 mutex_unlock(&ipvs->service_mutex); 3399 return ret; 3400 } 3401 3402 3403 static void 3404 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) 3405 { 3406 struct ip_vs_scheduler *sched; 3407 struct ip_vs_kstats kstats; 3408 char *sched_name; 3409 3410 sched = rcu_dereference_protected(src->scheduler, 1); 3411 sched_name = sched ? sched->name : "none"; 3412 dst->protocol = src->protocol; 3413 dst->addr = src->addr.ip; 3414 dst->port = src->port; 3415 dst->fwmark = src->fwmark; 3416 strscpy(dst->sched_name, sched_name, sizeof(dst->sched_name)); 3417 dst->flags = src->flags; 3418 dst->timeout = src->timeout / HZ; 3419 dst->netmask = src->netmask; 3420 dst->num_dests = src->num_dests; 3421 ip_vs_copy_stats(&kstats, &src->stats); 3422 ip_vs_export_stats_user(&dst->stats, &kstats); 3423 } 3424 3425 static inline int 3426 __ip_vs_get_service_entries(struct netns_ipvs *ipvs, 3427 const struct ip_vs_get_services *get, 3428 struct ip_vs_get_services __user *uptr) 3429 { 3430 struct ip_vs_service_entry entry; 3431 DECLARE_IP_VS_RHT_WALK_BUCKETS(); 3432 struct hlist_bl_head *head; 3433 struct ip_vs_service *svc; 3434 struct hlist_bl_node *e; 3435 int count = 0; 3436 int ret = 0; 3437 3438 lockdep_assert_held(&ipvs->svc_resize_sem); 3439 /* All service modifications are disabled, go ahead */ 3440 ip_vs_rht_walk_buckets(ipvs->svc_table, head) { 3441 hlist_bl_for_each_entry(svc, e, head, s_list) { 3442 /* Only expose IPv4 entries to old interface */ 3443 if (svc->af != AF_INET) 3444 continue; 3445 3446 if (count >= get->num_services) 3447 goto out; 3448 memset(&entry, 0, sizeof(entry)); 3449 ip_vs_copy_service(&entry, svc); 3450 if (copy_to_user(&uptr->entrytable[count], 3451 &entry, sizeof(entry))) { 3452 ret = -EFAULT; 3453 goto out; 3454 } 3455 count++; 3456 } 3457 } 3458 3459 out: 3460 return ret; 3461 } 3462 3463 static inline int 3464 __ip_vs_get_dest_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_dests *get, 3465 struct ip_vs_get_dests __user *uptr) 3466 { 3467 struct ip_vs_service *svc; 3468 union nf_inet_addr addr = { .ip = get->addr }; 3469 int ret = 0; 3470 3471 rcu_read_lock(); 3472 if (get->fwmark) 3473 svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, get->fwmark); 3474 else 3475 svc = __ip_vs_service_find(ipvs, AF_INET, get->protocol, &addr, 3476 get->port); 3477 rcu_read_unlock(); 3478 3479 if (svc) { 3480 int count = 0; 3481 struct ip_vs_dest *dest; 3482 struct ip_vs_dest_entry entry; 3483 struct ip_vs_kstats kstats; 3484 3485 memset(&entry, 0, sizeof(entry)); 3486 list_for_each_entry(dest, &svc->destinations, n_list) { 3487 if (count >= get->num_dests) 3488 break; 3489 3490 /* Cannot expose heterogeneous members via sockopt 3491 * interface 3492 */ 3493 if (dest->af != svc->af) 3494 continue; 3495 3496 entry.addr = dest->addr.ip; 3497 entry.port = dest->port; 3498 entry.conn_flags = atomic_read(&dest->conn_flags); 3499 entry.weight = atomic_read(&dest->weight); 3500 entry.u_threshold = dest->u_threshold; 3501 entry.l_threshold = dest->l_threshold; 3502 entry.activeconns = atomic_read(&dest->activeconns); 3503 entry.inactconns = atomic_read(&dest->inactconns); 3504 entry.persistconns = atomic_read(&dest->persistconns); 3505 ip_vs_copy_stats(&kstats, &dest->stats); 3506 ip_vs_export_stats_user(&entry.stats, &kstats); 3507 if (copy_to_user(&uptr->entrytable[count], 3508 &entry, sizeof(entry))) { 3509 ret = -EFAULT; 3510 break; 3511 } 3512 count++; 3513 } 3514 } else 3515 ret = -ESRCH; 3516 return ret; 3517 } 3518 3519 static inline void 3520 __ip_vs_get_timeouts(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u) 3521 { 3522 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) 3523 struct ip_vs_proto_data *pd; 3524 #endif 3525 3526 memset(u, 0, sizeof (*u)); 3527 3528 #ifdef CONFIG_IP_VS_PROTO_TCP 3529 pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); 3530 u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; 3531 u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; 3532 #endif 3533 #ifdef CONFIG_IP_VS_PROTO_UDP 3534 pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); 3535 u->udp_timeout = 3536 pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ; 3537 #endif 3538 } 3539 3540 static const unsigned char get_arglen[CMDID(IP_VS_SO_GET_MAX) + 1] = { 3541 [CMDID(IP_VS_SO_GET_VERSION)] = 64, 3542 [CMDID(IP_VS_SO_GET_INFO)] = sizeof(struct ip_vs_getinfo), 3543 [CMDID(IP_VS_SO_GET_SERVICES)] = sizeof(struct ip_vs_get_services), 3544 [CMDID(IP_VS_SO_GET_SERVICE)] = sizeof(struct ip_vs_service_entry), 3545 [CMDID(IP_VS_SO_GET_DESTS)] = sizeof(struct ip_vs_get_dests), 3546 [CMDID(IP_VS_SO_GET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user), 3547 [CMDID(IP_VS_SO_GET_DAEMON)] = 2 * sizeof(struct ip_vs_daemon_user), 3548 }; 3549 3550 union ip_vs_get_arglen { 3551 char field_IP_VS_SO_GET_VERSION[64]; 3552 struct ip_vs_getinfo field_IP_VS_SO_GET_INFO; 3553 struct ip_vs_get_services field_IP_VS_SO_GET_SERVICES; 3554 struct ip_vs_service_entry field_IP_VS_SO_GET_SERVICE; 3555 struct ip_vs_get_dests field_IP_VS_SO_GET_DESTS; 3556 struct ip_vs_timeout_user field_IP_VS_SO_GET_TIMEOUT; 3557 struct ip_vs_daemon_user field_IP_VS_SO_GET_DAEMON[2]; 3558 }; 3559 3560 #define MAX_GET_ARGLEN sizeof(union ip_vs_get_arglen) 3561 3562 static int 3563 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) 3564 { 3565 unsigned char arg[MAX_GET_ARGLEN]; 3566 int ret = 0; 3567 unsigned int copylen; 3568 struct net *net = sock_net(sk); 3569 struct netns_ipvs *ipvs = net_ipvs(net); 3570 3571 BUG_ON(!net); 3572 BUILD_BUG_ON(sizeof(arg) > 255); 3573 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3574 return -EPERM; 3575 3576 if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX) 3577 return -EINVAL; 3578 3579 copylen = get_arglen[CMDID(cmd)]; 3580 if (*len < (int) copylen) { 3581 IP_VS_DBG(1, "get_ctl: len %d < %u\n", *len, copylen); 3582 return -EINVAL; 3583 } 3584 3585 if (copy_from_user(arg, user, copylen) != 0) 3586 return -EFAULT; 3587 /* 3588 * Handle daemons first since it has its own locking 3589 */ 3590 if (cmd == IP_VS_SO_GET_DAEMON) { 3591 struct ip_vs_daemon_user d[2]; 3592 3593 memset(&d, 0, sizeof(d)); 3594 mutex_lock(&ipvs->sync_mutex); 3595 if (ipvs->sync_state & IP_VS_STATE_MASTER) { 3596 d[0].state = IP_VS_STATE_MASTER; 3597 strscpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn, 3598 sizeof(d[0].mcast_ifn)); 3599 d[0].syncid = ipvs->mcfg.syncid; 3600 } 3601 if (ipvs->sync_state & IP_VS_STATE_BACKUP) { 3602 d[1].state = IP_VS_STATE_BACKUP; 3603 strscpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn, 3604 sizeof(d[1].mcast_ifn)); 3605 d[1].syncid = ipvs->bcfg.syncid; 3606 } 3607 if (copy_to_user(user, &d, sizeof(d)) != 0) 3608 ret = -EFAULT; 3609 mutex_unlock(&ipvs->sync_mutex); 3610 return ret; 3611 } 3612 3613 if (cmd == IP_VS_SO_GET_SERVICES) { 3614 struct ip_vs_get_services *get; 3615 size_t size; 3616 3617 get = (struct ip_vs_get_services *)arg; 3618 size = struct_size(get, entrytable, get->num_services); 3619 if (*len != size) { 3620 pr_err("length: %u != %zu\n", *len, size); 3621 return -EINVAL; 3622 } 3623 /* Protect against table resizer moving the entries. 3624 * Try reverse locking, so that we do not hold the mutex 3625 * while waiting for semaphore. 3626 */ 3627 while (1) { 3628 ret = down_read_killable(&ipvs->svc_resize_sem); 3629 if (ret < 0) 3630 return ret; 3631 if (mutex_trylock(&ipvs->service_mutex)) 3632 break; 3633 up_read(&ipvs->svc_resize_sem); 3634 cond_resched(); 3635 } 3636 ret = __ip_vs_get_service_entries(ipvs, get, user); 3637 up_read(&ipvs->svc_resize_sem); 3638 mutex_unlock(&ipvs->service_mutex); 3639 return ret; 3640 } 3641 3642 mutex_lock(&ipvs->service_mutex); 3643 switch (cmd) { 3644 case IP_VS_SO_GET_VERSION: 3645 { 3646 char buf[64]; 3647 3648 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", 3649 NVERSION(IP_VS_VERSION_CODE), get_conn_tab_size(ipvs)); 3650 if (copy_to_user(user, buf, strlen(buf)+1) != 0) { 3651 ret = -EFAULT; 3652 goto out; 3653 } 3654 *len = strlen(buf)+1; 3655 } 3656 break; 3657 3658 case IP_VS_SO_GET_INFO: 3659 { 3660 struct ip_vs_getinfo info; 3661 3662 info.version = IP_VS_VERSION_CODE; 3663 info.size = get_conn_tab_size(ipvs); 3664 info.num_services = 3665 atomic_read(&ipvs->num_services[IP_VS_AF_INET]); 3666 if (copy_to_user(user, &info, sizeof(info)) != 0) 3667 ret = -EFAULT; 3668 } 3669 break; 3670 3671 case IP_VS_SO_GET_SERVICE: 3672 { 3673 struct ip_vs_service_entry *entry; 3674 struct ip_vs_service *svc; 3675 union nf_inet_addr addr; 3676 3677 entry = (struct ip_vs_service_entry *)arg; 3678 addr.ip = entry->addr; 3679 rcu_read_lock(); 3680 if (entry->fwmark) 3681 svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, entry->fwmark); 3682 else 3683 svc = __ip_vs_service_find(ipvs, AF_INET, 3684 entry->protocol, &addr, 3685 entry->port); 3686 rcu_read_unlock(); 3687 if (svc) { 3688 ip_vs_copy_service(entry, svc); 3689 if (copy_to_user(user, entry, sizeof(*entry)) != 0) 3690 ret = -EFAULT; 3691 } else 3692 ret = -ESRCH; 3693 } 3694 break; 3695 3696 case IP_VS_SO_GET_DESTS: 3697 { 3698 struct ip_vs_get_dests *get; 3699 size_t size; 3700 3701 get = (struct ip_vs_get_dests *)arg; 3702 size = struct_size(get, entrytable, get->num_dests); 3703 if (*len != size) { 3704 pr_err("length: %u != %zu\n", *len, size); 3705 ret = -EINVAL; 3706 goto out; 3707 } 3708 ret = __ip_vs_get_dest_entries(ipvs, get, user); 3709 } 3710 break; 3711 3712 case IP_VS_SO_GET_TIMEOUT: 3713 { 3714 struct ip_vs_timeout_user t; 3715 3716 __ip_vs_get_timeouts(ipvs, &t); 3717 if (copy_to_user(user, &t, sizeof(t)) != 0) 3718 ret = -EFAULT; 3719 } 3720 break; 3721 3722 default: 3723 ret = -EINVAL; 3724 } 3725 3726 out: 3727 mutex_unlock(&ipvs->service_mutex); 3728 return ret; 3729 } 3730 3731 3732 static struct nf_sockopt_ops ip_vs_sockopts = { 3733 .pf = PF_INET, 3734 .set_optmin = IP_VS_BASE_CTL, 3735 .set_optmax = IP_VS_SO_SET_MAX+1, 3736 .set = do_ip_vs_set_ctl, 3737 .get_optmin = IP_VS_BASE_CTL, 3738 .get_optmax = IP_VS_SO_GET_MAX+1, 3739 .get = do_ip_vs_get_ctl, 3740 .owner = THIS_MODULE, 3741 }; 3742 3743 /* 3744 * Generic Netlink interface 3745 */ 3746 3747 /* IPVS genetlink family */ 3748 static struct genl_family ip_vs_genl_family; 3749 3750 /* Policy used for first-level command attributes */ 3751 static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = { 3752 [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED }, 3753 [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED }, 3754 [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED }, 3755 [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 }, 3756 [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 }, 3757 [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 }, 3758 }; 3759 3760 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */ 3761 static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = { 3762 [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 }, 3763 [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING, 3764 .len = IP_VS_IFNAME_MAXLEN - 1 }, 3765 [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 }, 3766 [IPVS_DAEMON_ATTR_SYNC_MAXLEN] = { .type = NLA_U16 }, 3767 [IPVS_DAEMON_ATTR_MCAST_GROUP] = { .type = NLA_U32 }, 3768 [IPVS_DAEMON_ATTR_MCAST_GROUP6] = { .len = sizeof(struct in6_addr) }, 3769 [IPVS_DAEMON_ATTR_MCAST_PORT] = { .type = NLA_U16 }, 3770 [IPVS_DAEMON_ATTR_MCAST_TTL] = { .type = NLA_U8 }, 3771 }; 3772 3773 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */ 3774 static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = { 3775 [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 }, 3776 [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 }, 3777 [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY, 3778 .len = sizeof(union nf_inet_addr) }, 3779 [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 }, 3780 [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 }, 3781 [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING, 3782 .len = IP_VS_SCHEDNAME_MAXLEN - 1 }, 3783 [IPVS_SVC_ATTR_PE_NAME] = { .type = NLA_NUL_STRING, 3784 .len = IP_VS_PENAME_MAXLEN }, 3785 [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY, 3786 .len = sizeof(struct ip_vs_flags) }, 3787 [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 }, 3788 [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 }, 3789 [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED }, 3790 }; 3791 3792 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */ 3793 static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { 3794 [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY, 3795 .len = sizeof(union nf_inet_addr) }, 3796 [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 }, 3797 [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 }, 3798 [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 }, 3799 [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 }, 3800 [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 }, 3801 [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 }, 3802 [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 }, 3803 [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 }, 3804 [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED }, 3805 [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 }, 3806 [IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 }, 3807 [IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 }, 3808 [IPVS_DEST_ATTR_TUN_FLAGS] = { .type = NLA_U16 }, 3809 }; 3810 3811 static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, 3812 struct ip_vs_kstats *kstats) 3813 { 3814 struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type); 3815 3816 if (!nl_stats) 3817 return -EMSGSIZE; 3818 3819 if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, (u32)kstats->conns) || 3820 nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, (u32)kstats->inpkts) || 3821 nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, (u32)kstats->outpkts) || 3822 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes, 3823 IPVS_STATS_ATTR_PAD) || 3824 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes, 3825 IPVS_STATS_ATTR_PAD) || 3826 nla_put_u32(skb, IPVS_STATS_ATTR_CPS, (u32)kstats->cps) || 3827 nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, (u32)kstats->inpps) || 3828 nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, (u32)kstats->outpps) || 3829 nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, (u32)kstats->inbps) || 3830 nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, (u32)kstats->outbps)) 3831 goto nla_put_failure; 3832 nla_nest_end(skb, nl_stats); 3833 3834 return 0; 3835 3836 nla_put_failure: 3837 nla_nest_cancel(skb, nl_stats); 3838 return -EMSGSIZE; 3839 } 3840 3841 static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type, 3842 struct ip_vs_kstats *kstats) 3843 { 3844 struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type); 3845 3846 if (!nl_stats) 3847 return -EMSGSIZE; 3848 3849 if (nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CONNS, kstats->conns, 3850 IPVS_STATS_ATTR_PAD) || 3851 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts, 3852 IPVS_STATS_ATTR_PAD) || 3853 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts, 3854 IPVS_STATS_ATTR_PAD) || 3855 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes, 3856 IPVS_STATS_ATTR_PAD) || 3857 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes, 3858 IPVS_STATS_ATTR_PAD) || 3859 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CPS, kstats->cps, 3860 IPVS_STATS_ATTR_PAD) || 3861 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps, 3862 IPVS_STATS_ATTR_PAD) || 3863 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps, 3864 IPVS_STATS_ATTR_PAD) || 3865 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps, 3866 IPVS_STATS_ATTR_PAD) || 3867 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps, 3868 IPVS_STATS_ATTR_PAD)) 3869 goto nla_put_failure; 3870 nla_nest_end(skb, nl_stats); 3871 3872 return 0; 3873 3874 nla_put_failure: 3875 nla_nest_cancel(skb, nl_stats); 3876 return -EMSGSIZE; 3877 } 3878 3879 static int ip_vs_genl_fill_service(struct sk_buff *skb, 3880 struct ip_vs_service *svc) 3881 { 3882 struct ip_vs_scheduler *sched; 3883 struct ip_vs_pe *pe; 3884 struct nlattr *nl_service; 3885 struct ip_vs_flags flags = { .flags = svc->flags, 3886 .mask = ~0 }; 3887 struct ip_vs_kstats kstats; 3888 char *sched_name; 3889 3890 nl_service = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_SERVICE); 3891 if (!nl_service) 3892 return -EMSGSIZE; 3893 3894 if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af)) 3895 goto nla_put_failure; 3896 if (svc->fwmark) { 3897 if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark)) 3898 goto nla_put_failure; 3899 } else { 3900 if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) || 3901 nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) || 3902 nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port)) 3903 goto nla_put_failure; 3904 } 3905 3906 sched = rcu_dereference(svc->scheduler); 3907 sched_name = sched ? sched->name : "none"; 3908 pe = rcu_dereference(svc->pe); 3909 if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched_name) || 3910 (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) || 3911 nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) || 3912 nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) || 3913 nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask)) 3914 goto nla_put_failure; 3915 ip_vs_copy_stats(&kstats, &svc->stats); 3916 if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &kstats)) 3917 goto nla_put_failure; 3918 if (ip_vs_genl_fill_stats64(skb, IPVS_SVC_ATTR_STATS64, &kstats)) 3919 goto nla_put_failure; 3920 3921 nla_nest_end(skb, nl_service); 3922 3923 return 0; 3924 3925 nla_put_failure: 3926 nla_nest_cancel(skb, nl_service); 3927 return -EMSGSIZE; 3928 } 3929 3930 static int ip_vs_genl_dump_service(struct sk_buff *skb, 3931 struct ip_vs_service *svc, 3932 struct netlink_callback *cb) 3933 { 3934 void *hdr; 3935 3936 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 3937 &ip_vs_genl_family, NLM_F_MULTI, 3938 IPVS_CMD_NEW_SERVICE); 3939 if (!hdr) 3940 return -EMSGSIZE; 3941 3942 if (ip_vs_genl_fill_service(skb, svc) < 0) 3943 goto nla_put_failure; 3944 3945 genlmsg_end(skb, hdr); 3946 return 0; 3947 3948 nla_put_failure: 3949 genlmsg_cancel(skb, hdr); 3950 return -EMSGSIZE; 3951 } 3952 3953 static int ip_vs_genl_dump_services(struct sk_buff *skb, 3954 struct netlink_callback *cb) 3955 { 3956 DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU(); 3957 struct net *net = sock_net(skb->sk); 3958 struct netns_ipvs *ipvs = net_ipvs(net); 3959 struct hlist_bl_head *head; 3960 struct ip_vs_service *svc; 3961 struct hlist_bl_node *e; 3962 int start = cb->args[0]; 3963 int idx = 0; 3964 3965 down_read(&ipvs->svc_resize_sem); 3966 rcu_read_lock(); 3967 ip_vs_rht_walk_buckets_safe_rcu(ipvs->svc_table, head) { 3968 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) { 3969 if (++idx <= start) 3970 continue; 3971 if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { 3972 idx--; 3973 goto nla_put_failure; 3974 } 3975 } 3976 } 3977 3978 nla_put_failure: 3979 rcu_read_unlock(); 3980 up_read(&ipvs->svc_resize_sem); 3981 cb->args[0] = idx; 3982 3983 return skb->len; 3984 } 3985 3986 static bool ip_vs_is_af_valid(int af) 3987 { 3988 if (af == AF_INET) 3989 return true; 3990 #ifdef CONFIG_IP_VS_IPV6 3991 if (af == AF_INET6 && ipv6_mod_enabled()) 3992 return true; 3993 #endif 3994 return false; 3995 } 3996 3997 static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs, 3998 struct ip_vs_service_user_kern *usvc, 3999 struct nlattr *nla, bool full_entry, 4000 struct ip_vs_service **ret_svc) 4001 { 4002 struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1]; 4003 struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr; 4004 struct ip_vs_service *svc; 4005 4006 /* Parse mandatory identifying service fields first */ 4007 if (nla == NULL || 4008 nla_parse_nested_deprecated(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy, NULL)) 4009 return -EINVAL; 4010 4011 nla_af = attrs[IPVS_SVC_ATTR_AF]; 4012 nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL]; 4013 nla_addr = attrs[IPVS_SVC_ATTR_ADDR]; 4014 nla_port = attrs[IPVS_SVC_ATTR_PORT]; 4015 nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK]; 4016 4017 if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr)))) 4018 return -EINVAL; 4019 4020 memset(usvc, 0, sizeof(*usvc)); 4021 4022 usvc->af = nla_get_u16(nla_af); 4023 if (!ip_vs_is_af_valid(usvc->af)) 4024 return -EAFNOSUPPORT; 4025 4026 if (nla_fwmark) { 4027 usvc->protocol = IPPROTO_TCP; 4028 usvc->fwmark = nla_get_u32(nla_fwmark); 4029 } else { 4030 usvc->protocol = nla_get_u16(nla_protocol); 4031 nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr)); 4032 usvc->port = nla_get_be16(nla_port); 4033 usvc->fwmark = 0; 4034 } 4035 4036 if (usvc->fwmark) 4037 svc = __ip_vs_svc_fwm_find(ipvs, usvc->af, usvc->fwmark); 4038 else 4039 svc = __ip_vs_service_find(ipvs, usvc->af, usvc->protocol, 4040 &usvc->addr, usvc->port); 4041 *ret_svc = svc; 4042 4043 /* If a full entry was requested, check for the additional fields */ 4044 if (full_entry) { 4045 struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout, 4046 *nla_netmask; 4047 struct ip_vs_flags flags; 4048 4049 nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME]; 4050 nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME]; 4051 nla_flags = attrs[IPVS_SVC_ATTR_FLAGS]; 4052 nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT]; 4053 nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK]; 4054 4055 if (!(nla_sched && nla_flags && nla_timeout && nla_netmask)) 4056 return -EINVAL; 4057 4058 nla_memcpy(&flags, nla_flags, sizeof(flags)); 4059 4060 /* prefill flags from service if it already exists */ 4061 if (svc) 4062 usvc->flags = svc->flags; 4063 4064 /* set new flags from userland */ 4065 usvc->flags = (usvc->flags & ~flags.mask) | 4066 (flags.flags & flags.mask); 4067 usvc->sched_name = nla_data(nla_sched); 4068 usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL; 4069 usvc->timeout = nla_get_u32(nla_timeout); 4070 usvc->netmask = nla_get_be32(nla_netmask); 4071 } 4072 4073 return 0; 4074 } 4075 4076 static struct ip_vs_service *ip_vs_genl_find_service(struct netns_ipvs *ipvs, 4077 struct nlattr *nla) 4078 { 4079 struct ip_vs_service_user_kern usvc; 4080 struct ip_vs_service *svc; 4081 int ret; 4082 4083 ret = ip_vs_genl_parse_service(ipvs, &usvc, nla, false, &svc); 4084 return ret ? ERR_PTR(ret) : svc; 4085 } 4086 4087 static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) 4088 { 4089 struct nlattr *nl_dest; 4090 struct ip_vs_kstats kstats; 4091 4092 nl_dest = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DEST); 4093 if (!nl_dest) 4094 return -EMSGSIZE; 4095 4096 if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) || 4097 nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) || 4098 nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD, 4099 (atomic_read(&dest->conn_flags) & 4100 IP_VS_CONN_F_FWD_MASK)) || 4101 nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT, 4102 atomic_read(&dest->weight)) || 4103 nla_put_u8(skb, IPVS_DEST_ATTR_TUN_TYPE, 4104 dest->tun_type) || 4105 nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT, 4106 dest->tun_port) || 4107 nla_put_u16(skb, IPVS_DEST_ATTR_TUN_FLAGS, 4108 dest->tun_flags) || 4109 nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) || 4110 nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) || 4111 nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, 4112 atomic_read(&dest->activeconns)) || 4113 nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS, 4114 atomic_read(&dest->inactconns)) || 4115 nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, 4116 atomic_read(&dest->persistconns)) || 4117 nla_put_u16(skb, IPVS_DEST_ATTR_ADDR_FAMILY, dest->af)) 4118 goto nla_put_failure; 4119 ip_vs_copy_stats(&kstats, &dest->stats); 4120 if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &kstats)) 4121 goto nla_put_failure; 4122 if (ip_vs_genl_fill_stats64(skb, IPVS_DEST_ATTR_STATS64, &kstats)) 4123 goto nla_put_failure; 4124 4125 nla_nest_end(skb, nl_dest); 4126 4127 return 0; 4128 4129 nla_put_failure: 4130 nla_nest_cancel(skb, nl_dest); 4131 return -EMSGSIZE; 4132 } 4133 4134 static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest, 4135 struct netlink_callback *cb) 4136 { 4137 void *hdr; 4138 4139 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 4140 &ip_vs_genl_family, NLM_F_MULTI, 4141 IPVS_CMD_NEW_DEST); 4142 if (!hdr) 4143 return -EMSGSIZE; 4144 4145 if (ip_vs_genl_fill_dest(skb, dest) < 0) 4146 goto nla_put_failure; 4147 4148 genlmsg_end(skb, hdr); 4149 return 0; 4150 4151 nla_put_failure: 4152 genlmsg_cancel(skb, hdr); 4153 return -EMSGSIZE; 4154 } 4155 4156 static int ip_vs_genl_dump_dests(struct sk_buff *skb, 4157 struct netlink_callback *cb) 4158 { 4159 int idx = 0; 4160 int start = cb->args[0]; 4161 struct ip_vs_service *svc; 4162 struct ip_vs_dest *dest; 4163 struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; 4164 struct net *net = sock_net(skb->sk); 4165 struct netns_ipvs *ipvs = net_ipvs(net); 4166 4167 rcu_read_lock(); 4168 4169 /* Try to find the service for which to dump destinations */ 4170 if (nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy, cb->extack)) 4171 goto out_err; 4172 4173 4174 svc = ip_vs_genl_find_service(ipvs, attrs[IPVS_CMD_ATTR_SERVICE]); 4175 if (IS_ERR_OR_NULL(svc)) 4176 goto out_err; 4177 4178 /* Dump the destinations */ 4179 list_for_each_entry_rcu(dest, &svc->destinations, n_list) { 4180 if (++idx <= start) 4181 continue; 4182 if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) { 4183 idx--; 4184 goto nla_put_failure; 4185 } 4186 } 4187 4188 nla_put_failure: 4189 cb->args[0] = idx; 4190 4191 out_err: 4192 rcu_read_unlock(); 4193 4194 return skb->len; 4195 } 4196 4197 static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, 4198 struct nlattr *nla, bool full_entry) 4199 { 4200 struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1]; 4201 struct nlattr *nla_addr, *nla_port; 4202 struct nlattr *nla_addr_family; 4203 4204 /* Parse mandatory identifying destination fields first */ 4205 if (nla == NULL || 4206 nla_parse_nested_deprecated(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy, NULL)) 4207 return -EINVAL; 4208 4209 nla_addr = attrs[IPVS_DEST_ATTR_ADDR]; 4210 nla_port = attrs[IPVS_DEST_ATTR_PORT]; 4211 nla_addr_family = attrs[IPVS_DEST_ATTR_ADDR_FAMILY]; 4212 4213 if (!(nla_addr && nla_port)) 4214 return -EINVAL; 4215 4216 memset(udest, 0, sizeof(*udest)); 4217 4218 nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); 4219 udest->port = nla_get_be16(nla_port); 4220 4221 udest->af = nla_get_u16_default(nla_addr_family, 0); 4222 4223 /* If a full entry was requested, check for the additional fields */ 4224 if (full_entry) { 4225 struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, 4226 *nla_l_thresh, *nla_tun_type, *nla_tun_port, 4227 *nla_tun_flags; 4228 4229 nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD]; 4230 nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT]; 4231 nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH]; 4232 nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH]; 4233 nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE]; 4234 nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT]; 4235 nla_tun_flags = attrs[IPVS_DEST_ATTR_TUN_FLAGS]; 4236 4237 if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh)) 4238 return -EINVAL; 4239 4240 udest->conn_flags = nla_get_u32(nla_fwd) 4241 & IP_VS_CONN_F_FWD_MASK; 4242 udest->weight = nla_get_u32(nla_weight); 4243 udest->u_threshold = nla_get_u32(nla_u_thresh); 4244 udest->l_threshold = nla_get_u32(nla_l_thresh); 4245 4246 if (nla_tun_type) 4247 udest->tun_type = nla_get_u8(nla_tun_type); 4248 4249 if (nla_tun_port) 4250 udest->tun_port = nla_get_be16(nla_tun_port); 4251 4252 if (nla_tun_flags) 4253 udest->tun_flags = nla_get_u16(nla_tun_flags); 4254 } 4255 4256 return 0; 4257 } 4258 4259 static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state, 4260 struct ipvs_sync_daemon_cfg *c) 4261 { 4262 struct nlattr *nl_daemon; 4263 4264 nl_daemon = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DAEMON); 4265 if (!nl_daemon) 4266 return -EMSGSIZE; 4267 4268 if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) || 4269 nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, c->mcast_ifn) || 4270 nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, c->syncid) || 4271 nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen) || 4272 nla_put_u16(skb, IPVS_DAEMON_ATTR_MCAST_PORT, c->mcast_port) || 4273 nla_put_u8(skb, IPVS_DAEMON_ATTR_MCAST_TTL, c->mcast_ttl)) 4274 goto nla_put_failure; 4275 #ifdef CONFIG_IP_VS_IPV6 4276 if (c->mcast_af == AF_INET6) { 4277 if (nla_put_in6_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP6, 4278 &c->mcast_group.in6)) 4279 goto nla_put_failure; 4280 } else 4281 #endif 4282 if (c->mcast_af == AF_INET && 4283 nla_put_in_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP, 4284 c->mcast_group.ip)) 4285 goto nla_put_failure; 4286 nla_nest_end(skb, nl_daemon); 4287 4288 return 0; 4289 4290 nla_put_failure: 4291 nla_nest_cancel(skb, nl_daemon); 4292 return -EMSGSIZE; 4293 } 4294 4295 static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state, 4296 struct ipvs_sync_daemon_cfg *c, 4297 struct netlink_callback *cb) 4298 { 4299 void *hdr; 4300 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 4301 &ip_vs_genl_family, NLM_F_MULTI, 4302 IPVS_CMD_NEW_DAEMON); 4303 if (!hdr) 4304 return -EMSGSIZE; 4305 4306 if (ip_vs_genl_fill_daemon(skb, state, c)) 4307 goto nla_put_failure; 4308 4309 genlmsg_end(skb, hdr); 4310 return 0; 4311 4312 nla_put_failure: 4313 genlmsg_cancel(skb, hdr); 4314 return -EMSGSIZE; 4315 } 4316 4317 static int ip_vs_genl_dump_daemons(struct sk_buff *skb, 4318 struct netlink_callback *cb) 4319 { 4320 struct net *net = sock_net(skb->sk); 4321 struct netns_ipvs *ipvs = net_ipvs(net); 4322 4323 mutex_lock(&ipvs->sync_mutex); 4324 if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { 4325 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, 4326 &ipvs->mcfg, cb) < 0) 4327 goto nla_put_failure; 4328 4329 cb->args[0] = 1; 4330 } 4331 4332 if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { 4333 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, 4334 &ipvs->bcfg, cb) < 0) 4335 goto nla_put_failure; 4336 4337 cb->args[1] = 1; 4338 } 4339 4340 nla_put_failure: 4341 mutex_unlock(&ipvs->sync_mutex); 4342 4343 return skb->len; 4344 } 4345 4346 static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) 4347 { 4348 struct ipvs_sync_daemon_cfg c; 4349 struct nlattr *a; 4350 int ret; 4351 4352 memset(&c, 0, sizeof(c)); 4353 if (!(attrs[IPVS_DAEMON_ATTR_STATE] && 4354 attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && 4355 attrs[IPVS_DAEMON_ATTR_SYNC_ID])) 4356 return -EINVAL; 4357 strscpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), 4358 sizeof(c.mcast_ifn)); 4359 c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]); 4360 4361 a = attrs[IPVS_DAEMON_ATTR_SYNC_MAXLEN]; 4362 if (a) 4363 c.sync_maxlen = nla_get_u16(a); 4364 4365 a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP]; 4366 if (a) { 4367 c.mcast_af = AF_INET; 4368 c.mcast_group.ip = nla_get_in_addr(a); 4369 if (!ipv4_is_multicast(c.mcast_group.ip)) 4370 return -EINVAL; 4371 } else { 4372 a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP6]; 4373 if (a) { 4374 #ifdef CONFIG_IP_VS_IPV6 4375 int addr_type; 4376 4377 c.mcast_af = AF_INET6; 4378 c.mcast_group.in6 = nla_get_in6_addr(a); 4379 addr_type = ipv6_addr_type(&c.mcast_group.in6); 4380 if (!(addr_type & IPV6_ADDR_MULTICAST)) 4381 return -EINVAL; 4382 #else 4383 return -EAFNOSUPPORT; 4384 #endif 4385 } 4386 } 4387 4388 a = attrs[IPVS_DAEMON_ATTR_MCAST_PORT]; 4389 if (a) 4390 c.mcast_port = nla_get_u16(a); 4391 4392 a = attrs[IPVS_DAEMON_ATTR_MCAST_TTL]; 4393 if (a) 4394 c.mcast_ttl = nla_get_u8(a); 4395 4396 /* The synchronization protocol is incompatible with mixed family 4397 * services 4398 */ 4399 if (ipvs->mixed_address_family_dests > 0) 4400 return -EINVAL; 4401 4402 ret = start_sync_thread(ipvs, &c, 4403 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); 4404 return ret; 4405 } 4406 4407 static int ip_vs_genl_del_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) 4408 { 4409 int ret; 4410 4411 if (!attrs[IPVS_DAEMON_ATTR_STATE]) 4412 return -EINVAL; 4413 4414 ret = stop_sync_thread(ipvs, 4415 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); 4416 return ret; 4417 } 4418 4419 static int ip_vs_genl_set_config(struct netns_ipvs *ipvs, struct nlattr **attrs) 4420 { 4421 struct ip_vs_timeout_user t; 4422 4423 __ip_vs_get_timeouts(ipvs, &t); 4424 4425 if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]) 4426 t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]); 4427 4428 if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]) 4429 t.tcp_fin_timeout = 4430 nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]); 4431 4432 if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]) 4433 t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]); 4434 4435 return ip_vs_set_timeout(ipvs, &t); 4436 } 4437 4438 static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info) 4439 { 4440 int ret = -EINVAL, cmd; 4441 struct net *net = sock_net(skb->sk); 4442 struct netns_ipvs *ipvs = net_ipvs(net); 4443 4444 cmd = info->genlhdr->cmd; 4445 4446 if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) { 4447 struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; 4448 4449 if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || 4450 nla_parse_nested_deprecated(daemon_attrs, IPVS_DAEMON_ATTR_MAX, info->attrs[IPVS_CMD_ATTR_DAEMON], ip_vs_daemon_policy, info->extack)) 4451 goto out; 4452 4453 if (cmd == IPVS_CMD_NEW_DAEMON) 4454 ret = ip_vs_genl_new_daemon(ipvs, daemon_attrs); 4455 else 4456 ret = ip_vs_genl_del_daemon(ipvs, daemon_attrs); 4457 } 4458 4459 out: 4460 return ret; 4461 } 4462 4463 static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) 4464 { 4465 bool need_full_svc = false, need_full_dest = false; 4466 struct ip_vs_service *svc = NULL; 4467 struct ip_vs_service_user_kern usvc; 4468 struct ip_vs_dest_user_kern udest; 4469 int ret = 0, cmd; 4470 struct net *net = sock_net(skb->sk); 4471 struct netns_ipvs *ipvs = net_ipvs(net); 4472 4473 cmd = info->genlhdr->cmd; 4474 4475 mutex_lock(&ipvs->service_mutex); 4476 4477 if (cmd == IPVS_CMD_FLUSH) { 4478 ret = ip_vs_flush(ipvs, false); 4479 goto out; 4480 } else if (cmd == IPVS_CMD_SET_CONFIG) { 4481 ret = ip_vs_genl_set_config(ipvs, info->attrs); 4482 goto out; 4483 } else if (cmd == IPVS_CMD_ZERO && 4484 !info->attrs[IPVS_CMD_ATTR_SERVICE]) { 4485 ret = ip_vs_zero_all(ipvs); 4486 goto out; 4487 } 4488 4489 /* All following commands require a service argument, so check if we 4490 * received a valid one. We need a full service specification when 4491 * adding / editing a service. Only identifying members otherwise. */ 4492 if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE) 4493 need_full_svc = true; 4494 4495 /* We use function that requires RCU lock (hlist_bl) */ 4496 rcu_read_lock(); 4497 ret = ip_vs_genl_parse_service(ipvs, &usvc, 4498 info->attrs[IPVS_CMD_ATTR_SERVICE], 4499 need_full_svc, &svc); 4500 rcu_read_unlock(); 4501 if (ret) 4502 goto out; 4503 4504 /* Unless we're adding a new service, the service must already exist */ 4505 if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) { 4506 ret = -ESRCH; 4507 goto out; 4508 } 4509 4510 /* Destination commands require a valid destination argument. For 4511 * adding / editing a destination, we need a full destination 4512 * specification. */ 4513 if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST || 4514 cmd == IPVS_CMD_DEL_DEST) { 4515 if (cmd != IPVS_CMD_DEL_DEST) 4516 need_full_dest = true; 4517 4518 ret = ip_vs_genl_parse_dest(&udest, 4519 info->attrs[IPVS_CMD_ATTR_DEST], 4520 need_full_dest); 4521 if (ret) 4522 goto out; 4523 4524 /* Old protocols did not allow the user to specify address 4525 * family, so we set it to zero instead. We also didn't 4526 * allow heterogeneous pools in the old code, so it's safe 4527 * to assume that this will have the same address family as 4528 * the service. 4529 */ 4530 if (udest.af == 0) 4531 udest.af = svc->af; 4532 4533 if (!ip_vs_is_af_valid(udest.af)) { 4534 ret = -EAFNOSUPPORT; 4535 goto out; 4536 } 4537 4538 if (udest.af != svc->af && cmd != IPVS_CMD_DEL_DEST) { 4539 /* The synchronization protocol is incompatible 4540 * with mixed family services 4541 */ 4542 if (ipvs->sync_state) { 4543 ret = -EINVAL; 4544 goto out; 4545 } 4546 4547 /* Which connection types do we support? */ 4548 switch (udest.conn_flags) { 4549 case IP_VS_CONN_F_TUNNEL: 4550 /* We are able to forward this */ 4551 break; 4552 default: 4553 ret = -EINVAL; 4554 goto out; 4555 } 4556 } 4557 } 4558 4559 switch (cmd) { 4560 case IPVS_CMD_NEW_SERVICE: 4561 if (svc == NULL) 4562 ret = ip_vs_add_service(ipvs, &usvc, &svc); 4563 else 4564 ret = -EEXIST; 4565 break; 4566 case IPVS_CMD_SET_SERVICE: 4567 ret = ip_vs_edit_service(svc, &usvc); 4568 break; 4569 case IPVS_CMD_DEL_SERVICE: 4570 ret = ip_vs_del_service(svc); 4571 /* do not use svc, it can be freed */ 4572 break; 4573 case IPVS_CMD_NEW_DEST: 4574 ret = ip_vs_add_dest(svc, &udest); 4575 break; 4576 case IPVS_CMD_SET_DEST: 4577 ret = ip_vs_edit_dest(svc, &udest); 4578 break; 4579 case IPVS_CMD_DEL_DEST: 4580 ret = ip_vs_del_dest(svc, &udest); 4581 break; 4582 case IPVS_CMD_ZERO: 4583 ret = ip_vs_zero_service(svc); 4584 break; 4585 default: 4586 ret = -EINVAL; 4587 } 4588 4589 out: 4590 mutex_unlock(&ipvs->service_mutex); 4591 4592 return ret; 4593 } 4594 4595 static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) 4596 { 4597 struct sk_buff *msg; 4598 void *reply; 4599 int ret, cmd, reply_cmd; 4600 struct net *net = sock_net(skb->sk); 4601 struct netns_ipvs *ipvs = net_ipvs(net); 4602 4603 cmd = info->genlhdr->cmd; 4604 4605 if (cmd == IPVS_CMD_GET_SERVICE) 4606 reply_cmd = IPVS_CMD_NEW_SERVICE; 4607 else if (cmd == IPVS_CMD_GET_INFO) 4608 reply_cmd = IPVS_CMD_SET_INFO; 4609 else if (cmd == IPVS_CMD_GET_CONFIG) 4610 reply_cmd = IPVS_CMD_SET_CONFIG; 4611 else { 4612 pr_err("unknown Generic Netlink command\n"); 4613 return -EINVAL; 4614 } 4615 4616 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 4617 if (!msg) 4618 return -ENOMEM; 4619 4620 rcu_read_lock(); 4621 4622 reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd); 4623 if (reply == NULL) 4624 goto nla_put_failure; 4625 4626 switch (cmd) { 4627 case IPVS_CMD_GET_SERVICE: 4628 { 4629 struct ip_vs_service *svc; 4630 4631 svc = ip_vs_genl_find_service(ipvs, 4632 info->attrs[IPVS_CMD_ATTR_SERVICE]); 4633 if (IS_ERR(svc)) { 4634 ret = PTR_ERR(svc); 4635 goto out_err; 4636 } else if (svc) { 4637 ret = ip_vs_genl_fill_service(msg, svc); 4638 if (ret) 4639 goto nla_put_failure; 4640 } else { 4641 ret = -ESRCH; 4642 goto out_err; 4643 } 4644 4645 break; 4646 } 4647 4648 case IPVS_CMD_GET_CONFIG: 4649 { 4650 struct ip_vs_timeout_user t; 4651 4652 __ip_vs_get_timeouts(ipvs, &t); 4653 #ifdef CONFIG_IP_VS_PROTO_TCP 4654 if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, 4655 t.tcp_timeout) || 4656 nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, 4657 t.tcp_fin_timeout)) 4658 goto nla_put_failure; 4659 #endif 4660 #ifdef CONFIG_IP_VS_PROTO_UDP 4661 if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout)) 4662 goto nla_put_failure; 4663 #endif 4664 4665 break; 4666 } 4667 4668 case IPVS_CMD_GET_INFO: 4669 if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION, 4670 IP_VS_VERSION_CODE) || 4671 nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, 4672 get_conn_tab_size(ipvs))) 4673 goto nla_put_failure; 4674 break; 4675 } 4676 4677 genlmsg_end(msg, reply); 4678 ret = genlmsg_reply(msg, info); 4679 goto out; 4680 4681 nla_put_failure: 4682 pr_err("not enough space in Netlink message\n"); 4683 ret = -EMSGSIZE; 4684 4685 out_err: 4686 nlmsg_free(msg); 4687 out: 4688 rcu_read_unlock(); 4689 4690 return ret; 4691 } 4692 4693 4694 static const struct genl_small_ops ip_vs_genl_ops[] = { 4695 { 4696 .cmd = IPVS_CMD_NEW_SERVICE, 4697 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4698 .flags = GENL_ADMIN_PERM, 4699 .doit = ip_vs_genl_set_cmd, 4700 }, 4701 { 4702 .cmd = IPVS_CMD_SET_SERVICE, 4703 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4704 .flags = GENL_ADMIN_PERM, 4705 .doit = ip_vs_genl_set_cmd, 4706 }, 4707 { 4708 .cmd = IPVS_CMD_DEL_SERVICE, 4709 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4710 .flags = GENL_ADMIN_PERM, 4711 .doit = ip_vs_genl_set_cmd, 4712 }, 4713 { 4714 .cmd = IPVS_CMD_GET_SERVICE, 4715 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4716 .flags = GENL_ADMIN_PERM, 4717 .doit = ip_vs_genl_get_cmd, 4718 .dumpit = ip_vs_genl_dump_services, 4719 }, 4720 { 4721 .cmd = IPVS_CMD_NEW_DEST, 4722 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4723 .flags = GENL_ADMIN_PERM, 4724 .doit = ip_vs_genl_set_cmd, 4725 }, 4726 { 4727 .cmd = IPVS_CMD_SET_DEST, 4728 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4729 .flags = GENL_ADMIN_PERM, 4730 .doit = ip_vs_genl_set_cmd, 4731 }, 4732 { 4733 .cmd = IPVS_CMD_DEL_DEST, 4734 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4735 .flags = GENL_ADMIN_PERM, 4736 .doit = ip_vs_genl_set_cmd, 4737 }, 4738 { 4739 .cmd = IPVS_CMD_GET_DEST, 4740 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4741 .flags = GENL_ADMIN_PERM, 4742 .dumpit = ip_vs_genl_dump_dests, 4743 }, 4744 { 4745 .cmd = IPVS_CMD_NEW_DAEMON, 4746 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4747 .flags = GENL_ADMIN_PERM, 4748 .doit = ip_vs_genl_set_daemon, 4749 }, 4750 { 4751 .cmd = IPVS_CMD_DEL_DAEMON, 4752 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4753 .flags = GENL_ADMIN_PERM, 4754 .doit = ip_vs_genl_set_daemon, 4755 }, 4756 { 4757 .cmd = IPVS_CMD_GET_DAEMON, 4758 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4759 .flags = GENL_ADMIN_PERM, 4760 .dumpit = ip_vs_genl_dump_daemons, 4761 }, 4762 { 4763 .cmd = IPVS_CMD_SET_CONFIG, 4764 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4765 .flags = GENL_ADMIN_PERM, 4766 .doit = ip_vs_genl_set_cmd, 4767 }, 4768 { 4769 .cmd = IPVS_CMD_GET_CONFIG, 4770 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4771 .flags = GENL_ADMIN_PERM, 4772 .doit = ip_vs_genl_get_cmd, 4773 }, 4774 { 4775 .cmd = IPVS_CMD_GET_INFO, 4776 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4777 .flags = GENL_ADMIN_PERM, 4778 .doit = ip_vs_genl_get_cmd, 4779 }, 4780 { 4781 .cmd = IPVS_CMD_ZERO, 4782 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4783 .flags = GENL_ADMIN_PERM, 4784 .doit = ip_vs_genl_set_cmd, 4785 }, 4786 { 4787 .cmd = IPVS_CMD_FLUSH, 4788 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, 4789 .flags = GENL_ADMIN_PERM, 4790 .doit = ip_vs_genl_set_cmd, 4791 }, 4792 }; 4793 4794 static struct genl_family ip_vs_genl_family __ro_after_init = { 4795 .hdrsize = 0, 4796 .name = IPVS_GENL_NAME, 4797 .version = IPVS_GENL_VERSION, 4798 .maxattr = IPVS_CMD_ATTR_MAX, 4799 .policy = ip_vs_cmd_policy, 4800 .netnsok = true, /* Make ipvsadm to work on netns */ 4801 .module = THIS_MODULE, 4802 .small_ops = ip_vs_genl_ops, 4803 .n_small_ops = ARRAY_SIZE(ip_vs_genl_ops), 4804 .resv_start_op = IPVS_CMD_FLUSH + 1, 4805 .parallel_ops = 1, 4806 }; 4807 4808 static int __init ip_vs_genl_register(void) 4809 { 4810 return genl_register_family(&ip_vs_genl_family); 4811 } 4812 4813 static void ip_vs_genl_unregister(void) 4814 { 4815 genl_unregister_family(&ip_vs_genl_family); 4816 } 4817 4818 /* End of Generic Netlink interface definitions */ 4819 4820 /* 4821 * per netns intit/exit func. 4822 */ 4823 #ifdef CONFIG_SYSCTL 4824 static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) 4825 { 4826 struct net *net = ipvs->net; 4827 struct ctl_table *tbl; 4828 int idx, ret; 4829 size_t ctl_table_size = ARRAY_SIZE(vs_vars); 4830 bool unpriv = net->user_ns != &init_user_ns; 4831 4832 atomic_set(&ipvs->dropentry, 0); 4833 spin_lock_init(&ipvs->dropentry_lock); 4834 spin_lock_init(&ipvs->droppacket_lock); 4835 spin_lock_init(&ipvs->securetcp_lock); 4836 INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); 4837 INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work, 4838 expire_nodest_conn_handler); 4839 ipvs->est_stopped = 0; 4840 4841 if (!net_eq(net, &init_net)) { 4842 tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL); 4843 if (tbl == NULL) 4844 return -ENOMEM; 4845 } else 4846 tbl = vs_vars; 4847 /* Initialize sysctl defaults */ 4848 for (idx = 0; idx < ARRAY_SIZE(vs_vars); idx++) { 4849 if (tbl[idx].proc_handler == proc_do_defense_mode) 4850 tbl[idx].extra2 = ipvs; 4851 } 4852 idx = 0; 4853 ipvs->sysctl_amemthresh = 1024; 4854 tbl[idx++].data = &ipvs->sysctl_amemthresh; 4855 ipvs->sysctl_am_droprate = 10; 4856 tbl[idx++].data = &ipvs->sysctl_am_droprate; 4857 tbl[idx++].data = &ipvs->sysctl_drop_entry; 4858 tbl[idx++].data = &ipvs->sysctl_drop_packet; 4859 #ifdef CONFIG_IP_VS_NFCT 4860 tbl[idx++].data = &ipvs->sysctl_conntrack; 4861 #endif 4862 tbl[idx++].data = &ipvs->sysctl_secure_tcp; 4863 ipvs->sysctl_snat_reroute = 1; 4864 tbl[idx++].data = &ipvs->sysctl_snat_reroute; 4865 ipvs->sysctl_sync_ver = 1; 4866 tbl[idx++].data = &ipvs->sysctl_sync_ver; 4867 ipvs->sysctl_sync_ports = 1; 4868 tbl[idx++].data = &ipvs->sysctl_sync_ports; 4869 tbl[idx++].data = &ipvs->sysctl_sync_persist_mode; 4870 4871 ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32; 4872 if (unpriv) 4873 tbl[idx].mode = 0444; 4874 tbl[idx++].data = &ipvs->sysctl_sync_qlen_max; 4875 4876 ipvs->sysctl_sync_sock_size = 0; 4877 if (unpriv) 4878 tbl[idx].mode = 0444; 4879 tbl[idx++].data = &ipvs->sysctl_sync_sock_size; 4880 4881 tbl[idx++].data = &ipvs->sysctl_cache_bypass; 4882 tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn; 4883 tbl[idx++].data = &ipvs->sysctl_sloppy_tcp; 4884 tbl[idx++].data = &ipvs->sysctl_sloppy_sctp; 4885 tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template; 4886 ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD; 4887 ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD; 4888 tbl[idx].data = &ipvs->sysctl_sync_threshold; 4889 tbl[idx].extra2 = ipvs; 4890 tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold); 4891 ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD; 4892 tbl[idx++].data = &ipvs->sysctl_sync_refresh_period; 4893 ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3); 4894 tbl[idx++].data = &ipvs->sysctl_sync_retries; 4895 tbl[idx++].data = &ipvs->sysctl_nat_icmp_send; 4896 ipvs->sysctl_pmtu_disc = 1; 4897 tbl[idx++].data = &ipvs->sysctl_pmtu_disc; 4898 tbl[idx++].data = &ipvs->sysctl_backup_only; 4899 ipvs->sysctl_conn_reuse_mode = 1; 4900 tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode; 4901 tbl[idx++].data = &ipvs->sysctl_schedule_icmp; 4902 tbl[idx++].data = &ipvs->sysctl_ignore_tunneled; 4903 4904 ipvs->sysctl_run_estimation = 1; 4905 if (unpriv) 4906 tbl[idx].mode = 0444; 4907 tbl[idx].extra2 = ipvs; 4908 tbl[idx++].data = &ipvs->sysctl_run_estimation; 4909 4910 ipvs->est_cpulist_valid = 0; 4911 if (unpriv) 4912 tbl[idx].mode = 0444; 4913 tbl[idx].extra2 = ipvs; 4914 tbl[idx++].data = &ipvs->sysctl_est_cpulist; 4915 4916 ipvs->sysctl_est_nice = IPVS_EST_NICE; 4917 if (unpriv) 4918 tbl[idx].mode = 0444; 4919 tbl[idx].extra2 = ipvs; 4920 tbl[idx++].data = &ipvs->sysctl_est_nice; 4921 4922 if (unpriv) 4923 tbl[idx].mode = 0444; 4924 tbl[idx].extra2 = ipvs; 4925 tbl[idx++].data = &ipvs->sysctl_conn_lfactor; 4926 4927 if (unpriv) 4928 tbl[idx].mode = 0444; 4929 tbl[idx].extra2 = ipvs; 4930 tbl[idx++].data = &ipvs->sysctl_svc_lfactor; 4931 4932 #ifdef CONFIG_IP_VS_DEBUG 4933 /* Global sysctls must be ro in non-init netns */ 4934 if (!net_eq(net, &init_net)) 4935 tbl[idx++].mode = 0444; 4936 #endif 4937 4938 ret = -ENOMEM; 4939 ipvs->sysctl_hdr = register_net_sysctl_sz(net, "net/ipv4/vs", tbl, 4940 ctl_table_size); 4941 if (!ipvs->sysctl_hdr) 4942 goto err; 4943 ipvs->sysctl_tbl = tbl; 4944 4945 ret = ip_vs_start_estimator(ipvs, &ipvs->tot_stats->s); 4946 if (ret < 0) 4947 goto err; 4948 4949 /* Schedule defense work */ 4950 queue_delayed_work(system_long_wq, &ipvs->defense_work, 4951 DEFENSE_TIMER_PERIOD); 4952 4953 return 0; 4954 4955 err: 4956 unregister_net_sysctl_table(ipvs->sysctl_hdr); 4957 if (!net_eq(net, &init_net)) 4958 kfree(tbl); 4959 return ret; 4960 } 4961 4962 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) 4963 { 4964 struct net *net = ipvs->net; 4965 4966 cancel_delayed_work_sync(&ipvs->expire_nodest_conn_work); 4967 cancel_delayed_work_sync(&ipvs->defense_work); 4968 cancel_work_sync(&ipvs->defense_work.work); 4969 unregister_net_sysctl_table(ipvs->sysctl_hdr); 4970 ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s); 4971 4972 if (ipvs->est_cpulist_valid) 4973 free_cpumask_var(ipvs->sysctl_est_cpulist); 4974 4975 if (!net_eq(net, &init_net)) 4976 kfree(ipvs->sysctl_tbl); 4977 } 4978 4979 #else 4980 4981 static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { return 0; } 4982 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { } 4983 4984 #endif 4985 4986 static struct notifier_block ip_vs_dst_notifier = { 4987 .notifier_call = ip_vs_dst_event, 4988 #ifdef CONFIG_IP_VS_IPV6 4989 .priority = ADDRCONF_NOTIFY_PRIORITY + 5, 4990 #endif 4991 }; 4992 4993 int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) 4994 { 4995 int ret = -ENOMEM; 4996 int idx; 4997 4998 /* Initialize service_mutex, svc_table per netns */ 4999 __mutex_init(&ipvs->service_mutex, "ipvs->service_mutex", &__ipvs_service_key); 5000 init_rwsem(&ipvs->svc_resize_sem); 5001 INIT_DELAYED_WORK(&ipvs->svc_resize_work, svc_resize_work_handler); 5002 atomic_set(&ipvs->svc_table_changes, 0); 5003 RCU_INIT_POINTER(ipvs->svc_table, NULL); 5004 5005 /* Initialize rs_table */ 5006 for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) 5007 INIT_HLIST_HEAD(&ipvs->rs_table[idx]); 5008 5009 INIT_LIST_HEAD(&ipvs->dest_trash); 5010 spin_lock_init(&ipvs->dest_trash_lock); 5011 timer_setup(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, 0); 5012 for (idx = 0; idx < IP_VS_AF_MAX; idx++) { 5013 atomic_set(&ipvs->num_services[idx], 0); 5014 atomic_set(&ipvs->fwm_services[idx], 0); 5015 atomic_set(&ipvs->nonfwm_services[idx], 0); 5016 atomic_set(&ipvs->ftpsvc_counter[idx], 0); 5017 atomic_set(&ipvs->nullsvc_counter[idx], 0); 5018 atomic_set(&ipvs->conn_out_counter[idx], 0); 5019 } 5020 5021 INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler); 5022 ipvs->sysctl_svc_lfactor = ip_vs_svc_default_load_factor(ipvs); 5023 5024 /* procfs stats */ 5025 ipvs->tot_stats = kzalloc_obj(*ipvs->tot_stats); 5026 if (!ipvs->tot_stats) 5027 goto out; 5028 if (ip_vs_stats_init_alloc(&ipvs->tot_stats->s) < 0) 5029 goto err_tot_stats; 5030 5031 #ifdef CONFIG_PROC_FS 5032 if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net, 5033 &ip_vs_info_seq_ops, sizeof(struct ip_vs_iter))) 5034 goto err_vs; 5035 if (!proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net, 5036 ip_vs_stats_show, NULL)) 5037 goto err_stats; 5038 if (!proc_create_net_single("ip_vs_stats_percpu", 0, 5039 ipvs->net->proc_net, 5040 ip_vs_stats_percpu_show, NULL)) 5041 goto err_percpu; 5042 if (!proc_create_net_single("ip_vs_status", 0, ipvs->net->proc_net, 5043 ip_vs_status_show, NULL)) 5044 goto err_status; 5045 #endif 5046 5047 ret = ip_vs_control_net_init_sysctl(ipvs); 5048 if (ret < 0) 5049 goto err; 5050 5051 return 0; 5052 5053 err: 5054 #ifdef CONFIG_PROC_FS 5055 remove_proc_entry("ip_vs_status", ipvs->net->proc_net); 5056 5057 err_status: 5058 remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); 5059 5060 err_percpu: 5061 remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); 5062 5063 err_stats: 5064 remove_proc_entry("ip_vs", ipvs->net->proc_net); 5065 5066 err_vs: 5067 #endif 5068 ip_vs_stats_release(&ipvs->tot_stats->s); 5069 5070 err_tot_stats: 5071 kfree(ipvs->tot_stats); 5072 5073 out: 5074 return ret; 5075 } 5076 5077 void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs) 5078 { 5079 ip_vs_trash_cleanup(ipvs); 5080 ip_vs_control_net_cleanup_sysctl(ipvs); 5081 cancel_delayed_work_sync(&ipvs->est_reload_work); 5082 #ifdef CONFIG_PROC_FS 5083 remove_proc_entry("ip_vs_status", ipvs->net->proc_net); 5084 remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); 5085 remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); 5086 remove_proc_entry("ip_vs", ipvs->net->proc_net); 5087 #endif 5088 call_rcu(&ipvs->tot_stats->rcu_head, ip_vs_stats_rcu_free); 5089 } 5090 5091 int __init ip_vs_register_nl_ioctl(void) 5092 { 5093 int ret; 5094 5095 ret = nf_register_sockopt(&ip_vs_sockopts); 5096 if (ret) { 5097 pr_err("cannot register sockopt.\n"); 5098 goto err_sock; 5099 } 5100 5101 ret = ip_vs_genl_register(); 5102 if (ret) { 5103 pr_err("cannot register Generic Netlink interface.\n"); 5104 goto err_genl; 5105 } 5106 return 0; 5107 5108 err_genl: 5109 nf_unregister_sockopt(&ip_vs_sockopts); 5110 err_sock: 5111 return ret; 5112 } 5113 5114 void ip_vs_unregister_nl_ioctl(void) 5115 { 5116 ip_vs_genl_unregister(); 5117 nf_unregister_sockopt(&ip_vs_sockopts); 5118 } 5119 5120 int __init ip_vs_control_init(void) 5121 { 5122 int ret; 5123 5124 ret = register_netdevice_notifier(&ip_vs_dst_notifier); 5125 if (ret < 0) 5126 return ret; 5127 5128 return 0; 5129 } 5130 5131 5132 void ip_vs_control_cleanup(void) 5133 { 5134 unregister_netdevice_notifier(&ip_vs_dst_notifier); 5135 /* relying on common rcu_barrier() in ip_vs_cleanup() */ 5136 } 5137