xref: /linux/net/netfilter/ipvs/ip_vs_ctl.c (revision 91a4855d6c03e770e42f17c798a36a3c46e63de2)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * IPVS         An implementation of the IP virtual server support for the
4  *              LINUX operating system.  IPVS is now implemented as a module
5  *              over the NetFilter framework. IPVS can be used to build a
6  *              high-performance and highly available server based on a
7  *              cluster of servers.
8  *
9  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
10  *              Peter Kese <peter.kese@ijs.si>
11  *              Julian Anastasov <ja@ssi.bg>
12  *
13  * Changes:
14  */
15 
16 #define pr_fmt(fmt) "IPVS: " fmt
17 
18 #include <linux/module.h>
19 #include <linux/init.h>
20 #include <linux/types.h>
21 #include <linux/capability.h>
22 #include <linux/fs.h>
23 #include <linux/sysctl.h>
24 #include <linux/proc_fs.h>
25 #include <linux/workqueue.h>
26 #include <linux/seq_file.h>
27 #include <linux/slab.h>
28 
29 #include <linux/netfilter.h>
30 #include <linux/netfilter_ipv4.h>
31 #include <linux/mutex.h>
32 #include <linux/rcupdate_wait.h>
33 
34 #include <net/net_namespace.h>
35 #include <linux/nsproxy.h>
36 #include <net/ip.h>
37 #ifdef CONFIG_IP_VS_IPV6
38 #include <net/ipv6.h>
39 #include <net/ip6_route.h>
40 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
41 #endif
42 #include <net/route.h>
43 #include <net/sock.h>
44 #include <net/genetlink.h>
45 
46 #include <linux/uaccess.h>
47 
48 #include <net/ip_vs.h>
49 
50 MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME);
51 
52 static struct lock_class_key __ipvs_service_key;
53 
54 /* sysctl variables */
55 
56 #ifdef CONFIG_IP_VS_DEBUG
57 static int sysctl_ip_vs_debug_level = 0;
58 
59 int ip_vs_get_debug_level(void)
60 {
61 	return sysctl_ip_vs_debug_level;
62 }
63 #endif
64 
65 
66 /*  Protos */
67 static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup);
68 
69 
70 #ifdef CONFIG_IP_VS_IPV6
71 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
72 static bool __ip_vs_addr_is_local_v6(struct net *net,
73 				     const struct in6_addr *addr)
74 {
75 	struct flowi6 fl6 = {
76 		.daddr = *addr,
77 	};
78 	struct dst_entry *dst = ip6_route_output(net, NULL, &fl6);
79 	bool is_local;
80 
81 	is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK);
82 
83 	dst_release(dst);
84 	return is_local;
85 }
86 #endif
87 
88 #ifdef CONFIG_SYSCTL
89 /*
90  *	update_defense_level is called from keventd and from sysctl,
91  *	so it needs to protect itself from softirqs
92  */
93 static void update_defense_level(struct netns_ipvs *ipvs)
94 {
95 	struct sysinfo i;
96 	int availmem;
97 	int amemthresh;
98 	int nomem;
99 	int to_change = -1;
100 
101 	/* we only count free and buffered memory (in pages) */
102 	si_meminfo(&i);
103 	availmem = i.freeram + i.bufferram;
104 	/* however in linux 2.5 the i.bufferram is total page cache size,
105 	   we need adjust it */
106 	/* si_swapinfo(&i); */
107 	/* availmem = availmem - (i.totalswap - i.freeswap); */
108 
109 	amemthresh = max(READ_ONCE(ipvs->sysctl_amemthresh), 0);
110 	nomem = (availmem < amemthresh);
111 
112 	local_bh_disable();
113 
114 	/* drop_entry */
115 	spin_lock(&ipvs->dropentry_lock);
116 	switch (ipvs->sysctl_drop_entry) {
117 	case 0:
118 		atomic_set(&ipvs->dropentry, 0);
119 		break;
120 	case 1:
121 		if (nomem) {
122 			atomic_set(&ipvs->dropentry, 1);
123 			ipvs->sysctl_drop_entry = 2;
124 		} else {
125 			atomic_set(&ipvs->dropentry, 0);
126 		}
127 		break;
128 	case 2:
129 		if (nomem) {
130 			atomic_set(&ipvs->dropentry, 1);
131 		} else {
132 			atomic_set(&ipvs->dropentry, 0);
133 			ipvs->sysctl_drop_entry = 1;
134 		}
135 		break;
136 	case 3:
137 		atomic_set(&ipvs->dropentry, 1);
138 		break;
139 	}
140 	spin_unlock(&ipvs->dropentry_lock);
141 
142 	/* drop_packet */
143 	spin_lock(&ipvs->droppacket_lock);
144 	switch (ipvs->sysctl_drop_packet) {
145 	case 0:
146 		ipvs->drop_rate = 0;
147 		break;
148 	case 1:
149 		if (nomem) {
150 			ipvs->drop_counter = amemthresh / (amemthresh - availmem);
151 			ipvs->drop_rate = ipvs->drop_counter;
152 			ipvs->sysctl_drop_packet = 2;
153 		} else {
154 			ipvs->drop_rate = 0;
155 		}
156 		break;
157 	case 2:
158 		if (nomem) {
159 			ipvs->drop_counter = amemthresh / (amemthresh - availmem);
160 			ipvs->drop_rate = ipvs->drop_counter;
161 		} else {
162 			ipvs->drop_rate = 0;
163 			ipvs->sysctl_drop_packet = 1;
164 		}
165 		break;
166 	case 3:
167 		ipvs->drop_rate = ipvs->sysctl_am_droprate;
168 		break;
169 	}
170 	spin_unlock(&ipvs->droppacket_lock);
171 
172 	/* secure_tcp */
173 	spin_lock(&ipvs->securetcp_lock);
174 	switch (ipvs->sysctl_secure_tcp) {
175 	case 0:
176 		if (ipvs->old_secure_tcp >= 2)
177 			to_change = 0;
178 		break;
179 	case 1:
180 		if (nomem) {
181 			if (ipvs->old_secure_tcp < 2)
182 				to_change = 1;
183 			ipvs->sysctl_secure_tcp = 2;
184 		} else {
185 			if (ipvs->old_secure_tcp >= 2)
186 				to_change = 0;
187 		}
188 		break;
189 	case 2:
190 		if (nomem) {
191 			if (ipvs->old_secure_tcp < 2)
192 				to_change = 1;
193 		} else {
194 			if (ipvs->old_secure_tcp >= 2)
195 				to_change = 0;
196 			ipvs->sysctl_secure_tcp = 1;
197 		}
198 		break;
199 	case 3:
200 		if (ipvs->old_secure_tcp < 2)
201 			to_change = 1;
202 		break;
203 	}
204 	ipvs->old_secure_tcp = ipvs->sysctl_secure_tcp;
205 	if (to_change >= 0)
206 		ip_vs_protocol_timeout_change(ipvs,
207 					      ipvs->sysctl_secure_tcp > 1);
208 	spin_unlock(&ipvs->securetcp_lock);
209 
210 	local_bh_enable();
211 }
212 
213 /* Handler for delayed work for expiring no
214  * destination connections
215  */
216 static void expire_nodest_conn_handler(struct work_struct *work)
217 {
218 	struct netns_ipvs *ipvs;
219 
220 	ipvs = container_of(work, struct netns_ipvs,
221 			    expire_nodest_conn_work.work);
222 	ip_vs_expire_nodest_conn_flush(ipvs);
223 }
224 
225 /*
226  *	Timer for checking the defense
227  */
228 #define DEFENSE_TIMER_PERIOD	1*HZ
229 
230 static void defense_work_handler(struct work_struct *work)
231 {
232 	struct netns_ipvs *ipvs =
233 		container_of(work, struct netns_ipvs, defense_work.work);
234 
235 	update_defense_level(ipvs);
236 	if (atomic_read(&ipvs->dropentry))
237 		ip_vs_random_dropentry(ipvs);
238 	queue_delayed_work(system_long_wq, &ipvs->defense_work,
239 			   DEFENSE_TIMER_PERIOD);
240 }
241 #endif
242 
243 static void est_reload_work_handler(struct work_struct *work)
244 {
245 	struct netns_ipvs *ipvs =
246 		container_of(work, struct netns_ipvs, est_reload_work.work);
247 	int genid_done = atomic_read(&ipvs->est_genid_done);
248 	unsigned long delay = HZ / 10;	/* repeat startups after failure */
249 	bool repeat = false;
250 	int genid;
251 	int id;
252 
253 	mutex_lock(&ipvs->est_mutex);
254 	genid = atomic_read(&ipvs->est_genid);
255 	for (id = 0; id < ipvs->est_kt_count; id++) {
256 		struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id];
257 
258 		/* netns clean up started, abort delayed work */
259 		if (!READ_ONCE(ipvs->enable))
260 			goto unlock;
261 		if (!kd)
262 			continue;
263 		/* New config ? Stop kthread tasks */
264 		if (genid != genid_done)
265 			ip_vs_est_kthread_stop(kd);
266 		if (!kd->task && !ip_vs_est_stopped(ipvs)) {
267 			/* Do not start kthreads above 0 in calc phase */
268 			if ((!id || !ipvs->est_calc_phase) &&
269 			    ip_vs_est_kthread_start(ipvs, kd) < 0)
270 				repeat = true;
271 		}
272 	}
273 
274 	atomic_set(&ipvs->est_genid_done, genid);
275 
276 	if (repeat)
277 		queue_delayed_work(system_long_wq, &ipvs->est_reload_work,
278 				   delay);
279 
280 unlock:
281 	mutex_unlock(&ipvs->est_mutex);
282 }
283 
284 static int get_conn_tab_size(struct netns_ipvs *ipvs)
285 {
286 	const struct ip_vs_rht *t;
287 	int size = 0;
288 
289 	rcu_read_lock();
290 	t = rcu_dereference(ipvs->conn_tab);
291 	if (t)
292 		size = t->size;
293 	rcu_read_unlock();
294 
295 	return size;
296 }
297 
298 int
299 ip_vs_use_count_inc(void)
300 {
301 	return try_module_get(THIS_MODULE);
302 }
303 
304 void
305 ip_vs_use_count_dec(void)
306 {
307 	module_put(THIS_MODULE);
308 }
309 
310 
311 /* Service hashing:
312  * Operation			Locking order
313  * ---------------------------------------------------------------------------
314  * add table			service_mutex, svc_resize_sem(W)
315  * del table			service_mutex
316  * move between tables		svc_resize_sem(W), seqcount_t(W), bit lock
317  * add/del service		service_mutex, bit lock
318  * find service			RCU, seqcount_t(R)
319  * walk services(blocking)	service_mutex, svc_resize_sem(R)
320  * walk services(non-blocking)	RCU, seqcount_t(R)
321  *
322  * - new tables are linked/unlinked under service_mutex and svc_resize_sem
323  * - new table is linked on resizing and all operations can run in parallel
324  * in 2 tables until the new table is registered as current one
325  * - two contexts can modify buckets: config and table resize, both in
326  * process context
327  * - only table resizer can move entries, so we do not protect t->seqc[]
328  * items with t->lock[]
329  * - lookups occur under RCU lock and seqcount reader lock to detect if
330  * services are moved to new table
331  * - move operations may disturb readers: find operation will not miss entries
332  * but walkers may see same entry twice if they are forced to retry chains
333  * - walkers using cond_resched_rcu() on !PREEMPT_RCU may need to hold
334  * service_mutex to disallow new tables to be installed or to check
335  * svc_table_changes and repeat the RCU read section if new table is installed
336  */
337 
338 /*
339  *	Returns hash value for virtual service
340  */
341 static inline u32
342 ip_vs_svc_hashval(struct ip_vs_rht *t, int af, unsigned int proto,
343 		  const union nf_inet_addr *addr, __be16 port)
344 {
345 	return ip_vs_rht_hash_linfo(t, af, addr, ntohs(port), proto);
346 }
347 
348 /*
349  *	Returns hash value of fwmark for virtual service lookup
350  */
351 static inline u32 ip_vs_svc_fwm_hashval(struct ip_vs_rht *t, int af,
352 					__u32 fwmark)
353 {
354 	return jhash_2words(fwmark, af, (u32)t->hash_key.key[0]);
355 }
356 
357 /* Hashes a service in the svc_table by <proto,addr,port> or by fwmark */
358 static int ip_vs_svc_hash(struct ip_vs_service *svc)
359 {
360 	struct netns_ipvs *ipvs = svc->ipvs;
361 	struct hlist_bl_head *head;
362 	struct ip_vs_rht *t;
363 	u32 hash;
364 
365 	if (svc->flags & IP_VS_SVC_F_HASHED) {
366 		pr_err("%s(): request for already hashed, called from %pS\n",
367 		       __func__, __builtin_return_address(0));
368 		return 0;
369 	}
370 
371 	/* increase its refcnt because it is referenced by the svc table */
372 	atomic_inc(&svc->refcnt);
373 
374 	/* New entries go into recent table */
375 	t = rcu_dereference_protected(ipvs->svc_table, 1);
376 	t = rcu_dereference_protected(t->new_tbl, 1);
377 
378 	if (svc->fwmark == 0) {
379 		/*
380 		 *  Hash it by <protocol,addr,port>
381 		 */
382 		hash = ip_vs_svc_hashval(t, svc->af, svc->protocol,
383 					 &svc->addr, svc->port);
384 	} else {
385 		/*
386 		 *  Hash it by fwmark
387 		 */
388 		hash = ip_vs_svc_fwm_hashval(t, svc->af, svc->fwmark);
389 	}
390 	head = t->buckets + (hash & t->mask);
391 	hlist_bl_lock(head);
392 	WRITE_ONCE(svc->hash_key, ip_vs_rht_build_hash_key(t, hash));
393 	svc->flags |= IP_VS_SVC_F_HASHED;
394 	hlist_bl_add_head_rcu(&svc->s_list, head);
395 	hlist_bl_unlock(head);
396 
397 	return 1;
398 }
399 
400 
401 /*
402  *	Unhashes a service from svc_table.
403  *	Should be called with locked tables.
404  */
405 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
406 {
407 	struct netns_ipvs *ipvs = svc->ipvs;
408 	struct hlist_bl_head *head;
409 	struct ip_vs_rht *t;
410 	u32 hash_key2;
411 	u32 hash_key;
412 
413 	if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
414 		pr_err("%s(): request for unhash flagged, called from %pS\n",
415 		       __func__, __builtin_return_address(0));
416 		return 0;
417 	}
418 
419 	t = rcu_dereference_protected(ipvs->svc_table, 1);
420 	hash_key = READ_ONCE(svc->hash_key);
421 	/* We need to lock the bucket in the right table */
422 	if (ip_vs_rht_same_table(t, hash_key)) {
423 		head = t->buckets + (hash_key & t->mask);
424 		hlist_bl_lock(head);
425 		/* Ensure hash_key is read under lock */
426 		hash_key2 = READ_ONCE(svc->hash_key);
427 		/* Moved to new table ? */
428 		if (hash_key != hash_key2) {
429 			hlist_bl_unlock(head);
430 			t = rcu_dereference_protected(t->new_tbl, 1);
431 			head = t->buckets + (hash_key2 & t->mask);
432 			hlist_bl_lock(head);
433 		}
434 	} else {
435 		/* It is already moved to new table */
436 		t = rcu_dereference_protected(t->new_tbl, 1);
437 		head = t->buckets + (hash_key & t->mask);
438 		hlist_bl_lock(head);
439 	}
440 	/* Remove it from svc_table */
441 	hlist_bl_del_rcu(&svc->s_list);
442 
443 	svc->flags &= ~IP_VS_SVC_F_HASHED;
444 	atomic_dec(&svc->refcnt);
445 	hlist_bl_unlock(head);
446 	return 1;
447 }
448 
449 
450 /*
451  *	Get service by {netns, proto,addr,port} in the service table.
452  */
453 static inline struct ip_vs_service *
454 __ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol,
455 		     const union nf_inet_addr *vaddr, __be16 vport)
456 {
457 	DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
458 	struct hlist_bl_head *head;
459 	struct ip_vs_service *svc;
460 	struct ip_vs_rht *t, *p;
461 	struct hlist_bl_node *e;
462 	u32 hash, hash_key;
463 
464 	ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, p) {
465 		/* Check for "full" addressed entries */
466 		hash = ip_vs_svc_hashval(t, af, protocol, vaddr, vport);
467 
468 		hash_key = ip_vs_rht_build_hash_key(t, hash);
469 		ip_vs_rht_walk_bucket_rcu(t, hash_key, head) {
470 			hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
471 				if (READ_ONCE(svc->hash_key) == hash_key &&
472 				    svc->af == af &&
473 				    ip_vs_addr_equal(af, &svc->addr, vaddr) &&
474 				    svc->port == vport &&
475 				    svc->protocol == protocol && !svc->fwmark) {
476 					/* HIT */
477 					return svc;
478 				}
479 			}
480 		}
481 	}
482 
483 	return NULL;
484 }
485 
486 
487 /*
488  *	Get service by {fwmark} in the service table.
489  */
490 static inline struct ip_vs_service *
491 __ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark)
492 {
493 	DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
494 	struct hlist_bl_head *head;
495 	struct ip_vs_service *svc;
496 	struct ip_vs_rht *t, *p;
497 	struct hlist_bl_node *e;
498 	u32 hash, hash_key;
499 
500 	ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, p) {
501 		/* Check for fwmark addressed entries */
502 		hash = ip_vs_svc_fwm_hashval(t, af, fwmark);
503 
504 		hash_key = ip_vs_rht_build_hash_key(t, hash);
505 		ip_vs_rht_walk_bucket_rcu(t, hash_key, head) {
506 			hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
507 				if (READ_ONCE(svc->hash_key) == hash_key &&
508 				    svc->fwmark == fwmark && svc->af == af) {
509 					/* HIT */
510 					return svc;
511 				}
512 			}
513 		}
514 	}
515 
516 	return NULL;
517 }
518 
519 /* Find service, called under RCU lock */
520 struct ip_vs_service *
521 ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol,
522 		   const union nf_inet_addr *vaddr, __be16 vport)
523 {
524 	struct ip_vs_service *svc = NULL;
525 	int af_id = ip_vs_af_index(af);
526 
527 	/*
528 	 *	Check the table hashed by fwmark first
529 	 */
530 	if (fwmark && atomic_read(&ipvs->fwm_services[af_id])) {
531 		svc = __ip_vs_svc_fwm_find(ipvs, af, fwmark);
532 		if (svc)
533 			goto out;
534 	}
535 
536 	if (!atomic_read(&ipvs->nonfwm_services[af_id]))
537 		goto out;
538 
539 	/*
540 	 *	Check the table hashed by <protocol,addr,port>
541 	 *	for "full" addressed entries
542 	 */
543 	svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport);
544 	if (svc)
545 		goto out;
546 
547 	if (protocol == IPPROTO_TCP &&
548 	    atomic_read(&ipvs->ftpsvc_counter[af_id]) &&
549 	    (vport == FTPDATA || !inet_port_requires_bind_service(ipvs->net, ntohs(vport)))) {
550 		/*
551 		 * Check if ftp service entry exists, the packet
552 		 * might belong to FTP data connections.
553 		 */
554 		svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, FTPPORT);
555 		if (svc)
556 			goto out;
557 	}
558 
559 	if (atomic_read(&ipvs->nullsvc_counter[af_id])) {
560 		/*
561 		 * Check if the catch-all port (port zero) exists
562 		 */
563 		svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, 0);
564 	}
565 
566   out:
567 	IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
568 		      fwmark, ip_vs_proto_name(protocol),
569 		      IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
570 		      svc ? "hit" : "not hit");
571 
572 	return svc;
573 }
574 
575 /* Return the number of registered services */
576 static int ip_vs_get_num_services(struct netns_ipvs *ipvs)
577 {
578 	int ns = 0, ni = IP_VS_AF_MAX;
579 
580 	while (--ni >= 0)
581 		ns += atomic_read(&ipvs->num_services[ni]);
582 	return ns;
583 }
584 
585 /* Get default load factor to map num_services/u_thresh to t->size */
586 static int ip_vs_svc_default_load_factor(struct netns_ipvs *ipvs)
587 {
588 	int factor;
589 
590 	if (net_eq(ipvs->net, &init_net))
591 		factor = -3;	/* grow if load is above 12.5% */
592 	else
593 		factor = -2;	/* grow if load is above 25% */
594 	return factor;
595 }
596 
597 /* Get the desired svc_table size */
598 static int ip_vs_svc_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t,
599 				  int lfactor)
600 {
601 	return ip_vs_rht_desired_size(ipvs, t, ip_vs_get_num_services(ipvs),
602 				      lfactor, IP_VS_SVC_TAB_MIN_BITS,
603 				      IP_VS_SVC_TAB_MAX_BITS);
604 }
605 
606 /* Allocate svc_table */
607 static struct ip_vs_rht *ip_vs_svc_table_alloc(struct netns_ipvs *ipvs,
608 					       int buckets, int lfactor)
609 {
610 	struct ip_vs_rht *t;
611 	int scounts, locks;
612 
613 	/* No frequent lookups to race with resizing, so use max of 64
614 	 * seqcounts. Only resizer moves entries, so use 0 locks.
615 	 */
616 	scounts = clamp(buckets >> 4, 1, 64);
617 	locks = 0;
618 
619 	t = ip_vs_rht_alloc(buckets, scounts, locks);
620 	if (!t)
621 		return NULL;
622 	t->lfactor = lfactor;
623 	ip_vs_rht_set_thresholds(t, t->size, lfactor, IP_VS_SVC_TAB_MIN_BITS,
624 				 IP_VS_SVC_TAB_MAX_BITS);
625 	return t;
626 }
627 
628 /* svc_table resizer work */
629 static void svc_resize_work_handler(struct work_struct *work)
630 {
631 	struct hlist_bl_head *head, *head2;
632 	struct ip_vs_rht *t_free = NULL;
633 	unsigned int resched_score = 0;
634 	struct hlist_bl_node *cn, *nn;
635 	struct ip_vs_rht *t, *t_new;
636 	struct ip_vs_service *svc;
637 	struct netns_ipvs *ipvs;
638 	bool more_work = true;
639 	seqcount_t *sc;
640 	int limit = 0;
641 	int new_size;
642 	int lfactor;
643 	u32 bucket;
644 
645 	ipvs = container_of(work, struct netns_ipvs, svc_resize_work.work);
646 
647 	if (!down_write_trylock(&ipvs->svc_resize_sem))
648 		goto out;
649 	if (!mutex_trylock(&ipvs->service_mutex))
650 		goto unlock_sem;
651 	more_work = false;
652 	clear_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags);
653 	if (!READ_ONCE(ipvs->enable) ||
654 	    test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
655 		goto unlock_m;
656 	t = rcu_dereference_protected(ipvs->svc_table, 1);
657 	/* Do nothing if table is removed */
658 	if (!t)
659 		goto unlock_m;
660 	/* New table needs to be registered? BUG! */
661 	if (t != rcu_dereference_protected(t->new_tbl, 1))
662 		goto unlock_m;
663 
664 	lfactor = sysctl_svc_lfactor(ipvs);
665 	/* Should we resize ? */
666 	new_size = ip_vs_svc_desired_size(ipvs, t, lfactor);
667 	if (new_size == t->size && lfactor == t->lfactor)
668 		goto unlock_m;
669 
670 	t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor);
671 	if (!t_new) {
672 		more_work = true;
673 		goto unlock_m;
674 	}
675 	/* Flip the table_id */
676 	t_new->table_id = t->table_id ^ IP_VS_RHT_TABLE_ID_MASK;
677 
678 	rcu_assign_pointer(t->new_tbl, t_new);
679 	/* Allow add/del to new_tbl while moving from old table */
680 	mutex_unlock(&ipvs->service_mutex);
681 
682 	ip_vs_rht_for_each_bucket(t, bucket, head) {
683 same_bucket:
684 		if (++limit >= 16) {
685 			if (!READ_ONCE(ipvs->enable) ||
686 			    test_bit(IP_VS_WORK_SVC_NORESIZE,
687 				     &ipvs->work_flags))
688 				goto unlock_sem;
689 			if (resched_score >= 100) {
690 				resched_score = 0;
691 				cond_resched();
692 			}
693 			limit = 0;
694 		}
695 		if (hlist_bl_empty(head)) {
696 			resched_score++;
697 			continue;
698 		}
699 		/* Preemption calls ahead... */
700 		resched_score = 0;
701 
702 		sc = &t->seqc[bucket & t->seqc_mask];
703 		/* seqcount_t usage considering PREEMPT_RT rules:
704 		 * - we are the only writer => preemption can be allowed
705 		 * - readers (SoftIRQ) => disable BHs
706 		 * - readers (processes) => preemption should be disabled
707 		 */
708 		local_bh_disable();
709 		preempt_disable_nested();
710 		write_seqcount_begin(sc);
711 		hlist_bl_lock(head);
712 
713 		hlist_bl_for_each_entry_safe(svc, cn, nn, head, s_list) {
714 			u32 hash;
715 
716 			/* New hash for the new table */
717 			if (svc->fwmark == 0) {
718 				/*  Hash it by <protocol,addr,port> */
719 				hash = ip_vs_svc_hashval(t_new, svc->af,
720 							 svc->protocol,
721 							 &svc->addr, svc->port);
722 			} else {
723 				/* Hash it by fwmark */
724 				hash = ip_vs_svc_fwm_hashval(t_new, svc->af,
725 							     svc->fwmark);
726 			}
727 			hlist_bl_del_rcu(&svc->s_list);
728 			head2 = t_new->buckets + (hash & t_new->mask);
729 
730 			hlist_bl_lock(head2);
731 			WRITE_ONCE(svc->hash_key,
732 				   ip_vs_rht_build_hash_key(t_new, hash));
733 			/* t_new->seqc are not used at this stage, we race
734 			 * only with add/del, so only lock the bucket.
735 			 */
736 			hlist_bl_add_head_rcu(&svc->s_list, head2);
737 			hlist_bl_unlock(head2);
738 			/* Too long chain? Do it in steps */
739 			if (++limit >= 64)
740 				break;
741 		}
742 
743 		hlist_bl_unlock(head);
744 		write_seqcount_end(sc);
745 		preempt_enable_nested();
746 		local_bh_enable();
747 		if (limit >= 64)
748 			goto same_bucket;
749 	}
750 
751 	/* Tables can be switched only under service_mutex */
752 	while (!mutex_trylock(&ipvs->service_mutex)) {
753 		cond_resched();
754 		if (!READ_ONCE(ipvs->enable) ||
755 		    test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
756 			goto unlock_sem;
757 	}
758 	if (!READ_ONCE(ipvs->enable) ||
759 	    test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
760 		goto unlock_m;
761 
762 	rcu_assign_pointer(ipvs->svc_table, t_new);
763 	/* Inform readers that new table is installed */
764 	smp_mb__before_atomic();
765 	atomic_inc(&ipvs->svc_table_changes);
766 	t_free = t;
767 
768 unlock_m:
769 	mutex_unlock(&ipvs->service_mutex);
770 
771 unlock_sem:
772 	up_write(&ipvs->svc_resize_sem);
773 
774 	if (t_free) {
775 		/* RCU readers should not see more than two tables in chain.
776 		 * To prevent new table to be attached wait here instead of
777 		 * freeing the old table in RCU callback.
778 		 */
779 		synchronize_rcu();
780 		ip_vs_rht_free(t_free);
781 	}
782 
783 out:
784 	if (!READ_ONCE(ipvs->enable) || !more_work ||
785 	    test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
786 		return;
787 	queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 1);
788 }
789 
790 static inline void
791 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
792 {
793 	atomic_inc(&svc->refcnt);
794 	rcu_assign_pointer(dest->svc, svc);
795 }
796 
797 static void ip_vs_service_free(struct ip_vs_service *svc)
798 {
799 	ip_vs_stats_release(&svc->stats);
800 	kfree(svc);
801 }
802 
803 static void ip_vs_service_rcu_free(struct rcu_head *head)
804 {
805 	struct ip_vs_service *svc;
806 
807 	svc = container_of(head, struct ip_vs_service, rcu_head);
808 	ip_vs_service_free(svc);
809 }
810 
811 static void __ip_vs_svc_put(struct ip_vs_service *svc)
812 {
813 	if (atomic_dec_and_test(&svc->refcnt)) {
814 		IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
815 			      svc->fwmark,
816 			      IP_VS_DBG_ADDR(svc->af, &svc->addr),
817 			      ntohs(svc->port));
818 		call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
819 	}
820 }
821 
822 
823 /*
824  *	Returns hash value for real service
825  */
826 static inline unsigned int ip_vs_rs_hashkey(int af,
827 					    const union nf_inet_addr *addr,
828 					    __be16 port)
829 {
830 	unsigned int porth = ntohs(port);
831 	__be32 addr_fold = addr->ip;
832 
833 #ifdef CONFIG_IP_VS_IPV6
834 	if (af == AF_INET6)
835 		addr_fold = addr->ip6[0]^addr->ip6[1]^
836 			    addr->ip6[2]^addr->ip6[3];
837 #endif
838 
839 	return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
840 		& IP_VS_RTAB_MASK;
841 }
842 
843 /* Hash ip_vs_dest in rs_table by <proto,addr,port>. */
844 static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
845 {
846 	unsigned int hash;
847 	__be16 port;
848 
849 	if (dest->in_rs_table)
850 		return;
851 
852 	switch (IP_VS_DFWD_METHOD(dest)) {
853 	case IP_VS_CONN_F_MASQ:
854 		port = dest->port;
855 		break;
856 	case IP_VS_CONN_F_TUNNEL:
857 		switch (dest->tun_type) {
858 		case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
859 			port = dest->tun_port;
860 			break;
861 		case IP_VS_CONN_F_TUNNEL_TYPE_IPIP:
862 		case IP_VS_CONN_F_TUNNEL_TYPE_GRE:
863 			port = 0;
864 			break;
865 		default:
866 			return;
867 		}
868 		break;
869 	default:
870 		return;
871 	}
872 
873 	/*
874 	 *	Hash by proto,addr,port,
875 	 *	which are the parameters of the real service.
876 	 */
877 	hash = ip_vs_rs_hashkey(dest->af, &dest->addr, port);
878 
879 	hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]);
880 	dest->in_rs_table = 1;
881 }
882 
883 /* Unhash ip_vs_dest from rs_table. */
884 static void ip_vs_rs_unhash(struct ip_vs_dest *dest)
885 {
886 	/*
887 	 * Remove it from the rs_table table.
888 	 */
889 	if (dest->in_rs_table) {
890 		hlist_del_rcu(&dest->d_list);
891 		dest->in_rs_table = 0;
892 	}
893 }
894 
895 /* Check if real service by <proto,addr,port> is present */
896 bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol,
897 			    const union nf_inet_addr *daddr, __be16 dport)
898 {
899 	unsigned int hash;
900 	struct ip_vs_dest *dest;
901 
902 	/* Check for "full" addressed entries */
903 	hash = ip_vs_rs_hashkey(af, daddr, dport);
904 
905 	hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
906 		if (dest->port == dport &&
907 		    dest->af == af &&
908 		    ip_vs_addr_equal(af, &dest->addr, daddr) &&
909 		    (dest->protocol == protocol || dest->vfwmark) &&
910 		    IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) {
911 			/* HIT */
912 			return true;
913 		}
914 	}
915 
916 	return false;
917 }
918 
919 /* Find real service record by <proto,addr,port>.
920  * In case of multiple records with the same <proto,addr,port>, only
921  * the first found record is returned.
922  *
923  * To be called under RCU lock.
924  */
925 struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af,
926 					   __u16 protocol,
927 					   const union nf_inet_addr *daddr,
928 					   __be16 dport)
929 {
930 	unsigned int hash;
931 	struct ip_vs_dest *dest;
932 
933 	/* Check for "full" addressed entries */
934 	hash = ip_vs_rs_hashkey(af, daddr, dport);
935 
936 	hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
937 		if (dest->port == dport &&
938 		    dest->af == af &&
939 		    ip_vs_addr_equal(af, &dest->addr, daddr) &&
940 		    (dest->protocol == protocol || dest->vfwmark) &&
941 		    IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) {
942 			/* HIT */
943 			return dest;
944 		}
945 	}
946 
947 	return NULL;
948 }
949 
950 /* Find real service record by <af,addr,tun_port>.
951  * In case of multiple records with the same <af,addr,tun_port>, only
952  * the first found record is returned.
953  *
954  * To be called under RCU lock.
955  */
956 struct ip_vs_dest *ip_vs_find_tunnel(struct netns_ipvs *ipvs, int af,
957 				     const union nf_inet_addr *daddr,
958 				     __be16 tun_port)
959 {
960 	struct ip_vs_dest *dest;
961 	unsigned int hash;
962 
963 	/* Check for "full" addressed entries */
964 	hash = ip_vs_rs_hashkey(af, daddr, tun_port);
965 
966 	hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
967 		if (dest->tun_port == tun_port &&
968 		    dest->af == af &&
969 		    ip_vs_addr_equal(af, &dest->addr, daddr) &&
970 		    IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_TUNNEL) {
971 			/* HIT */
972 			return dest;
973 		}
974 	}
975 
976 	return NULL;
977 }
978 
979 /* Lookup destination by {addr,port} in the given service
980  * Called under RCU lock.
981  */
982 static struct ip_vs_dest *
983 ip_vs_lookup_dest(struct ip_vs_service *svc, int dest_af,
984 		  const union nf_inet_addr *daddr, __be16 dport)
985 {
986 	struct ip_vs_dest *dest;
987 
988 	/*
989 	 * Find the destination for the given service
990 	 */
991 	list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
992 		if ((dest->af == dest_af) &&
993 		    ip_vs_addr_equal(dest_af, &dest->addr, daddr) &&
994 		    (dest->port == dport)) {
995 			/* HIT */
996 			return dest;
997 		}
998 	}
999 
1000 	return NULL;
1001 }
1002 
1003 /*
1004  * Find destination by {daddr,dport,vaddr,protocol}
1005  * Created to be used in ip_vs_process_message() in
1006  * the backup synchronization daemon. It finds the
1007  * destination to be bound to the received connection
1008  * on the backup.
1009  * Called under RCU lock, no refcnt is returned.
1010  */
1011 struct ip_vs_dest *ip_vs_find_dest(struct netns_ipvs *ipvs, int svc_af, int dest_af,
1012 				   const union nf_inet_addr *daddr,
1013 				   __be16 dport,
1014 				   const union nf_inet_addr *vaddr,
1015 				   __be16 vport, __u16 protocol, __u32 fwmark,
1016 				   __u32 flags)
1017 {
1018 	struct ip_vs_dest *dest;
1019 	struct ip_vs_service *svc;
1020 	__be16 port = dport;
1021 
1022 	svc = ip_vs_service_find(ipvs, svc_af, fwmark, protocol, vaddr, vport);
1023 	if (!svc)
1024 		return NULL;
1025 	if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ)
1026 		port = 0;
1027 	dest = ip_vs_lookup_dest(svc, dest_af, daddr, port);
1028 	if (!dest)
1029 		dest = ip_vs_lookup_dest(svc, dest_af, daddr, port ^ dport);
1030 	return dest;
1031 }
1032 
1033 void ip_vs_dest_dst_rcu_free(struct rcu_head *head)
1034 {
1035 	struct ip_vs_dest_dst *dest_dst = container_of(head,
1036 						       struct ip_vs_dest_dst,
1037 						       rcu_head);
1038 
1039 	dst_release(dest_dst->dst_cache);
1040 	kfree(dest_dst);
1041 }
1042 
1043 /* Release dest_dst and dst_cache for dest in user context */
1044 static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest)
1045 {
1046 	struct ip_vs_dest_dst *old;
1047 
1048 	old = rcu_dereference_protected(dest->dest_dst, 1);
1049 	if (old) {
1050 		RCU_INIT_POINTER(dest->dest_dst, NULL);
1051 		call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
1052 	}
1053 }
1054 
1055 /*
1056  *  Lookup dest by {svc,addr,port} in the destination trash.
1057  *  The destination trash is used to hold the destinations that are removed
1058  *  from the service table but are still referenced by some conn entries.
1059  *  The reason to add the destination trash is when the dest is temporary
1060  *  down (either by administrator or by monitor program), the dest can be
1061  *  picked back from the trash, the remaining connections to the dest can
1062  *  continue, and the counting information of the dest is also useful for
1063  *  scheduling.
1064  */
1065 static struct ip_vs_dest *
1066 ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af,
1067 		     const union nf_inet_addr *daddr, __be16 dport)
1068 {
1069 	struct ip_vs_dest *dest;
1070 	struct netns_ipvs *ipvs = svc->ipvs;
1071 
1072 	/*
1073 	 * Find the destination in trash
1074 	 */
1075 	spin_lock_bh(&ipvs->dest_trash_lock);
1076 	list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
1077 		IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
1078 			      "dest->refcnt=%d\n",
1079 			      dest->vfwmark,
1080 			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
1081 			      ntohs(dest->port),
1082 			      refcount_read(&dest->refcnt));
1083 		if (dest->af == dest_af &&
1084 		    ip_vs_addr_equal(dest_af, &dest->addr, daddr) &&
1085 		    dest->port == dport &&
1086 		    dest->vfwmark == svc->fwmark &&
1087 		    dest->protocol == svc->protocol &&
1088 		    (svc->fwmark ||
1089 		     (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
1090 		      dest->vport == svc->port))) {
1091 			/* HIT */
1092 			list_del(&dest->t_list);
1093 			goto out;
1094 		}
1095 	}
1096 
1097 	dest = NULL;
1098 
1099 out:
1100 	spin_unlock_bh(&ipvs->dest_trash_lock);
1101 
1102 	return dest;
1103 }
1104 
1105 static void ip_vs_dest_rcu_free(struct rcu_head *head)
1106 {
1107 	struct ip_vs_dest *dest;
1108 
1109 	dest = container_of(head, struct ip_vs_dest, rcu_head);
1110 	ip_vs_stats_release(&dest->stats);
1111 	ip_vs_dest_put_and_free(dest);
1112 }
1113 
1114 static void ip_vs_dest_free(struct ip_vs_dest *dest)
1115 {
1116 	struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1);
1117 
1118 	__ip_vs_svc_put(svc);
1119 	call_rcu(&dest->rcu_head, ip_vs_dest_rcu_free);
1120 }
1121 
1122 /*
1123  *  Clean up all the destinations in the trash
1124  *  Called by the ip_vs_control_cleanup()
1125  *
1126  *  When the ip_vs_control_clearup is activated by ipvs module exit,
1127  *  the service tables must have been flushed and all the connections
1128  *  are expired, and the refcnt of each destination in the trash must
1129  *  be 1, so we simply release them here.
1130  */
1131 static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs)
1132 {
1133 	struct ip_vs_dest *dest, *nxt;
1134 
1135 	timer_delete_sync(&ipvs->dest_trash_timer);
1136 	/* No need to use dest_trash_lock */
1137 	list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) {
1138 		list_del(&dest->t_list);
1139 		ip_vs_dest_free(dest);
1140 	}
1141 }
1142 
1143 static void ip_vs_stats_rcu_free(struct rcu_head *head)
1144 {
1145 	struct ip_vs_stats_rcu *rs = container_of(head,
1146 						  struct ip_vs_stats_rcu,
1147 						  rcu_head);
1148 
1149 	ip_vs_stats_release(&rs->s);
1150 	kfree(rs);
1151 }
1152 
1153 static void
1154 ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src)
1155 {
1156 #define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c
1157 
1158 	spin_lock(&src->lock);
1159 
1160 	IP_VS_SHOW_STATS_COUNTER(conns);
1161 	IP_VS_SHOW_STATS_COUNTER(inpkts);
1162 	IP_VS_SHOW_STATS_COUNTER(outpkts);
1163 	IP_VS_SHOW_STATS_COUNTER(inbytes);
1164 	IP_VS_SHOW_STATS_COUNTER(outbytes);
1165 
1166 	ip_vs_read_estimator(dst, src);
1167 
1168 	spin_unlock(&src->lock);
1169 }
1170 
1171 static void
1172 ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src)
1173 {
1174 	dst->conns = (u32)src->conns;
1175 	dst->inpkts = (u32)src->inpkts;
1176 	dst->outpkts = (u32)src->outpkts;
1177 	dst->inbytes = src->inbytes;
1178 	dst->outbytes = src->outbytes;
1179 	dst->cps = (u32)src->cps;
1180 	dst->inpps = (u32)src->inpps;
1181 	dst->outpps = (u32)src->outpps;
1182 	dst->inbps = (u32)src->inbps;
1183 	dst->outbps = (u32)src->outbps;
1184 }
1185 
1186 static void
1187 ip_vs_zero_stats(struct ip_vs_stats *stats)
1188 {
1189 	spin_lock(&stats->lock);
1190 
1191 	/* get current counters as zero point, rates are zeroed */
1192 
1193 #define IP_VS_ZERO_STATS_COUNTER(c) stats->kstats0.c = stats->kstats.c
1194 
1195 	IP_VS_ZERO_STATS_COUNTER(conns);
1196 	IP_VS_ZERO_STATS_COUNTER(inpkts);
1197 	IP_VS_ZERO_STATS_COUNTER(outpkts);
1198 	IP_VS_ZERO_STATS_COUNTER(inbytes);
1199 	IP_VS_ZERO_STATS_COUNTER(outbytes);
1200 
1201 	ip_vs_zero_estimator(stats);
1202 
1203 	spin_unlock(&stats->lock);
1204 }
1205 
1206 /* Allocate fields after kzalloc */
1207 int ip_vs_stats_init_alloc(struct ip_vs_stats *s)
1208 {
1209 	int i;
1210 
1211 	spin_lock_init(&s->lock);
1212 	s->cpustats = alloc_percpu(struct ip_vs_cpu_stats);
1213 	if (!s->cpustats)
1214 		return -ENOMEM;
1215 
1216 	for_each_possible_cpu(i) {
1217 		struct ip_vs_cpu_stats *cs = per_cpu_ptr(s->cpustats, i);
1218 
1219 		u64_stats_init(&cs->syncp);
1220 	}
1221 	return 0;
1222 }
1223 
1224 struct ip_vs_stats *ip_vs_stats_alloc(void)
1225 {
1226 	struct ip_vs_stats *s = kzalloc_obj(*s);
1227 
1228 	if (s && ip_vs_stats_init_alloc(s) >= 0)
1229 		return s;
1230 	kfree(s);
1231 	return NULL;
1232 }
1233 
1234 void ip_vs_stats_release(struct ip_vs_stats *stats)
1235 {
1236 	free_percpu(stats->cpustats);
1237 }
1238 
1239 void ip_vs_stats_free(struct ip_vs_stats *stats)
1240 {
1241 	if (stats) {
1242 		ip_vs_stats_release(stats);
1243 		kfree(stats);
1244 	}
1245 }
1246 
1247 /*
1248  *	Update a destination in the given service
1249  */
1250 static void
1251 __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
1252 		    struct ip_vs_dest_user_kern *udest, int add)
1253 {
1254 	struct netns_ipvs *ipvs = svc->ipvs;
1255 	struct ip_vs_service *old_svc;
1256 	struct ip_vs_scheduler *sched;
1257 	int conn_flags;
1258 
1259 	/* We cannot modify an address and change the address family */
1260 	BUG_ON(!add && udest->af != dest->af);
1261 
1262 	if (add && udest->af != svc->af)
1263 		ipvs->mixed_address_family_dests++;
1264 
1265 	/* keep the last_weight with latest non-0 weight */
1266 	if (add || udest->weight != 0)
1267 		atomic_set(&dest->last_weight, udest->weight);
1268 
1269 	/* set the weight and the flags */
1270 	atomic_set(&dest->weight, udest->weight);
1271 	conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
1272 	conn_flags |= IP_VS_CONN_F_INACTIVE;
1273 
1274 	/* Need to rehash? */
1275 	if ((udest->conn_flags & IP_VS_CONN_F_FWD_MASK) !=
1276 	    IP_VS_DFWD_METHOD(dest) ||
1277 	    udest->tun_type != dest->tun_type ||
1278 	    udest->tun_port != dest->tun_port)
1279 		ip_vs_rs_unhash(dest);
1280 
1281 	/* set the tunnel info */
1282 	dest->tun_type = udest->tun_type;
1283 	dest->tun_port = udest->tun_port;
1284 	dest->tun_flags = udest->tun_flags;
1285 
1286 	/* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
1287 	if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
1288 		conn_flags |= IP_VS_CONN_F_NOOUTPUT;
1289 	} else {
1290 		/* FTP-NAT requires conntrack for mangling */
1291 		if (svc->port == FTPPORT)
1292 			ip_vs_register_conntrack(svc);
1293 	}
1294 	atomic_set(&dest->conn_flags, conn_flags);
1295 	/* Put the real service in rs_table if not present. */
1296 	ip_vs_rs_hash(ipvs, dest);
1297 
1298 	/* bind the service */
1299 	old_svc = rcu_dereference_protected(dest->svc, 1);
1300 	if (!old_svc) {
1301 		__ip_vs_bind_svc(dest, svc);
1302 	} else {
1303 		if (old_svc != svc) {
1304 			ip_vs_zero_stats(&dest->stats);
1305 			__ip_vs_bind_svc(dest, svc);
1306 			__ip_vs_svc_put(old_svc);
1307 		}
1308 	}
1309 
1310 	/* set the dest status flags */
1311 	dest->flags |= IP_VS_DEST_F_AVAILABLE;
1312 
1313 	if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
1314 		dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
1315 	dest->u_threshold = udest->u_threshold;
1316 	dest->l_threshold = udest->l_threshold;
1317 
1318 	dest->af = udest->af;
1319 
1320 	if (add) {
1321 		list_add_rcu(&dest->n_list, &svc->destinations);
1322 		svc->num_dests++;
1323 		sched = rcu_dereference_protected(svc->scheduler, 1);
1324 		if (sched && sched->add_dest)
1325 			sched->add_dest(svc, dest);
1326 	} else {
1327 		spin_lock_bh(&dest->dst_lock);
1328 		__ip_vs_dst_cache_reset(dest);
1329 		spin_unlock_bh(&dest->dst_lock);
1330 
1331 		sched = rcu_dereference_protected(svc->scheduler, 1);
1332 		if (sched && sched->upd_dest)
1333 			sched->upd_dest(svc, dest);
1334 	}
1335 }
1336 
1337 
1338 /*
1339  *	Create a destination for the given service
1340  */
1341 static int
1342 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1343 {
1344 	struct ip_vs_dest *dest;
1345 	unsigned int atype;
1346 	int ret;
1347 
1348 #ifdef CONFIG_IP_VS_IPV6
1349 	if (udest->af == AF_INET6) {
1350 		atype = ipv6_addr_type(&udest->addr.in6);
1351 		if ((!(atype & IPV6_ADDR_UNICAST) ||
1352 			atype & IPV6_ADDR_LINKLOCAL) &&
1353 			!__ip_vs_addr_is_local_v6(svc->ipvs->net, &udest->addr.in6))
1354 			return -EINVAL;
1355 
1356 		ret = nf_defrag_ipv6_enable(svc->ipvs->net);
1357 		if (ret)
1358 			return ret;
1359 	} else
1360 #endif
1361 	{
1362 		atype = inet_addr_type(svc->ipvs->net, udest->addr.ip);
1363 		if (atype != RTN_LOCAL && atype != RTN_UNICAST)
1364 			return -EINVAL;
1365 	}
1366 
1367 	dest = kzalloc_obj(struct ip_vs_dest);
1368 	if (dest == NULL)
1369 		return -ENOMEM;
1370 
1371 	ret = ip_vs_stats_init_alloc(&dest->stats);
1372 	if (ret < 0)
1373 		goto err_alloc;
1374 
1375 	ret = ip_vs_start_estimator(svc->ipvs, &dest->stats);
1376 	if (ret < 0)
1377 		goto err_stats;
1378 
1379 	dest->af = udest->af;
1380 	dest->protocol = svc->protocol;
1381 	dest->vaddr = svc->addr;
1382 	dest->vport = svc->port;
1383 	dest->vfwmark = svc->fwmark;
1384 	ip_vs_addr_copy(udest->af, &dest->addr, &udest->addr);
1385 	dest->port = udest->port;
1386 
1387 	atomic_set(&dest->activeconns, 0);
1388 	atomic_set(&dest->inactconns, 0);
1389 	atomic_set(&dest->persistconns, 0);
1390 	refcount_set(&dest->refcnt, 1);
1391 
1392 	INIT_HLIST_NODE(&dest->d_list);
1393 	spin_lock_init(&dest->dst_lock);
1394 	__ip_vs_update_dest(svc, dest, udest, 1);
1395 
1396 	return 0;
1397 
1398 err_stats:
1399 	ip_vs_stats_release(&dest->stats);
1400 
1401 err_alloc:
1402 	kfree(dest);
1403 	return ret;
1404 }
1405 
1406 
1407 /*
1408  *	Add a destination into an existing service
1409  */
1410 static int
1411 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1412 {
1413 	struct ip_vs_dest *dest;
1414 	union nf_inet_addr daddr;
1415 	__be16 dport = udest->port;
1416 	int ret;
1417 
1418 	if (udest->weight < 0) {
1419 		pr_err("%s(): server weight less than zero\n", __func__);
1420 		return -ERANGE;
1421 	}
1422 
1423 	if (udest->l_threshold > udest->u_threshold) {
1424 		pr_err("%s(): lower threshold is higher than upper threshold\n",
1425 			__func__);
1426 		return -ERANGE;
1427 	}
1428 
1429 	if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1430 		if (udest->tun_port == 0) {
1431 			pr_err("%s(): tunnel port is zero\n", __func__);
1432 			return -EINVAL;
1433 		}
1434 	}
1435 
1436 	ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
1437 
1438 	/* We use function that requires RCU lock */
1439 	rcu_read_lock();
1440 	dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport);
1441 	rcu_read_unlock();
1442 
1443 	if (dest != NULL) {
1444 		IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
1445 		return -EEXIST;
1446 	}
1447 
1448 	/*
1449 	 * Check if the dest already exists in the trash and
1450 	 * is from the same service
1451 	 */
1452 	dest = ip_vs_trash_get_dest(svc, udest->af, &daddr, dport);
1453 
1454 	if (dest != NULL) {
1455 		IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
1456 			      "dest->refcnt=%d, service %u/%s:%u\n",
1457 			      IP_VS_DBG_ADDR(udest->af, &daddr), ntohs(dport),
1458 			      refcount_read(&dest->refcnt),
1459 			      dest->vfwmark,
1460 			      IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
1461 			      ntohs(dest->vport));
1462 
1463 		ret = ip_vs_start_estimator(svc->ipvs, &dest->stats);
1464 		if (ret < 0)
1465 			return ret;
1466 		__ip_vs_update_dest(svc, dest, udest, 1);
1467 	} else {
1468 		/*
1469 		 * Allocate and initialize the dest structure
1470 		 */
1471 		ret = ip_vs_new_dest(svc, udest);
1472 	}
1473 
1474 	return ret;
1475 }
1476 
1477 
1478 /*
1479  *	Edit a destination in the given service
1480  */
1481 static int
1482 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1483 {
1484 	struct ip_vs_dest *dest;
1485 	union nf_inet_addr daddr;
1486 	__be16 dport = udest->port;
1487 
1488 	if (udest->weight < 0) {
1489 		pr_err("%s(): server weight less than zero\n", __func__);
1490 		return -ERANGE;
1491 	}
1492 
1493 	if (udest->l_threshold > udest->u_threshold) {
1494 		pr_err("%s(): lower threshold is higher than upper threshold\n",
1495 			__func__);
1496 		return -ERANGE;
1497 	}
1498 
1499 	if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1500 		if (udest->tun_port == 0) {
1501 			pr_err("%s(): tunnel port is zero\n", __func__);
1502 			return -EINVAL;
1503 		}
1504 	}
1505 
1506 	ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
1507 
1508 	/* We use function that requires RCU lock */
1509 	rcu_read_lock();
1510 	dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport);
1511 	rcu_read_unlock();
1512 
1513 	if (dest == NULL) {
1514 		IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
1515 		return -ENOENT;
1516 	}
1517 
1518 	__ip_vs_update_dest(svc, dest, udest, 0);
1519 
1520 	return 0;
1521 }
1522 
1523 /*
1524  *	Delete a destination (must be already unlinked from the service)
1525  */
1526 static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest,
1527 			     bool cleanup)
1528 {
1529 	ip_vs_stop_estimator(ipvs, &dest->stats);
1530 
1531 	/*
1532 	 *  Remove it from the d-linked list with the real services.
1533 	 */
1534 	ip_vs_rs_unhash(dest);
1535 
1536 	spin_lock_bh(&ipvs->dest_trash_lock);
1537 	IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n",
1538 		      IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
1539 		      refcount_read(&dest->refcnt));
1540 	if (list_empty(&ipvs->dest_trash) && !cleanup)
1541 		mod_timer(&ipvs->dest_trash_timer,
1542 			  jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
1543 	/* dest lives in trash with reference */
1544 	list_add(&dest->t_list, &ipvs->dest_trash);
1545 	dest->idle_start = 0;
1546 	spin_unlock_bh(&ipvs->dest_trash_lock);
1547 
1548 	/* Queue up delayed work to expire all no destination connections.
1549 	 * No-op when CONFIG_SYSCTL is disabled.
1550 	 */
1551 	if (!cleanup)
1552 		ip_vs_enqueue_expire_nodest_conns(ipvs);
1553 }
1554 
1555 
1556 /*
1557  *	Unlink a destination from the given service
1558  */
1559 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
1560 				struct ip_vs_dest *dest,
1561 				int svcupd)
1562 {
1563 	dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
1564 
1565 	spin_lock_bh(&dest->dst_lock);
1566 	__ip_vs_dst_cache_reset(dest);
1567 	spin_unlock_bh(&dest->dst_lock);
1568 
1569 	/*
1570 	 *  Remove it from the d-linked destination list.
1571 	 */
1572 	list_del_rcu(&dest->n_list);
1573 	svc->num_dests--;
1574 
1575 	if (dest->af != svc->af)
1576 		svc->ipvs->mixed_address_family_dests--;
1577 
1578 	if (svcupd) {
1579 		struct ip_vs_scheduler *sched;
1580 
1581 		sched = rcu_dereference_protected(svc->scheduler, 1);
1582 		if (sched && sched->del_dest)
1583 			sched->del_dest(svc, dest);
1584 	}
1585 }
1586 
1587 
1588 /*
1589  *	Delete a destination server in the given service
1590  */
1591 static int
1592 ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1593 {
1594 	struct ip_vs_dest *dest;
1595 	__be16 dport = udest->port;
1596 
1597 	/* We use function that requires RCU lock */
1598 	rcu_read_lock();
1599 	dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport);
1600 	rcu_read_unlock();
1601 
1602 	if (dest == NULL) {
1603 		IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
1604 		return -ENOENT;
1605 	}
1606 
1607 	/*
1608 	 *	Unlink dest from the service
1609 	 */
1610 	__ip_vs_unlink_dest(svc, dest, 1);
1611 
1612 	/*
1613 	 *	Delete the destination
1614 	 */
1615 	__ip_vs_del_dest(svc->ipvs, dest, false);
1616 
1617 	return 0;
1618 }
1619 
1620 static void ip_vs_dest_trash_expire(struct timer_list *t)
1621 {
1622 	struct netns_ipvs *ipvs = timer_container_of(ipvs, t,
1623 						     dest_trash_timer);
1624 	struct ip_vs_dest *dest, *next;
1625 	unsigned long now = jiffies;
1626 
1627 	spin_lock(&ipvs->dest_trash_lock);
1628 	list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) {
1629 		if (refcount_read(&dest->refcnt) > 1)
1630 			continue;
1631 		if (dest->idle_start) {
1632 			if (time_before(now, dest->idle_start +
1633 					     IP_VS_DEST_TRASH_PERIOD))
1634 				continue;
1635 		} else {
1636 			dest->idle_start = max(1UL, now);
1637 			continue;
1638 		}
1639 		IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n",
1640 			      dest->vfwmark,
1641 			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
1642 			      ntohs(dest->port));
1643 		list_del(&dest->t_list);
1644 		ip_vs_dest_free(dest);
1645 	}
1646 	if (!list_empty(&ipvs->dest_trash))
1647 		mod_timer(&ipvs->dest_trash_timer,
1648 			  jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
1649 	spin_unlock(&ipvs->dest_trash_lock);
1650 }
1651 
1652 /*
1653  *	Add a service into the service hash table
1654  */
1655 static int
1656 ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
1657 		  struct ip_vs_service **svc_p)
1658 {
1659 	struct ip_vs_scheduler *sched = NULL;
1660 	struct ip_vs_rht *tc_new = NULL;
1661 	struct ip_vs_rht *t, *t_new = NULL;
1662 	int af_id = ip_vs_af_index(u->af);
1663 	struct ip_vs_service *svc = NULL;
1664 	struct ip_vs_pe *pe = NULL;
1665 	int ret_hooks = -1;
1666 	int ret = 0;
1667 
1668 	/* increase the module use count */
1669 	if (!ip_vs_use_count_inc())
1670 		return -ENOPROTOOPT;
1671 
1672 	/* Lookup the scheduler by 'u->sched_name' */
1673 	if (strcmp(u->sched_name, "none")) {
1674 		sched = ip_vs_scheduler_get(u->sched_name);
1675 		if (!sched) {
1676 			pr_info("Scheduler module ip_vs_%s not found\n",
1677 				u->sched_name);
1678 			ret = -ENOENT;
1679 			goto out_err;
1680 		}
1681 	}
1682 
1683 	if (u->pe_name && *u->pe_name) {
1684 		pe = ip_vs_pe_getbyname(u->pe_name);
1685 		if (pe == NULL) {
1686 			pr_info("persistence engine module ip_vs_pe_%s "
1687 				"not found\n", u->pe_name);
1688 			ret = -ENOENT;
1689 			goto out_err;
1690 		}
1691 	}
1692 
1693 #ifdef CONFIG_IP_VS_IPV6
1694 	if (u->af == AF_INET6) {
1695 		__u32 plen = (__force __u32) u->netmask;
1696 
1697 		if (plen < 1 || plen > 128) {
1698 			ret = -EINVAL;
1699 			goto out_err;
1700 		}
1701 
1702 		ret = nf_defrag_ipv6_enable(ipvs->net);
1703 		if (ret)
1704 			goto out_err;
1705 	}
1706 #endif
1707 
1708 	t = rcu_dereference_protected(ipvs->svc_table, 1);
1709 	if (!t) {
1710 		int lfactor = sysctl_svc_lfactor(ipvs);
1711 		int new_size = ip_vs_svc_desired_size(ipvs, NULL, lfactor);
1712 
1713 		t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor);
1714 		if (!t_new) {
1715 			ret = -ENOMEM;
1716 			goto out_err;
1717 		}
1718 	}
1719 
1720 	if (!rcu_dereference_protected(ipvs->conn_tab, 1)) {
1721 		int lfactor = sysctl_conn_lfactor(ipvs);
1722 		int new_size = ip_vs_conn_desired_size(ipvs, NULL, lfactor);
1723 
1724 		tc_new = ip_vs_conn_tab_alloc(ipvs, new_size, lfactor);
1725 		if (!tc_new) {
1726 			ret = -ENOMEM;
1727 			goto out_err;
1728 		}
1729 	}
1730 
1731 	if (!atomic_read(&ipvs->num_services[af_id])) {
1732 		ret = ip_vs_register_hooks(ipvs, u->af);
1733 		if (ret < 0)
1734 			goto out_err;
1735 		ret_hooks = ret;
1736 	}
1737 
1738 	svc = kzalloc_obj(struct ip_vs_service);
1739 	if (svc == NULL) {
1740 		IP_VS_DBG(1, "%s(): no memory\n", __func__);
1741 		ret = -ENOMEM;
1742 		goto out_err;
1743 	}
1744 	ret = ip_vs_stats_init_alloc(&svc->stats);
1745 	if (ret < 0)
1746 		goto out_err;
1747 
1748 	/* I'm the first user of the service */
1749 	atomic_set(&svc->refcnt, 0);
1750 
1751 	svc->af = u->af;
1752 	svc->protocol = u->protocol;
1753 	ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
1754 	svc->port = u->port;
1755 	svc->fwmark = u->fwmark;
1756 	svc->flags = u->flags & ~IP_VS_SVC_F_HASHED;
1757 	svc->timeout = u->timeout * HZ;
1758 	svc->netmask = u->netmask;
1759 	svc->ipvs = ipvs;
1760 
1761 	INIT_LIST_HEAD(&svc->destinations);
1762 	spin_lock_init(&svc->sched_lock);
1763 
1764 	/* Bind the scheduler */
1765 	if (sched) {
1766 		ret = ip_vs_bind_scheduler(svc, sched);
1767 		if (ret)
1768 			goto out_err;
1769 	}
1770 
1771 	ret = ip_vs_start_estimator(ipvs, &svc->stats);
1772 	if (ret < 0)
1773 		goto out_err;
1774 
1775 	if (t_new) {
1776 		clear_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags);
1777 		rcu_assign_pointer(ipvs->svc_table, t_new);
1778 		t_new = NULL;
1779 	}
1780 	if (tc_new) {
1781 		rcu_assign_pointer(ipvs->conn_tab, tc_new);
1782 		tc_new = NULL;
1783 	}
1784 
1785 	/* Update the virtual service counters */
1786 	if (svc->port == FTPPORT)
1787 		atomic_inc(&ipvs->ftpsvc_counter[af_id]);
1788 	else if (!svc->port && !svc->fwmark)
1789 		atomic_inc(&ipvs->nullsvc_counter[af_id]);
1790 	if (pe && pe->conn_out)
1791 		atomic_inc(&ipvs->conn_out_counter[af_id]);
1792 
1793 	/* Bind the ct retriever */
1794 	RCU_INIT_POINTER(svc->pe, pe);
1795 	pe = NULL;
1796 
1797 	if (svc->fwmark)
1798 		atomic_inc(&ipvs->fwm_services[af_id]);
1799 	else
1800 		atomic_inc(&ipvs->nonfwm_services[af_id]);
1801 	atomic_inc(&ipvs->num_services[af_id]);
1802 
1803 	/* Hash the service into the service table */
1804 	ip_vs_svc_hash(svc);
1805 
1806 	/* Schedule resize work */
1807 	if (t && ip_vs_get_num_services(ipvs) > t->u_thresh &&
1808 	    !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags))
1809 		queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work,
1810 				   1);
1811 
1812 	*svc_p = svc;
1813 
1814 	if (!READ_ONCE(ipvs->enable)) {
1815 		/* Now there is a service - full throttle */
1816 		WRITE_ONCE(ipvs->enable, 1);
1817 
1818 		/* Start estimation for first time */
1819 		ip_vs_est_reload_start(ipvs);
1820 	}
1821 
1822 	return 0;
1823 
1824 
1825  out_err:
1826 	if (tc_new)
1827 		ip_vs_rht_free(tc_new);
1828 	if (t_new)
1829 		ip_vs_rht_free(t_new);
1830 	if (ret_hooks >= 0)
1831 		ip_vs_unregister_hooks(ipvs, u->af);
1832 	if (svc != NULL) {
1833 		ip_vs_unbind_scheduler(svc, sched);
1834 		ip_vs_service_free(svc);
1835 	}
1836 	ip_vs_scheduler_put(sched);
1837 	ip_vs_pe_put(pe);
1838 
1839 	/* decrease the module use count */
1840 	ip_vs_use_count_dec();
1841 
1842 	return ret;
1843 }
1844 
1845 
1846 /*
1847  *	Edit a service and bind it with a new scheduler
1848  */
1849 static int
1850 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
1851 {
1852 	struct ip_vs_scheduler *sched = NULL, *old_sched;
1853 	struct ip_vs_pe *pe = NULL, *old_pe = NULL;
1854 	int ret = 0;
1855 	bool new_pe_conn_out, old_pe_conn_out;
1856 	struct netns_ipvs *ipvs = svc->ipvs;
1857 	int af_id = ip_vs_af_index(svc->af);
1858 
1859 	/*
1860 	 * Lookup the scheduler, by 'u->sched_name'
1861 	 */
1862 	if (strcmp(u->sched_name, "none")) {
1863 		sched = ip_vs_scheduler_get(u->sched_name);
1864 		if (!sched) {
1865 			pr_info("Scheduler module ip_vs_%s not found\n",
1866 				u->sched_name);
1867 			return -ENOENT;
1868 		}
1869 	}
1870 	old_sched = sched;
1871 
1872 	if (u->pe_name && *u->pe_name) {
1873 		pe = ip_vs_pe_getbyname(u->pe_name);
1874 		if (pe == NULL) {
1875 			pr_info("persistence engine module ip_vs_pe_%s "
1876 				"not found\n", u->pe_name);
1877 			ret = -ENOENT;
1878 			goto out;
1879 		}
1880 		old_pe = pe;
1881 	}
1882 
1883 #ifdef CONFIG_IP_VS_IPV6
1884 	if (u->af == AF_INET6) {
1885 		__u32 plen = (__force __u32) u->netmask;
1886 
1887 		if (plen < 1 || plen > 128) {
1888 			ret = -EINVAL;
1889 			goto out;
1890 		}
1891 	}
1892 #endif
1893 
1894 	old_sched = rcu_dereference_protected(svc->scheduler, 1);
1895 	if (sched != old_sched) {
1896 		if (old_sched) {
1897 			ip_vs_unbind_scheduler(svc, old_sched);
1898 			RCU_INIT_POINTER(svc->scheduler, NULL);
1899 			/* Wait all svc->sched_data users */
1900 			synchronize_rcu();
1901 		}
1902 		/* Bind the new scheduler */
1903 		if (sched) {
1904 			ret = ip_vs_bind_scheduler(svc, sched);
1905 			if (ret) {
1906 				ip_vs_scheduler_put(sched);
1907 				goto out;
1908 			}
1909 		}
1910 	}
1911 
1912 	/*
1913 	 * Set the flags and timeout value
1914 	 */
1915 	svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1916 	svc->timeout = u->timeout * HZ;
1917 	svc->netmask = u->netmask;
1918 
1919 	old_pe = rcu_dereference_protected(svc->pe, 1);
1920 	if (pe != old_pe) {
1921 		rcu_assign_pointer(svc->pe, pe);
1922 		/* check for optional methods in new pe */
1923 		new_pe_conn_out = (pe && pe->conn_out) ? true : false;
1924 		old_pe_conn_out = (old_pe && old_pe->conn_out) ? true : false;
1925 		if (new_pe_conn_out && !old_pe_conn_out)
1926 			atomic_inc(&ipvs->conn_out_counter[af_id]);
1927 		if (old_pe_conn_out && !new_pe_conn_out)
1928 			atomic_dec(&ipvs->conn_out_counter[af_id]);
1929 	}
1930 
1931 out:
1932 	ip_vs_scheduler_put(old_sched);
1933 	ip_vs_pe_put(old_pe);
1934 	return ret;
1935 }
1936 
1937 /*
1938  *	Delete a service from the service list
1939  *	- The service must be unlinked, unlocked and not referenced!
1940  *	- We are called under _bh lock
1941  */
1942 static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
1943 {
1944 	struct ip_vs_dest *dest, *nxt;
1945 	struct ip_vs_scheduler *old_sched;
1946 	struct ip_vs_pe *old_pe;
1947 	struct netns_ipvs *ipvs = svc->ipvs;
1948 	int af_id = ip_vs_af_index(svc->af);
1949 
1950 	atomic_dec(&ipvs->num_services[af_id]);
1951 	if (!atomic_read(&ipvs->num_services[af_id]))
1952 		ip_vs_unregister_hooks(ipvs, svc->af);
1953 	if (svc->fwmark)
1954 		atomic_dec(&ipvs->fwm_services[af_id]);
1955 	else
1956 		atomic_dec(&ipvs->nonfwm_services[af_id]);
1957 
1958 	ip_vs_stop_estimator(svc->ipvs, &svc->stats);
1959 
1960 	/* Unbind scheduler */
1961 	old_sched = rcu_dereference_protected(svc->scheduler, 1);
1962 	ip_vs_unbind_scheduler(svc, old_sched);
1963 	ip_vs_scheduler_put(old_sched);
1964 
1965 	/* Unbind persistence engine, keep svc->pe */
1966 	old_pe = rcu_dereference_protected(svc->pe, 1);
1967 	if (old_pe && old_pe->conn_out)
1968 		atomic_dec(&ipvs->conn_out_counter[af_id]);
1969 	ip_vs_pe_put(old_pe);
1970 
1971 	/*
1972 	 *    Unlink the whole destination list
1973 	 */
1974 	list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1975 		__ip_vs_unlink_dest(svc, dest, 0);
1976 		__ip_vs_del_dest(svc->ipvs, dest, cleanup);
1977 	}
1978 
1979 	/*
1980 	 *    Update the virtual service counters
1981 	 */
1982 	if (svc->port == FTPPORT)
1983 		atomic_dec(&ipvs->ftpsvc_counter[af_id]);
1984 	else if (!svc->port && !svc->fwmark)
1985 		atomic_dec(&ipvs->nullsvc_counter[af_id]);
1986 
1987 	/*
1988 	 *    Free the service if nobody refers to it
1989 	 */
1990 	__ip_vs_svc_put(svc);
1991 
1992 	/* decrease the module use count */
1993 	ip_vs_use_count_dec();
1994 }
1995 
1996 /*
1997  * Unlink a service from list and try to delete it if its refcnt reached 0
1998  */
1999 static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup)
2000 {
2001 	ip_vs_unregister_conntrack(svc);
2002 	/* Hold svc to avoid double release from dest_trash */
2003 	atomic_inc(&svc->refcnt);
2004 	/*
2005 	 * Unhash it from the service table
2006 	 */
2007 	ip_vs_svc_unhash(svc);
2008 
2009 	__ip_vs_del_service(svc, cleanup);
2010 }
2011 
2012 /*
2013  *	Delete a service from the service list
2014  */
2015 static int ip_vs_del_service(struct ip_vs_service *svc)
2016 {
2017 	struct netns_ipvs *ipvs;
2018 	struct ip_vs_rht *t, *p;
2019 	int ns;
2020 
2021 	if (svc == NULL)
2022 		return -EEXIST;
2023 	ipvs = svc->ipvs;
2024 	ip_vs_unlink_service(svc, false);
2025 	t = rcu_dereference_protected(ipvs->svc_table, 1);
2026 
2027 	/* Drop the table if no more services */
2028 	ns = ip_vs_get_num_services(ipvs);
2029 	if (!ns) {
2030 		/* Stop the resizer and drop the tables */
2031 		set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags);
2032 		cancel_delayed_work_sync(&ipvs->svc_resize_work);
2033 		if (t) {
2034 			rcu_assign_pointer(ipvs->svc_table, NULL);
2035 			while (1) {
2036 				p = rcu_dereference_protected(t->new_tbl, 1);
2037 				call_rcu(&t->rcu_head, ip_vs_rht_rcu_free);
2038 				if (p == t)
2039 					break;
2040 				t = p;
2041 			}
2042 		}
2043 	} else if (ns <= t->l_thresh &&
2044 		   !test_and_set_bit(IP_VS_WORK_SVC_RESIZE,
2045 				     &ipvs->work_flags)) {
2046 		queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work,
2047 				   1);
2048 	}
2049 	return 0;
2050 }
2051 
2052 
2053 /*
2054  *	Flush all the virtual services
2055  */
2056 static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup)
2057 {
2058 	DECLARE_IP_VS_RHT_WALK_BUCKETS();
2059 	struct hlist_bl_head *head;
2060 	struct ip_vs_service *svc;
2061 	struct hlist_bl_node *ne;
2062 	struct hlist_bl_node *e;
2063 	struct ip_vs_rht *t, *p;
2064 
2065 	/* Stop the resizer and drop the tables */
2066 	if (!test_and_set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
2067 		cancel_delayed_work_sync(&ipvs->svc_resize_work);
2068 	/* No resizer, so now we have exclusive write access */
2069 
2070 	if (ip_vs_get_num_services(ipvs)) {
2071 		ip_vs_rht_walk_buckets(ipvs->svc_table, head) {
2072 			hlist_bl_for_each_entry_safe(svc, e, ne, head, s_list)
2073 				ip_vs_unlink_service(svc, cleanup);
2074 		}
2075 	}
2076 
2077 	/* Unregister the hash table and release it after RCU grace period */
2078 	t = rcu_dereference_protected(ipvs->svc_table, 1);
2079 	if (t) {
2080 		rcu_assign_pointer(ipvs->svc_table, NULL);
2081 		while (1) {
2082 			p = rcu_dereference_protected(t->new_tbl, 1);
2083 			call_rcu(&t->rcu_head, ip_vs_rht_rcu_free);
2084 			if (p == t)
2085 				break;
2086 			t = p;
2087 		}
2088 	}
2089 	return 0;
2090 }
2091 
2092 /*
2093  *	Delete service by {netns} in the service table.
2094  *	Called by __ip_vs_batch_cleanup()
2095  */
2096 void ip_vs_service_nets_cleanup(struct list_head *net_list)
2097 {
2098 	struct netns_ipvs *ipvs;
2099 	struct net *net;
2100 
2101 	/* Check for "full" addressed entries */
2102 	list_for_each_entry(net, net_list, exit_list) {
2103 		ipvs = net_ipvs(net);
2104 		mutex_lock(&ipvs->service_mutex);
2105 		ip_vs_flush(ipvs, true);
2106 		mutex_unlock(&ipvs->service_mutex);
2107 	}
2108 }
2109 
2110 /* Put all references for device (dst_cache) */
2111 static inline void
2112 ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev)
2113 {
2114 	struct ip_vs_dest_dst *dest_dst;
2115 
2116 	spin_lock_bh(&dest->dst_lock);
2117 	dest_dst = rcu_dereference_protected(dest->dest_dst, 1);
2118 	if (dest_dst && dest_dst->dst_cache->dev == dev) {
2119 		IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n",
2120 			      dev->name,
2121 			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
2122 			      ntohs(dest->port),
2123 			      refcount_read(&dest->refcnt));
2124 		__ip_vs_dst_cache_reset(dest);
2125 	}
2126 	spin_unlock_bh(&dest->dst_lock);
2127 
2128 }
2129 /* Netdev event receiver
2130  * Currently only NETDEV_DOWN is handled to release refs to cached dsts
2131  */
2132 static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
2133 			   void *ptr)
2134 {
2135 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
2136 	struct net *net = dev_net(dev);
2137 	struct netns_ipvs *ipvs = net_ipvs(net);
2138 	DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU();
2139 	unsigned int resched_score = 0;
2140 	struct hlist_bl_head *head;
2141 	struct ip_vs_service *svc;
2142 	struct hlist_bl_node *e;
2143 	struct ip_vs_dest *dest;
2144 	int old_gen, new_gen;
2145 
2146 	if (event != NETDEV_DOWN || !ipvs)
2147 		return NOTIFY_DONE;
2148 	IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
2149 
2150 	old_gen = atomic_read(&ipvs->svc_table_changes);
2151 
2152 	rcu_read_lock();
2153 
2154 repeat:
2155 	smp_rmb(); /* ipvs->svc_table and svc_table_changes */
2156 	ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) {
2157 		hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
2158 			list_for_each_entry_rcu(dest, &svc->destinations,
2159 						n_list) {
2160 				ip_vs_forget_dev(dest, dev);
2161 				resched_score += 10;
2162 			}
2163 			resched_score++;
2164 		}
2165 		resched_score++;
2166 		if (resched_score >= 100) {
2167 			resched_score = 0;
2168 			cond_resched_rcu();
2169 			new_gen = atomic_read(&ipvs->svc_table_changes);
2170 			/* New table installed ? */
2171 			if (old_gen != new_gen) {
2172 				old_gen = new_gen;
2173 				goto repeat;
2174 			}
2175 		}
2176 	}
2177 	rcu_read_unlock();
2178 
2179 	return NOTIFY_DONE;
2180 }
2181 
2182 /*
2183  *	Zero counters in a service or all services
2184  */
2185 static int ip_vs_zero_service(struct ip_vs_service *svc)
2186 {
2187 	struct ip_vs_dest *dest;
2188 
2189 	list_for_each_entry(dest, &svc->destinations, n_list) {
2190 		ip_vs_zero_stats(&dest->stats);
2191 	}
2192 	ip_vs_zero_stats(&svc->stats);
2193 	return 0;
2194 }
2195 
2196 static int ip_vs_zero_all(struct netns_ipvs *ipvs)
2197 {
2198 	DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU();
2199 	unsigned int resched_score = 0;
2200 	struct hlist_bl_head *head;
2201 	struct ip_vs_service *svc;
2202 	struct hlist_bl_node *e;
2203 
2204 	rcu_read_lock();
2205 
2206 	ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) {
2207 		hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
2208 			ip_vs_zero_service(svc);
2209 			resched_score += 10;
2210 		}
2211 		resched_score++;
2212 		if (resched_score >= 100) {
2213 			resched_score = 0;
2214 			cond_resched_rcu();
2215 		}
2216 	}
2217 
2218 	rcu_read_unlock();
2219 
2220 	ip_vs_zero_stats(&ipvs->tot_stats->s);
2221 	return 0;
2222 }
2223 
2224 #ifdef CONFIG_SYSCTL
2225 
2226 static int
2227 proc_do_defense_mode(const struct ctl_table *table, int write,
2228 		     void *buffer, size_t *lenp, loff_t *ppos)
2229 {
2230 	struct netns_ipvs *ipvs = table->extra2;
2231 	int *valp = table->data;
2232 	int val = *valp;
2233 	int rc;
2234 
2235 	struct ctl_table tmp = {
2236 		.data = &val,
2237 		.maxlen = sizeof(int),
2238 		.mode = table->mode,
2239 	};
2240 
2241 	rc = proc_dointvec(&tmp, write, buffer, lenp, ppos);
2242 	if (write && (*valp != val)) {
2243 		if (val < 0 || val > 3) {
2244 			rc = -EINVAL;
2245 		} else {
2246 			*valp = val;
2247 			update_defense_level(ipvs);
2248 		}
2249 	}
2250 	return rc;
2251 }
2252 
2253 static int
2254 proc_do_sync_threshold(const struct ctl_table *table, int write,
2255 		       void *buffer, size_t *lenp, loff_t *ppos)
2256 {
2257 	struct netns_ipvs *ipvs = table->extra2;
2258 	int *valp = table->data;
2259 	int val[2];
2260 	int rc;
2261 	struct ctl_table tmp = {
2262 		.data = &val,
2263 		.maxlen = table->maxlen,
2264 		.mode = table->mode,
2265 	};
2266 
2267 	mutex_lock(&ipvs->sync_mutex);
2268 	memcpy(val, valp, sizeof(val));
2269 	rc = proc_dointvec(&tmp, write, buffer, lenp, ppos);
2270 	if (write) {
2271 		if (val[0] < 0 || val[1] < 0 ||
2272 		    (val[0] >= val[1] && val[1]))
2273 			rc = -EINVAL;
2274 		else
2275 			memcpy(valp, val, sizeof(val));
2276 	}
2277 	mutex_unlock(&ipvs->sync_mutex);
2278 	return rc;
2279 }
2280 
2281 static int
2282 proc_do_sync_ports(const struct ctl_table *table, int write,
2283 		   void *buffer, size_t *lenp, loff_t *ppos)
2284 {
2285 	int *valp = table->data;
2286 	int val = *valp;
2287 	int rc;
2288 
2289 	struct ctl_table tmp = {
2290 		.data = &val,
2291 		.maxlen = sizeof(int),
2292 		.mode = table->mode,
2293 	};
2294 
2295 	rc = proc_dointvec(&tmp, write, buffer, lenp, ppos);
2296 	if (write && (*valp != val)) {
2297 		if (val < 1 || !is_power_of_2(val))
2298 			rc = -EINVAL;
2299 		else
2300 			*valp = val;
2301 	}
2302 	return rc;
2303 }
2304 
2305 static int ipvs_proc_est_cpumask_set(const struct ctl_table *table,
2306 				     void *buffer)
2307 {
2308 	struct netns_ipvs *ipvs = table->extra2;
2309 	cpumask_var_t *valp = table->data;
2310 	cpumask_var_t newmask;
2311 	int ret;
2312 
2313 	if (!zalloc_cpumask_var(&newmask, GFP_KERNEL))
2314 		return -ENOMEM;
2315 
2316 	ret = cpulist_parse(buffer, newmask);
2317 	if (ret)
2318 		goto out;
2319 
2320 	mutex_lock(&ipvs->est_mutex);
2321 
2322 	if (!ipvs->est_cpulist_valid) {
2323 		if (!zalloc_cpumask_var(valp, GFP_KERNEL)) {
2324 			ret = -ENOMEM;
2325 			goto unlock;
2326 		}
2327 		ipvs->est_cpulist_valid = 1;
2328 	}
2329 	cpumask_and(newmask, newmask, &current->cpus_mask);
2330 	cpumask_copy(*valp, newmask);
2331 	/* est_max_threads may depend on cpulist size */
2332 	ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
2333 	ipvs->est_calc_phase = 1;
2334 	ip_vs_est_reload_start(ipvs);
2335 
2336 unlock:
2337 	mutex_unlock(&ipvs->est_mutex);
2338 
2339 out:
2340 	free_cpumask_var(newmask);
2341 	return ret;
2342 }
2343 
2344 static int ipvs_proc_est_cpumask_get(const struct ctl_table *table,
2345 				     void *buffer, size_t size)
2346 {
2347 	struct netns_ipvs *ipvs = table->extra2;
2348 	cpumask_var_t *valp = table->data;
2349 	struct cpumask *mask;
2350 	int ret;
2351 
2352 	mutex_lock(&ipvs->est_mutex);
2353 
2354 	if (ipvs->est_cpulist_valid)
2355 		mask = *valp;
2356 	else
2357 		mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD);
2358 	ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask));
2359 
2360 	mutex_unlock(&ipvs->est_mutex);
2361 
2362 	return ret;
2363 }
2364 
2365 static int ipvs_proc_est_cpulist(const struct ctl_table *table, int write,
2366 				 void *buffer, size_t *lenp, loff_t *ppos)
2367 {
2368 	int ret;
2369 
2370 	/* Ignore both read and write(append) if *ppos not 0 */
2371 	if (*ppos || !*lenp) {
2372 		*lenp = 0;
2373 		return 0;
2374 	}
2375 	if (write) {
2376 		/* proc_sys_call_handler() appends terminator */
2377 		ret = ipvs_proc_est_cpumask_set(table, buffer);
2378 		if (ret >= 0)
2379 			*ppos += *lenp;
2380 	} else {
2381 		/* proc_sys_call_handler() allocates 1 byte for terminator */
2382 		ret = ipvs_proc_est_cpumask_get(table, buffer, *lenp + 1);
2383 		if (ret >= 0) {
2384 			*lenp = ret;
2385 			*ppos += *lenp;
2386 			ret = 0;
2387 		}
2388 	}
2389 	return ret;
2390 }
2391 
2392 static int ipvs_proc_est_nice(const struct ctl_table *table, int write,
2393 			      void *buffer, size_t *lenp, loff_t *ppos)
2394 {
2395 	struct netns_ipvs *ipvs = table->extra2;
2396 	int *valp = table->data;
2397 	int val = *valp;
2398 	int ret;
2399 
2400 	struct ctl_table tmp_table = {
2401 		.data = &val,
2402 		.maxlen = sizeof(int),
2403 		.mode = table->mode,
2404 	};
2405 
2406 	ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
2407 	if (write && ret >= 0) {
2408 		if (val < MIN_NICE || val > MAX_NICE) {
2409 			ret = -EINVAL;
2410 		} else {
2411 			mutex_lock(&ipvs->est_mutex);
2412 			if (*valp != val) {
2413 				*valp = val;
2414 				ip_vs_est_reload_start(ipvs);
2415 			}
2416 			mutex_unlock(&ipvs->est_mutex);
2417 		}
2418 	}
2419 	return ret;
2420 }
2421 
2422 static int ipvs_proc_run_estimation(const struct ctl_table *table, int write,
2423 				    void *buffer, size_t *lenp, loff_t *ppos)
2424 {
2425 	struct netns_ipvs *ipvs = table->extra2;
2426 	int *valp = table->data;
2427 	int val = *valp;
2428 	int ret;
2429 
2430 	struct ctl_table tmp_table = {
2431 		.data = &val,
2432 		.maxlen = sizeof(int),
2433 		.mode = table->mode,
2434 	};
2435 
2436 	ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
2437 	if (write && ret >= 0) {
2438 		mutex_lock(&ipvs->est_mutex);
2439 		if (*valp != val) {
2440 			*valp = val;
2441 			ip_vs_est_reload_start(ipvs);
2442 		}
2443 		mutex_unlock(&ipvs->est_mutex);
2444 	}
2445 	return ret;
2446 }
2447 
2448 static int ipvs_proc_conn_lfactor(const struct ctl_table *table, int write,
2449 				  void *buffer, size_t *lenp, loff_t *ppos)
2450 {
2451 	struct netns_ipvs *ipvs = table->extra2;
2452 	int *valp = table->data;
2453 	int val = *valp;
2454 	int ret;
2455 
2456 	struct ctl_table tmp_table = {
2457 		.data = &val,
2458 		.maxlen = sizeof(int),
2459 	};
2460 
2461 	ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
2462 	if (write && ret >= 0) {
2463 		if (val < -8 || val > 8) {
2464 			ret = -EINVAL;
2465 		} else {
2466 			*valp = val;
2467 			if (rcu_access_pointer(ipvs->conn_tab))
2468 				mod_delayed_work(system_unbound_wq,
2469 						 &ipvs->conn_resize_work, 0);
2470 		}
2471 	}
2472 	return ret;
2473 }
2474 
2475 static int ipvs_proc_svc_lfactor(const struct ctl_table *table, int write,
2476 				 void *buffer, size_t *lenp, loff_t *ppos)
2477 {
2478 	struct netns_ipvs *ipvs = table->extra2;
2479 	int *valp = table->data;
2480 	int val = *valp;
2481 	int ret;
2482 
2483 	struct ctl_table tmp_table = {
2484 		.data = &val,
2485 		.maxlen = sizeof(int),
2486 	};
2487 
2488 	ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
2489 	if (write && ret >= 0) {
2490 		if (val < -8 || val > 8) {
2491 			ret = -EINVAL;
2492 		} else {
2493 			*valp = val;
2494 			if (rcu_access_pointer(ipvs->svc_table))
2495 				mod_delayed_work(system_unbound_wq,
2496 						 &ipvs->svc_resize_work, 0);
2497 		}
2498 	}
2499 	return ret;
2500 }
2501 
2502 /*
2503  *	IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
2504  *	Do not change order or insert new entries without
2505  *	align with netns init in ip_vs_control_net_init()
2506  */
2507 
2508 static struct ctl_table vs_vars[] = {
2509 	{
2510 		.procname	= "amemthresh",
2511 		.maxlen		= sizeof(int),
2512 		.mode		= 0644,
2513 		.proc_handler	= proc_dointvec,
2514 	},
2515 	{
2516 		.procname	= "am_droprate",
2517 		.maxlen		= sizeof(int),
2518 		.mode		= 0644,
2519 		.proc_handler	= proc_dointvec,
2520 	},
2521 	{
2522 		.procname	= "drop_entry",
2523 		.maxlen		= sizeof(int),
2524 		.mode		= 0644,
2525 		.proc_handler	= proc_do_defense_mode,
2526 	},
2527 	{
2528 		.procname	= "drop_packet",
2529 		.maxlen		= sizeof(int),
2530 		.mode		= 0644,
2531 		.proc_handler	= proc_do_defense_mode,
2532 	},
2533 #ifdef CONFIG_IP_VS_NFCT
2534 	{
2535 		.procname	= "conntrack",
2536 		.maxlen		= sizeof(int),
2537 		.mode		= 0644,
2538 		.proc_handler	= &proc_dointvec,
2539 	},
2540 #endif
2541 	{
2542 		.procname	= "secure_tcp",
2543 		.maxlen		= sizeof(int),
2544 		.mode		= 0644,
2545 		.proc_handler	= proc_do_defense_mode,
2546 	},
2547 	{
2548 		.procname	= "snat_reroute",
2549 		.maxlen		= sizeof(int),
2550 		.mode		= 0644,
2551 		.proc_handler	= &proc_dointvec,
2552 	},
2553 	{
2554 		.procname	= "sync_version",
2555 		.maxlen		= sizeof(int),
2556 		.mode		= 0644,
2557 		.proc_handler	= proc_dointvec_minmax,
2558 		.extra1		= SYSCTL_ZERO,
2559 		.extra2		= SYSCTL_ONE,
2560 	},
2561 	{
2562 		.procname	= "sync_ports",
2563 		.maxlen		= sizeof(int),
2564 		.mode		= 0644,
2565 		.proc_handler	= proc_do_sync_ports,
2566 	},
2567 	{
2568 		.procname	= "sync_persist_mode",
2569 		.maxlen		= sizeof(int),
2570 		.mode		= 0644,
2571 		.proc_handler	= proc_dointvec,
2572 	},
2573 	{
2574 		.procname	= "sync_qlen_max",
2575 		.maxlen		= sizeof(unsigned long),
2576 		.mode		= 0644,
2577 		.proc_handler	= proc_doulongvec_minmax,
2578 	},
2579 	{
2580 		.procname	= "sync_sock_size",
2581 		.maxlen		= sizeof(int),
2582 		.mode		= 0644,
2583 		.proc_handler	= proc_dointvec,
2584 	},
2585 	{
2586 		.procname	= "cache_bypass",
2587 		.maxlen		= sizeof(int),
2588 		.mode		= 0644,
2589 		.proc_handler	= proc_dointvec,
2590 	},
2591 	{
2592 		.procname	= "expire_nodest_conn",
2593 		.maxlen		= sizeof(int),
2594 		.mode		= 0644,
2595 		.proc_handler	= proc_dointvec,
2596 	},
2597 	{
2598 		.procname	= "sloppy_tcp",
2599 		.maxlen		= sizeof(int),
2600 		.mode		= 0644,
2601 		.proc_handler	= proc_dointvec,
2602 	},
2603 	{
2604 		.procname	= "sloppy_sctp",
2605 		.maxlen		= sizeof(int),
2606 		.mode		= 0644,
2607 		.proc_handler	= proc_dointvec,
2608 	},
2609 	{
2610 		.procname	= "expire_quiescent_template",
2611 		.maxlen		= sizeof(int),
2612 		.mode		= 0644,
2613 		.proc_handler	= proc_dointvec,
2614 	},
2615 	{
2616 		.procname	= "sync_threshold",
2617 		.maxlen		=
2618 			sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
2619 		.mode		= 0644,
2620 		.proc_handler	= proc_do_sync_threshold,
2621 	},
2622 	{
2623 		.procname	= "sync_refresh_period",
2624 		.maxlen		= sizeof(int),
2625 		.mode		= 0644,
2626 		.proc_handler	= proc_dointvec_jiffies,
2627 	},
2628 	{
2629 		.procname	= "sync_retries",
2630 		.maxlen		= sizeof(int),
2631 		.mode		= 0644,
2632 		.proc_handler	= proc_dointvec_minmax,
2633 		.extra1		= SYSCTL_ZERO,
2634 		.extra2		= SYSCTL_THREE,
2635 	},
2636 	{
2637 		.procname	= "nat_icmp_send",
2638 		.maxlen		= sizeof(int),
2639 		.mode		= 0644,
2640 		.proc_handler	= proc_dointvec,
2641 	},
2642 	{
2643 		.procname	= "pmtu_disc",
2644 		.maxlen		= sizeof(int),
2645 		.mode		= 0644,
2646 		.proc_handler	= proc_dointvec,
2647 	},
2648 	{
2649 		.procname	= "backup_only",
2650 		.maxlen		= sizeof(int),
2651 		.mode		= 0644,
2652 		.proc_handler	= proc_dointvec,
2653 	},
2654 	{
2655 		.procname	= "conn_reuse_mode",
2656 		.maxlen		= sizeof(int),
2657 		.mode		= 0644,
2658 		.proc_handler	= proc_dointvec,
2659 	},
2660 	{
2661 		.procname	= "schedule_icmp",
2662 		.maxlen		= sizeof(int),
2663 		.mode		= 0644,
2664 		.proc_handler	= proc_dointvec,
2665 	},
2666 	{
2667 		.procname	= "ignore_tunneled",
2668 		.maxlen		= sizeof(int),
2669 		.mode		= 0644,
2670 		.proc_handler	= proc_dointvec,
2671 	},
2672 	{
2673 		.procname	= "run_estimation",
2674 		.maxlen		= sizeof(int),
2675 		.mode		= 0644,
2676 		.proc_handler	= ipvs_proc_run_estimation,
2677 	},
2678 	{
2679 		.procname	= "est_cpulist",
2680 		.maxlen		= NR_CPUS,	/* unused */
2681 		.mode		= 0644,
2682 		.proc_handler	= ipvs_proc_est_cpulist,
2683 	},
2684 	{
2685 		.procname	= "est_nice",
2686 		.maxlen		= sizeof(int),
2687 		.mode		= 0644,
2688 		.proc_handler	= ipvs_proc_est_nice,
2689 	},
2690 	{
2691 		.procname	= "conn_lfactor",
2692 		.maxlen		= sizeof(int),
2693 		.mode		= 0644,
2694 		.proc_handler	= ipvs_proc_conn_lfactor,
2695 	},
2696 	{
2697 		.procname	= "svc_lfactor",
2698 		.maxlen		= sizeof(int),
2699 		.mode		= 0644,
2700 		.proc_handler	= ipvs_proc_svc_lfactor,
2701 	},
2702 #ifdef CONFIG_IP_VS_DEBUG
2703 	{
2704 		.procname	= "debug_level",
2705 		.data		= &sysctl_ip_vs_debug_level,
2706 		.maxlen		= sizeof(int),
2707 		.mode		= 0644,
2708 		.proc_handler	= proc_dointvec,
2709 	},
2710 #endif
2711 };
2712 
2713 #endif
2714 
2715 #ifdef CONFIG_PROC_FS
2716 
2717 struct ip_vs_iter {
2718 	struct seq_net_private p;  /* Do not move this, netns depends upon it*/
2719 	struct ip_vs_rht *t;
2720 	u32 bucket;
2721 };
2722 
2723 /*
2724  *	Write the contents of the VS rule table to a PROCfs file.
2725  *	(It is kept just for backward compatibility)
2726  */
2727 static inline const char *ip_vs_fwd_name(unsigned int flags)
2728 {
2729 	switch (flags & IP_VS_CONN_F_FWD_MASK) {
2730 	case IP_VS_CONN_F_LOCALNODE:
2731 		return "Local";
2732 	case IP_VS_CONN_F_TUNNEL:
2733 		return "Tunnel";
2734 	case IP_VS_CONN_F_DROUTE:
2735 		return "Route";
2736 	default:
2737 		return "Masq";
2738 	}
2739 }
2740 
2741 /* Do not expect consistent view during add, del and move(table resize).
2742  * We may miss entries and even show duplicates.
2743  */
2744 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
2745 {
2746 	struct ip_vs_iter *iter = seq->private;
2747 	struct ip_vs_rht *t = iter->t;
2748 	struct ip_vs_service *svc;
2749 	struct hlist_bl_node *e;
2750 	int idx;
2751 
2752 	if (!t)
2753 		return NULL;
2754 	for (idx = 0; idx < t->size; idx++) {
2755 		hlist_bl_for_each_entry_rcu(svc, e, &t->buckets[idx], s_list) {
2756 			if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key)))
2757 				break;
2758 			if (pos-- == 0) {
2759 				iter->bucket = idx;
2760 				return svc;
2761 			}
2762 		}
2763 	}
2764 	return NULL;
2765 }
2766 
2767 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
2768 	__acquires(RCU)
2769 {
2770 	struct ip_vs_iter *iter = seq->private;
2771 	struct net *net = seq_file_net(seq);
2772 	struct netns_ipvs *ipvs = net_ipvs(net);
2773 
2774 	rcu_read_lock();
2775 	iter->t = rcu_dereference(ipvs->svc_table);
2776 	return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
2777 }
2778 
2779 
2780 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2781 {
2782 	struct ip_vs_service *svc;
2783 	struct ip_vs_iter *iter;
2784 	struct hlist_bl_node *e;
2785 	struct ip_vs_rht *t;
2786 
2787 	++*pos;
2788 	if (v == SEQ_START_TOKEN)
2789 		return ip_vs_info_array(seq,0);
2790 
2791 	svc = v;
2792 	iter = seq->private;
2793 	t = iter->t;
2794 	if (!t)
2795 		return NULL;
2796 
2797 	hlist_bl_for_each_entry_continue_rcu(svc, e, s_list) {
2798 		/* Our cursor was moved to new table ? */
2799 		if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key)))
2800 			break;
2801 		return svc;
2802 	}
2803 
2804 	while (++iter->bucket < t->size) {
2805 		hlist_bl_for_each_entry_rcu(svc, e, &t->buckets[iter->bucket],
2806 					    s_list) {
2807 			if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key)))
2808 				break;
2809 			return svc;
2810 		}
2811 	}
2812 	return NULL;
2813 }
2814 
2815 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
2816 	__releases(RCU)
2817 {
2818 	rcu_read_unlock();
2819 }
2820 
2821 
2822 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
2823 {
2824 	struct net *net = seq_file_net(seq);
2825 	struct netns_ipvs *ipvs = net_ipvs(net);
2826 
2827 	if (v == SEQ_START_TOKEN) {
2828 		seq_printf(seq,
2829 			"IP Virtual Server version %d.%d.%d (size=%d)\n",
2830 			NVERSION(IP_VS_VERSION_CODE), get_conn_tab_size(ipvs));
2831 		seq_puts(seq,
2832 			 "Prot LocalAddress:Port Scheduler Flags\n");
2833 		seq_puts(seq,
2834 			 "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
2835 	} else {
2836 		const struct ip_vs_service *svc = v;
2837 		const struct ip_vs_dest *dest;
2838 		struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler);
2839 		char *sched_name = sched ? sched->name : "none";
2840 
2841 		if (!svc->fwmark) {
2842 #ifdef CONFIG_IP_VS_IPV6
2843 			if (svc->af == AF_INET6)
2844 				seq_printf(seq, "%s  [%pI6]:%04X %s ",
2845 					   ip_vs_proto_name(svc->protocol),
2846 					   &svc->addr.in6,
2847 					   ntohs(svc->port),
2848 					   sched_name);
2849 			else
2850 #endif
2851 				seq_printf(seq, "%s  %08X:%04X %s %s ",
2852 					   ip_vs_proto_name(svc->protocol),
2853 					   ntohl(svc->addr.ip),
2854 					   ntohs(svc->port),
2855 					   sched_name,
2856 					   (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
2857 		} else {
2858 			seq_printf(seq, "FWM  %08X %s %s",
2859 				   svc->fwmark, sched_name,
2860 				   (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
2861 		}
2862 
2863 		if (svc->flags & IP_VS_SVC_F_PERSISTENT)
2864 			seq_printf(seq, "persistent %d %08X\n",
2865 				svc->timeout,
2866 				ntohl(svc->netmask));
2867 		else
2868 			seq_putc(seq, '\n');
2869 
2870 		list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
2871 #ifdef CONFIG_IP_VS_IPV6
2872 			if (dest->af == AF_INET6)
2873 				seq_printf(seq,
2874 					   "  -> [%pI6]:%04X"
2875 					   "      %-7s %-6d %-10d %-10d\n",
2876 					   &dest->addr.in6,
2877 					   ntohs(dest->port),
2878 					   ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
2879 					   atomic_read(&dest->weight),
2880 					   atomic_read(&dest->activeconns),
2881 					   atomic_read(&dest->inactconns));
2882 			else
2883 #endif
2884 				seq_printf(seq,
2885 					   "  -> %08X:%04X      "
2886 					   "%-7s %-6d %-10d %-10d\n",
2887 					   ntohl(dest->addr.ip),
2888 					   ntohs(dest->port),
2889 					   ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
2890 					   atomic_read(&dest->weight),
2891 					   atomic_read(&dest->activeconns),
2892 					   atomic_read(&dest->inactconns));
2893 
2894 		}
2895 	}
2896 	return 0;
2897 }
2898 
2899 static const struct seq_operations ip_vs_info_seq_ops = {
2900 	.start = ip_vs_info_seq_start,
2901 	.next  = ip_vs_info_seq_next,
2902 	.stop  = ip_vs_info_seq_stop,
2903 	.show  = ip_vs_info_seq_show,
2904 };
2905 
2906 static int ip_vs_stats_show(struct seq_file *seq, void *v)
2907 {
2908 	struct net *net = seq_file_single_net(seq);
2909 	struct ip_vs_kstats show;
2910 
2911 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
2912 	seq_puts(seq,
2913 		 "   Total Incoming Outgoing         Incoming         Outgoing\n");
2914 	seq_puts(seq,
2915 		 "   Conns  Packets  Packets            Bytes            Bytes\n");
2916 
2917 	ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats->s);
2918 	seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n",
2919 		   (unsigned long long)show.conns,
2920 		   (unsigned long long)show.inpkts,
2921 		   (unsigned long long)show.outpkts,
2922 		   (unsigned long long)show.inbytes,
2923 		   (unsigned long long)show.outbytes);
2924 
2925 /*                01234567 01234567 01234567 0123456701234567 0123456701234567*/
2926 	seq_puts(seq,
2927 		 " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
2928 	seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n",
2929 		   (unsigned long long)show.cps,
2930 		   (unsigned long long)show.inpps,
2931 		   (unsigned long long)show.outpps,
2932 		   (unsigned long long)show.inbps,
2933 		   (unsigned long long)show.outbps);
2934 
2935 	return 0;
2936 }
2937 
2938 static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
2939 {
2940 	struct net *net = seq_file_single_net(seq);
2941 	struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats->s;
2942 	struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats;
2943 	struct ip_vs_kstats kstats;
2944 	int i;
2945 
2946 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
2947 	seq_puts(seq,
2948 		 "       Total Incoming Outgoing         Incoming         Outgoing\n");
2949 	seq_puts(seq,
2950 		 "CPU    Conns  Packets  Packets            Bytes            Bytes\n");
2951 
2952 	for_each_possible_cpu(i) {
2953 		struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i);
2954 		unsigned int start;
2955 		u64 conns, inpkts, outpkts, inbytes, outbytes;
2956 
2957 		do {
2958 			start = u64_stats_fetch_begin(&u->syncp);
2959 			conns = u64_stats_read(&u->cnt.conns);
2960 			inpkts = u64_stats_read(&u->cnt.inpkts);
2961 			outpkts = u64_stats_read(&u->cnt.outpkts);
2962 			inbytes = u64_stats_read(&u->cnt.inbytes);
2963 			outbytes = u64_stats_read(&u->cnt.outbytes);
2964 		} while (u64_stats_fetch_retry(&u->syncp, start));
2965 
2966 		seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n",
2967 			   i, (u64)conns, (u64)inpkts,
2968 			   (u64)outpkts, (u64)inbytes,
2969 			   (u64)outbytes);
2970 	}
2971 
2972 	ip_vs_copy_stats(&kstats, tot_stats);
2973 
2974 	seq_printf(seq, "  ~ %8LX %8LX %8LX %16LX %16LX\n\n",
2975 		   (unsigned long long)kstats.conns,
2976 		   (unsigned long long)kstats.inpkts,
2977 		   (unsigned long long)kstats.outpkts,
2978 		   (unsigned long long)kstats.inbytes,
2979 		   (unsigned long long)kstats.outbytes);
2980 
2981 /*                ... 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2982 	seq_puts(seq,
2983 		 "     Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
2984 	seq_printf(seq, "    %8LX %8LX %8LX %16LX %16LX\n",
2985 		   kstats.cps,
2986 		   kstats.inpps,
2987 		   kstats.outpps,
2988 		   kstats.inbps,
2989 		   kstats.outbps);
2990 
2991 	return 0;
2992 }
2993 
2994 static int ip_vs_status_show(struct seq_file *seq, void *v)
2995 {
2996 	struct net *net = seq_file_single_net(seq);
2997 	struct netns_ipvs *ipvs = net_ipvs(net);
2998 	unsigned int resched_score = 0;
2999 	struct ip_vs_conn_hnode *hn;
3000 	struct hlist_bl_head *head;
3001 	struct ip_vs_service *svc;
3002 	struct ip_vs_rht *t, *pt;
3003 	struct hlist_bl_node *e;
3004 	int old_gen, new_gen;
3005 	u32 counts[8];
3006 	u32 bucket;
3007 	int count;
3008 	u32 sum1;
3009 	u32 sum;
3010 	int i;
3011 
3012 	rcu_read_lock();
3013 
3014 	t = rcu_dereference(ipvs->conn_tab);
3015 
3016 	seq_printf(seq, "Conns:\t%d\n", atomic_read(&ipvs->conn_count));
3017 	seq_printf(seq, "Conn buckets:\t%d (%d bits, lfactor %d)\n",
3018 		   t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0);
3019 
3020 	if (!atomic_read(&ipvs->conn_count))
3021 		goto after_conns;
3022 	old_gen = atomic_read(&ipvs->conn_tab_changes);
3023 
3024 repeat_conn:
3025 	smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */
3026 	memset(counts, 0, sizeof(counts));
3027 	ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) {
3028 		for (bucket = 0; bucket < t->size; bucket++) {
3029 			DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
3030 
3031 			count = 0;
3032 			resched_score++;
3033 			ip_vs_rht_walk_bucket_rcu(t, bucket, head) {
3034 				count = 0;
3035 				hlist_bl_for_each_entry_rcu(hn, e, head, node)
3036 					count++;
3037 			}
3038 			resched_score += count;
3039 			if (resched_score >= 100) {
3040 				resched_score = 0;
3041 				cond_resched_rcu();
3042 				new_gen = atomic_read(&ipvs->conn_tab_changes);
3043 				/* New table installed ? */
3044 				if (old_gen != new_gen) {
3045 					old_gen = new_gen;
3046 					goto repeat_conn;
3047 				}
3048 			}
3049 			counts[min(count, (int)ARRAY_SIZE(counts) - 1)]++;
3050 		}
3051 	}
3052 	for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++)
3053 		sum += counts[i];
3054 	sum1 = sum - counts[0];
3055 	seq_printf(seq, "Conn buckets empty:\t%u (%lu%%)\n",
3056 		   counts[0], (unsigned long)counts[0] * 100 / max(sum, 1U));
3057 	for (i = 1; i < ARRAY_SIZE(counts); i++) {
3058 		if (!counts[i])
3059 			continue;
3060 		seq_printf(seq, "Conn buckets len-%d:\t%u (%lu%%)\n",
3061 			   i, counts[i],
3062 			   (unsigned long)counts[i] * 100 / max(sum1, 1U));
3063 	}
3064 
3065 after_conns:
3066 	t = rcu_dereference(ipvs->svc_table);
3067 
3068 	count = ip_vs_get_num_services(ipvs);
3069 	seq_printf(seq, "Services:\t%d\n", count);
3070 	seq_printf(seq, "Service buckets:\t%d (%d bits, lfactor %d)\n",
3071 		   t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0);
3072 
3073 	if (!count)
3074 		goto after_svc;
3075 	old_gen = atomic_read(&ipvs->svc_table_changes);
3076 
3077 repeat_svc:
3078 	smp_rmb(); /* ipvs->svc_table and svc_table_changes */
3079 	memset(counts, 0, sizeof(counts));
3080 	ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, pt) {
3081 		for (bucket = 0; bucket < t->size; bucket++) {
3082 			DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
3083 
3084 			count = 0;
3085 			resched_score++;
3086 			ip_vs_rht_walk_bucket_rcu(t, bucket, head) {
3087 				count = 0;
3088 				hlist_bl_for_each_entry_rcu(svc, e, head,
3089 							    s_list)
3090 					count++;
3091 			}
3092 			resched_score += count;
3093 			if (resched_score >= 100) {
3094 				resched_score = 0;
3095 				cond_resched_rcu();
3096 				new_gen = atomic_read(&ipvs->svc_table_changes);
3097 				/* New table installed ? */
3098 				if (old_gen != new_gen) {
3099 					old_gen = new_gen;
3100 					goto repeat_svc;
3101 				}
3102 			}
3103 			counts[min(count, (int)ARRAY_SIZE(counts) - 1)]++;
3104 		}
3105 	}
3106 	for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++)
3107 		sum += counts[i];
3108 	sum1 = sum - counts[0];
3109 	seq_printf(seq, "Service buckets empty:\t%u (%lu%%)\n",
3110 		   counts[0], (unsigned long)counts[0] * 100 / max(sum, 1U));
3111 	for (i = 1; i < ARRAY_SIZE(counts); i++) {
3112 		if (!counts[i])
3113 			continue;
3114 		seq_printf(seq, "Service buckets len-%d:\t%u (%lu%%)\n",
3115 			   i, counts[i],
3116 			   (unsigned long)counts[i] * 100 / max(sum1, 1U));
3117 	}
3118 
3119 after_svc:
3120 	seq_printf(seq, "Stats thread slots:\t%d (max %lu)\n",
3121 		   ipvs->est_kt_count, ipvs->est_max_threads);
3122 	seq_printf(seq, "Stats chain max len:\t%d\n", ipvs->est_chain_max);
3123 	seq_printf(seq, "Stats thread ests:\t%d\n",
3124 		   ipvs->est_chain_max * IPVS_EST_CHAIN_FACTOR *
3125 		   IPVS_EST_NTICKS);
3126 
3127 	rcu_read_unlock();
3128 	return 0;
3129 }
3130 
3131 #endif
3132 
3133 /*
3134  *	Set timeout values for tcp tcpfin udp in the timeout_table.
3135  */
3136 static int ip_vs_set_timeout(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u)
3137 {
3138 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
3139 	struct ip_vs_proto_data *pd;
3140 #endif
3141 
3142 	IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
3143 		  u->tcp_timeout,
3144 		  u->tcp_fin_timeout,
3145 		  u->udp_timeout);
3146 
3147 #ifdef CONFIG_IP_VS_PROTO_TCP
3148 	if (u->tcp_timeout < 0 || u->tcp_timeout > (INT_MAX / HZ) ||
3149 	    u->tcp_fin_timeout < 0 || u->tcp_fin_timeout > (INT_MAX / HZ)) {
3150 		return -EINVAL;
3151 	}
3152 #endif
3153 
3154 #ifdef CONFIG_IP_VS_PROTO_UDP
3155 	if (u->udp_timeout < 0 || u->udp_timeout > (INT_MAX / HZ))
3156 		return -EINVAL;
3157 #endif
3158 
3159 #ifdef CONFIG_IP_VS_PROTO_TCP
3160 	if (u->tcp_timeout) {
3161 		pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
3162 		pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
3163 			= u->tcp_timeout * HZ;
3164 	}
3165 
3166 	if (u->tcp_fin_timeout) {
3167 		pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
3168 		pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
3169 			= u->tcp_fin_timeout * HZ;
3170 	}
3171 #endif
3172 
3173 #ifdef CONFIG_IP_VS_PROTO_UDP
3174 	if (u->udp_timeout) {
3175 		pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP);
3176 		pd->timeout_table[IP_VS_UDP_S_NORMAL]
3177 			= u->udp_timeout * HZ;
3178 	}
3179 #endif
3180 	return 0;
3181 }
3182 
3183 #define CMDID(cmd)		(cmd - IP_VS_BASE_CTL)
3184 
3185 struct ip_vs_svcdest_user {
3186 	struct ip_vs_service_user	s;
3187 	struct ip_vs_dest_user		d;
3188 };
3189 
3190 static const unsigned char set_arglen[CMDID(IP_VS_SO_SET_MAX) + 1] = {
3191 	[CMDID(IP_VS_SO_SET_ADD)]         = sizeof(struct ip_vs_service_user),
3192 	[CMDID(IP_VS_SO_SET_EDIT)]        = sizeof(struct ip_vs_service_user),
3193 	[CMDID(IP_VS_SO_SET_DEL)]         = sizeof(struct ip_vs_service_user),
3194 	[CMDID(IP_VS_SO_SET_ADDDEST)]     = sizeof(struct ip_vs_svcdest_user),
3195 	[CMDID(IP_VS_SO_SET_DELDEST)]     = sizeof(struct ip_vs_svcdest_user),
3196 	[CMDID(IP_VS_SO_SET_EDITDEST)]    = sizeof(struct ip_vs_svcdest_user),
3197 	[CMDID(IP_VS_SO_SET_TIMEOUT)]     = sizeof(struct ip_vs_timeout_user),
3198 	[CMDID(IP_VS_SO_SET_STARTDAEMON)] = sizeof(struct ip_vs_daemon_user),
3199 	[CMDID(IP_VS_SO_SET_STOPDAEMON)]  = sizeof(struct ip_vs_daemon_user),
3200 	[CMDID(IP_VS_SO_SET_ZERO)]        = sizeof(struct ip_vs_service_user),
3201 };
3202 
3203 union ip_vs_set_arglen {
3204 	struct ip_vs_service_user	field_IP_VS_SO_SET_ADD;
3205 	struct ip_vs_service_user	field_IP_VS_SO_SET_EDIT;
3206 	struct ip_vs_service_user	field_IP_VS_SO_SET_DEL;
3207 	struct ip_vs_svcdest_user	field_IP_VS_SO_SET_ADDDEST;
3208 	struct ip_vs_svcdest_user	field_IP_VS_SO_SET_DELDEST;
3209 	struct ip_vs_svcdest_user	field_IP_VS_SO_SET_EDITDEST;
3210 	struct ip_vs_timeout_user	field_IP_VS_SO_SET_TIMEOUT;
3211 	struct ip_vs_daemon_user	field_IP_VS_SO_SET_STARTDAEMON;
3212 	struct ip_vs_daemon_user	field_IP_VS_SO_SET_STOPDAEMON;
3213 	struct ip_vs_service_user	field_IP_VS_SO_SET_ZERO;
3214 };
3215 
3216 #define MAX_SET_ARGLEN	sizeof(union ip_vs_set_arglen)
3217 
3218 static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
3219 				  struct ip_vs_service_user *usvc_compat)
3220 {
3221 	memset(usvc, 0, sizeof(*usvc));
3222 
3223 	usvc->af		= AF_INET;
3224 	usvc->protocol		= usvc_compat->protocol;
3225 	usvc->addr.ip		= usvc_compat->addr;
3226 	usvc->port		= usvc_compat->port;
3227 	usvc->fwmark		= usvc_compat->fwmark;
3228 
3229 	/* Deep copy of sched_name is not needed here */
3230 	usvc->sched_name	= usvc_compat->sched_name;
3231 
3232 	usvc->flags		= usvc_compat->flags;
3233 	usvc->timeout		= usvc_compat->timeout;
3234 	usvc->netmask		= usvc_compat->netmask;
3235 }
3236 
3237 static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
3238 				   struct ip_vs_dest_user *udest_compat)
3239 {
3240 	memset(udest, 0, sizeof(*udest));
3241 
3242 	udest->addr.ip		= udest_compat->addr;
3243 	udest->port		= udest_compat->port;
3244 	udest->conn_flags	= udest_compat->conn_flags;
3245 	udest->weight		= udest_compat->weight;
3246 	udest->u_threshold	= udest_compat->u_threshold;
3247 	udest->l_threshold	= udest_compat->l_threshold;
3248 	udest->af		= AF_INET;
3249 	udest->tun_type		= IP_VS_CONN_F_TUNNEL_TYPE_IPIP;
3250 }
3251 
3252 static int
3253 do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len)
3254 {
3255 	struct net *net = sock_net(sk);
3256 	int ret;
3257 	unsigned char arg[MAX_SET_ARGLEN];
3258 	struct ip_vs_service_user *usvc_compat;
3259 	struct ip_vs_service_user_kern usvc;
3260 	struct ip_vs_service *svc;
3261 	struct ip_vs_dest_user *udest_compat;
3262 	struct ip_vs_dest_user_kern udest;
3263 	struct netns_ipvs *ipvs = net_ipvs(net);
3264 
3265 	BUILD_BUG_ON(sizeof(arg) > 255);
3266 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3267 		return -EPERM;
3268 
3269 	if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
3270 		return -EINVAL;
3271 	if (len != set_arglen[CMDID(cmd)]) {
3272 		IP_VS_DBG(1, "set_ctl: len %u != %u\n",
3273 			  len, set_arglen[CMDID(cmd)]);
3274 		return -EINVAL;
3275 	}
3276 
3277 	if (copy_from_sockptr(arg, ptr, len) != 0)
3278 		return -EFAULT;
3279 
3280 	/* Handle daemons since they have another lock */
3281 	if (cmd == IP_VS_SO_SET_STARTDAEMON ||
3282 	    cmd == IP_VS_SO_SET_STOPDAEMON) {
3283 		struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
3284 
3285 		if (cmd == IP_VS_SO_SET_STARTDAEMON) {
3286 			struct ipvs_sync_daemon_cfg cfg;
3287 
3288 			memset(&cfg, 0, sizeof(cfg));
3289 			ret = -EINVAL;
3290 			if (strscpy(cfg.mcast_ifn, dm->mcast_ifn,
3291 				    sizeof(cfg.mcast_ifn)) <= 0)
3292 				return ret;
3293 			cfg.syncid = dm->syncid;
3294 			ret = start_sync_thread(ipvs, &cfg, dm->state);
3295 		} else {
3296 			ret = stop_sync_thread(ipvs, dm->state);
3297 		}
3298 		return ret;
3299 	}
3300 
3301 	mutex_lock(&ipvs->service_mutex);
3302 	if (cmd == IP_VS_SO_SET_FLUSH) {
3303 		/* Flush the virtual service */
3304 		ret = ip_vs_flush(ipvs, false);
3305 		goto out_unlock;
3306 	} else if (cmd == IP_VS_SO_SET_TIMEOUT) {
3307 		/* Set timeout values for (tcp tcpfin udp) */
3308 		ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg);
3309 		goto out_unlock;
3310 	} else if (!len) {
3311 		/* No more commands with len == 0 below */
3312 		ret = -EINVAL;
3313 		goto out_unlock;
3314 	}
3315 
3316 	usvc_compat = (struct ip_vs_service_user *)arg;
3317 	udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
3318 
3319 	/* We only use the new structs internally, so copy userspace compat
3320 	 * structs to extended internal versions */
3321 	ip_vs_copy_usvc_compat(&usvc, usvc_compat);
3322 	ip_vs_copy_udest_compat(&udest, udest_compat);
3323 
3324 	if (cmd == IP_VS_SO_SET_ZERO) {
3325 		/* if no service address is set, zero counters in all */
3326 		if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
3327 			ret = ip_vs_zero_all(ipvs);
3328 			goto out_unlock;
3329 		}
3330 	}
3331 
3332 	if ((cmd == IP_VS_SO_SET_ADD || cmd == IP_VS_SO_SET_EDIT) &&
3333 	    strnlen(usvc.sched_name, IP_VS_SCHEDNAME_MAXLEN) ==
3334 	    IP_VS_SCHEDNAME_MAXLEN) {
3335 		ret = -EINVAL;
3336 		goto out_unlock;
3337 	}
3338 
3339 	/* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */
3340 	if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP &&
3341 	    usvc.protocol != IPPROTO_SCTP) {
3342 		pr_err("set_ctl: invalid protocol: %d %pI4:%d\n",
3343 		       usvc.protocol, &usvc.addr.ip,
3344 		       ntohs(usvc.port));
3345 		ret = -EFAULT;
3346 		goto out_unlock;
3347 	}
3348 
3349 	/* Lookup the exact service by <protocol, addr, port> or fwmark */
3350 	rcu_read_lock();
3351 	if (usvc.fwmark == 0)
3352 		svc = __ip_vs_service_find(ipvs, usvc.af, usvc.protocol,
3353 					   &usvc.addr, usvc.port);
3354 	else
3355 		svc = __ip_vs_svc_fwm_find(ipvs, usvc.af, usvc.fwmark);
3356 	rcu_read_unlock();
3357 
3358 	if (cmd != IP_VS_SO_SET_ADD
3359 	    && (svc == NULL || svc->protocol != usvc.protocol)) {
3360 		ret = -ESRCH;
3361 		goto out_unlock;
3362 	}
3363 
3364 	switch (cmd) {
3365 	case IP_VS_SO_SET_ADD:
3366 		if (svc != NULL)
3367 			ret = -EEXIST;
3368 		else
3369 			ret = ip_vs_add_service(ipvs, &usvc, &svc);
3370 		break;
3371 	case IP_VS_SO_SET_EDIT:
3372 		ret = ip_vs_edit_service(svc, &usvc);
3373 		break;
3374 	case IP_VS_SO_SET_DEL:
3375 		ret = ip_vs_del_service(svc);
3376 		if (!ret)
3377 			goto out_unlock;
3378 		break;
3379 	case IP_VS_SO_SET_ZERO:
3380 		ret = ip_vs_zero_service(svc);
3381 		break;
3382 	case IP_VS_SO_SET_ADDDEST:
3383 		ret = ip_vs_add_dest(svc, &udest);
3384 		break;
3385 	case IP_VS_SO_SET_EDITDEST:
3386 		ret = ip_vs_edit_dest(svc, &udest);
3387 		break;
3388 	case IP_VS_SO_SET_DELDEST:
3389 		ret = ip_vs_del_dest(svc, &udest);
3390 		break;
3391 	default:
3392 		WARN_ON_ONCE(1);
3393 		ret = -EINVAL;
3394 		break;
3395 	}
3396 
3397   out_unlock:
3398 	mutex_unlock(&ipvs->service_mutex);
3399 	return ret;
3400 }
3401 
3402 
3403 static void
3404 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
3405 {
3406 	struct ip_vs_scheduler *sched;
3407 	struct ip_vs_kstats kstats;
3408 	char *sched_name;
3409 
3410 	sched = rcu_dereference_protected(src->scheduler, 1);
3411 	sched_name = sched ? sched->name : "none";
3412 	dst->protocol = src->protocol;
3413 	dst->addr = src->addr.ip;
3414 	dst->port = src->port;
3415 	dst->fwmark = src->fwmark;
3416 	strscpy(dst->sched_name, sched_name, sizeof(dst->sched_name));
3417 	dst->flags = src->flags;
3418 	dst->timeout = src->timeout / HZ;
3419 	dst->netmask = src->netmask;
3420 	dst->num_dests = src->num_dests;
3421 	ip_vs_copy_stats(&kstats, &src->stats);
3422 	ip_vs_export_stats_user(&dst->stats, &kstats);
3423 }
3424 
3425 static inline int
3426 __ip_vs_get_service_entries(struct netns_ipvs *ipvs,
3427 			    const struct ip_vs_get_services *get,
3428 			    struct ip_vs_get_services __user *uptr)
3429 {
3430 	struct ip_vs_service_entry entry;
3431 	DECLARE_IP_VS_RHT_WALK_BUCKETS();
3432 	struct hlist_bl_head *head;
3433 	struct ip_vs_service *svc;
3434 	struct hlist_bl_node *e;
3435 	int count = 0;
3436 	int ret = 0;
3437 
3438 	lockdep_assert_held(&ipvs->svc_resize_sem);
3439 	/* All service modifications are disabled, go ahead */
3440 	ip_vs_rht_walk_buckets(ipvs->svc_table, head) {
3441 		hlist_bl_for_each_entry(svc, e, head, s_list) {
3442 			/* Only expose IPv4 entries to old interface */
3443 			if (svc->af != AF_INET)
3444 				continue;
3445 
3446 			if (count >= get->num_services)
3447 				goto out;
3448 			memset(&entry, 0, sizeof(entry));
3449 			ip_vs_copy_service(&entry, svc);
3450 			if (copy_to_user(&uptr->entrytable[count],
3451 					 &entry, sizeof(entry))) {
3452 				ret = -EFAULT;
3453 				goto out;
3454 			}
3455 			count++;
3456 		}
3457 	}
3458 
3459 out:
3460 	return ret;
3461 }
3462 
3463 static inline int
3464 __ip_vs_get_dest_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_dests *get,
3465 			 struct ip_vs_get_dests __user *uptr)
3466 {
3467 	struct ip_vs_service *svc;
3468 	union nf_inet_addr addr = { .ip = get->addr };
3469 	int ret = 0;
3470 
3471 	rcu_read_lock();
3472 	if (get->fwmark)
3473 		svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, get->fwmark);
3474 	else
3475 		svc = __ip_vs_service_find(ipvs, AF_INET, get->protocol, &addr,
3476 					   get->port);
3477 	rcu_read_unlock();
3478 
3479 	if (svc) {
3480 		int count = 0;
3481 		struct ip_vs_dest *dest;
3482 		struct ip_vs_dest_entry entry;
3483 		struct ip_vs_kstats kstats;
3484 
3485 		memset(&entry, 0, sizeof(entry));
3486 		list_for_each_entry(dest, &svc->destinations, n_list) {
3487 			if (count >= get->num_dests)
3488 				break;
3489 
3490 			/* Cannot expose heterogeneous members via sockopt
3491 			 * interface
3492 			 */
3493 			if (dest->af != svc->af)
3494 				continue;
3495 
3496 			entry.addr = dest->addr.ip;
3497 			entry.port = dest->port;
3498 			entry.conn_flags = atomic_read(&dest->conn_flags);
3499 			entry.weight = atomic_read(&dest->weight);
3500 			entry.u_threshold = dest->u_threshold;
3501 			entry.l_threshold = dest->l_threshold;
3502 			entry.activeconns = atomic_read(&dest->activeconns);
3503 			entry.inactconns = atomic_read(&dest->inactconns);
3504 			entry.persistconns = atomic_read(&dest->persistconns);
3505 			ip_vs_copy_stats(&kstats, &dest->stats);
3506 			ip_vs_export_stats_user(&entry.stats, &kstats);
3507 			if (copy_to_user(&uptr->entrytable[count],
3508 					 &entry, sizeof(entry))) {
3509 				ret = -EFAULT;
3510 				break;
3511 			}
3512 			count++;
3513 		}
3514 	} else
3515 		ret = -ESRCH;
3516 	return ret;
3517 }
3518 
3519 static inline void
3520 __ip_vs_get_timeouts(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u)
3521 {
3522 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
3523 	struct ip_vs_proto_data *pd;
3524 #endif
3525 
3526 	memset(u, 0, sizeof (*u));
3527 
3528 #ifdef CONFIG_IP_VS_PROTO_TCP
3529 	pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
3530 	u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
3531 	u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
3532 #endif
3533 #ifdef CONFIG_IP_VS_PROTO_UDP
3534 	pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP);
3535 	u->udp_timeout =
3536 			pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
3537 #endif
3538 }
3539 
3540 static const unsigned char get_arglen[CMDID(IP_VS_SO_GET_MAX) + 1] = {
3541 	[CMDID(IP_VS_SO_GET_VERSION)]  = 64,
3542 	[CMDID(IP_VS_SO_GET_INFO)]     = sizeof(struct ip_vs_getinfo),
3543 	[CMDID(IP_VS_SO_GET_SERVICES)] = sizeof(struct ip_vs_get_services),
3544 	[CMDID(IP_VS_SO_GET_SERVICE)]  = sizeof(struct ip_vs_service_entry),
3545 	[CMDID(IP_VS_SO_GET_DESTS)]    = sizeof(struct ip_vs_get_dests),
3546 	[CMDID(IP_VS_SO_GET_TIMEOUT)]  = sizeof(struct ip_vs_timeout_user),
3547 	[CMDID(IP_VS_SO_GET_DAEMON)]   = 2 * sizeof(struct ip_vs_daemon_user),
3548 };
3549 
3550 union ip_vs_get_arglen {
3551 	char				field_IP_VS_SO_GET_VERSION[64];
3552 	struct ip_vs_getinfo		field_IP_VS_SO_GET_INFO;
3553 	struct ip_vs_get_services	field_IP_VS_SO_GET_SERVICES;
3554 	struct ip_vs_service_entry	field_IP_VS_SO_GET_SERVICE;
3555 	struct ip_vs_get_dests		field_IP_VS_SO_GET_DESTS;
3556 	struct ip_vs_timeout_user	field_IP_VS_SO_GET_TIMEOUT;
3557 	struct ip_vs_daemon_user	field_IP_VS_SO_GET_DAEMON[2];
3558 };
3559 
3560 #define MAX_GET_ARGLEN	sizeof(union ip_vs_get_arglen)
3561 
3562 static int
3563 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
3564 {
3565 	unsigned char arg[MAX_GET_ARGLEN];
3566 	int ret = 0;
3567 	unsigned int copylen;
3568 	struct net *net = sock_net(sk);
3569 	struct netns_ipvs *ipvs = net_ipvs(net);
3570 
3571 	BUG_ON(!net);
3572 	BUILD_BUG_ON(sizeof(arg) > 255);
3573 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3574 		return -EPERM;
3575 
3576 	if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
3577 		return -EINVAL;
3578 
3579 	copylen = get_arglen[CMDID(cmd)];
3580 	if (*len < (int) copylen) {
3581 		IP_VS_DBG(1, "get_ctl: len %d < %u\n", *len, copylen);
3582 		return -EINVAL;
3583 	}
3584 
3585 	if (copy_from_user(arg, user, copylen) != 0)
3586 		return -EFAULT;
3587 	/*
3588 	 * Handle daemons first since it has its own locking
3589 	 */
3590 	if (cmd == IP_VS_SO_GET_DAEMON) {
3591 		struct ip_vs_daemon_user d[2];
3592 
3593 		memset(&d, 0, sizeof(d));
3594 		mutex_lock(&ipvs->sync_mutex);
3595 		if (ipvs->sync_state & IP_VS_STATE_MASTER) {
3596 			d[0].state = IP_VS_STATE_MASTER;
3597 			strscpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn,
3598 				sizeof(d[0].mcast_ifn));
3599 			d[0].syncid = ipvs->mcfg.syncid;
3600 		}
3601 		if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
3602 			d[1].state = IP_VS_STATE_BACKUP;
3603 			strscpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn,
3604 				sizeof(d[1].mcast_ifn));
3605 			d[1].syncid = ipvs->bcfg.syncid;
3606 		}
3607 		if (copy_to_user(user, &d, sizeof(d)) != 0)
3608 			ret = -EFAULT;
3609 		mutex_unlock(&ipvs->sync_mutex);
3610 		return ret;
3611 	}
3612 
3613 	if (cmd == IP_VS_SO_GET_SERVICES) {
3614 		struct ip_vs_get_services *get;
3615 		size_t size;
3616 
3617 		get = (struct ip_vs_get_services *)arg;
3618 		size = struct_size(get, entrytable, get->num_services);
3619 		if (*len != size) {
3620 			pr_err("length: %u != %zu\n", *len, size);
3621 			return -EINVAL;
3622 		}
3623 		/* Protect against table resizer moving the entries.
3624 		 * Try reverse locking, so that we do not hold the mutex
3625 		 * while waiting for semaphore.
3626 		 */
3627 		while (1) {
3628 			ret = down_read_killable(&ipvs->svc_resize_sem);
3629 			if (ret < 0)
3630 				return ret;
3631 			if (mutex_trylock(&ipvs->service_mutex))
3632 				break;
3633 			up_read(&ipvs->svc_resize_sem);
3634 			cond_resched();
3635 		}
3636 		ret = __ip_vs_get_service_entries(ipvs, get, user);
3637 		up_read(&ipvs->svc_resize_sem);
3638 		mutex_unlock(&ipvs->service_mutex);
3639 		return ret;
3640 	}
3641 
3642 	mutex_lock(&ipvs->service_mutex);
3643 	switch (cmd) {
3644 	case IP_VS_SO_GET_VERSION:
3645 	{
3646 		char buf[64];
3647 
3648 		sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
3649 			NVERSION(IP_VS_VERSION_CODE), get_conn_tab_size(ipvs));
3650 		if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
3651 			ret = -EFAULT;
3652 			goto out;
3653 		}
3654 		*len = strlen(buf)+1;
3655 	}
3656 	break;
3657 
3658 	case IP_VS_SO_GET_INFO:
3659 	{
3660 		struct ip_vs_getinfo info;
3661 
3662 		info.version = IP_VS_VERSION_CODE;
3663 		info.size = get_conn_tab_size(ipvs);
3664 		info.num_services =
3665 			atomic_read(&ipvs->num_services[IP_VS_AF_INET]);
3666 		if (copy_to_user(user, &info, sizeof(info)) != 0)
3667 			ret = -EFAULT;
3668 	}
3669 	break;
3670 
3671 	case IP_VS_SO_GET_SERVICE:
3672 	{
3673 		struct ip_vs_service_entry *entry;
3674 		struct ip_vs_service *svc;
3675 		union nf_inet_addr addr;
3676 
3677 		entry = (struct ip_vs_service_entry *)arg;
3678 		addr.ip = entry->addr;
3679 		rcu_read_lock();
3680 		if (entry->fwmark)
3681 			svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, entry->fwmark);
3682 		else
3683 			svc = __ip_vs_service_find(ipvs, AF_INET,
3684 						   entry->protocol, &addr,
3685 						   entry->port);
3686 		rcu_read_unlock();
3687 		if (svc) {
3688 			ip_vs_copy_service(entry, svc);
3689 			if (copy_to_user(user, entry, sizeof(*entry)) != 0)
3690 				ret = -EFAULT;
3691 		} else
3692 			ret = -ESRCH;
3693 	}
3694 	break;
3695 
3696 	case IP_VS_SO_GET_DESTS:
3697 	{
3698 		struct ip_vs_get_dests *get;
3699 		size_t size;
3700 
3701 		get = (struct ip_vs_get_dests *)arg;
3702 		size = struct_size(get, entrytable, get->num_dests);
3703 		if (*len != size) {
3704 			pr_err("length: %u != %zu\n", *len, size);
3705 			ret = -EINVAL;
3706 			goto out;
3707 		}
3708 		ret = __ip_vs_get_dest_entries(ipvs, get, user);
3709 	}
3710 	break;
3711 
3712 	case IP_VS_SO_GET_TIMEOUT:
3713 	{
3714 		struct ip_vs_timeout_user t;
3715 
3716 		__ip_vs_get_timeouts(ipvs, &t);
3717 		if (copy_to_user(user, &t, sizeof(t)) != 0)
3718 			ret = -EFAULT;
3719 	}
3720 	break;
3721 
3722 	default:
3723 		ret = -EINVAL;
3724 	}
3725 
3726 out:
3727 	mutex_unlock(&ipvs->service_mutex);
3728 	return ret;
3729 }
3730 
3731 
3732 static struct nf_sockopt_ops ip_vs_sockopts = {
3733 	.pf		= PF_INET,
3734 	.set_optmin	= IP_VS_BASE_CTL,
3735 	.set_optmax	= IP_VS_SO_SET_MAX+1,
3736 	.set		= do_ip_vs_set_ctl,
3737 	.get_optmin	= IP_VS_BASE_CTL,
3738 	.get_optmax	= IP_VS_SO_GET_MAX+1,
3739 	.get		= do_ip_vs_get_ctl,
3740 	.owner		= THIS_MODULE,
3741 };
3742 
3743 /*
3744  * Generic Netlink interface
3745  */
3746 
3747 /* IPVS genetlink family */
3748 static struct genl_family ip_vs_genl_family;
3749 
3750 /* Policy used for first-level command attributes */
3751 static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
3752 	[IPVS_CMD_ATTR_SERVICE]		= { .type = NLA_NESTED },
3753 	[IPVS_CMD_ATTR_DEST]		= { .type = NLA_NESTED },
3754 	[IPVS_CMD_ATTR_DAEMON]		= { .type = NLA_NESTED },
3755 	[IPVS_CMD_ATTR_TIMEOUT_TCP]	= { .type = NLA_U32 },
3756 	[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]	= { .type = NLA_U32 },
3757 	[IPVS_CMD_ATTR_TIMEOUT_UDP]	= { .type = NLA_U32 },
3758 };
3759 
3760 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
3761 static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
3762 	[IPVS_DAEMON_ATTR_STATE]	= { .type = NLA_U32 },
3763 	[IPVS_DAEMON_ATTR_MCAST_IFN]	= { .type = NLA_NUL_STRING,
3764 					    .len = IP_VS_IFNAME_MAXLEN - 1 },
3765 	[IPVS_DAEMON_ATTR_SYNC_ID]	= { .type = NLA_U32 },
3766 	[IPVS_DAEMON_ATTR_SYNC_MAXLEN]	= { .type = NLA_U16 },
3767 	[IPVS_DAEMON_ATTR_MCAST_GROUP]	= { .type = NLA_U32 },
3768 	[IPVS_DAEMON_ATTR_MCAST_GROUP6]	= { .len = sizeof(struct in6_addr) },
3769 	[IPVS_DAEMON_ATTR_MCAST_PORT]	= { .type = NLA_U16 },
3770 	[IPVS_DAEMON_ATTR_MCAST_TTL]	= { .type = NLA_U8 },
3771 };
3772 
3773 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
3774 static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
3775 	[IPVS_SVC_ATTR_AF]		= { .type = NLA_U16 },
3776 	[IPVS_SVC_ATTR_PROTOCOL]	= { .type = NLA_U16 },
3777 	[IPVS_SVC_ATTR_ADDR]		= { .type = NLA_BINARY,
3778 					    .len = sizeof(union nf_inet_addr) },
3779 	[IPVS_SVC_ATTR_PORT]		= { .type = NLA_U16 },
3780 	[IPVS_SVC_ATTR_FWMARK]		= { .type = NLA_U32 },
3781 	[IPVS_SVC_ATTR_SCHED_NAME]	= { .type = NLA_NUL_STRING,
3782 					    .len = IP_VS_SCHEDNAME_MAXLEN - 1 },
3783 	[IPVS_SVC_ATTR_PE_NAME]		= { .type = NLA_NUL_STRING,
3784 					    .len = IP_VS_PENAME_MAXLEN },
3785 	[IPVS_SVC_ATTR_FLAGS]		= { .type = NLA_BINARY,
3786 					    .len = sizeof(struct ip_vs_flags) },
3787 	[IPVS_SVC_ATTR_TIMEOUT]		= { .type = NLA_U32 },
3788 	[IPVS_SVC_ATTR_NETMASK]		= { .type = NLA_U32 },
3789 	[IPVS_SVC_ATTR_STATS]		= { .type = NLA_NESTED },
3790 };
3791 
3792 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
3793 static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
3794 	[IPVS_DEST_ATTR_ADDR]		= { .type = NLA_BINARY,
3795 					    .len = sizeof(union nf_inet_addr) },
3796 	[IPVS_DEST_ATTR_PORT]		= { .type = NLA_U16 },
3797 	[IPVS_DEST_ATTR_FWD_METHOD]	= { .type = NLA_U32 },
3798 	[IPVS_DEST_ATTR_WEIGHT]		= { .type = NLA_U32 },
3799 	[IPVS_DEST_ATTR_U_THRESH]	= { .type = NLA_U32 },
3800 	[IPVS_DEST_ATTR_L_THRESH]	= { .type = NLA_U32 },
3801 	[IPVS_DEST_ATTR_ACTIVE_CONNS]	= { .type = NLA_U32 },
3802 	[IPVS_DEST_ATTR_INACT_CONNS]	= { .type = NLA_U32 },
3803 	[IPVS_DEST_ATTR_PERSIST_CONNS]	= { .type = NLA_U32 },
3804 	[IPVS_DEST_ATTR_STATS]		= { .type = NLA_NESTED },
3805 	[IPVS_DEST_ATTR_ADDR_FAMILY]	= { .type = NLA_U16 },
3806 	[IPVS_DEST_ATTR_TUN_TYPE]	= { .type = NLA_U8 },
3807 	[IPVS_DEST_ATTR_TUN_PORT]	= { .type = NLA_U16 },
3808 	[IPVS_DEST_ATTR_TUN_FLAGS]	= { .type = NLA_U16 },
3809 };
3810 
3811 static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
3812 				 struct ip_vs_kstats *kstats)
3813 {
3814 	struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type);
3815 
3816 	if (!nl_stats)
3817 		return -EMSGSIZE;
3818 
3819 	if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, (u32)kstats->conns) ||
3820 	    nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, (u32)kstats->inpkts) ||
3821 	    nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, (u32)kstats->outpkts) ||
3822 	    nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes,
3823 			      IPVS_STATS_ATTR_PAD) ||
3824 	    nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes,
3825 			      IPVS_STATS_ATTR_PAD) ||
3826 	    nla_put_u32(skb, IPVS_STATS_ATTR_CPS, (u32)kstats->cps) ||
3827 	    nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, (u32)kstats->inpps) ||
3828 	    nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, (u32)kstats->outpps) ||
3829 	    nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, (u32)kstats->inbps) ||
3830 	    nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, (u32)kstats->outbps))
3831 		goto nla_put_failure;
3832 	nla_nest_end(skb, nl_stats);
3833 
3834 	return 0;
3835 
3836 nla_put_failure:
3837 	nla_nest_cancel(skb, nl_stats);
3838 	return -EMSGSIZE;
3839 }
3840 
3841 static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type,
3842 				   struct ip_vs_kstats *kstats)
3843 {
3844 	struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type);
3845 
3846 	if (!nl_stats)
3847 		return -EMSGSIZE;
3848 
3849 	if (nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CONNS, kstats->conns,
3850 			      IPVS_STATS_ATTR_PAD) ||
3851 	    nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts,
3852 			      IPVS_STATS_ATTR_PAD) ||
3853 	    nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts,
3854 			      IPVS_STATS_ATTR_PAD) ||
3855 	    nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes,
3856 			      IPVS_STATS_ATTR_PAD) ||
3857 	    nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes,
3858 			      IPVS_STATS_ATTR_PAD) ||
3859 	    nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CPS, kstats->cps,
3860 			      IPVS_STATS_ATTR_PAD) ||
3861 	    nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps,
3862 			      IPVS_STATS_ATTR_PAD) ||
3863 	    nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps,
3864 			      IPVS_STATS_ATTR_PAD) ||
3865 	    nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps,
3866 			      IPVS_STATS_ATTR_PAD) ||
3867 	    nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps,
3868 			      IPVS_STATS_ATTR_PAD))
3869 		goto nla_put_failure;
3870 	nla_nest_end(skb, nl_stats);
3871 
3872 	return 0;
3873 
3874 nla_put_failure:
3875 	nla_nest_cancel(skb, nl_stats);
3876 	return -EMSGSIZE;
3877 }
3878 
3879 static int ip_vs_genl_fill_service(struct sk_buff *skb,
3880 				   struct ip_vs_service *svc)
3881 {
3882 	struct ip_vs_scheduler *sched;
3883 	struct ip_vs_pe *pe;
3884 	struct nlattr *nl_service;
3885 	struct ip_vs_flags flags = { .flags = svc->flags,
3886 				     .mask = ~0 };
3887 	struct ip_vs_kstats kstats;
3888 	char *sched_name;
3889 
3890 	nl_service = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_SERVICE);
3891 	if (!nl_service)
3892 		return -EMSGSIZE;
3893 
3894 	if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af))
3895 		goto nla_put_failure;
3896 	if (svc->fwmark) {
3897 		if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark))
3898 			goto nla_put_failure;
3899 	} else {
3900 		if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) ||
3901 		    nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) ||
3902 		    nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port))
3903 			goto nla_put_failure;
3904 	}
3905 
3906 	sched = rcu_dereference(svc->scheduler);
3907 	sched_name = sched ? sched->name : "none";
3908 	pe = rcu_dereference(svc->pe);
3909 	if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched_name) ||
3910 	    (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) ||
3911 	    nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
3912 	    nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) ||
3913 	    nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask))
3914 		goto nla_put_failure;
3915 	ip_vs_copy_stats(&kstats, &svc->stats);
3916 	if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &kstats))
3917 		goto nla_put_failure;
3918 	if (ip_vs_genl_fill_stats64(skb, IPVS_SVC_ATTR_STATS64, &kstats))
3919 		goto nla_put_failure;
3920 
3921 	nla_nest_end(skb, nl_service);
3922 
3923 	return 0;
3924 
3925 nla_put_failure:
3926 	nla_nest_cancel(skb, nl_service);
3927 	return -EMSGSIZE;
3928 }
3929 
3930 static int ip_vs_genl_dump_service(struct sk_buff *skb,
3931 				   struct ip_vs_service *svc,
3932 				   struct netlink_callback *cb)
3933 {
3934 	void *hdr;
3935 
3936 	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
3937 			  &ip_vs_genl_family, NLM_F_MULTI,
3938 			  IPVS_CMD_NEW_SERVICE);
3939 	if (!hdr)
3940 		return -EMSGSIZE;
3941 
3942 	if (ip_vs_genl_fill_service(skb, svc) < 0)
3943 		goto nla_put_failure;
3944 
3945 	genlmsg_end(skb, hdr);
3946 	return 0;
3947 
3948 nla_put_failure:
3949 	genlmsg_cancel(skb, hdr);
3950 	return -EMSGSIZE;
3951 }
3952 
3953 static int ip_vs_genl_dump_services(struct sk_buff *skb,
3954 				    struct netlink_callback *cb)
3955 {
3956 	DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU();
3957 	struct net *net = sock_net(skb->sk);
3958 	struct netns_ipvs *ipvs = net_ipvs(net);
3959 	struct hlist_bl_head *head;
3960 	struct ip_vs_service *svc;
3961 	struct hlist_bl_node *e;
3962 	int start = cb->args[0];
3963 	int idx = 0;
3964 
3965 	down_read(&ipvs->svc_resize_sem);
3966 	rcu_read_lock();
3967 	ip_vs_rht_walk_buckets_safe_rcu(ipvs->svc_table, head) {
3968 		hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
3969 			if (++idx <= start)
3970 				continue;
3971 			if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
3972 				idx--;
3973 				goto nla_put_failure;
3974 			}
3975 		}
3976 	}
3977 
3978 nla_put_failure:
3979 	rcu_read_unlock();
3980 	up_read(&ipvs->svc_resize_sem);
3981 	cb->args[0] = idx;
3982 
3983 	return skb->len;
3984 }
3985 
3986 static bool ip_vs_is_af_valid(int af)
3987 {
3988 	if (af == AF_INET)
3989 		return true;
3990 #ifdef CONFIG_IP_VS_IPV6
3991 	if (af == AF_INET6 && ipv6_mod_enabled())
3992 		return true;
3993 #endif
3994 	return false;
3995 }
3996 
3997 static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs,
3998 				    struct ip_vs_service_user_kern *usvc,
3999 				    struct nlattr *nla, bool full_entry,
4000 				    struct ip_vs_service **ret_svc)
4001 {
4002 	struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
4003 	struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
4004 	struct ip_vs_service *svc;
4005 
4006 	/* Parse mandatory identifying service fields first */
4007 	if (nla == NULL ||
4008 	    nla_parse_nested_deprecated(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy, NULL))
4009 		return -EINVAL;
4010 
4011 	nla_af		= attrs[IPVS_SVC_ATTR_AF];
4012 	nla_protocol	= attrs[IPVS_SVC_ATTR_PROTOCOL];
4013 	nla_addr	= attrs[IPVS_SVC_ATTR_ADDR];
4014 	nla_port	= attrs[IPVS_SVC_ATTR_PORT];
4015 	nla_fwmark	= attrs[IPVS_SVC_ATTR_FWMARK];
4016 
4017 	if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
4018 		return -EINVAL;
4019 
4020 	memset(usvc, 0, sizeof(*usvc));
4021 
4022 	usvc->af = nla_get_u16(nla_af);
4023 	if (!ip_vs_is_af_valid(usvc->af))
4024 		return -EAFNOSUPPORT;
4025 
4026 	if (nla_fwmark) {
4027 		usvc->protocol = IPPROTO_TCP;
4028 		usvc->fwmark = nla_get_u32(nla_fwmark);
4029 	} else {
4030 		usvc->protocol = nla_get_u16(nla_protocol);
4031 		nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
4032 		usvc->port = nla_get_be16(nla_port);
4033 		usvc->fwmark = 0;
4034 	}
4035 
4036 	if (usvc->fwmark)
4037 		svc = __ip_vs_svc_fwm_find(ipvs, usvc->af, usvc->fwmark);
4038 	else
4039 		svc = __ip_vs_service_find(ipvs, usvc->af, usvc->protocol,
4040 					   &usvc->addr, usvc->port);
4041 	*ret_svc = svc;
4042 
4043 	/* If a full entry was requested, check for the additional fields */
4044 	if (full_entry) {
4045 		struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
4046 			      *nla_netmask;
4047 		struct ip_vs_flags flags;
4048 
4049 		nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
4050 		nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
4051 		nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
4052 		nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
4053 		nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
4054 
4055 		if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
4056 			return -EINVAL;
4057 
4058 		nla_memcpy(&flags, nla_flags, sizeof(flags));
4059 
4060 		/* prefill flags from service if it already exists */
4061 		if (svc)
4062 			usvc->flags = svc->flags;
4063 
4064 		/* set new flags from userland */
4065 		usvc->flags = (usvc->flags & ~flags.mask) |
4066 			      (flags.flags & flags.mask);
4067 		usvc->sched_name = nla_data(nla_sched);
4068 		usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
4069 		usvc->timeout = nla_get_u32(nla_timeout);
4070 		usvc->netmask = nla_get_be32(nla_netmask);
4071 	}
4072 
4073 	return 0;
4074 }
4075 
4076 static struct ip_vs_service *ip_vs_genl_find_service(struct netns_ipvs *ipvs,
4077 						     struct nlattr *nla)
4078 {
4079 	struct ip_vs_service_user_kern usvc;
4080 	struct ip_vs_service *svc;
4081 	int ret;
4082 
4083 	ret = ip_vs_genl_parse_service(ipvs, &usvc, nla, false, &svc);
4084 	return ret ? ERR_PTR(ret) : svc;
4085 }
4086 
4087 static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
4088 {
4089 	struct nlattr *nl_dest;
4090 	struct ip_vs_kstats kstats;
4091 
4092 	nl_dest = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DEST);
4093 	if (!nl_dest)
4094 		return -EMSGSIZE;
4095 
4096 	if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) ||
4097 	    nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) ||
4098 	    nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD,
4099 			(atomic_read(&dest->conn_flags) &
4100 			 IP_VS_CONN_F_FWD_MASK)) ||
4101 	    nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT,
4102 			atomic_read(&dest->weight)) ||
4103 	    nla_put_u8(skb, IPVS_DEST_ATTR_TUN_TYPE,
4104 		       dest->tun_type) ||
4105 	    nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT,
4106 			 dest->tun_port) ||
4107 	    nla_put_u16(skb, IPVS_DEST_ATTR_TUN_FLAGS,
4108 			dest->tun_flags) ||
4109 	    nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
4110 	    nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
4111 	    nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
4112 			atomic_read(&dest->activeconns)) ||
4113 	    nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS,
4114 			atomic_read(&dest->inactconns)) ||
4115 	    nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
4116 			atomic_read(&dest->persistconns)) ||
4117 	    nla_put_u16(skb, IPVS_DEST_ATTR_ADDR_FAMILY, dest->af))
4118 		goto nla_put_failure;
4119 	ip_vs_copy_stats(&kstats, &dest->stats);
4120 	if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &kstats))
4121 		goto nla_put_failure;
4122 	if (ip_vs_genl_fill_stats64(skb, IPVS_DEST_ATTR_STATS64, &kstats))
4123 		goto nla_put_failure;
4124 
4125 	nla_nest_end(skb, nl_dest);
4126 
4127 	return 0;
4128 
4129 nla_put_failure:
4130 	nla_nest_cancel(skb, nl_dest);
4131 	return -EMSGSIZE;
4132 }
4133 
4134 static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
4135 				struct netlink_callback *cb)
4136 {
4137 	void *hdr;
4138 
4139 	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
4140 			  &ip_vs_genl_family, NLM_F_MULTI,
4141 			  IPVS_CMD_NEW_DEST);
4142 	if (!hdr)
4143 		return -EMSGSIZE;
4144 
4145 	if (ip_vs_genl_fill_dest(skb, dest) < 0)
4146 		goto nla_put_failure;
4147 
4148 	genlmsg_end(skb, hdr);
4149 	return 0;
4150 
4151 nla_put_failure:
4152 	genlmsg_cancel(skb, hdr);
4153 	return -EMSGSIZE;
4154 }
4155 
4156 static int ip_vs_genl_dump_dests(struct sk_buff *skb,
4157 				 struct netlink_callback *cb)
4158 {
4159 	int idx = 0;
4160 	int start = cb->args[0];
4161 	struct ip_vs_service *svc;
4162 	struct ip_vs_dest *dest;
4163 	struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
4164 	struct net *net = sock_net(skb->sk);
4165 	struct netns_ipvs *ipvs = net_ipvs(net);
4166 
4167 	rcu_read_lock();
4168 
4169 	/* Try to find the service for which to dump destinations */
4170 	if (nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy, cb->extack))
4171 		goto out_err;
4172 
4173 
4174 	svc = ip_vs_genl_find_service(ipvs, attrs[IPVS_CMD_ATTR_SERVICE]);
4175 	if (IS_ERR_OR_NULL(svc))
4176 		goto out_err;
4177 
4178 	/* Dump the destinations */
4179 	list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
4180 		if (++idx <= start)
4181 			continue;
4182 		if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
4183 			idx--;
4184 			goto nla_put_failure;
4185 		}
4186 	}
4187 
4188 nla_put_failure:
4189 	cb->args[0] = idx;
4190 
4191 out_err:
4192 	rcu_read_unlock();
4193 
4194 	return skb->len;
4195 }
4196 
4197 static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
4198 				 struct nlattr *nla, bool full_entry)
4199 {
4200 	struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
4201 	struct nlattr *nla_addr, *nla_port;
4202 	struct nlattr *nla_addr_family;
4203 
4204 	/* Parse mandatory identifying destination fields first */
4205 	if (nla == NULL ||
4206 	    nla_parse_nested_deprecated(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy, NULL))
4207 		return -EINVAL;
4208 
4209 	nla_addr	= attrs[IPVS_DEST_ATTR_ADDR];
4210 	nla_port	= attrs[IPVS_DEST_ATTR_PORT];
4211 	nla_addr_family	= attrs[IPVS_DEST_ATTR_ADDR_FAMILY];
4212 
4213 	if (!(nla_addr && nla_port))
4214 		return -EINVAL;
4215 
4216 	memset(udest, 0, sizeof(*udest));
4217 
4218 	nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
4219 	udest->port = nla_get_be16(nla_port);
4220 
4221 	udest->af = nla_get_u16_default(nla_addr_family, 0);
4222 
4223 	/* If a full entry was requested, check for the additional fields */
4224 	if (full_entry) {
4225 		struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
4226 			      *nla_l_thresh, *nla_tun_type, *nla_tun_port,
4227 			      *nla_tun_flags;
4228 
4229 		nla_fwd		= attrs[IPVS_DEST_ATTR_FWD_METHOD];
4230 		nla_weight	= attrs[IPVS_DEST_ATTR_WEIGHT];
4231 		nla_u_thresh	= attrs[IPVS_DEST_ATTR_U_THRESH];
4232 		nla_l_thresh	= attrs[IPVS_DEST_ATTR_L_THRESH];
4233 		nla_tun_type	= attrs[IPVS_DEST_ATTR_TUN_TYPE];
4234 		nla_tun_port	= attrs[IPVS_DEST_ATTR_TUN_PORT];
4235 		nla_tun_flags	= attrs[IPVS_DEST_ATTR_TUN_FLAGS];
4236 
4237 		if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
4238 			return -EINVAL;
4239 
4240 		udest->conn_flags = nla_get_u32(nla_fwd)
4241 				    & IP_VS_CONN_F_FWD_MASK;
4242 		udest->weight = nla_get_u32(nla_weight);
4243 		udest->u_threshold = nla_get_u32(nla_u_thresh);
4244 		udest->l_threshold = nla_get_u32(nla_l_thresh);
4245 
4246 		if (nla_tun_type)
4247 			udest->tun_type = nla_get_u8(nla_tun_type);
4248 
4249 		if (nla_tun_port)
4250 			udest->tun_port = nla_get_be16(nla_tun_port);
4251 
4252 		if (nla_tun_flags)
4253 			udest->tun_flags = nla_get_u16(nla_tun_flags);
4254 	}
4255 
4256 	return 0;
4257 }
4258 
4259 static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state,
4260 				  struct ipvs_sync_daemon_cfg *c)
4261 {
4262 	struct nlattr *nl_daemon;
4263 
4264 	nl_daemon = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DAEMON);
4265 	if (!nl_daemon)
4266 		return -EMSGSIZE;
4267 
4268 	if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) ||
4269 	    nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, c->mcast_ifn) ||
4270 	    nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, c->syncid) ||
4271 	    nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen) ||
4272 	    nla_put_u16(skb, IPVS_DAEMON_ATTR_MCAST_PORT, c->mcast_port) ||
4273 	    nla_put_u8(skb, IPVS_DAEMON_ATTR_MCAST_TTL, c->mcast_ttl))
4274 		goto nla_put_failure;
4275 #ifdef CONFIG_IP_VS_IPV6
4276 	if (c->mcast_af == AF_INET6) {
4277 		if (nla_put_in6_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP6,
4278 				     &c->mcast_group.in6))
4279 			goto nla_put_failure;
4280 	} else
4281 #endif
4282 		if (c->mcast_af == AF_INET &&
4283 		    nla_put_in_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP,
4284 				    c->mcast_group.ip))
4285 			goto nla_put_failure;
4286 	nla_nest_end(skb, nl_daemon);
4287 
4288 	return 0;
4289 
4290 nla_put_failure:
4291 	nla_nest_cancel(skb, nl_daemon);
4292 	return -EMSGSIZE;
4293 }
4294 
4295 static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state,
4296 				  struct ipvs_sync_daemon_cfg *c,
4297 				  struct netlink_callback *cb)
4298 {
4299 	void *hdr;
4300 	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
4301 			  &ip_vs_genl_family, NLM_F_MULTI,
4302 			  IPVS_CMD_NEW_DAEMON);
4303 	if (!hdr)
4304 		return -EMSGSIZE;
4305 
4306 	if (ip_vs_genl_fill_daemon(skb, state, c))
4307 		goto nla_put_failure;
4308 
4309 	genlmsg_end(skb, hdr);
4310 	return 0;
4311 
4312 nla_put_failure:
4313 	genlmsg_cancel(skb, hdr);
4314 	return -EMSGSIZE;
4315 }
4316 
4317 static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
4318 				   struct netlink_callback *cb)
4319 {
4320 	struct net *net = sock_net(skb->sk);
4321 	struct netns_ipvs *ipvs = net_ipvs(net);
4322 
4323 	mutex_lock(&ipvs->sync_mutex);
4324 	if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
4325 		if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
4326 					   &ipvs->mcfg, cb) < 0)
4327 			goto nla_put_failure;
4328 
4329 		cb->args[0] = 1;
4330 	}
4331 
4332 	if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
4333 		if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
4334 					   &ipvs->bcfg, cb) < 0)
4335 			goto nla_put_failure;
4336 
4337 		cb->args[1] = 1;
4338 	}
4339 
4340 nla_put_failure:
4341 	mutex_unlock(&ipvs->sync_mutex);
4342 
4343 	return skb->len;
4344 }
4345 
4346 static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs)
4347 {
4348 	struct ipvs_sync_daemon_cfg c;
4349 	struct nlattr *a;
4350 	int ret;
4351 
4352 	memset(&c, 0, sizeof(c));
4353 	if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
4354 	      attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
4355 	      attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
4356 		return -EINVAL;
4357 	strscpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
4358 		sizeof(c.mcast_ifn));
4359 	c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]);
4360 
4361 	a = attrs[IPVS_DAEMON_ATTR_SYNC_MAXLEN];
4362 	if (a)
4363 		c.sync_maxlen = nla_get_u16(a);
4364 
4365 	a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP];
4366 	if (a) {
4367 		c.mcast_af = AF_INET;
4368 		c.mcast_group.ip = nla_get_in_addr(a);
4369 		if (!ipv4_is_multicast(c.mcast_group.ip))
4370 			return -EINVAL;
4371 	} else {
4372 		a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP6];
4373 		if (a) {
4374 #ifdef CONFIG_IP_VS_IPV6
4375 			int addr_type;
4376 
4377 			c.mcast_af = AF_INET6;
4378 			c.mcast_group.in6 = nla_get_in6_addr(a);
4379 			addr_type = ipv6_addr_type(&c.mcast_group.in6);
4380 			if (!(addr_type & IPV6_ADDR_MULTICAST))
4381 				return -EINVAL;
4382 #else
4383 			return -EAFNOSUPPORT;
4384 #endif
4385 		}
4386 	}
4387 
4388 	a = attrs[IPVS_DAEMON_ATTR_MCAST_PORT];
4389 	if (a)
4390 		c.mcast_port = nla_get_u16(a);
4391 
4392 	a = attrs[IPVS_DAEMON_ATTR_MCAST_TTL];
4393 	if (a)
4394 		c.mcast_ttl = nla_get_u8(a);
4395 
4396 	/* The synchronization protocol is incompatible with mixed family
4397 	 * services
4398 	 */
4399 	if (ipvs->mixed_address_family_dests > 0)
4400 		return -EINVAL;
4401 
4402 	ret = start_sync_thread(ipvs, &c,
4403 				nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
4404 	return ret;
4405 }
4406 
4407 static int ip_vs_genl_del_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs)
4408 {
4409 	int ret;
4410 
4411 	if (!attrs[IPVS_DAEMON_ATTR_STATE])
4412 		return -EINVAL;
4413 
4414 	ret = stop_sync_thread(ipvs,
4415 			       nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
4416 	return ret;
4417 }
4418 
4419 static int ip_vs_genl_set_config(struct netns_ipvs *ipvs, struct nlattr **attrs)
4420 {
4421 	struct ip_vs_timeout_user t;
4422 
4423 	__ip_vs_get_timeouts(ipvs, &t);
4424 
4425 	if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
4426 		t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
4427 
4428 	if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
4429 		t.tcp_fin_timeout =
4430 			nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
4431 
4432 	if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
4433 		t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
4434 
4435 	return ip_vs_set_timeout(ipvs, &t);
4436 }
4437 
4438 static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info)
4439 {
4440 	int ret = -EINVAL, cmd;
4441 	struct net *net = sock_net(skb->sk);
4442 	struct netns_ipvs *ipvs = net_ipvs(net);
4443 
4444 	cmd = info->genlhdr->cmd;
4445 
4446 	if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) {
4447 		struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
4448 
4449 		if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
4450 		    nla_parse_nested_deprecated(daemon_attrs, IPVS_DAEMON_ATTR_MAX, info->attrs[IPVS_CMD_ATTR_DAEMON], ip_vs_daemon_policy, info->extack))
4451 			goto out;
4452 
4453 		if (cmd == IPVS_CMD_NEW_DAEMON)
4454 			ret = ip_vs_genl_new_daemon(ipvs, daemon_attrs);
4455 		else
4456 			ret = ip_vs_genl_del_daemon(ipvs, daemon_attrs);
4457 	}
4458 
4459 out:
4460 	return ret;
4461 }
4462 
4463 static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
4464 {
4465 	bool need_full_svc = false, need_full_dest = false;
4466 	struct ip_vs_service *svc = NULL;
4467 	struct ip_vs_service_user_kern usvc;
4468 	struct ip_vs_dest_user_kern udest;
4469 	int ret = 0, cmd;
4470 	struct net *net = sock_net(skb->sk);
4471 	struct netns_ipvs *ipvs = net_ipvs(net);
4472 
4473 	cmd = info->genlhdr->cmd;
4474 
4475 	mutex_lock(&ipvs->service_mutex);
4476 
4477 	if (cmd == IPVS_CMD_FLUSH) {
4478 		ret = ip_vs_flush(ipvs, false);
4479 		goto out;
4480 	} else if (cmd == IPVS_CMD_SET_CONFIG) {
4481 		ret = ip_vs_genl_set_config(ipvs, info->attrs);
4482 		goto out;
4483 	} else if (cmd == IPVS_CMD_ZERO &&
4484 		   !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
4485 		ret = ip_vs_zero_all(ipvs);
4486 		goto out;
4487 	}
4488 
4489 	/* All following commands require a service argument, so check if we
4490 	 * received a valid one. We need a full service specification when
4491 	 * adding / editing a service. Only identifying members otherwise. */
4492 	if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
4493 		need_full_svc = true;
4494 
4495 	/* We use function that requires RCU lock (hlist_bl) */
4496 	rcu_read_lock();
4497 	ret = ip_vs_genl_parse_service(ipvs, &usvc,
4498 				       info->attrs[IPVS_CMD_ATTR_SERVICE],
4499 				       need_full_svc, &svc);
4500 	rcu_read_unlock();
4501 	if (ret)
4502 		goto out;
4503 
4504 	/* Unless we're adding a new service, the service must already exist */
4505 	if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
4506 		ret = -ESRCH;
4507 		goto out;
4508 	}
4509 
4510 	/* Destination commands require a valid destination argument. For
4511 	 * adding / editing a destination, we need a full destination
4512 	 * specification. */
4513 	if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
4514 	    cmd == IPVS_CMD_DEL_DEST) {
4515 		if (cmd != IPVS_CMD_DEL_DEST)
4516 			need_full_dest = true;
4517 
4518 		ret = ip_vs_genl_parse_dest(&udest,
4519 					    info->attrs[IPVS_CMD_ATTR_DEST],
4520 					    need_full_dest);
4521 		if (ret)
4522 			goto out;
4523 
4524 		/* Old protocols did not allow the user to specify address
4525 		 * family, so we set it to zero instead.  We also didn't
4526 		 * allow heterogeneous pools in the old code, so it's safe
4527 		 * to assume that this will have the same address family as
4528 		 * the service.
4529 		 */
4530 		if (udest.af == 0)
4531 			udest.af = svc->af;
4532 
4533 		if (!ip_vs_is_af_valid(udest.af)) {
4534 			ret = -EAFNOSUPPORT;
4535 			goto out;
4536 		}
4537 
4538 		if (udest.af != svc->af && cmd != IPVS_CMD_DEL_DEST) {
4539 			/* The synchronization protocol is incompatible
4540 			 * with mixed family services
4541 			 */
4542 			if (ipvs->sync_state) {
4543 				ret = -EINVAL;
4544 				goto out;
4545 			}
4546 
4547 			/* Which connection types do we support? */
4548 			switch (udest.conn_flags) {
4549 			case IP_VS_CONN_F_TUNNEL:
4550 				/* We are able to forward this */
4551 				break;
4552 			default:
4553 				ret = -EINVAL;
4554 				goto out;
4555 			}
4556 		}
4557 	}
4558 
4559 	switch (cmd) {
4560 	case IPVS_CMD_NEW_SERVICE:
4561 		if (svc == NULL)
4562 			ret = ip_vs_add_service(ipvs, &usvc, &svc);
4563 		else
4564 			ret = -EEXIST;
4565 		break;
4566 	case IPVS_CMD_SET_SERVICE:
4567 		ret = ip_vs_edit_service(svc, &usvc);
4568 		break;
4569 	case IPVS_CMD_DEL_SERVICE:
4570 		ret = ip_vs_del_service(svc);
4571 		/* do not use svc, it can be freed */
4572 		break;
4573 	case IPVS_CMD_NEW_DEST:
4574 		ret = ip_vs_add_dest(svc, &udest);
4575 		break;
4576 	case IPVS_CMD_SET_DEST:
4577 		ret = ip_vs_edit_dest(svc, &udest);
4578 		break;
4579 	case IPVS_CMD_DEL_DEST:
4580 		ret = ip_vs_del_dest(svc, &udest);
4581 		break;
4582 	case IPVS_CMD_ZERO:
4583 		ret = ip_vs_zero_service(svc);
4584 		break;
4585 	default:
4586 		ret = -EINVAL;
4587 	}
4588 
4589 out:
4590 	mutex_unlock(&ipvs->service_mutex);
4591 
4592 	return ret;
4593 }
4594 
4595 static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
4596 {
4597 	struct sk_buff *msg;
4598 	void *reply;
4599 	int ret, cmd, reply_cmd;
4600 	struct net *net = sock_net(skb->sk);
4601 	struct netns_ipvs *ipvs = net_ipvs(net);
4602 
4603 	cmd = info->genlhdr->cmd;
4604 
4605 	if (cmd == IPVS_CMD_GET_SERVICE)
4606 		reply_cmd = IPVS_CMD_NEW_SERVICE;
4607 	else if (cmd == IPVS_CMD_GET_INFO)
4608 		reply_cmd = IPVS_CMD_SET_INFO;
4609 	else if (cmd == IPVS_CMD_GET_CONFIG)
4610 		reply_cmd = IPVS_CMD_SET_CONFIG;
4611 	else {
4612 		pr_err("unknown Generic Netlink command\n");
4613 		return -EINVAL;
4614 	}
4615 
4616 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
4617 	if (!msg)
4618 		return -ENOMEM;
4619 
4620 	rcu_read_lock();
4621 
4622 	reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
4623 	if (reply == NULL)
4624 		goto nla_put_failure;
4625 
4626 	switch (cmd) {
4627 	case IPVS_CMD_GET_SERVICE:
4628 	{
4629 		struct ip_vs_service *svc;
4630 
4631 		svc = ip_vs_genl_find_service(ipvs,
4632 					      info->attrs[IPVS_CMD_ATTR_SERVICE]);
4633 		if (IS_ERR(svc)) {
4634 			ret = PTR_ERR(svc);
4635 			goto out_err;
4636 		} else if (svc) {
4637 			ret = ip_vs_genl_fill_service(msg, svc);
4638 			if (ret)
4639 				goto nla_put_failure;
4640 		} else {
4641 			ret = -ESRCH;
4642 			goto out_err;
4643 		}
4644 
4645 		break;
4646 	}
4647 
4648 	case IPVS_CMD_GET_CONFIG:
4649 	{
4650 		struct ip_vs_timeout_user t;
4651 
4652 		__ip_vs_get_timeouts(ipvs, &t);
4653 #ifdef CONFIG_IP_VS_PROTO_TCP
4654 		if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP,
4655 				t.tcp_timeout) ||
4656 		    nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
4657 				t.tcp_fin_timeout))
4658 			goto nla_put_failure;
4659 #endif
4660 #ifdef CONFIG_IP_VS_PROTO_UDP
4661 		if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout))
4662 			goto nla_put_failure;
4663 #endif
4664 
4665 		break;
4666 	}
4667 
4668 	case IPVS_CMD_GET_INFO:
4669 		if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION,
4670 				IP_VS_VERSION_CODE) ||
4671 		    nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
4672 				get_conn_tab_size(ipvs)))
4673 			goto nla_put_failure;
4674 		break;
4675 	}
4676 
4677 	genlmsg_end(msg, reply);
4678 	ret = genlmsg_reply(msg, info);
4679 	goto out;
4680 
4681 nla_put_failure:
4682 	pr_err("not enough space in Netlink message\n");
4683 	ret = -EMSGSIZE;
4684 
4685 out_err:
4686 	nlmsg_free(msg);
4687 out:
4688 	rcu_read_unlock();
4689 
4690 	return ret;
4691 }
4692 
4693 
4694 static const struct genl_small_ops ip_vs_genl_ops[] = {
4695 	{
4696 		.cmd	= IPVS_CMD_NEW_SERVICE,
4697 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4698 		.flags	= GENL_ADMIN_PERM,
4699 		.doit	= ip_vs_genl_set_cmd,
4700 	},
4701 	{
4702 		.cmd	= IPVS_CMD_SET_SERVICE,
4703 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4704 		.flags	= GENL_ADMIN_PERM,
4705 		.doit	= ip_vs_genl_set_cmd,
4706 	},
4707 	{
4708 		.cmd	= IPVS_CMD_DEL_SERVICE,
4709 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4710 		.flags	= GENL_ADMIN_PERM,
4711 		.doit	= ip_vs_genl_set_cmd,
4712 	},
4713 	{
4714 		.cmd	= IPVS_CMD_GET_SERVICE,
4715 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4716 		.flags	= GENL_ADMIN_PERM,
4717 		.doit	= ip_vs_genl_get_cmd,
4718 		.dumpit	= ip_vs_genl_dump_services,
4719 	},
4720 	{
4721 		.cmd	= IPVS_CMD_NEW_DEST,
4722 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4723 		.flags	= GENL_ADMIN_PERM,
4724 		.doit	= ip_vs_genl_set_cmd,
4725 	},
4726 	{
4727 		.cmd	= IPVS_CMD_SET_DEST,
4728 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4729 		.flags	= GENL_ADMIN_PERM,
4730 		.doit	= ip_vs_genl_set_cmd,
4731 	},
4732 	{
4733 		.cmd	= IPVS_CMD_DEL_DEST,
4734 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4735 		.flags	= GENL_ADMIN_PERM,
4736 		.doit	= ip_vs_genl_set_cmd,
4737 	},
4738 	{
4739 		.cmd	= IPVS_CMD_GET_DEST,
4740 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4741 		.flags	= GENL_ADMIN_PERM,
4742 		.dumpit	= ip_vs_genl_dump_dests,
4743 	},
4744 	{
4745 		.cmd	= IPVS_CMD_NEW_DAEMON,
4746 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4747 		.flags	= GENL_ADMIN_PERM,
4748 		.doit	= ip_vs_genl_set_daemon,
4749 	},
4750 	{
4751 		.cmd	= IPVS_CMD_DEL_DAEMON,
4752 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4753 		.flags	= GENL_ADMIN_PERM,
4754 		.doit	= ip_vs_genl_set_daemon,
4755 	},
4756 	{
4757 		.cmd	= IPVS_CMD_GET_DAEMON,
4758 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4759 		.flags	= GENL_ADMIN_PERM,
4760 		.dumpit	= ip_vs_genl_dump_daemons,
4761 	},
4762 	{
4763 		.cmd	= IPVS_CMD_SET_CONFIG,
4764 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4765 		.flags	= GENL_ADMIN_PERM,
4766 		.doit	= ip_vs_genl_set_cmd,
4767 	},
4768 	{
4769 		.cmd	= IPVS_CMD_GET_CONFIG,
4770 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4771 		.flags	= GENL_ADMIN_PERM,
4772 		.doit	= ip_vs_genl_get_cmd,
4773 	},
4774 	{
4775 		.cmd	= IPVS_CMD_GET_INFO,
4776 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4777 		.flags	= GENL_ADMIN_PERM,
4778 		.doit	= ip_vs_genl_get_cmd,
4779 	},
4780 	{
4781 		.cmd	= IPVS_CMD_ZERO,
4782 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4783 		.flags	= GENL_ADMIN_PERM,
4784 		.doit	= ip_vs_genl_set_cmd,
4785 	},
4786 	{
4787 		.cmd	= IPVS_CMD_FLUSH,
4788 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4789 		.flags	= GENL_ADMIN_PERM,
4790 		.doit	= ip_vs_genl_set_cmd,
4791 	},
4792 };
4793 
4794 static struct genl_family ip_vs_genl_family __ro_after_init = {
4795 	.hdrsize	= 0,
4796 	.name		= IPVS_GENL_NAME,
4797 	.version	= IPVS_GENL_VERSION,
4798 	.maxattr	= IPVS_CMD_ATTR_MAX,
4799 	.policy = ip_vs_cmd_policy,
4800 	.netnsok        = true,         /* Make ipvsadm to work on netns */
4801 	.module		= THIS_MODULE,
4802 	.small_ops	= ip_vs_genl_ops,
4803 	.n_small_ops	= ARRAY_SIZE(ip_vs_genl_ops),
4804 	.resv_start_op	= IPVS_CMD_FLUSH + 1,
4805 	.parallel_ops	= 1,
4806 };
4807 
4808 static int __init ip_vs_genl_register(void)
4809 {
4810 	return genl_register_family(&ip_vs_genl_family);
4811 }
4812 
4813 static void ip_vs_genl_unregister(void)
4814 {
4815 	genl_unregister_family(&ip_vs_genl_family);
4816 }
4817 
4818 /* End of Generic Netlink interface definitions */
4819 
4820 /*
4821  * per netns intit/exit func.
4822  */
4823 #ifdef CONFIG_SYSCTL
4824 static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
4825 {
4826 	struct net *net = ipvs->net;
4827 	struct ctl_table *tbl;
4828 	int idx, ret;
4829 	size_t ctl_table_size = ARRAY_SIZE(vs_vars);
4830 	bool unpriv = net->user_ns != &init_user_ns;
4831 
4832 	atomic_set(&ipvs->dropentry, 0);
4833 	spin_lock_init(&ipvs->dropentry_lock);
4834 	spin_lock_init(&ipvs->droppacket_lock);
4835 	spin_lock_init(&ipvs->securetcp_lock);
4836 	INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
4837 	INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work,
4838 			  expire_nodest_conn_handler);
4839 	ipvs->est_stopped = 0;
4840 
4841 	if (!net_eq(net, &init_net)) {
4842 		tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
4843 		if (tbl == NULL)
4844 			return -ENOMEM;
4845 	} else
4846 		tbl = vs_vars;
4847 	/* Initialize sysctl defaults */
4848 	for (idx = 0; idx < ARRAY_SIZE(vs_vars); idx++) {
4849 		if (tbl[idx].proc_handler == proc_do_defense_mode)
4850 			tbl[idx].extra2 = ipvs;
4851 	}
4852 	idx = 0;
4853 	ipvs->sysctl_amemthresh = 1024;
4854 	tbl[idx++].data = &ipvs->sysctl_amemthresh;
4855 	ipvs->sysctl_am_droprate = 10;
4856 	tbl[idx++].data = &ipvs->sysctl_am_droprate;
4857 	tbl[idx++].data = &ipvs->sysctl_drop_entry;
4858 	tbl[idx++].data = &ipvs->sysctl_drop_packet;
4859 #ifdef CONFIG_IP_VS_NFCT
4860 	tbl[idx++].data = &ipvs->sysctl_conntrack;
4861 #endif
4862 	tbl[idx++].data = &ipvs->sysctl_secure_tcp;
4863 	ipvs->sysctl_snat_reroute = 1;
4864 	tbl[idx++].data = &ipvs->sysctl_snat_reroute;
4865 	ipvs->sysctl_sync_ver = 1;
4866 	tbl[idx++].data = &ipvs->sysctl_sync_ver;
4867 	ipvs->sysctl_sync_ports = 1;
4868 	tbl[idx++].data = &ipvs->sysctl_sync_ports;
4869 	tbl[idx++].data = &ipvs->sysctl_sync_persist_mode;
4870 
4871 	ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32;
4872 	if (unpriv)
4873 		tbl[idx].mode = 0444;
4874 	tbl[idx++].data = &ipvs->sysctl_sync_qlen_max;
4875 
4876 	ipvs->sysctl_sync_sock_size = 0;
4877 	if (unpriv)
4878 		tbl[idx].mode = 0444;
4879 	tbl[idx++].data = &ipvs->sysctl_sync_sock_size;
4880 
4881 	tbl[idx++].data = &ipvs->sysctl_cache_bypass;
4882 	tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
4883 	tbl[idx++].data = &ipvs->sysctl_sloppy_tcp;
4884 	tbl[idx++].data = &ipvs->sysctl_sloppy_sctp;
4885 	tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
4886 	ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
4887 	ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
4888 	tbl[idx].data = &ipvs->sysctl_sync_threshold;
4889 	tbl[idx].extra2 = ipvs;
4890 	tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
4891 	ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD;
4892 	tbl[idx++].data = &ipvs->sysctl_sync_refresh_period;
4893 	ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3);
4894 	tbl[idx++].data = &ipvs->sysctl_sync_retries;
4895 	tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
4896 	ipvs->sysctl_pmtu_disc = 1;
4897 	tbl[idx++].data = &ipvs->sysctl_pmtu_disc;
4898 	tbl[idx++].data = &ipvs->sysctl_backup_only;
4899 	ipvs->sysctl_conn_reuse_mode = 1;
4900 	tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode;
4901 	tbl[idx++].data = &ipvs->sysctl_schedule_icmp;
4902 	tbl[idx++].data = &ipvs->sysctl_ignore_tunneled;
4903 
4904 	ipvs->sysctl_run_estimation = 1;
4905 	if (unpriv)
4906 		tbl[idx].mode = 0444;
4907 	tbl[idx].extra2 = ipvs;
4908 	tbl[idx++].data = &ipvs->sysctl_run_estimation;
4909 
4910 	ipvs->est_cpulist_valid = 0;
4911 	if (unpriv)
4912 		tbl[idx].mode = 0444;
4913 	tbl[idx].extra2 = ipvs;
4914 	tbl[idx++].data = &ipvs->sysctl_est_cpulist;
4915 
4916 	ipvs->sysctl_est_nice = IPVS_EST_NICE;
4917 	if (unpriv)
4918 		tbl[idx].mode = 0444;
4919 	tbl[idx].extra2 = ipvs;
4920 	tbl[idx++].data = &ipvs->sysctl_est_nice;
4921 
4922 	if (unpriv)
4923 		tbl[idx].mode = 0444;
4924 	tbl[idx].extra2 = ipvs;
4925 	tbl[idx++].data = &ipvs->sysctl_conn_lfactor;
4926 
4927 	if (unpriv)
4928 		tbl[idx].mode = 0444;
4929 	tbl[idx].extra2 = ipvs;
4930 	tbl[idx++].data = &ipvs->sysctl_svc_lfactor;
4931 
4932 #ifdef CONFIG_IP_VS_DEBUG
4933 	/* Global sysctls must be ro in non-init netns */
4934 	if (!net_eq(net, &init_net))
4935 		tbl[idx++].mode = 0444;
4936 #endif
4937 
4938 	ret = -ENOMEM;
4939 	ipvs->sysctl_hdr = register_net_sysctl_sz(net, "net/ipv4/vs", tbl,
4940 						  ctl_table_size);
4941 	if (!ipvs->sysctl_hdr)
4942 		goto err;
4943 	ipvs->sysctl_tbl = tbl;
4944 
4945 	ret = ip_vs_start_estimator(ipvs, &ipvs->tot_stats->s);
4946 	if (ret < 0)
4947 		goto err;
4948 
4949 	/* Schedule defense work */
4950 	queue_delayed_work(system_long_wq, &ipvs->defense_work,
4951 			   DEFENSE_TIMER_PERIOD);
4952 
4953 	return 0;
4954 
4955 err:
4956 	unregister_net_sysctl_table(ipvs->sysctl_hdr);
4957 	if (!net_eq(net, &init_net))
4958 		kfree(tbl);
4959 	return ret;
4960 }
4961 
4962 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs)
4963 {
4964 	struct net *net = ipvs->net;
4965 
4966 	cancel_delayed_work_sync(&ipvs->expire_nodest_conn_work);
4967 	cancel_delayed_work_sync(&ipvs->defense_work);
4968 	cancel_work_sync(&ipvs->defense_work.work);
4969 	unregister_net_sysctl_table(ipvs->sysctl_hdr);
4970 	ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s);
4971 
4972 	if (ipvs->est_cpulist_valid)
4973 		free_cpumask_var(ipvs->sysctl_est_cpulist);
4974 
4975 	if (!net_eq(net, &init_net))
4976 		kfree(ipvs->sysctl_tbl);
4977 }
4978 
4979 #else
4980 
4981 static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { return 0; }
4982 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { }
4983 
4984 #endif
4985 
4986 static struct notifier_block ip_vs_dst_notifier = {
4987 	.notifier_call = ip_vs_dst_event,
4988 #ifdef CONFIG_IP_VS_IPV6
4989 	.priority = ADDRCONF_NOTIFY_PRIORITY + 5,
4990 #endif
4991 };
4992 
4993 int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
4994 {
4995 	int ret = -ENOMEM;
4996 	int idx;
4997 
4998 	/* Initialize service_mutex, svc_table per netns */
4999 	__mutex_init(&ipvs->service_mutex, "ipvs->service_mutex", &__ipvs_service_key);
5000 	init_rwsem(&ipvs->svc_resize_sem);
5001 	INIT_DELAYED_WORK(&ipvs->svc_resize_work, svc_resize_work_handler);
5002 	atomic_set(&ipvs->svc_table_changes, 0);
5003 	RCU_INIT_POINTER(ipvs->svc_table, NULL);
5004 
5005 	/* Initialize rs_table */
5006 	for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
5007 		INIT_HLIST_HEAD(&ipvs->rs_table[idx]);
5008 
5009 	INIT_LIST_HEAD(&ipvs->dest_trash);
5010 	spin_lock_init(&ipvs->dest_trash_lock);
5011 	timer_setup(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, 0);
5012 	for (idx = 0; idx < IP_VS_AF_MAX; idx++) {
5013 		atomic_set(&ipvs->num_services[idx], 0);
5014 		atomic_set(&ipvs->fwm_services[idx], 0);
5015 		atomic_set(&ipvs->nonfwm_services[idx], 0);
5016 		atomic_set(&ipvs->ftpsvc_counter[idx], 0);
5017 		atomic_set(&ipvs->nullsvc_counter[idx], 0);
5018 		atomic_set(&ipvs->conn_out_counter[idx], 0);
5019 	}
5020 
5021 	INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler);
5022 	ipvs->sysctl_svc_lfactor = ip_vs_svc_default_load_factor(ipvs);
5023 
5024 	/* procfs stats */
5025 	ipvs->tot_stats = kzalloc_obj(*ipvs->tot_stats);
5026 	if (!ipvs->tot_stats)
5027 		goto out;
5028 	if (ip_vs_stats_init_alloc(&ipvs->tot_stats->s) < 0)
5029 		goto err_tot_stats;
5030 
5031 #ifdef CONFIG_PROC_FS
5032 	if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net,
5033 			     &ip_vs_info_seq_ops, sizeof(struct ip_vs_iter)))
5034 		goto err_vs;
5035 	if (!proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net,
5036 				    ip_vs_stats_show, NULL))
5037 		goto err_stats;
5038 	if (!proc_create_net_single("ip_vs_stats_percpu", 0,
5039 				    ipvs->net->proc_net,
5040 				    ip_vs_stats_percpu_show, NULL))
5041 		goto err_percpu;
5042 	if (!proc_create_net_single("ip_vs_status", 0, ipvs->net->proc_net,
5043 				    ip_vs_status_show, NULL))
5044 		goto err_status;
5045 #endif
5046 
5047 	ret = ip_vs_control_net_init_sysctl(ipvs);
5048 	if (ret < 0)
5049 		goto err;
5050 
5051 	return 0;
5052 
5053 err:
5054 #ifdef CONFIG_PROC_FS
5055 	remove_proc_entry("ip_vs_status", ipvs->net->proc_net);
5056 
5057 err_status:
5058 	remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
5059 
5060 err_percpu:
5061 	remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
5062 
5063 err_stats:
5064 	remove_proc_entry("ip_vs", ipvs->net->proc_net);
5065 
5066 err_vs:
5067 #endif
5068 	ip_vs_stats_release(&ipvs->tot_stats->s);
5069 
5070 err_tot_stats:
5071 	kfree(ipvs->tot_stats);
5072 
5073 out:
5074 	return ret;
5075 }
5076 
5077 void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs)
5078 {
5079 	ip_vs_trash_cleanup(ipvs);
5080 	ip_vs_control_net_cleanup_sysctl(ipvs);
5081 	cancel_delayed_work_sync(&ipvs->est_reload_work);
5082 #ifdef CONFIG_PROC_FS
5083 	remove_proc_entry("ip_vs_status", ipvs->net->proc_net);
5084 	remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
5085 	remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
5086 	remove_proc_entry("ip_vs", ipvs->net->proc_net);
5087 #endif
5088 	call_rcu(&ipvs->tot_stats->rcu_head, ip_vs_stats_rcu_free);
5089 }
5090 
5091 int __init ip_vs_register_nl_ioctl(void)
5092 {
5093 	int ret;
5094 
5095 	ret = nf_register_sockopt(&ip_vs_sockopts);
5096 	if (ret) {
5097 		pr_err("cannot register sockopt.\n");
5098 		goto err_sock;
5099 	}
5100 
5101 	ret = ip_vs_genl_register();
5102 	if (ret) {
5103 		pr_err("cannot register Generic Netlink interface.\n");
5104 		goto err_genl;
5105 	}
5106 	return 0;
5107 
5108 err_genl:
5109 	nf_unregister_sockopt(&ip_vs_sockopts);
5110 err_sock:
5111 	return ret;
5112 }
5113 
5114 void ip_vs_unregister_nl_ioctl(void)
5115 {
5116 	ip_vs_genl_unregister();
5117 	nf_unregister_sockopt(&ip_vs_sockopts);
5118 }
5119 
5120 int __init ip_vs_control_init(void)
5121 {
5122 	int ret;
5123 
5124 	ret = register_netdevice_notifier(&ip_vs_dst_notifier);
5125 	if (ret < 0)
5126 		return ret;
5127 
5128 	return 0;
5129 }
5130 
5131 
5132 void ip_vs_control_cleanup(void)
5133 {
5134 	unregister_netdevice_notifier(&ip_vs_dst_notifier);
5135 	/* relying on common rcu_barrier() in ip_vs_cleanup() */
5136 }
5137