1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * IPVS An implementation of the IP virtual server support for the 4 * LINUX operating system. IPVS is now implemented as a module 5 * over the Netfilter framework. IPVS can be used to build a 6 * high-performance and highly available server based on a 7 * cluster of servers. 8 * 9 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 10 * Peter Kese <peter.kese@ijs.si> 11 * Julian Anastasov <ja@ssi.bg> 12 * 13 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, 14 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms 15 * and others. 16 * 17 * Changes: 18 * Paul `Rusty' Russell properly handle non-linear skbs 19 * Harald Welte don't use nfcache 20 */ 21 22 #define pr_fmt(fmt) "IPVS: " fmt 23 24 #include <linux/module.h> 25 #include <linux/kernel.h> 26 #include <linux/ip.h> 27 #include <linux/tcp.h> 28 #include <linux/sctp.h> 29 #include <linux/icmp.h> 30 #include <linux/slab.h> 31 32 #include <net/ip.h> 33 #include <net/tcp.h> 34 #include <net/udp.h> 35 #include <net/icmp.h> /* for icmp_send */ 36 #include <net/gue.h> 37 #include <net/gre.h> 38 #include <net/route.h> 39 #include <net/ip6_checksum.h> 40 #include <net/netns/generic.h> /* net_generic() */ 41 42 #include <linux/netfilter.h> 43 #include <linux/netfilter_ipv4.h> 44 45 #ifdef CONFIG_IP_VS_IPV6 46 #include <net/ipv6.h> 47 #include <linux/netfilter_ipv6.h> 48 #include <net/ip6_route.h> 49 #endif 50 51 #include <net/ip_vs.h> 52 #include <linux/indirect_call_wrapper.h> 53 54 55 EXPORT_SYMBOL(register_ip_vs_scheduler); 56 EXPORT_SYMBOL(unregister_ip_vs_scheduler); 57 EXPORT_SYMBOL(ip_vs_proto_name); 58 EXPORT_SYMBOL(ip_vs_conn_new); 59 EXPORT_SYMBOL(ip_vs_conn_in_get); 60 EXPORT_SYMBOL(ip_vs_conn_out_get); 61 #ifdef CONFIG_IP_VS_PROTO_TCP 62 EXPORT_SYMBOL(ip_vs_tcp_conn_listen); 63 #endif 64 EXPORT_SYMBOL(ip_vs_conn_put); 65 #ifdef CONFIG_IP_VS_DEBUG 66 EXPORT_SYMBOL(ip_vs_get_debug_level); 67 #endif 68 EXPORT_SYMBOL(ip_vs_new_conn_out); 69 70 #if defined(CONFIG_IP_VS_PROTO_TCP) && defined(CONFIG_IP_VS_PROTO_UDP) 71 #define SNAT_CALL(f, ...) \ 72 INDIRECT_CALL_2(f, tcp_snat_handler, udp_snat_handler, __VA_ARGS__) 73 #elif defined(CONFIG_IP_VS_PROTO_TCP) 74 #define SNAT_CALL(f, ...) INDIRECT_CALL_1(f, tcp_snat_handler, __VA_ARGS__) 75 #elif defined(CONFIG_IP_VS_PROTO_UDP) 76 #define SNAT_CALL(f, ...) INDIRECT_CALL_1(f, udp_snat_handler, __VA_ARGS__) 77 #else 78 #define SNAT_CALL(f, ...) f(__VA_ARGS__) 79 #endif 80 81 static unsigned int ip_vs_net_id __read_mostly; 82 /* netns cnt used for uniqueness */ 83 static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0); 84 85 /* ID used in ICMP lookups */ 86 #define icmp_id(icmph) (((icmph)->un).echo.id) 87 #define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier) 88 89 const char *ip_vs_proto_name(unsigned int proto) 90 { 91 static char buf[20]; 92 93 switch (proto) { 94 case IPPROTO_IP: 95 return "IP"; 96 case IPPROTO_UDP: 97 return "UDP"; 98 case IPPROTO_TCP: 99 return "TCP"; 100 case IPPROTO_SCTP: 101 return "SCTP"; 102 case IPPROTO_ICMP: 103 return "ICMP"; 104 #ifdef CONFIG_IP_VS_IPV6 105 case IPPROTO_ICMPV6: 106 return "ICMPv6"; 107 #endif 108 default: 109 sprintf(buf, "IP_%u", proto); 110 return buf; 111 } 112 } 113 114 void ip_vs_init_hash_table(struct list_head *table, int rows) 115 { 116 while (--rows >= 0) 117 INIT_LIST_HEAD(&table[rows]); 118 } 119 120 /* IPVS Resizable Hash Tables: 121 * - list_bl buckets with bit lock 122 * 123 * Goals: 124 * - RCU lookup for entry can run in parallel with add/del/move operations 125 * - hash keys can be on non-contiguous memory 126 * - support entries with duplicate keys 127 * - unlink entries without lookup, use the saved table and bucket id 128 * - resizing can trigger on load change or depending on key refresh period 129 * - customizable load factor to balance between speed and memory usage 130 * - add/del/move operations should be allowed for any context 131 * 132 * Resizing: 133 * - new table is attached to the current table and all entries are moved 134 * with new hash key. Finally, the new table is installed as current one and 135 * the old table is released after RCU grace period. 136 * - RCU read-side critical sections will walk two tables while resizing is 137 * in progress 138 * - new entries are added to the new table 139 * - entries will be deleted from the old or from the new table, the table_id 140 * can be saved into entry as part of the hash key to know where the entry is 141 * hashed 142 * - move operations may delay readers or to cause retry for the modified 143 * bucket. As result, searched entry will be found but walkers that operate 144 * on multiple entries may see same entry twice if bucket walking is retried. 145 * - for fast path the number of entries (load) can be compared to u_thresh 146 * and l_thresh to decide when to trigger table growing/shrinking. They 147 * are calculated based on load factor (shift count), negative value allows 148 * load to be below 100% to reduce collisions by maintaining larger table 149 * while positive value tolerates collisions by using smaller table and load 150 * above 100%: u_thresh(load) = size * (2 ^ lfactor) 151 * 152 * Locking: 153 * - lock: protect seqc if other context except resizer can move entries 154 * - seqc: seqcount_t, delay/retry readers while entries are moved to 155 * new table on resizing 156 * - bit lock: serialize bucket modifications 157 * - writers may use other locking mechanisms to serialize operations for 158 * resizing, moving and installing new tables 159 */ 160 161 void ip_vs_rht_free(struct ip_vs_rht *t) 162 { 163 kvfree(t->buckets); 164 kvfree(t->seqc); 165 kvfree(t->lock); 166 kfree(t); 167 } 168 169 void ip_vs_rht_rcu_free(struct rcu_head *head) 170 { 171 struct ip_vs_rht *t; 172 173 t = container_of(head, struct ip_vs_rht, rcu_head); 174 ip_vs_rht_free(t); 175 } 176 177 struct ip_vs_rht *ip_vs_rht_alloc(int buckets, int scounts, int locks) 178 { 179 struct ip_vs_rht *t = kzalloc(sizeof(*t), GFP_KERNEL); 180 int i; 181 182 if (!t) 183 return NULL; 184 if (scounts) { 185 int ml = roundup_pow_of_two(nr_cpu_ids); 186 187 scounts = min(scounts, buckets); 188 scounts = min(scounts, ml); 189 t->seqc = kvmalloc_array(scounts, sizeof(*t->seqc), GFP_KERNEL); 190 if (!t->seqc) 191 goto err; 192 for (i = 0; i < scounts; i++) 193 seqcount_init(&t->seqc[i]); 194 195 if (locks) { 196 locks = min(locks, scounts); 197 t->lock = kvmalloc_array(locks, sizeof(*t->lock), 198 GFP_KERNEL); 199 if (!t->lock) 200 goto err; 201 for (i = 0; i < locks; i++) 202 spin_lock_init(&t->lock[i].l); 203 } 204 } 205 206 t->buckets = kvmalloc_array(buckets, sizeof(*t->buckets), GFP_KERNEL); 207 if (!t->buckets) 208 goto err; 209 for (i = 0; i < buckets; i++) 210 INIT_HLIST_BL_HEAD(&t->buckets[i]); 211 t->mask = buckets - 1; 212 t->size = buckets; 213 t->seqc_mask = scounts - 1; 214 t->lock_mask = locks - 1; 215 t->u_thresh = buckets; 216 t->l_thresh = buckets >> 4; 217 t->bits = order_base_2(buckets); 218 /* new_tbl points to self if no new table is filled */ 219 RCU_INIT_POINTER(t->new_tbl, t); 220 get_random_bytes(&t->hash_key, sizeof(t->hash_key)); 221 return t; 222 223 err: 224 ip_vs_rht_free(t); 225 return NULL; 226 } 227 228 /* Get the desired table size for n entries based on current table size and 229 * by using the formula size = n / (2^lfactor) 230 * lfactor: shift value for the load factor: 231 * - >0: u_thresh=size << lfactor, for load factor above 100% 232 * - <0: u_thresh=size >> -lfactor, for load factor below 100% 233 * - 0: for load factor of 100% 234 */ 235 int ip_vs_rht_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, int n, 236 int lfactor, int min_bits, int max_bits) 237 { 238 if (!t) 239 return 1 << min_bits; 240 n = roundup_pow_of_two(n); 241 if (lfactor < 0) { 242 int factor = min(-lfactor, max_bits); 243 244 n = min(n, 1 << (max_bits - factor)); 245 n <<= factor; 246 } else { 247 n = min(n >> lfactor, 1 << max_bits); 248 } 249 if (lfactor != t->lfactor) 250 return clamp(n, 1 << min_bits, 1 << max_bits); 251 if (n > t->size) 252 return n; 253 if (n > t->size >> 4) 254 return t->size; 255 /* Shrink but keep it n * 2 to prevent frequent resizing */ 256 return clamp(n << 1, 1 << min_bits, 1 << max_bits); 257 } 258 259 /* Set thresholds based on table size and load factor: 260 * u_thresh = size * (2^lfactor) 261 * l_thresh = u_thresh / 16 262 * u_thresh/l_thresh can be used to check if load triggers a table grow/shrink 263 */ 264 void ip_vs_rht_set_thresholds(struct ip_vs_rht *t, int size, int lfactor, 265 int min_bits, int max_bits) 266 { 267 if (size >= 1 << max_bits) 268 t->u_thresh = INT_MAX; /* stop growing */ 269 else if (lfactor <= 0) 270 t->u_thresh = size >> min(-lfactor, max_bits); 271 else 272 t->u_thresh = min(size, 1 << (30 - lfactor)) << lfactor; 273 274 /* l_thresh: shrink when load is 16 times lower, can be 0 */ 275 if (size >= 1 << max_bits) 276 t->l_thresh = (1 << max_bits) >> 4; 277 else if (size > 1 << min_bits) 278 t->l_thresh = t->u_thresh >> 4; 279 else 280 t->l_thresh = 0; /* stop shrinking */ 281 } 282 283 /* Return hash value for local info (fast, insecure) */ 284 u32 ip_vs_rht_hash_linfo(struct ip_vs_rht *t, int af, 285 const union nf_inet_addr *addr, u32 v1, u32 v2) 286 { 287 u32 v3; 288 289 #ifdef CONFIG_IP_VS_IPV6 290 if (af == AF_INET6) 291 v3 = ipv6_addr_hash(&addr->in6); 292 else 293 #endif 294 v3 = addr->all[0]; 295 296 return jhash_3words(v1, v2, v3, (u32)t->hash_key.key[0]); 297 } 298 299 static inline void 300 ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) 301 { 302 struct ip_vs_dest *dest = cp->dest; 303 struct netns_ipvs *ipvs = cp->ipvs; 304 305 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { 306 struct ip_vs_cpu_stats *s; 307 struct ip_vs_service *svc; 308 309 local_bh_disable(); 310 311 s = this_cpu_ptr(dest->stats.cpustats); 312 u64_stats_update_begin(&s->syncp); 313 u64_stats_inc(&s->cnt.inpkts); 314 u64_stats_add(&s->cnt.inbytes, skb->len); 315 u64_stats_update_end(&s->syncp); 316 317 svc = rcu_dereference(dest->svc); 318 s = this_cpu_ptr(svc->stats.cpustats); 319 u64_stats_update_begin(&s->syncp); 320 u64_stats_inc(&s->cnt.inpkts); 321 u64_stats_add(&s->cnt.inbytes, skb->len); 322 u64_stats_update_end(&s->syncp); 323 324 s = this_cpu_ptr(ipvs->tot_stats->s.cpustats); 325 u64_stats_update_begin(&s->syncp); 326 u64_stats_inc(&s->cnt.inpkts); 327 u64_stats_add(&s->cnt.inbytes, skb->len); 328 u64_stats_update_end(&s->syncp); 329 330 local_bh_enable(); 331 } 332 } 333 334 335 static inline void 336 ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) 337 { 338 struct ip_vs_dest *dest = cp->dest; 339 struct netns_ipvs *ipvs = cp->ipvs; 340 341 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { 342 struct ip_vs_cpu_stats *s; 343 struct ip_vs_service *svc; 344 345 local_bh_disable(); 346 347 s = this_cpu_ptr(dest->stats.cpustats); 348 u64_stats_update_begin(&s->syncp); 349 u64_stats_inc(&s->cnt.outpkts); 350 u64_stats_add(&s->cnt.outbytes, skb->len); 351 u64_stats_update_end(&s->syncp); 352 353 svc = rcu_dereference(dest->svc); 354 s = this_cpu_ptr(svc->stats.cpustats); 355 u64_stats_update_begin(&s->syncp); 356 u64_stats_inc(&s->cnt.outpkts); 357 u64_stats_add(&s->cnt.outbytes, skb->len); 358 u64_stats_update_end(&s->syncp); 359 360 s = this_cpu_ptr(ipvs->tot_stats->s.cpustats); 361 u64_stats_update_begin(&s->syncp); 362 u64_stats_inc(&s->cnt.outpkts); 363 u64_stats_add(&s->cnt.outbytes, skb->len); 364 u64_stats_update_end(&s->syncp); 365 366 local_bh_enable(); 367 } 368 } 369 370 371 static inline void 372 ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) 373 { 374 struct netns_ipvs *ipvs = svc->ipvs; 375 struct ip_vs_cpu_stats *s; 376 377 local_bh_disable(); 378 379 s = this_cpu_ptr(cp->dest->stats.cpustats); 380 u64_stats_update_begin(&s->syncp); 381 u64_stats_inc(&s->cnt.conns); 382 u64_stats_update_end(&s->syncp); 383 384 s = this_cpu_ptr(svc->stats.cpustats); 385 u64_stats_update_begin(&s->syncp); 386 u64_stats_inc(&s->cnt.conns); 387 u64_stats_update_end(&s->syncp); 388 389 s = this_cpu_ptr(ipvs->tot_stats->s.cpustats); 390 u64_stats_update_begin(&s->syncp); 391 u64_stats_inc(&s->cnt.conns); 392 u64_stats_update_end(&s->syncp); 393 394 local_bh_enable(); 395 } 396 397 398 static inline void 399 ip_vs_set_state(struct ip_vs_conn *cp, int direction, 400 const struct sk_buff *skb, 401 struct ip_vs_proto_data *pd) 402 { 403 if (likely(pd->pp->state_transition)) 404 pd->pp->state_transition(cp, direction, skb, pd); 405 } 406 407 static inline int 408 ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, 409 struct sk_buff *skb, int protocol, 410 const union nf_inet_addr *caddr, __be16 cport, 411 const union nf_inet_addr *vaddr, __be16 vport, 412 struct ip_vs_conn_param *p) 413 { 414 ip_vs_conn_fill_param(svc->ipvs, svc->af, protocol, caddr, cport, vaddr, 415 vport, p); 416 p->pe = rcu_dereference(svc->pe); 417 if (p->pe && p->pe->fill_param) 418 return p->pe->fill_param(p, skb); 419 420 return 0; 421 } 422 423 /* 424 * IPVS persistent scheduling function 425 * It creates a connection entry according to its template if exists, 426 * or selects a server and creates a connection entry plus a template. 427 * Locking: we are svc user (svc->refcnt), so we hold all dests too 428 * Protocols supported: TCP, UDP 429 */ 430 static struct ip_vs_conn * 431 ip_vs_sched_persist(struct ip_vs_service *svc, 432 struct sk_buff *skb, __be16 src_port, __be16 dst_port, 433 int *ignored, struct ip_vs_iphdr *iph) 434 { 435 struct ip_vs_conn *cp = NULL; 436 struct ip_vs_dest *dest; 437 struct ip_vs_conn *ct; 438 __be16 dport = 0; /* destination port to forward */ 439 unsigned int flags; 440 struct ip_vs_conn_param param; 441 const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; 442 union nf_inet_addr snet; /* source network of the client, 443 after masking */ 444 const union nf_inet_addr *src_addr, *dst_addr; 445 446 if (likely(!ip_vs_iph_inverse(iph))) { 447 src_addr = &iph->saddr; 448 dst_addr = &iph->daddr; 449 } else { 450 src_addr = &iph->daddr; 451 dst_addr = &iph->saddr; 452 } 453 454 455 /* Mask saddr with the netmask to adjust template granularity */ 456 #ifdef CONFIG_IP_VS_IPV6 457 if (svc->af == AF_INET6) 458 ipv6_addr_prefix(&snet.in6, &src_addr->in6, 459 (__force __u32) svc->netmask); 460 else 461 #endif 462 snet.ip = src_addr->ip & svc->netmask; 463 464 IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " 465 "mnet %s\n", 466 IP_VS_DBG_ADDR(svc->af, src_addr), ntohs(src_port), 467 IP_VS_DBG_ADDR(svc->af, dst_addr), ntohs(dst_port), 468 IP_VS_DBG_ADDR(svc->af, &snet)); 469 470 /* 471 * As far as we know, FTP is a very complicated network protocol, and 472 * it uses control connection and data connections. For active FTP, 473 * FTP server initialize data connection to the client, its source port 474 * is often 20. For passive FTP, FTP server tells the clients the port 475 * that it passively listens to, and the client issues the data 476 * connection. In the tunneling or direct routing mode, the load 477 * balancer is on the client-to-server half of connection, the port 478 * number is unknown to the load balancer. So, a conn template like 479 * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP 480 * service, and a template like <caddr, 0, vaddr, vport, daddr, dport> 481 * is created for other persistent services. 482 */ 483 { 484 int protocol = iph->protocol; 485 const union nf_inet_addr *vaddr = dst_addr; 486 __be16 vport = 0; 487 488 if (dst_port == svc->port) { 489 /* non-FTP template: 490 * <protocol, caddr, 0, vaddr, vport, daddr, dport> 491 * FTP template: 492 * <protocol, caddr, 0, vaddr, 0, daddr, 0> 493 */ 494 if (svc->port != FTPPORT) 495 vport = dst_port; 496 } else { 497 /* Note: persistent fwmark-based services and 498 * persistent port zero service are handled here. 499 * fwmark template: 500 * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0> 501 * port zero template: 502 * <protocol,caddr,0,vaddr,0,daddr,0> 503 */ 504 if (svc->fwmark) { 505 protocol = IPPROTO_IP; 506 vaddr = &fwmark; 507 } 508 } 509 /* return *ignored = -1 so NF_DROP can be used */ 510 if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0, 511 vaddr, vport, ¶m) < 0) { 512 *ignored = -1; 513 return NULL; 514 } 515 } 516 517 /* Check if a template already exists */ 518 ct = ip_vs_ct_in_get(¶m); 519 if (!ct || !ip_vs_check_template(ct, NULL)) { 520 struct ip_vs_scheduler *sched; 521 522 /* 523 * No template found or the dest of the connection 524 * template is not available. 525 * return *ignored=0 i.e. ICMP and NF_DROP 526 */ 527 sched = rcu_dereference(svc->scheduler); 528 if (sched) { 529 /* read svc->sched_data after svc->scheduler */ 530 smp_rmb(); 531 dest = sched->schedule(svc, skb, iph); 532 } else { 533 dest = NULL; 534 } 535 if (!dest) { 536 IP_VS_DBG(1, "p-schedule: no dest found.\n"); 537 kfree(param.pe_data); 538 *ignored = 0; 539 return NULL; 540 } 541 542 if (dst_port == svc->port && svc->port != FTPPORT) 543 dport = dest->port; 544 545 /* Create a template 546 * This adds param.pe_data to the template, 547 * and thus param.pe_data will be destroyed 548 * when the template expires */ 549 ct = ip_vs_conn_new(¶m, dest->af, &dest->addr, dport, 550 IP_VS_CONN_F_TEMPLATE, dest, skb->mark); 551 if (ct == NULL) { 552 kfree(param.pe_data); 553 *ignored = -1; 554 return NULL; 555 } 556 557 ct->timeout = svc->timeout; 558 } else { 559 /* set destination with the found template */ 560 dest = ct->dest; 561 kfree(param.pe_data); 562 } 563 564 dport = dst_port; 565 if (dport == svc->port && dest->port) 566 dport = dest->port; 567 568 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET 569 && iph->protocol == IPPROTO_UDP) ? 570 IP_VS_CONN_F_ONE_PACKET : 0; 571 572 /* 573 * Create a new connection according to the template 574 */ 575 ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, src_addr, 576 src_port, dst_addr, dst_port, ¶m); 577 578 cp = ip_vs_conn_new(¶m, dest->af, &dest->addr, dport, flags, dest, 579 skb->mark); 580 if (cp == NULL) { 581 ip_vs_conn_put(ct); 582 *ignored = -1; 583 return NULL; 584 } 585 586 /* 587 * Add its control 588 */ 589 ip_vs_control_add(cp, ct); 590 ip_vs_conn_put(ct); 591 592 ip_vs_conn_stats(cp, svc); 593 return cp; 594 } 595 596 597 /* 598 * IPVS main scheduling function 599 * It selects a server according to the virtual service, and 600 * creates a connection entry. 601 * Protocols supported: TCP, UDP 602 * 603 * Usage of *ignored 604 * 605 * 1 : protocol tried to schedule (eg. on SYN), found svc but the 606 * svc/scheduler decides that this packet should be accepted with 607 * NF_ACCEPT because it must not be scheduled. 608 * 609 * 0 : scheduler can not find destination, so try bypass or 610 * return ICMP and then NF_DROP (ip_vs_leave). 611 * 612 * -1 : scheduler tried to schedule but fatal error occurred, eg. 613 * ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param 614 * failure such as missing Call-ID, ENOMEM on skb_linearize 615 * or pe_data. In this case we should return NF_DROP without 616 * any attempts to send ICMP with ip_vs_leave. 617 */ 618 struct ip_vs_conn * 619 ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, 620 struct ip_vs_proto_data *pd, int *ignored, 621 struct ip_vs_iphdr *iph) 622 { 623 struct ip_vs_protocol *pp = pd->pp; 624 struct ip_vs_conn *cp = NULL; 625 struct ip_vs_scheduler *sched; 626 struct ip_vs_dest *dest; 627 __be16 _ports[2], *pptr, cport, vport; 628 const void *caddr, *vaddr; 629 unsigned int flags; 630 631 *ignored = 1; 632 /* 633 * IPv6 frags, only the first hit here. 634 */ 635 pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports); 636 if (pptr == NULL) 637 return NULL; 638 639 if (likely(!ip_vs_iph_inverse(iph))) { 640 cport = pptr[0]; 641 caddr = &iph->saddr; 642 vport = pptr[1]; 643 vaddr = &iph->daddr; 644 } else { 645 cport = pptr[1]; 646 caddr = &iph->daddr; 647 vport = pptr[0]; 648 vaddr = &iph->saddr; 649 } 650 651 /* 652 * FTPDATA needs this check when using local real server. 653 * Never schedule Active FTPDATA connections from real server. 654 * For LVS-NAT they must be already created. For other methods 655 * with persistence the connection is created on SYN+ACK. 656 */ 657 if (cport == FTPDATA) { 658 IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off, 659 "Not scheduling FTPDATA"); 660 return NULL; 661 } 662 663 /* 664 * Do not schedule replies from local real server. 665 */ 666 if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK)) { 667 iph->hdr_flags ^= IP_VS_HDR_INVERSE; 668 cp = INDIRECT_CALL_1(pp->conn_in_get, 669 ip_vs_conn_in_get_proto, svc->ipvs, 670 svc->af, skb, iph); 671 iph->hdr_flags ^= IP_VS_HDR_INVERSE; 672 673 if (cp) { 674 IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off, 675 "Not scheduling reply for existing" 676 " connection"); 677 __ip_vs_conn_put(cp); 678 return NULL; 679 } 680 } 681 682 /* 683 * Persistent service 684 */ 685 if (svc->flags & IP_VS_SVC_F_PERSISTENT) 686 return ip_vs_sched_persist(svc, skb, cport, vport, ignored, 687 iph); 688 689 *ignored = 0; 690 691 /* 692 * Non-persistent service 693 */ 694 if (!svc->fwmark && vport != svc->port) { 695 if (!svc->port) 696 pr_err("Schedule: port zero only supported " 697 "in persistent services, " 698 "check your ipvs configuration\n"); 699 return NULL; 700 } 701 702 sched = rcu_dereference(svc->scheduler); 703 if (sched) { 704 /* read svc->sched_data after svc->scheduler */ 705 smp_rmb(); 706 dest = sched->schedule(svc, skb, iph); 707 } else { 708 dest = NULL; 709 } 710 if (dest == NULL) { 711 IP_VS_DBG(1, "Schedule: no dest found.\n"); 712 return NULL; 713 } 714 715 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET 716 && iph->protocol == IPPROTO_UDP) ? 717 IP_VS_CONN_F_ONE_PACKET : 0; 718 719 /* 720 * Create a connection entry. 721 */ 722 { 723 struct ip_vs_conn_param p; 724 725 ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, 726 caddr, cport, vaddr, vport, &p); 727 cp = ip_vs_conn_new(&p, dest->af, &dest->addr, 728 dest->port ? dest->port : vport, 729 flags, dest, skb->mark); 730 if (!cp) { 731 *ignored = -1; 732 return NULL; 733 } 734 } 735 736 IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " 737 "d:%s:%u conn->flags:%X conn->refcnt:%d\n", 738 ip_vs_fwd_tag(cp), 739 IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), 740 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), 741 IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), 742 cp->flags, refcount_read(&cp->refcnt)); 743 744 ip_vs_conn_stats(cp, svc); 745 return cp; 746 } 747 748 static inline int ip_vs_addr_is_unicast(struct net *net, int af, 749 union nf_inet_addr *addr) 750 { 751 #ifdef CONFIG_IP_VS_IPV6 752 if (af == AF_INET6) 753 return ipv6_addr_type(&addr->in6) & IPV6_ADDR_UNICAST; 754 #endif 755 return (inet_addr_type(net, addr->ip) == RTN_UNICAST); 756 } 757 758 /* 759 * Pass or drop the packet. 760 * Called by ip_vs_in, when the virtual service is available but 761 * no destination is available for a new connection. 762 */ 763 int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, 764 struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph) 765 { 766 __be16 _ports[2], *pptr, dport; 767 struct netns_ipvs *ipvs = svc->ipvs; 768 struct net *net = ipvs->net; 769 770 pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports); 771 if (!pptr) 772 return NF_DROP; 773 dport = likely(!ip_vs_iph_inverse(iph)) ? pptr[1] : pptr[0]; 774 775 /* if it is fwmark-based service, the cache_bypass sysctl is up 776 and the destination is a non-local unicast, then create 777 a cache_bypass connection entry */ 778 if (sysctl_cache_bypass(ipvs) && svc->fwmark && 779 !(iph->hdr_flags & (IP_VS_HDR_INVERSE | IP_VS_HDR_ICMP)) && 780 ip_vs_addr_is_unicast(net, svc->af, &iph->daddr)) { 781 int ret; 782 struct ip_vs_conn *cp; 783 unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && 784 iph->protocol == IPPROTO_UDP) ? 785 IP_VS_CONN_F_ONE_PACKET : 0; 786 union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; 787 788 /* create a new connection entry */ 789 IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); 790 { 791 struct ip_vs_conn_param p; 792 ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, 793 &iph->saddr, pptr[0], 794 &iph->daddr, pptr[1], &p); 795 cp = ip_vs_conn_new(&p, svc->af, &daddr, 0, 796 IP_VS_CONN_F_BYPASS | flags, 797 NULL, skb->mark); 798 if (!cp) 799 return NF_DROP; 800 } 801 802 /* statistics */ 803 ip_vs_in_stats(cp, skb); 804 805 /* set state */ 806 ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); 807 808 /* transmit the first SYN packet */ 809 ret = cp->packet_xmit(skb, cp, pd->pp, iph); 810 /* do not touch skb anymore */ 811 812 if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control) 813 atomic_inc(&cp->control->in_pkts); 814 else 815 atomic_inc(&cp->in_pkts); 816 ip_vs_conn_put(cp); 817 return ret; 818 } 819 820 /* 821 * When the virtual ftp service is presented, packets destined 822 * for other services on the VIP may get here (except services 823 * listed in the ipvs table), pass the packets, because it is 824 * not ipvs job to decide to drop the packets. 825 */ 826 if (svc->port == FTPPORT && dport != FTPPORT) 827 return NF_ACCEPT; 828 829 if (unlikely(ip_vs_iph_icmp(iph))) 830 return NF_DROP; 831 832 /* 833 * Notify the client that the destination is unreachable, and 834 * release the socket buffer. 835 * Since it is in IP layer, the TCP socket is not actually 836 * created, the TCP RST packet cannot be sent, instead that 837 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ 838 */ 839 #ifdef CONFIG_IP_VS_IPV6 840 if (svc->af == AF_INET6) { 841 if (!skb->dev) 842 skb->dev = net->loopback_dev; 843 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); 844 } else 845 #endif 846 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); 847 848 return NF_DROP; 849 } 850 851 #ifdef CONFIG_SYSCTL 852 853 static int sysctl_snat_reroute(struct netns_ipvs *ipvs) 854 { 855 return ipvs->sysctl_snat_reroute; 856 } 857 858 static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) 859 { 860 return ipvs->sysctl_nat_icmp_send; 861 } 862 863 #else 864 865 static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { return 0; } 866 static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) { return 0; } 867 868 #endif 869 870 __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) 871 { 872 return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); 873 } 874 875 static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum) 876 { 877 if (NF_INET_LOCAL_IN == hooknum) 878 return IP_DEFRAG_VS_IN; 879 if (NF_INET_FORWARD == hooknum) 880 return IP_DEFRAG_VS_FWD; 881 return IP_DEFRAG_VS_OUT; 882 } 883 884 static inline int ip_vs_gather_frags(struct netns_ipvs *ipvs, 885 struct sk_buff *skb, u_int32_t user) 886 { 887 int err; 888 889 local_bh_disable(); 890 err = ip_defrag(ipvs->net, skb, user); 891 local_bh_enable(); 892 if (!err) 893 ip_send_check(ip_hdr(skb)); 894 895 return err; 896 } 897 898 static int ip_vs_route_me_harder(struct netns_ipvs *ipvs, int af, 899 struct sk_buff *skb, unsigned int hooknum) 900 { 901 if (!sysctl_snat_reroute(ipvs)) 902 return 0; 903 /* Reroute replies only to remote clients (FORWARD and LOCAL_OUT) */ 904 if (NF_INET_LOCAL_IN == hooknum) 905 return 0; 906 #ifdef CONFIG_IP_VS_IPV6 907 if (af == AF_INET6) { 908 struct dst_entry *dst = skb_dst(skb); 909 910 if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) && 911 ip6_route_me_harder(ipvs->net, skb->sk, skb) != 0) 912 return 1; 913 } else 914 #endif 915 if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) && 916 ip_route_me_harder(ipvs->net, skb->sk, skb, RTN_LOCAL) != 0) 917 return 1; 918 919 return 0; 920 } 921 922 /* 923 * Packet has been made sufficiently writable in caller 924 * - inout: 1=in->out, 0=out->in 925 */ 926 void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, 927 struct ip_vs_conn *cp, int inout) 928 { 929 struct iphdr *iph = ip_hdr(skb); 930 unsigned int icmp_offset = iph->ihl*4; 931 struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) + 932 icmp_offset); 933 struct iphdr *ciph = (struct iphdr *)(icmph + 1); 934 935 if (inout) { 936 iph->saddr = cp->vaddr.ip; 937 ip_send_check(iph); 938 ciph->daddr = cp->vaddr.ip; 939 ip_send_check(ciph); 940 } else { 941 iph->daddr = cp->daddr.ip; 942 ip_send_check(iph); 943 ciph->saddr = cp->daddr.ip; 944 ip_send_check(ciph); 945 } 946 947 /* the TCP/UDP/SCTP port */ 948 if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol || 949 IPPROTO_SCTP == ciph->protocol) { 950 __be16 *ports = (void *)ciph + ciph->ihl*4; 951 952 if (inout) 953 ports[1] = cp->vport; 954 else 955 ports[0] = cp->dport; 956 } 957 958 /* And finally the ICMP checksum */ 959 icmph->checksum = 0; 960 icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset); 961 skb->ip_summed = CHECKSUM_UNNECESSARY; 962 963 if (inout) 964 IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, 965 "Forwarding altered outgoing ICMP"); 966 else 967 IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, 968 "Forwarding altered incoming ICMP"); 969 } 970 971 #ifdef CONFIG_IP_VS_IPV6 972 void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, 973 struct ip_vs_conn *cp, int inout) 974 { 975 struct ipv6hdr *iph = ipv6_hdr(skb); 976 unsigned int icmp_offset = 0; 977 unsigned int offs = 0; /* header offset*/ 978 int protocol; 979 struct icmp6hdr *icmph; 980 struct ipv6hdr *ciph; 981 unsigned short fragoffs; 982 983 ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, &fragoffs, NULL); 984 icmph = (struct icmp6hdr *)(skb_network_header(skb) + icmp_offset); 985 offs = icmp_offset + sizeof(struct icmp6hdr); 986 ciph = (struct ipv6hdr *)(skb_network_header(skb) + offs); 987 988 protocol = ipv6_find_hdr(skb, &offs, -1, &fragoffs, NULL); 989 990 if (inout) { 991 iph->saddr = cp->vaddr.in6; 992 ciph->daddr = cp->vaddr.in6; 993 } else { 994 iph->daddr = cp->daddr.in6; 995 ciph->saddr = cp->daddr.in6; 996 } 997 998 /* the TCP/UDP/SCTP port */ 999 if (!fragoffs && (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || 1000 IPPROTO_SCTP == protocol)) { 1001 __be16 *ports = (void *)(skb_network_header(skb) + offs); 1002 1003 IP_VS_DBG(11, "%s() changed port %d to %d\n", __func__, 1004 ntohs(inout ? ports[1] : ports[0]), 1005 ntohs(inout ? cp->vport : cp->dport)); 1006 if (inout) 1007 ports[1] = cp->vport; 1008 else 1009 ports[0] = cp->dport; 1010 } 1011 1012 /* And finally the ICMP checksum */ 1013 icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, 1014 skb->len - icmp_offset, 1015 IPPROTO_ICMPV6, 0); 1016 skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset; 1017 skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum); 1018 skb->ip_summed = CHECKSUM_PARTIAL; 1019 1020 if (inout) 1021 IP_VS_DBG_PKT(11, AF_INET6, pp, skb, 1022 (void *)ciph - (void *)iph, 1023 "Forwarding altered outgoing ICMPv6"); 1024 else 1025 IP_VS_DBG_PKT(11, AF_INET6, pp, skb, 1026 (void *)ciph - (void *)iph, 1027 "Forwarding altered incoming ICMPv6"); 1028 } 1029 #endif 1030 1031 /* Handle relevant response ICMP messages - forward to the right 1032 * destination host. 1033 */ 1034 static int handle_response_icmp(int af, struct sk_buff *skb, 1035 union nf_inet_addr *snet, 1036 __u8 protocol, struct ip_vs_conn *cp, 1037 struct ip_vs_protocol *pp, 1038 unsigned int offset, unsigned int ihl, 1039 unsigned int hooknum) 1040 { 1041 unsigned int verdict = NF_DROP; 1042 1043 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) 1044 goto after_nat; 1045 1046 /* Ensure the checksum is correct */ 1047 if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { 1048 /* Failed checksum! */ 1049 IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n", 1050 IP_VS_DBG_ADDR(af, snet)); 1051 goto out; 1052 } 1053 1054 if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol || 1055 IPPROTO_SCTP == protocol) 1056 offset += 2 * sizeof(__u16); 1057 if (skb_ensure_writable(skb, offset)) 1058 goto out; 1059 1060 #ifdef CONFIG_IP_VS_IPV6 1061 if (af == AF_INET6) 1062 ip_vs_nat_icmp_v6(skb, pp, cp, 1); 1063 else 1064 #endif 1065 ip_vs_nat_icmp(skb, pp, cp, 1); 1066 1067 if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum)) 1068 goto out; 1069 1070 after_nat: 1071 /* do the statistics and put it back */ 1072 ip_vs_out_stats(cp, skb); 1073 1074 skb->ipvs_property = 1; 1075 if (!(cp->flags & IP_VS_CONN_F_NFCT)) 1076 ip_vs_notrack(skb); 1077 else 1078 ip_vs_update_conntrack(skb, cp, 0); 1079 verdict = NF_ACCEPT; 1080 1081 out: 1082 __ip_vs_conn_put(cp); 1083 1084 return verdict; 1085 } 1086 1087 /* 1088 * Handle ICMP messages in the inside-to-outside direction (outgoing). 1089 * Find any that might be relevant, check against existing connections. 1090 * Currently handles error types - unreachable, quench, ttl exceeded. 1091 */ 1092 static int ip_vs_out_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, 1093 int *related, unsigned int hooknum) 1094 { 1095 struct iphdr *iph; 1096 struct icmphdr _icmph, *ic; 1097 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ 1098 struct ip_vs_iphdr ciph; 1099 struct ip_vs_conn *cp; 1100 struct ip_vs_protocol *pp; 1101 unsigned int offset, ihl; 1102 union nf_inet_addr snet; 1103 1104 *related = 1; 1105 1106 /* reassemble IP fragments */ 1107 if (ip_is_fragment(ip_hdr(skb))) { 1108 if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum))) 1109 return NF_STOLEN; 1110 } 1111 1112 iph = ip_hdr(skb); 1113 offset = ihl = iph->ihl * 4; 1114 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); 1115 if (ic == NULL) 1116 return NF_DROP; 1117 1118 IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n", 1119 ic->type, ntohs(icmp_id(ic)), 1120 &iph->saddr, &iph->daddr); 1121 1122 /* 1123 * Work through seeing if this is for us. 1124 * These checks are supposed to be in an order that means easy 1125 * things are checked first to speed up processing.... however 1126 * this means that some packets will manage to get a long way 1127 * down this stack and then be rejected, but that's life. 1128 */ 1129 if ((ic->type != ICMP_DEST_UNREACH) && 1130 (ic->type != ICMP_SOURCE_QUENCH) && 1131 (ic->type != ICMP_TIME_EXCEEDED)) { 1132 *related = 0; 1133 return NF_ACCEPT; 1134 } 1135 1136 /* Now find the contained IP header */ 1137 offset += sizeof(_icmph); 1138 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); 1139 if (cih == NULL) 1140 return NF_ACCEPT; /* The packet looks wrong, ignore */ 1141 1142 pp = ip_vs_proto_get(cih->protocol); 1143 if (!pp) 1144 return NF_ACCEPT; 1145 1146 /* Is the embedded protocol header present? */ 1147 if (unlikely(cih->frag_off & htons(IP_OFFSET) && 1148 pp->dont_defrag)) 1149 return NF_ACCEPT; 1150 1151 IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, 1152 "Checking outgoing ICMP for"); 1153 1154 ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, true, &ciph); 1155 1156 /* The embedded headers contain source and dest in reverse order */ 1157 cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto, 1158 ipvs, AF_INET, skb, &ciph); 1159 if (!cp) 1160 return NF_ACCEPT; 1161 1162 snet.ip = iph->saddr; 1163 return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp, 1164 pp, ciph.len, ihl, hooknum); 1165 } 1166 1167 #ifdef CONFIG_IP_VS_IPV6 1168 static int ip_vs_out_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb, 1169 int *related, unsigned int hooknum, 1170 struct ip_vs_iphdr *ipvsh) 1171 { 1172 struct icmp6hdr _icmph, *ic; 1173 struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ 1174 struct ip_vs_conn *cp; 1175 struct ip_vs_protocol *pp; 1176 union nf_inet_addr snet; 1177 unsigned int offset; 1178 1179 *related = 1; 1180 ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph); 1181 if (ic == NULL) 1182 return NF_DROP; 1183 1184 /* 1185 * Work through seeing if this is for us. 1186 * These checks are supposed to be in an order that means easy 1187 * things are checked first to speed up processing.... however 1188 * this means that some packets will manage to get a long way 1189 * down this stack and then be rejected, but that's life. 1190 */ 1191 if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) { 1192 *related = 0; 1193 return NF_ACCEPT; 1194 } 1195 /* Fragment header that is before ICMP header tells us that: 1196 * it's not an error message since they can't be fragmented. 1197 */ 1198 if (ipvsh->flags & IP6_FH_F_FRAG) 1199 return NF_DROP; 1200 1201 IP_VS_DBG(8, "Outgoing ICMPv6 (%d,%d) %pI6c->%pI6c\n", 1202 ic->icmp6_type, ntohs(icmpv6_id(ic)), 1203 &ipvsh->saddr, &ipvsh->daddr); 1204 1205 if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, ipvsh->len + sizeof(_icmph), 1206 true, &ciph)) 1207 return NF_ACCEPT; /* The packet looks wrong, ignore */ 1208 1209 pp = ip_vs_proto_get(ciph.protocol); 1210 if (!pp) 1211 return NF_ACCEPT; 1212 1213 /* The embedded headers contain source and dest in reverse order */ 1214 cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto, 1215 ipvs, AF_INET6, skb, &ciph); 1216 if (!cp) 1217 return NF_ACCEPT; 1218 1219 snet.in6 = ciph.saddr.in6; 1220 offset = ciph.len; 1221 return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp, 1222 pp, offset, sizeof(struct ipv6hdr), 1223 hooknum); 1224 } 1225 #endif 1226 1227 /* 1228 * Check if sctp chunc is ABORT chunk 1229 */ 1230 static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len) 1231 { 1232 struct sctp_chunkhdr *sch, schunk; 1233 sch = skb_header_pointer(skb, nh_len + sizeof(struct sctphdr), 1234 sizeof(schunk), &schunk); 1235 if (sch == NULL) 1236 return 0; 1237 if (sch->type == SCTP_CID_ABORT) 1238 return 1; 1239 return 0; 1240 } 1241 1242 static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) 1243 { 1244 struct tcphdr _tcph, *th; 1245 1246 th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph); 1247 if (th == NULL) 1248 return 0; 1249 return th->rst; 1250 } 1251 1252 static inline bool is_new_conn(const struct sk_buff *skb, 1253 struct ip_vs_iphdr *iph) 1254 { 1255 switch (iph->protocol) { 1256 case IPPROTO_TCP: { 1257 struct tcphdr _tcph, *th; 1258 1259 th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); 1260 if (th == NULL) 1261 return false; 1262 return th->syn; 1263 } 1264 case IPPROTO_SCTP: { 1265 struct sctp_chunkhdr *sch, schunk; 1266 1267 sch = skb_header_pointer(skb, iph->len + sizeof(struct sctphdr), 1268 sizeof(schunk), &schunk); 1269 if (sch == NULL) 1270 return false; 1271 return sch->type == SCTP_CID_INIT; 1272 } 1273 default: 1274 return false; 1275 } 1276 } 1277 1278 static inline bool is_new_conn_expected(const struct ip_vs_conn *cp, 1279 int conn_reuse_mode) 1280 { 1281 /* Controlled (FTP DATA or persistence)? */ 1282 if (cp->control) 1283 return false; 1284 1285 switch (cp->protocol) { 1286 case IPPROTO_TCP: 1287 return (cp->state == IP_VS_TCP_S_TIME_WAIT) || 1288 (cp->state == IP_VS_TCP_S_CLOSE) || 1289 ((conn_reuse_mode & 2) && 1290 (cp->state == IP_VS_TCP_S_FIN_WAIT) && 1291 (cp->flags & IP_VS_CONN_F_NOOUTPUT)); 1292 case IPPROTO_SCTP: 1293 return cp->state == IP_VS_SCTP_S_CLOSED; 1294 default: 1295 return false; 1296 } 1297 } 1298 1299 /* Generic function to create new connections for outgoing RS packets 1300 * 1301 * Pre-requisites for successful connection creation: 1302 * 1) Virtual Service is NOT fwmark based: 1303 * In fwmark-VS actual vaddr and vport are unknown to IPVS 1304 * 2) Real Server and Virtual Service were NOT configured without port: 1305 * This is to allow match of different VS to the same RS ip-addr 1306 */ 1307 struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc, 1308 struct ip_vs_dest *dest, 1309 struct sk_buff *skb, 1310 const struct ip_vs_iphdr *iph, 1311 __be16 dport, 1312 __be16 cport) 1313 { 1314 struct ip_vs_conn_param param; 1315 struct ip_vs_conn *ct = NULL, *cp = NULL; 1316 const union nf_inet_addr *vaddr, *daddr, *caddr; 1317 union nf_inet_addr snet; 1318 __be16 vport; 1319 unsigned int flags; 1320 1321 vaddr = &svc->addr; 1322 vport = svc->port; 1323 daddr = &iph->saddr; 1324 caddr = &iph->daddr; 1325 1326 /* check pre-requisites are satisfied */ 1327 if (svc->fwmark) 1328 return NULL; 1329 if (!vport || !dport) 1330 return NULL; 1331 1332 /* for persistent service first create connection template */ 1333 if (svc->flags & IP_VS_SVC_F_PERSISTENT) { 1334 /* apply netmask the same way ingress-side does */ 1335 #ifdef CONFIG_IP_VS_IPV6 1336 if (svc->af == AF_INET6) 1337 ipv6_addr_prefix(&snet.in6, &caddr->in6, 1338 (__force __u32)svc->netmask); 1339 else 1340 #endif 1341 snet.ip = caddr->ip & svc->netmask; 1342 /* fill params and create template if not existent */ 1343 if (ip_vs_conn_fill_param_persist(svc, skb, iph->protocol, 1344 &snet, 0, vaddr, 1345 vport, ¶m) < 0) 1346 return NULL; 1347 ct = ip_vs_ct_in_get(¶m); 1348 /* check if template exists and points to the same dest */ 1349 if (!ct || !ip_vs_check_template(ct, dest)) { 1350 ct = ip_vs_conn_new(¶m, dest->af, daddr, dport, 1351 IP_VS_CONN_F_TEMPLATE, dest, 0); 1352 if (!ct) { 1353 kfree(param.pe_data); 1354 return NULL; 1355 } 1356 ct->timeout = svc->timeout; 1357 } else { 1358 kfree(param.pe_data); 1359 } 1360 } 1361 1362 /* connection flags */ 1363 flags = ((svc->flags & IP_VS_SVC_F_ONEPACKET) && 1364 iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; 1365 /* create connection */ 1366 ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, 1367 caddr, cport, vaddr, vport, ¶m); 1368 cp = ip_vs_conn_new(¶m, dest->af, daddr, dport, flags, dest, 0); 1369 if (!cp) { 1370 if (ct) 1371 ip_vs_conn_put(ct); 1372 return NULL; 1373 } 1374 if (ct) { 1375 ip_vs_control_add(cp, ct); 1376 ip_vs_conn_put(ct); 1377 } 1378 ip_vs_conn_stats(cp, svc); 1379 1380 /* return connection (will be used to handle outgoing packet) */ 1381 IP_VS_DBG_BUF(6, "New connection RS-initiated:%c c:%s:%u v:%s:%u " 1382 "d:%s:%u conn->flags:%X conn->refcnt:%d\n", 1383 ip_vs_fwd_tag(cp), 1384 IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), 1385 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), 1386 IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), 1387 cp->flags, refcount_read(&cp->refcnt)); 1388 return cp; 1389 } 1390 1391 /* Handle outgoing packets which are considered requests initiated by 1392 * real servers, so that subsequent responses from external client can be 1393 * routed to the right real server. 1394 * Used also for outgoing responses in OPS mode. 1395 * 1396 * Connection management is handled by persistent-engine specific callback. 1397 */ 1398 static struct ip_vs_conn *__ip_vs_rs_conn_out(unsigned int hooknum, 1399 struct netns_ipvs *ipvs, 1400 int af, struct sk_buff *skb, 1401 const struct ip_vs_iphdr *iph) 1402 { 1403 struct ip_vs_dest *dest; 1404 struct ip_vs_conn *cp = NULL; 1405 __be16 _ports[2], *pptr; 1406 1407 if (hooknum == NF_INET_LOCAL_IN) 1408 return NULL; 1409 1410 pptr = frag_safe_skb_hp(skb, iph->len, 1411 sizeof(_ports), _ports); 1412 if (!pptr) 1413 return NULL; 1414 1415 dest = ip_vs_find_real_service(ipvs, af, iph->protocol, 1416 &iph->saddr, pptr[0]); 1417 if (dest) { 1418 struct ip_vs_service *svc; 1419 struct ip_vs_pe *pe; 1420 1421 svc = rcu_dereference(dest->svc); 1422 if (svc) { 1423 pe = rcu_dereference(svc->pe); 1424 if (pe && pe->conn_out) 1425 cp = pe->conn_out(svc, dest, skb, iph, 1426 pptr[0], pptr[1]); 1427 } 1428 } 1429 1430 return cp; 1431 } 1432 1433 /* Handle response packets: rewrite addresses and send away... 1434 */ 1435 static unsigned int 1436 handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, 1437 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph, 1438 unsigned int hooknum) 1439 { 1440 struct ip_vs_protocol *pp = pd->pp; 1441 1442 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) 1443 goto after_nat; 1444 1445 IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet"); 1446 1447 if (skb_ensure_writable(skb, iph->len)) 1448 goto drop; 1449 1450 /* mangle the packet */ 1451 if (pp->snat_handler && 1452 !SNAT_CALL(pp->snat_handler, skb, pp, cp, iph)) 1453 goto drop; 1454 1455 #ifdef CONFIG_IP_VS_IPV6 1456 if (af == AF_INET6) 1457 ipv6_hdr(skb)->saddr = cp->vaddr.in6; 1458 else 1459 #endif 1460 { 1461 ip_hdr(skb)->saddr = cp->vaddr.ip; 1462 ip_send_check(ip_hdr(skb)); 1463 } 1464 1465 /* 1466 * nf_iterate does not expect change in the skb->dst->dev. 1467 * It looks like it is not fatal to enable this code for hooks 1468 * where our handlers are at the end of the chain list and 1469 * when all next handlers use skb->dst->dev and not outdev. 1470 * It will definitely route properly the inout NAT traffic 1471 * when multiple paths are used. 1472 */ 1473 1474 /* For policy routing, packets originating from this 1475 * machine itself may be routed differently to packets 1476 * passing through. We want this packet to be routed as 1477 * if it came from this machine itself. So re-compute 1478 * the routing information. 1479 */ 1480 if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum)) 1481 goto drop; 1482 1483 IP_VS_DBG_PKT(10, af, pp, skb, iph->off, "After SNAT"); 1484 1485 after_nat: 1486 ip_vs_out_stats(cp, skb); 1487 ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd); 1488 skb->ipvs_property = 1; 1489 if (!(cp->flags & IP_VS_CONN_F_NFCT)) 1490 ip_vs_notrack(skb); 1491 else 1492 ip_vs_update_conntrack(skb, cp, 0); 1493 ip_vs_conn_put(cp); 1494 1495 return NF_ACCEPT; 1496 1497 drop: 1498 ip_vs_conn_put(cp); 1499 kfree_skb(skb); 1500 return NF_STOLEN; 1501 } 1502 1503 /* 1504 * Check if outgoing packet belongs to the established ip_vs_conn. 1505 */ 1506 static unsigned int 1507 ip_vs_out_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) 1508 { 1509 struct netns_ipvs *ipvs = net_ipvs(state->net); 1510 unsigned int hooknum = state->hook; 1511 struct ip_vs_iphdr iph; 1512 struct ip_vs_protocol *pp; 1513 struct ip_vs_proto_data *pd; 1514 struct ip_vs_conn *cp; 1515 int af = state->pf; 1516 struct sock *sk; 1517 1518 /* Already marked as IPVS request or reply? */ 1519 if (skb->ipvs_property) 1520 return NF_ACCEPT; 1521 1522 sk = skb_to_full_sk(skb); 1523 /* Bad... Do not break raw sockets */ 1524 if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && 1525 af == AF_INET)) { 1526 1527 if (sk->sk_family == PF_INET && inet_test_bit(NODEFRAG, sk)) 1528 return NF_ACCEPT; 1529 } 1530 1531 if (unlikely(!skb_dst(skb))) 1532 return NF_ACCEPT; 1533 1534 ip_vs_fill_iph_skb(af, skb, false, &iph); 1535 #ifdef CONFIG_IP_VS_IPV6 1536 if (af == AF_INET6) { 1537 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { 1538 int related; 1539 int verdict = ip_vs_out_icmp_v6(ipvs, skb, &related, 1540 hooknum, &iph); 1541 1542 if (related) 1543 return verdict; 1544 } 1545 } else 1546 #endif 1547 if (unlikely(iph.protocol == IPPROTO_ICMP)) { 1548 int related; 1549 int verdict = ip_vs_out_icmp(ipvs, skb, &related, hooknum); 1550 1551 if (related) 1552 return verdict; 1553 } 1554 1555 pd = ip_vs_proto_data_get(ipvs, iph.protocol); 1556 if (unlikely(!pd)) 1557 return NF_ACCEPT; 1558 pp = pd->pp; 1559 1560 /* reassemble IP fragments */ 1561 #ifdef CONFIG_IP_VS_IPV6 1562 if (af == AF_INET) 1563 #endif 1564 if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) { 1565 if (ip_vs_gather_frags(ipvs, skb, 1566 ip_vs_defrag_user(hooknum))) 1567 return NF_STOLEN; 1568 1569 ip_vs_fill_iph_skb(AF_INET, skb, false, &iph); 1570 } 1571 1572 /* 1573 * Check if the packet belongs to an existing entry 1574 */ 1575 cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto, 1576 ipvs, af, skb, &iph); 1577 1578 if (likely(cp)) 1579 return handle_response(af, skb, pd, cp, &iph, hooknum); 1580 1581 /* Check for real-server-started requests */ 1582 if (atomic_read(&ipvs->conn_out_counter[ip_vs_af_index(af)])) { 1583 /* Currently only for UDP: 1584 * connection oriented protocols typically use 1585 * ephemeral ports for outgoing connections, so 1586 * related incoming responses would not match any VS 1587 */ 1588 if (pp->protocol == IPPROTO_UDP) { 1589 cp = __ip_vs_rs_conn_out(hooknum, ipvs, af, skb, &iph); 1590 if (likely(cp)) 1591 return handle_response(af, skb, pd, cp, &iph, 1592 hooknum); 1593 } 1594 } 1595 1596 if (sysctl_nat_icmp_send(ipvs) && 1597 (pp->protocol == IPPROTO_TCP || 1598 pp->protocol == IPPROTO_UDP || 1599 pp->protocol == IPPROTO_SCTP)) { 1600 __be16 _ports[2], *pptr; 1601 1602 pptr = frag_safe_skb_hp(skb, iph.len, 1603 sizeof(_ports), _ports); 1604 if (pptr == NULL) 1605 return NF_ACCEPT; /* Not for me */ 1606 if (ip_vs_has_real_service(ipvs, af, iph.protocol, &iph.saddr, 1607 pptr[0])) { 1608 /* 1609 * Notify the real server: there is no 1610 * existing entry if it is not RST 1611 * packet or not TCP packet. 1612 */ 1613 if ((iph.protocol != IPPROTO_TCP && 1614 iph.protocol != IPPROTO_SCTP) 1615 || ((iph.protocol == IPPROTO_TCP 1616 && !is_tcp_reset(skb, iph.len)) 1617 || (iph.protocol == IPPROTO_SCTP 1618 && !is_sctp_abort(skb, 1619 iph.len)))) { 1620 #ifdef CONFIG_IP_VS_IPV6 1621 if (af == AF_INET6) { 1622 if (!skb->dev) 1623 skb->dev = ipvs->net->loopback_dev; 1624 icmpv6_send(skb, 1625 ICMPV6_DEST_UNREACH, 1626 ICMPV6_PORT_UNREACH, 1627 0); 1628 } else 1629 #endif 1630 icmp_send(skb, 1631 ICMP_DEST_UNREACH, 1632 ICMP_PORT_UNREACH, 0); 1633 return NF_DROP; 1634 } 1635 } 1636 } 1637 1638 IP_VS_DBG_PKT(12, af, pp, skb, iph.off, 1639 "ip_vs_out: packet continues traversal as normal"); 1640 return NF_ACCEPT; 1641 } 1642 1643 static unsigned int 1644 ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, 1645 struct ip_vs_proto_data *pd, 1646 int *verdict, struct ip_vs_conn **cpp, 1647 struct ip_vs_iphdr *iph) 1648 { 1649 struct ip_vs_protocol *pp = pd->pp; 1650 1651 if (!iph->fragoffs) { 1652 /* No (second) fragments need to enter here, as nf_defrag_ipv6 1653 * replayed fragment zero will already have created the cp 1654 */ 1655 1656 /* Schedule and create new connection entry into cpp */ 1657 if (!pp->conn_schedule(ipvs, af, skb, pd, verdict, cpp, iph)) 1658 return 0; 1659 } 1660 1661 if (unlikely(!*cpp)) { 1662 /* sorry, all this trouble for a no-hit :) */ 1663 IP_VS_DBG_PKT(12, af, pp, skb, iph->off, 1664 "ip_vs_in: packet continues traversal as normal"); 1665 1666 /* Fragment couldn't be mapped to a conn entry */ 1667 if (iph->fragoffs) 1668 IP_VS_DBG_PKT(7, af, pp, skb, iph->off, 1669 "unhandled fragment"); 1670 1671 *verdict = NF_ACCEPT; 1672 return 0; 1673 } 1674 1675 return 1; 1676 } 1677 1678 /* Check the UDP tunnel and return its header length */ 1679 static int ipvs_udp_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, 1680 unsigned int offset, __u16 af, 1681 const union nf_inet_addr *daddr, __u8 *proto) 1682 { 1683 struct udphdr _udph, *udph; 1684 struct ip_vs_dest *dest; 1685 1686 udph = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); 1687 if (!udph) 1688 goto unk; 1689 offset += sizeof(struct udphdr); 1690 dest = ip_vs_find_tunnel(ipvs, af, daddr, udph->dest); 1691 if (!dest) 1692 goto unk; 1693 if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1694 struct guehdr _gueh, *gueh; 1695 1696 gueh = skb_header_pointer(skb, offset, sizeof(_gueh), &_gueh); 1697 if (!gueh) 1698 goto unk; 1699 if (gueh->control != 0 || gueh->version != 0) 1700 goto unk; 1701 /* Later we can support also IPPROTO_IPV6 */ 1702 if (gueh->proto_ctype != IPPROTO_IPIP) 1703 goto unk; 1704 *proto = gueh->proto_ctype; 1705 return sizeof(struct udphdr) + sizeof(struct guehdr) + 1706 (gueh->hlen << 2); 1707 } 1708 1709 unk: 1710 return 0; 1711 } 1712 1713 /* Check the GRE tunnel and return its header length */ 1714 static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb, 1715 unsigned int offset, __u16 af, 1716 const union nf_inet_addr *daddr, __u8 *proto) 1717 { 1718 struct gre_base_hdr _greh, *greh; 1719 struct ip_vs_dest *dest; 1720 1721 greh = skb_header_pointer(skb, offset, sizeof(_greh), &_greh); 1722 if (!greh) 1723 goto unk; 1724 dest = ip_vs_find_tunnel(ipvs, af, daddr, 0); 1725 if (!dest) 1726 goto unk; 1727 if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 1728 IP_TUNNEL_DECLARE_FLAGS(flags); 1729 __be16 type; 1730 1731 /* Only support version 0 and C (csum) */ 1732 if ((greh->flags & ~GRE_CSUM) != 0) 1733 goto unk; 1734 type = greh->protocol; 1735 /* Later we can support also IPPROTO_IPV6 */ 1736 if (type != htons(ETH_P_IP)) 1737 goto unk; 1738 *proto = IPPROTO_IPIP; 1739 1740 gre_flags_to_tnl_flags(flags, greh->flags); 1741 1742 return gre_calc_hlen(flags); 1743 } 1744 1745 unk: 1746 return 0; 1747 } 1748 1749 /* 1750 * Handle ICMP messages in the outside-to-inside direction (incoming). 1751 * Find any that might be relevant, check against existing connections, 1752 * forward to the right destination host if relevant. 1753 * Currently handles error types - unreachable, quench, ttl exceeded. 1754 */ 1755 static int 1756 ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, 1757 unsigned int hooknum) 1758 { 1759 struct iphdr *iph; 1760 struct icmphdr _icmph, *ic; 1761 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ 1762 struct ip_vs_iphdr ciph; 1763 struct ip_vs_conn *cp; 1764 struct ip_vs_protocol *pp; 1765 struct ip_vs_proto_data *pd; 1766 unsigned int offset, offset2, ihl, verdict; 1767 bool tunnel, new_cp = false; 1768 union nf_inet_addr *raddr; 1769 char *outer_proto = "IPIP"; 1770 1771 *related = 1; 1772 1773 /* reassemble IP fragments */ 1774 if (ip_is_fragment(ip_hdr(skb))) { 1775 if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum))) 1776 return NF_STOLEN; 1777 } 1778 1779 iph = ip_hdr(skb); 1780 offset = ihl = iph->ihl * 4; 1781 ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); 1782 if (ic == NULL) 1783 return NF_DROP; 1784 1785 IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n", 1786 ic->type, ntohs(icmp_id(ic)), 1787 &iph->saddr, &iph->daddr); 1788 1789 /* 1790 * Work through seeing if this is for us. 1791 * These checks are supposed to be in an order that means easy 1792 * things are checked first to speed up processing.... however 1793 * this means that some packets will manage to get a long way 1794 * down this stack and then be rejected, but that's life. 1795 */ 1796 if ((ic->type != ICMP_DEST_UNREACH) && 1797 (ic->type != ICMP_SOURCE_QUENCH) && 1798 (ic->type != ICMP_TIME_EXCEEDED)) { 1799 *related = 0; 1800 return NF_ACCEPT; 1801 } 1802 1803 /* Now find the contained IP header */ 1804 offset += sizeof(_icmph); 1805 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); 1806 if (cih == NULL) 1807 return NF_ACCEPT; /* The packet looks wrong, ignore */ 1808 raddr = (union nf_inet_addr *)&cih->daddr; 1809 1810 /* Special case for errors for IPIP/UDP/GRE tunnel packets */ 1811 tunnel = false; 1812 if (cih->protocol == IPPROTO_IPIP) { 1813 struct ip_vs_dest *dest; 1814 1815 if (unlikely(cih->frag_off & htons(IP_OFFSET))) 1816 return NF_ACCEPT; 1817 /* Error for our IPIP must arrive at LOCAL_IN */ 1818 if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL)) 1819 return NF_ACCEPT; 1820 dest = ip_vs_find_tunnel(ipvs, AF_INET, raddr, 0); 1821 /* Only for known tunnel */ 1822 if (!dest || dest->tun_type != IP_VS_CONN_F_TUNNEL_TYPE_IPIP) 1823 return NF_ACCEPT; 1824 offset += cih->ihl * 4; 1825 cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); 1826 if (cih == NULL) 1827 return NF_ACCEPT; /* The packet looks wrong, ignore */ 1828 tunnel = true; 1829 } else if ((cih->protocol == IPPROTO_UDP || /* Can be UDP encap */ 1830 cih->protocol == IPPROTO_GRE) && /* Can be GRE encap */ 1831 /* Error for our tunnel must arrive at LOCAL_IN */ 1832 (skb_rtable(skb)->rt_flags & RTCF_LOCAL)) { 1833 __u8 iproto; 1834 int ulen; 1835 1836 /* Non-first fragment has no UDP/GRE header */ 1837 if (unlikely(cih->frag_off & htons(IP_OFFSET))) 1838 return NF_ACCEPT; 1839 offset2 = offset + cih->ihl * 4; 1840 if (cih->protocol == IPPROTO_UDP) { 1841 ulen = ipvs_udp_decap(ipvs, skb, offset2, AF_INET, 1842 raddr, &iproto); 1843 outer_proto = "UDP"; 1844 } else { 1845 ulen = ipvs_gre_decap(ipvs, skb, offset2, AF_INET, 1846 raddr, &iproto); 1847 outer_proto = "GRE"; 1848 } 1849 if (ulen > 0) { 1850 /* Skip IP and UDP/GRE tunnel headers */ 1851 offset = offset2 + ulen; 1852 /* Now we should be at the original IP header */ 1853 cih = skb_header_pointer(skb, offset, sizeof(_ciph), 1854 &_ciph); 1855 if (cih && cih->version == 4 && cih->ihl >= 5 && 1856 iproto == IPPROTO_IPIP) 1857 tunnel = true; 1858 else 1859 return NF_ACCEPT; 1860 } 1861 } 1862 1863 pd = ip_vs_proto_data_get(ipvs, cih->protocol); 1864 if (!pd) 1865 return NF_ACCEPT; 1866 pp = pd->pp; 1867 1868 /* Is the embedded protocol header present? */ 1869 if (unlikely(cih->frag_off & htons(IP_OFFSET) && 1870 pp->dont_defrag)) 1871 return NF_ACCEPT; 1872 1873 IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, 1874 "Checking incoming ICMP for"); 1875 1876 offset2 = offset; 1877 ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, !tunnel, &ciph); 1878 offset = ciph.len; 1879 1880 /* The embedded headers contain source and dest in reverse order. 1881 * For IPIP/UDP/GRE tunnel this is error for request, not for reply. 1882 */ 1883 cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, 1884 ipvs, AF_INET, skb, &ciph); 1885 1886 if (!cp) { 1887 int v; 1888 1889 if (tunnel || !sysctl_schedule_icmp(ipvs)) 1890 return NF_ACCEPT; 1891 1892 if (!ip_vs_try_to_schedule(ipvs, AF_INET, skb, pd, &v, &cp, &ciph)) 1893 return v; 1894 new_cp = true; 1895 } 1896 1897 verdict = NF_DROP; 1898 1899 /* Ensure the checksum is correct */ 1900 if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { 1901 /* Failed checksum! */ 1902 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n", 1903 &iph->saddr); 1904 goto out; 1905 } 1906 1907 if (tunnel) { 1908 __be32 info = ic->un.gateway; 1909 __u8 type = ic->type; 1910 __u8 code = ic->code; 1911 1912 /* Update the MTU */ 1913 if (ic->type == ICMP_DEST_UNREACH && 1914 ic->code == ICMP_FRAG_NEEDED) { 1915 struct ip_vs_dest *dest = cp->dest; 1916 u32 mtu = ntohs(ic->un.frag.mtu); 1917 __be16 frag_off = cih->frag_off; 1918 1919 /* Strip outer IP and ICMP, go to IPIP/UDP/GRE header */ 1920 if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL) 1921 goto ignore_tunnel; 1922 offset2 -= ihl + sizeof(_icmph); 1923 skb_reset_network_header(skb); 1924 IP_VS_DBG(12, "ICMP for %s %pI4->%pI4: mtu=%u\n", 1925 outer_proto, &ip_hdr(skb)->saddr, 1926 &ip_hdr(skb)->daddr, mtu); 1927 ipv4_update_pmtu(skb, ipvs->net, mtu, 0, 0); 1928 /* Client uses PMTUD? */ 1929 if (!(frag_off & htons(IP_DF))) 1930 goto ignore_tunnel; 1931 /* Prefer the resulting PMTU */ 1932 if (dest) { 1933 struct ip_vs_dest_dst *dest_dst; 1934 1935 dest_dst = rcu_dereference(dest->dest_dst); 1936 if (dest_dst) 1937 mtu = dst_mtu(dest_dst->dst_cache); 1938 } 1939 if (mtu > 68 + sizeof(struct iphdr)) 1940 mtu -= sizeof(struct iphdr); 1941 info = htonl(mtu); 1942 } 1943 /* Strip outer IP, ICMP and IPIP/UDP/GRE, go to IP header of 1944 * original request. 1945 */ 1946 if (pskb_pull(skb, offset2) == NULL) 1947 goto ignore_tunnel; 1948 skb_reset_network_header(skb); 1949 IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n", 1950 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, 1951 type, code, ntohl(info)); 1952 icmp_send(skb, type, code, info); 1953 /* ICMP can be shorter but anyways, account it */ 1954 ip_vs_out_stats(cp, skb); 1955 1956 ignore_tunnel: 1957 consume_skb(skb); 1958 verdict = NF_STOLEN; 1959 goto out; 1960 } 1961 1962 /* do the statistics and put it back */ 1963 ip_vs_in_stats(cp, skb); 1964 if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol || 1965 IPPROTO_SCTP == cih->protocol) 1966 offset += 2 * sizeof(__u16); 1967 verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum, &ciph); 1968 1969 out: 1970 if (likely(!new_cp)) 1971 __ip_vs_conn_put(cp); 1972 else 1973 ip_vs_conn_put(cp); 1974 1975 return verdict; 1976 } 1977 1978 #ifdef CONFIG_IP_VS_IPV6 1979 static int ip_vs_in_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb, 1980 int *related, unsigned int hooknum, 1981 struct ip_vs_iphdr *iph) 1982 { 1983 struct icmp6hdr _icmph, *ic; 1984 struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */ 1985 struct ip_vs_conn *cp; 1986 struct ip_vs_protocol *pp; 1987 struct ip_vs_proto_data *pd; 1988 unsigned int offset, verdict; 1989 bool new_cp = false; 1990 1991 *related = 1; 1992 1993 ic = frag_safe_skb_hp(skb, iph->len, sizeof(_icmph), &_icmph); 1994 if (ic == NULL) 1995 return NF_DROP; 1996 1997 /* 1998 * Work through seeing if this is for us. 1999 * These checks are supposed to be in an order that means easy 2000 * things are checked first to speed up processing.... however 2001 * this means that some packets will manage to get a long way 2002 * down this stack and then be rejected, but that's life. 2003 */ 2004 if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) { 2005 *related = 0; 2006 return NF_ACCEPT; 2007 } 2008 /* Fragment header that is before ICMP header tells us that: 2009 * it's not an error message since they can't be fragmented. 2010 */ 2011 if (iph->flags & IP6_FH_F_FRAG) 2012 return NF_DROP; 2013 2014 IP_VS_DBG(8, "Incoming ICMPv6 (%d,%d) %pI6c->%pI6c\n", 2015 ic->icmp6_type, ntohs(icmpv6_id(ic)), 2016 &iph->saddr, &iph->daddr); 2017 2018 offset = iph->len + sizeof(_icmph); 2019 if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, offset, true, &ciph)) 2020 return NF_ACCEPT; 2021 2022 pd = ip_vs_proto_data_get(ipvs, ciph.protocol); 2023 if (!pd) 2024 return NF_ACCEPT; 2025 pp = pd->pp; 2026 2027 /* Cannot handle fragmented embedded protocol */ 2028 if (ciph.fragoffs) 2029 return NF_ACCEPT; 2030 2031 IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset, 2032 "Checking incoming ICMPv6 for"); 2033 2034 /* The embedded headers contain source and dest in reverse order 2035 * if not from localhost 2036 */ 2037 cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, 2038 ipvs, AF_INET6, skb, &ciph); 2039 2040 if (!cp) { 2041 int v; 2042 2043 if (!sysctl_schedule_icmp(ipvs)) 2044 return NF_ACCEPT; 2045 2046 if (!ip_vs_try_to_schedule(ipvs, AF_INET6, skb, pd, &v, &cp, &ciph)) 2047 return v; 2048 2049 new_cp = true; 2050 } 2051 2052 /* VS/TUN, VS/DR and LOCALNODE just let it go */ 2053 if ((hooknum == NF_INET_LOCAL_OUT) && 2054 (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) { 2055 verdict = NF_ACCEPT; 2056 goto out; 2057 } 2058 2059 /* do the statistics and put it back */ 2060 ip_vs_in_stats(cp, skb); 2061 2062 /* Need to mangle contained IPv6 header in ICMPv6 packet */ 2063 offset = ciph.len; 2064 if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol || 2065 IPPROTO_SCTP == ciph.protocol) 2066 offset += 2 * sizeof(__u16); /* Also mangle ports */ 2067 2068 verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset, hooknum, &ciph); 2069 2070 out: 2071 if (likely(!new_cp)) 2072 __ip_vs_conn_put(cp); 2073 else 2074 ip_vs_conn_put(cp); 2075 2076 return verdict; 2077 } 2078 #endif 2079 2080 2081 /* 2082 * Check if it's for virtual services, look it up, 2083 * and send it on its way... 2084 */ 2085 static unsigned int 2086 ip_vs_in_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) 2087 { 2088 struct netns_ipvs *ipvs = net_ipvs(state->net); 2089 unsigned int hooknum = state->hook; 2090 struct ip_vs_iphdr iph; 2091 struct ip_vs_protocol *pp; 2092 struct ip_vs_proto_data *pd; 2093 struct ip_vs_conn *cp; 2094 int ret, pkts; 2095 struct sock *sk; 2096 int af = state->pf; 2097 2098 /* Already marked as IPVS request or reply? */ 2099 if (skb->ipvs_property) 2100 return NF_ACCEPT; 2101 2102 /* 2103 * Big tappo: 2104 * - remote client: only PACKET_HOST 2105 * - route: used for struct net when skb->dev is unset 2106 */ 2107 if (unlikely((skb->pkt_type != PACKET_HOST && 2108 hooknum != NF_INET_LOCAL_OUT) || 2109 !skb_dst(skb))) { 2110 ip_vs_fill_iph_skb(af, skb, false, &iph); 2111 IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s" 2112 " ignored in hook %u\n", 2113 skb->pkt_type, iph.protocol, 2114 IP_VS_DBG_ADDR(af, &iph.daddr), hooknum); 2115 return NF_ACCEPT; 2116 } 2117 /* ipvs enabled in this netns ? */ 2118 if (unlikely(sysctl_backup_only(ipvs))) 2119 return NF_ACCEPT; 2120 2121 ip_vs_fill_iph_skb(af, skb, false, &iph); 2122 2123 /* Bad... Do not break raw sockets */ 2124 sk = skb_to_full_sk(skb); 2125 if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && 2126 af == AF_INET)) { 2127 2128 if (sk->sk_family == PF_INET && inet_test_bit(NODEFRAG, sk)) 2129 return NF_ACCEPT; 2130 } 2131 2132 #ifdef CONFIG_IP_VS_IPV6 2133 if (af == AF_INET6) { 2134 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { 2135 int related; 2136 int verdict = ip_vs_in_icmp_v6(ipvs, skb, &related, 2137 hooknum, &iph); 2138 2139 if (related) 2140 return verdict; 2141 } 2142 } else 2143 #endif 2144 if (unlikely(iph.protocol == IPPROTO_ICMP)) { 2145 int related; 2146 int verdict = ip_vs_in_icmp(ipvs, skb, &related, 2147 hooknum); 2148 2149 if (related) 2150 return verdict; 2151 } 2152 2153 /* Protocol supported? */ 2154 pd = ip_vs_proto_data_get(ipvs, iph.protocol); 2155 if (unlikely(!pd)) { 2156 /* The only way we'll see this packet again is if it's 2157 * encapsulated, so mark it with ipvs_property=1 so we 2158 * skip it if we're ignoring tunneled packets 2159 */ 2160 if (sysctl_ignore_tunneled(ipvs)) 2161 skb->ipvs_property = 1; 2162 2163 return NF_ACCEPT; 2164 } 2165 pp = pd->pp; 2166 /* 2167 * Check if the packet belongs to an existing connection entry 2168 */ 2169 cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, 2170 ipvs, af, skb, &iph); 2171 2172 if (!iph.fragoffs && is_new_conn(skb, &iph) && cp) { 2173 int conn_reuse_mode = sysctl_conn_reuse_mode(ipvs); 2174 bool old_ct = false, resched = false; 2175 2176 if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest && 2177 unlikely(!atomic_read(&cp->dest->weight))) { 2178 resched = true; 2179 old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); 2180 } else if (conn_reuse_mode && 2181 is_new_conn_expected(cp, conn_reuse_mode)) { 2182 old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); 2183 if (!atomic_read(&cp->n_control)) { 2184 resched = true; 2185 } else { 2186 /* Do not reschedule controlling connection 2187 * that uses conntrack while it is still 2188 * referenced by controlled connection(s). 2189 */ 2190 resched = !old_ct; 2191 } 2192 } 2193 2194 if (resched) { 2195 if (!old_ct) 2196 cp->flags &= ~IP_VS_CONN_F_NFCT; 2197 if (!atomic_read(&cp->n_control)) 2198 ip_vs_conn_expire_now(cp); 2199 __ip_vs_conn_put(cp); 2200 if (old_ct) 2201 return NF_DROP; 2202 cp = NULL; 2203 } 2204 } 2205 2206 /* Check the server status */ 2207 if (cp && cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { 2208 /* the destination server is not available */ 2209 if (sysctl_expire_nodest_conn(ipvs)) { 2210 bool old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); 2211 2212 if (!old_ct) 2213 cp->flags &= ~IP_VS_CONN_F_NFCT; 2214 2215 ip_vs_conn_expire_now(cp); 2216 __ip_vs_conn_put(cp); 2217 if (old_ct) 2218 return NF_DROP; 2219 cp = NULL; 2220 } else { 2221 __ip_vs_conn_put(cp); 2222 return NF_DROP; 2223 } 2224 } 2225 2226 if (unlikely(!cp)) { 2227 int v; 2228 2229 if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph)) 2230 return v; 2231 } 2232 2233 IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet"); 2234 2235 ip_vs_in_stats(cp, skb); 2236 ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); 2237 if (cp->packet_xmit) 2238 ret = cp->packet_xmit(skb, cp, pp, &iph); 2239 /* do not touch skb anymore */ 2240 else { 2241 IP_VS_DBG_RL("warning: packet_xmit is null"); 2242 ret = NF_ACCEPT; 2243 } 2244 2245 /* Increase its packet counter and check if it is needed 2246 * to be synchronized 2247 * 2248 * Sync connection if it is about to close to 2249 * encorage the standby servers to update the connections timeout 2250 * 2251 * For ONE_PKT let ip_vs_sync_conn() do the filter work. 2252 */ 2253 2254 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 2255 pkts = sysctl_sync_threshold(ipvs); 2256 else 2257 pkts = atomic_inc_return(&cp->in_pkts); 2258 2259 if (ipvs->sync_state & IP_VS_STATE_MASTER) 2260 ip_vs_sync_conn(ipvs, cp, pkts); 2261 else if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control) 2262 /* increment is done inside ip_vs_sync_conn too */ 2263 atomic_inc(&cp->control->in_pkts); 2264 2265 ip_vs_conn_put(cp); 2266 return ret; 2267 } 2268 2269 /* 2270 * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP 2271 * related packets destined for 0.0.0.0/0. 2272 * When fwmark-based virtual service is used, such as transparent 2273 * cache cluster, TCP packets can be marked and routed to ip_vs_in, 2274 * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and 2275 * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain 2276 * and send them to ip_vs_in_icmp. 2277 */ 2278 static unsigned int 2279 ip_vs_forward_icmp(void *priv, struct sk_buff *skb, 2280 const struct nf_hook_state *state) 2281 { 2282 struct netns_ipvs *ipvs = net_ipvs(state->net); 2283 int r; 2284 2285 /* ipvs enabled in this netns ? */ 2286 if (unlikely(sysctl_backup_only(ipvs))) 2287 return NF_ACCEPT; 2288 2289 if (state->pf == NFPROTO_IPV4) { 2290 if (ip_hdr(skb)->protocol != IPPROTO_ICMP) 2291 return NF_ACCEPT; 2292 #ifdef CONFIG_IP_VS_IPV6 2293 } else { 2294 struct ip_vs_iphdr iphdr; 2295 2296 ip_vs_fill_iph_skb(AF_INET6, skb, false, &iphdr); 2297 2298 if (iphdr.protocol != IPPROTO_ICMPV6) 2299 return NF_ACCEPT; 2300 2301 return ip_vs_in_icmp_v6(ipvs, skb, &r, state->hook, &iphdr); 2302 #endif 2303 } 2304 2305 return ip_vs_in_icmp(ipvs, skb, &r, state->hook); 2306 } 2307 2308 static const struct nf_hook_ops ip_vs_ops4[] = { 2309 /* After packet filtering, change source only for VS/NAT */ 2310 { 2311 .hook = ip_vs_out_hook, 2312 .pf = NFPROTO_IPV4, 2313 .hooknum = NF_INET_LOCAL_IN, 2314 .priority = NF_IP_PRI_NAT_SRC - 2, 2315 }, 2316 /* After packet filtering, forward packet through VS/DR, VS/TUN, 2317 * or VS/NAT(change destination), so that filtering rules can be 2318 * applied to IPVS. */ 2319 { 2320 .hook = ip_vs_in_hook, 2321 .pf = NFPROTO_IPV4, 2322 .hooknum = NF_INET_LOCAL_IN, 2323 .priority = NF_IP_PRI_NAT_SRC - 1, 2324 }, 2325 /* Before ip_vs_in, change source only for VS/NAT */ 2326 { 2327 .hook = ip_vs_out_hook, 2328 .pf = NFPROTO_IPV4, 2329 .hooknum = NF_INET_LOCAL_OUT, 2330 .priority = NF_IP_PRI_NAT_DST + 1, 2331 }, 2332 /* After mangle, schedule and forward local requests */ 2333 { 2334 .hook = ip_vs_in_hook, 2335 .pf = NFPROTO_IPV4, 2336 .hooknum = NF_INET_LOCAL_OUT, 2337 .priority = NF_IP_PRI_NAT_DST + 2, 2338 }, 2339 /* After packet filtering (but before ip_vs_out_icmp), catch icmp 2340 * destined for 0.0.0.0/0, which is for incoming IPVS connections */ 2341 { 2342 .hook = ip_vs_forward_icmp, 2343 .pf = NFPROTO_IPV4, 2344 .hooknum = NF_INET_FORWARD, 2345 .priority = 99, 2346 }, 2347 /* After packet filtering, change source only for VS/NAT */ 2348 { 2349 .hook = ip_vs_out_hook, 2350 .pf = NFPROTO_IPV4, 2351 .hooknum = NF_INET_FORWARD, 2352 .priority = 100, 2353 }, 2354 }; 2355 2356 #ifdef CONFIG_IP_VS_IPV6 2357 static const struct nf_hook_ops ip_vs_ops6[] = { 2358 /* After packet filtering, change source only for VS/NAT */ 2359 { 2360 .hook = ip_vs_out_hook, 2361 .pf = NFPROTO_IPV6, 2362 .hooknum = NF_INET_LOCAL_IN, 2363 .priority = NF_IP6_PRI_NAT_SRC - 2, 2364 }, 2365 /* After packet filtering, forward packet through VS/DR, VS/TUN, 2366 * or VS/NAT(change destination), so that filtering rules can be 2367 * applied to IPVS. */ 2368 { 2369 .hook = ip_vs_in_hook, 2370 .pf = NFPROTO_IPV6, 2371 .hooknum = NF_INET_LOCAL_IN, 2372 .priority = NF_IP6_PRI_NAT_SRC - 1, 2373 }, 2374 /* Before ip_vs_in, change source only for VS/NAT */ 2375 { 2376 .hook = ip_vs_out_hook, 2377 .pf = NFPROTO_IPV6, 2378 .hooknum = NF_INET_LOCAL_OUT, 2379 .priority = NF_IP6_PRI_NAT_DST + 1, 2380 }, 2381 /* After mangle, schedule and forward local requests */ 2382 { 2383 .hook = ip_vs_in_hook, 2384 .pf = NFPROTO_IPV6, 2385 .hooknum = NF_INET_LOCAL_OUT, 2386 .priority = NF_IP6_PRI_NAT_DST + 2, 2387 }, 2388 /* After packet filtering (but before ip_vs_out_icmp), catch icmp 2389 * destined for 0.0.0.0/0, which is for incoming IPVS connections */ 2390 { 2391 .hook = ip_vs_forward_icmp, 2392 .pf = NFPROTO_IPV6, 2393 .hooknum = NF_INET_FORWARD, 2394 .priority = 99, 2395 }, 2396 /* After packet filtering, change source only for VS/NAT */ 2397 { 2398 .hook = ip_vs_out_hook, 2399 .pf = NFPROTO_IPV6, 2400 .hooknum = NF_INET_FORWARD, 2401 .priority = 100, 2402 }, 2403 }; 2404 #endif 2405 2406 int ip_vs_register_hooks(struct netns_ipvs *ipvs, unsigned int af) 2407 { 2408 const struct nf_hook_ops *ops; 2409 unsigned int count; 2410 unsigned int afmask; 2411 int ret = 0; 2412 2413 if (af == AF_INET6) { 2414 #ifdef CONFIG_IP_VS_IPV6 2415 ops = ip_vs_ops6; 2416 count = ARRAY_SIZE(ip_vs_ops6); 2417 afmask = 2; 2418 #else 2419 return -EINVAL; 2420 #endif 2421 } else { 2422 ops = ip_vs_ops4; 2423 count = ARRAY_SIZE(ip_vs_ops4); 2424 afmask = 1; 2425 } 2426 2427 if (!(ipvs->hooks_afmask & afmask)) { 2428 ret = nf_register_net_hooks(ipvs->net, ops, count); 2429 if (ret >= 0) 2430 ipvs->hooks_afmask |= afmask; 2431 } 2432 return ret; 2433 } 2434 2435 void ip_vs_unregister_hooks(struct netns_ipvs *ipvs, unsigned int af) 2436 { 2437 const struct nf_hook_ops *ops; 2438 unsigned int count; 2439 unsigned int afmask; 2440 2441 if (af == AF_INET6) { 2442 #ifdef CONFIG_IP_VS_IPV6 2443 ops = ip_vs_ops6; 2444 count = ARRAY_SIZE(ip_vs_ops6); 2445 afmask = 2; 2446 #else 2447 return; 2448 #endif 2449 } else { 2450 ops = ip_vs_ops4; 2451 count = ARRAY_SIZE(ip_vs_ops4); 2452 afmask = 1; 2453 } 2454 2455 if (ipvs->hooks_afmask & afmask) { 2456 nf_unregister_net_hooks(ipvs->net, ops, count); 2457 ipvs->hooks_afmask &= ~afmask; 2458 } 2459 } 2460 2461 /* 2462 * Initialize IP Virtual Server netns mem. 2463 */ 2464 static int __net_init __ip_vs_init(struct net *net) 2465 { 2466 struct netns_ipvs *ipvs; 2467 2468 ipvs = net_generic(net, ip_vs_net_id); 2469 if (ipvs == NULL) 2470 return -ENOMEM; 2471 2472 /* Hold the beast until a service is registered */ 2473 WRITE_ONCE(ipvs->enable, 0); 2474 ipvs->net = net; 2475 /* Counters used for creating unique names */ 2476 ipvs->gen = atomic_read(&ipvs_netns_cnt); 2477 atomic_inc(&ipvs_netns_cnt); 2478 net->ipvs = ipvs; 2479 2480 if (ip_vs_estimator_net_init(ipvs) < 0) 2481 goto estimator_fail; 2482 2483 if (ip_vs_control_net_init(ipvs) < 0) 2484 goto control_fail; 2485 2486 if (ip_vs_protocol_net_init(ipvs) < 0) 2487 goto protocol_fail; 2488 2489 if (ip_vs_app_net_init(ipvs) < 0) 2490 goto app_fail; 2491 2492 if (ip_vs_conn_net_init(ipvs) < 0) 2493 goto conn_fail; 2494 2495 if (ip_vs_sync_net_init(ipvs) < 0) 2496 goto sync_fail; 2497 2498 return 0; 2499 /* 2500 * Error handling 2501 */ 2502 2503 sync_fail: 2504 ip_vs_conn_net_cleanup(ipvs); 2505 conn_fail: 2506 ip_vs_app_net_cleanup(ipvs); 2507 app_fail: 2508 ip_vs_protocol_net_cleanup(ipvs); 2509 protocol_fail: 2510 ip_vs_control_net_cleanup(ipvs); 2511 control_fail: 2512 ip_vs_estimator_net_cleanup(ipvs); 2513 estimator_fail: 2514 net->ipvs = NULL; 2515 return -ENOMEM; 2516 } 2517 2518 static void __net_exit __ip_vs_cleanup_batch(struct list_head *net_list) 2519 { 2520 struct netns_ipvs *ipvs; 2521 struct net *net; 2522 2523 ip_vs_service_nets_cleanup(net_list); /* ip_vs_flush() with locks */ 2524 list_for_each_entry(net, net_list, exit_list) { 2525 ipvs = net_ipvs(net); 2526 ip_vs_conn_net_cleanup(ipvs); 2527 ip_vs_app_net_cleanup(ipvs); 2528 ip_vs_protocol_net_cleanup(ipvs); 2529 ip_vs_control_net_cleanup(ipvs); 2530 ip_vs_estimator_net_cleanup(ipvs); 2531 IP_VS_DBG(2, "ipvs netns %d released\n", ipvs->gen); 2532 net->ipvs = NULL; 2533 } 2534 } 2535 2536 static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list) 2537 { 2538 struct netns_ipvs *ipvs; 2539 struct net *net; 2540 2541 list_for_each_entry(net, net_list, exit_list) { 2542 ipvs = net_ipvs(net); 2543 ip_vs_unregister_hooks(ipvs, AF_INET); 2544 ip_vs_unregister_hooks(ipvs, AF_INET6); 2545 WRITE_ONCE(ipvs->enable, 0); /* Disable packet reception */ 2546 smp_wmb(); 2547 ip_vs_sync_net_cleanup(ipvs); 2548 } 2549 } 2550 2551 static struct pernet_operations ipvs_core_ops = { 2552 .init = __ip_vs_init, 2553 .exit_batch = __ip_vs_cleanup_batch, 2554 .id = &ip_vs_net_id, 2555 .size = sizeof(struct netns_ipvs), 2556 }; 2557 2558 static struct pernet_operations ipvs_core_dev_ops = { 2559 .exit_batch = __ip_vs_dev_cleanup_batch, 2560 }; 2561 2562 /* 2563 * Initialize IP Virtual Server 2564 */ 2565 static int __init ip_vs_init(void) 2566 { 2567 int ret; 2568 2569 ret = ip_vs_control_init(); 2570 if (ret < 0) { 2571 pr_err("can't setup control.\n"); 2572 goto exit; 2573 } 2574 2575 ip_vs_protocol_init(); 2576 2577 ret = ip_vs_conn_init(); 2578 if (ret < 0) { 2579 pr_err("can't setup connection table.\n"); 2580 goto cleanup_protocol; 2581 } 2582 2583 ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */ 2584 if (ret < 0) 2585 goto cleanup_conn; 2586 2587 ret = register_pernet_device(&ipvs_core_dev_ops); 2588 if (ret < 0) 2589 goto cleanup_sub; 2590 2591 ret = ip_vs_register_nl_ioctl(); 2592 if (ret < 0) { 2593 pr_err("can't register netlink/ioctl.\n"); 2594 goto cleanup_dev; 2595 } 2596 2597 pr_info("ipvs loaded.\n"); 2598 2599 return ret; 2600 2601 cleanup_dev: 2602 unregister_pernet_device(&ipvs_core_dev_ops); 2603 cleanup_sub: 2604 unregister_pernet_subsys(&ipvs_core_ops); 2605 cleanup_conn: 2606 ip_vs_conn_cleanup(); 2607 cleanup_protocol: 2608 ip_vs_protocol_cleanup(); 2609 ip_vs_control_cleanup(); 2610 exit: 2611 return ret; 2612 } 2613 2614 static void __exit ip_vs_cleanup(void) 2615 { 2616 ip_vs_unregister_nl_ioctl(); 2617 unregister_pernet_device(&ipvs_core_dev_ops); 2618 unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ 2619 ip_vs_conn_cleanup(); 2620 ip_vs_protocol_cleanup(); 2621 ip_vs_control_cleanup(); 2622 /* common rcu_barrier() used by: 2623 * - ip_vs_control_cleanup() 2624 */ 2625 rcu_barrier(); 2626 pr_info("ipvs unloaded.\n"); 2627 } 2628 2629 module_init(ip_vs_init); 2630 module_exit(ip_vs_cleanup); 2631 MODULE_LICENSE("GPL"); 2632 MODULE_DESCRIPTION("IP Virtual Server"); 2633