1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * ip_vs_xmit.c: various packet transmitters for IPVS 4 * 5 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 6 * Julian Anastasov <ja@ssi.bg> 7 * 8 * Changes: 9 * 10 * Description of forwarding methods: 11 * - all transmitters are called from LOCAL_IN (remote clients) and 12 * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD 13 * - not all connections have destination server, for example, 14 * connections in backup server when fwmark is used 15 * - bypass connections use daddr from packet 16 * - we can use dst without ref while sending in RCU section, we use 17 * ref when returning NF_ACCEPT for NAT-ed packet via loopback 18 * LOCAL_OUT rules: 19 * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING) 20 * - skb->pkt_type is not set yet 21 * - the only place where we can see skb->sk != NULL 22 */ 23 24 #define pr_fmt(fmt) "IPVS: " fmt 25 26 #include <linux/kernel.h> 27 #include <linux/slab.h> 28 #include <linux/tcp.h> /* for tcphdr */ 29 #include <net/ip.h> 30 #include <net/gue.h> 31 #include <net/gre.h> 32 #include <net/tcp.h> /* for csum_tcpudp_magic */ 33 #include <net/udp.h> 34 #include <net/icmp.h> /* for icmp_send */ 35 #include <net/route.h> /* for ip_route_output */ 36 #include <net/ipv6.h> 37 #include <net/ip6_route.h> 38 #include <net/ip_tunnels.h> 39 #include <net/ip6_checksum.h> 40 #include <net/addrconf.h> 41 #include <linux/icmpv6.h> 42 #include <linux/netfilter.h> 43 #include <linux/netfilter_ipv4.h> 44 45 #include <net/ip_vs.h> 46 47 enum { 48 IP_VS_RT_MODE_LOCAL = 1, /* Allow local dest */ 49 IP_VS_RT_MODE_NON_LOCAL = 2, /* Allow non-local dest */ 50 IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to 51 * local 52 */ 53 IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */ 54 IP_VS_RT_MODE_KNOWN_NH = 16,/* Route via remote addr */ 55 IP_VS_RT_MODE_TUNNEL = 32,/* Tunnel mode */ 56 }; 57 58 static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void) 59 { 60 return kmalloc_obj(struct ip_vs_dest_dst, GFP_ATOMIC); 61 } 62 63 static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst) 64 { 65 kfree(dest_dst); 66 } 67 68 /* 69 * Destination cache to speed up outgoing route lookup 70 */ 71 static inline void 72 __ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst, 73 struct dst_entry *dst, u32 dst_cookie) 74 { 75 struct ip_vs_dest_dst *old; 76 77 old = rcu_dereference_protected(dest->dest_dst, 78 lockdep_is_held(&dest->dst_lock)); 79 80 if (dest_dst) { 81 dest_dst->dst_cache = dst; 82 dest_dst->dst_cookie = dst_cookie; 83 } 84 rcu_assign_pointer(dest->dest_dst, dest_dst); 85 86 if (old) 87 call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); 88 } 89 90 static inline struct ip_vs_dest_dst * 91 __ip_vs_dst_check(struct ip_vs_dest *dest) 92 { 93 struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst); 94 struct dst_entry *dst; 95 96 if (!dest_dst) 97 return NULL; 98 dst = dest_dst->dst_cache; 99 if (READ_ONCE(dst->obsolete) && 100 dst->ops->check(dst, dest_dst->dst_cookie) == NULL) 101 return NULL; 102 return dest_dst; 103 } 104 105 static inline bool 106 __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu) 107 { 108 if (IP6CB(skb)->frag_max_size) { 109 /* frag_max_size tell us that, this packet have been 110 * defragmented by netfilter IPv6 conntrack module. 111 */ 112 if (IP6CB(skb)->frag_max_size > mtu) 113 return true; /* largest fragment violate MTU */ 114 } 115 else if (skb->len > mtu && !skb_is_gso(skb)) { 116 return true; /* Packet size violate MTU size */ 117 } 118 return false; 119 } 120 121 /* Get route to daddr, optionally bind route to saddr */ 122 static struct rtable *do_output_route4(struct net *net, __be32 daddr, 123 int rt_mode, __be32 *ret_saddr) 124 { 125 struct flowi4 fl4; 126 struct rtable *rt; 127 128 memset(&fl4, 0, sizeof(fl4)); 129 fl4.daddr = daddr; 130 fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ? 131 FLOWI_FLAG_KNOWN_NH : 0; 132 133 retry: 134 rt = ip_route_output_key(net, &fl4); 135 if (IS_ERR(rt)) { 136 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr); 137 return NULL; 138 } 139 if (rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { 140 ip_rt_put(rt); 141 flowi4_update_output(&fl4, 0, daddr, fl4.saddr); 142 rt_mode = 0; 143 goto retry; 144 } 145 if (ret_saddr) 146 *ret_saddr = fl4.saddr; 147 return rt; 148 } 149 150 #ifdef CONFIG_IP_VS_IPV6 151 static inline int __ip_vs_is_local_route6(struct rt6_info *rt) 152 { 153 return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK; 154 } 155 #endif 156 157 static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb, 158 int rt_mode, 159 bool new_rt_is_local) 160 { 161 bool rt_mode_allow_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL); 162 bool rt_mode_allow_non_local = !!(rt_mode & IP_VS_RT_MODE_NON_LOCAL); 163 bool rt_mode_allow_redirect = !!(rt_mode & IP_VS_RT_MODE_RDR); 164 bool source_is_loopback; 165 bool old_rt_is_local; 166 167 #ifdef CONFIG_IP_VS_IPV6 168 if (skb_af == AF_INET6) { 169 int addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr); 170 171 source_is_loopback = 172 (!skb->dev || skb->dev->flags & IFF_LOOPBACK) && 173 (addr_type & IPV6_ADDR_LOOPBACK); 174 old_rt_is_local = __ip_vs_is_local_route6( 175 dst_rt6_info(skb_dst(skb))); 176 } else 177 #endif 178 { 179 source_is_loopback = ipv4_is_loopback(ip_hdr(skb)->saddr); 180 old_rt_is_local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; 181 } 182 183 if (unlikely(new_rt_is_local)) { 184 if (!rt_mode_allow_local) 185 return true; 186 if (!rt_mode_allow_redirect && !old_rt_is_local) 187 return true; 188 } else { 189 if (!rt_mode_allow_non_local) 190 return true; 191 if (source_is_loopback) 192 return true; 193 } 194 return false; 195 } 196 197 static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu) 198 { 199 struct sock *sk = skb->sk; 200 struct rtable *ort = skb_rtable(skb); 201 202 if (!skb->dev && sk && sk_fullsock(sk)) 203 ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu, true); 204 } 205 206 static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af, 207 int rt_mode, 208 struct ip_vs_iphdr *ipvsh, 209 struct sk_buff *skb, int mtu) 210 { 211 #ifdef CONFIG_IP_VS_IPV6 212 if (skb_af == AF_INET6) { 213 struct net *net = ipvs->net; 214 215 if (unlikely(__mtu_check_toobig_v6(skb, mtu))) { 216 if (!skb->dev) 217 skb->dev = net->loopback_dev; 218 /* only send ICMP too big on first fragment */ 219 if (!ipvsh->fragoffs && !ip_vs_iph_icmp(ipvsh)) 220 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 221 IP_VS_DBG(1, "frag needed for %pI6c\n", 222 &ipv6_hdr(skb)->saddr); 223 return false; 224 } 225 } else 226 #endif 227 { 228 /* If we're going to tunnel the packet and pmtu discovery 229 * is disabled, we'll just fragment it anyway 230 */ 231 if ((rt_mode & IP_VS_RT_MODE_TUNNEL) && !sysctl_pmtu_disc(ipvs)) 232 return true; 233 234 if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) && 235 skb->len > mtu && !skb_is_gso(skb) && 236 !ip_vs_iph_icmp(ipvsh))) { 237 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, 238 htonl(mtu)); 239 IP_VS_DBG(1, "frag needed for %pI4\n", 240 &ip_hdr(skb)->saddr); 241 return false; 242 } 243 } 244 245 return true; 246 } 247 248 static inline bool decrement_ttl(struct netns_ipvs *ipvs, 249 int skb_af, 250 struct sk_buff *skb) 251 { 252 struct net *net = ipvs->net; 253 254 #ifdef CONFIG_IP_VS_IPV6 255 if (skb_af == AF_INET6) { 256 struct dst_entry *dst = skb_dst(skb); 257 258 /* check and decrement ttl */ 259 if (ipv6_hdr(skb)->hop_limit <= 1) { 260 struct inet6_dev *idev = __in6_dev_get_safely(skb->dev); 261 262 /* Force OUTPUT device used as source address */ 263 skb->dev = dst->dev; 264 icmpv6_send(skb, ICMPV6_TIME_EXCEED, 265 ICMPV6_EXC_HOPLIMIT, 0); 266 IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); 267 268 return false; 269 } 270 271 /* don't propagate ttl change to cloned packets */ 272 if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) 273 return false; 274 275 ipv6_hdr(skb)->hop_limit--; 276 } else 277 #endif 278 { 279 if (ip_hdr(skb)->ttl <= 1) { 280 /* Tell the sender its packet died... */ 281 IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); 282 icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); 283 return false; 284 } 285 286 /* don't propagate ttl change to cloned packets */ 287 if (skb_ensure_writable(skb, sizeof(struct iphdr))) 288 return false; 289 290 /* Decrease ttl */ 291 ip_decrease_ttl(ip_hdr(skb)); 292 } 293 294 return true; 295 } 296 297 /* rt has device that is down */ 298 static bool rt_dev_is_down(const struct net_device *dev) 299 { 300 return dev && !netif_running(dev); 301 } 302 303 /* Get route to destination or remote server */ 304 static int 305 __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, 306 struct ip_vs_dest *dest, 307 __be32 daddr, int rt_mode, __be32 *ret_saddr, 308 struct ip_vs_iphdr *ipvsh) 309 { 310 struct net *net = ipvs->net; 311 struct ip_vs_dest_dst *dest_dst; 312 struct rtable *rt; /* Route to the other host */ 313 int mtu; 314 int local, noref = 1; 315 316 if (dest) { 317 dest_dst = __ip_vs_dst_check(dest); 318 if (likely(dest_dst)) { 319 rt = dst_rtable(dest_dst->dst_cache); 320 if (ret_saddr) 321 *ret_saddr = dest_dst->dst_saddr.ip; 322 } else { 323 dest_dst = ip_vs_dest_dst_alloc(); 324 spin_lock_bh(&dest->dst_lock); 325 if (!dest_dst) { 326 __ip_vs_dst_set(dest, NULL, NULL, 0); 327 spin_unlock_bh(&dest->dst_lock); 328 goto err_unreach; 329 } 330 rt = do_output_route4(net, dest->addr.ip, rt_mode, 331 &dest_dst->dst_saddr.ip); 332 if (!rt) { 333 __ip_vs_dst_set(dest, NULL, NULL, 0); 334 spin_unlock_bh(&dest->dst_lock); 335 ip_vs_dest_dst_free(dest_dst); 336 goto err_unreach; 337 } 338 /* It is forbidden to attach dest->dest_dst if 339 * device is going down. 340 */ 341 if (!rt_dev_is_down(dst_dev_rcu(&rt->dst))) 342 __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0); 343 else 344 noref = 0; 345 spin_unlock_bh(&dest->dst_lock); 346 IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n", 347 &dest->addr.ip, &dest_dst->dst_saddr.ip, 348 rcuref_read(&rt->dst.__rcuref)); 349 if (ret_saddr) 350 *ret_saddr = dest_dst->dst_saddr.ip; 351 if (!noref) 352 ip_vs_dest_dst_free(dest_dst); 353 } 354 } else { 355 noref = 0; 356 357 /* For such unconfigured boxes avoid many route lookups 358 * for performance reasons because we do not remember saddr 359 */ 360 rt_mode &= ~IP_VS_RT_MODE_CONNECT; 361 rt = do_output_route4(net, daddr, rt_mode, ret_saddr); 362 if (!rt) 363 goto err_unreach; 364 } 365 366 local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0; 367 if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode, 368 local))) { 369 IP_VS_DBG_RL("We are crossing local and non-local addresses" 370 " daddr=%pI4\n", &daddr); 371 goto err_put; 372 } 373 374 if (unlikely(local)) { 375 /* skb to local stack, preserve old route */ 376 if (!noref) 377 ip_rt_put(rt); 378 return local; 379 } 380 381 if (!decrement_ttl(ipvs, skb_af, skb)) 382 goto err_put; 383 384 if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) { 385 mtu = dst_mtu(&rt->dst); 386 } else { 387 mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); 388 if (!dest) 389 goto err_put; 390 if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 391 mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); 392 if ((dest->tun_flags & 393 IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 394 skb->ip_summed == CHECKSUM_PARTIAL) 395 mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; 396 } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 397 IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; 398 399 if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 400 __set_bit(IP_TUNNEL_CSUM_BIT, tflags); 401 mtu -= gre_calc_hlen(tflags); 402 } 403 if (mtu < 68) { 404 IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); 405 goto err_put; 406 } 407 maybe_update_pmtu(skb_af, skb, mtu); 408 } 409 410 if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu)) 411 goto err_put; 412 413 skb_dst_drop(skb); 414 if (noref) 415 skb_dst_set_noref(skb, &rt->dst); 416 else 417 skb_dst_set(skb, &rt->dst); 418 419 return local; 420 421 err_put: 422 if (!noref) 423 ip_rt_put(rt); 424 return -1; 425 426 err_unreach: 427 if (!skb->dev) 428 skb->dev = skb_dst(skb)->dev; 429 430 dst_link_failure(skb); 431 return -1; 432 } 433 434 #ifdef CONFIG_IP_VS_IPV6 435 static struct dst_entry * 436 __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, 437 struct in6_addr *ret_saddr, int do_xfrm, int rt_mode) 438 { 439 struct dst_entry *dst; 440 struct flowi6 fl6 = { 441 .daddr = *daddr, 442 }; 443 444 if (rt_mode & IP_VS_RT_MODE_KNOWN_NH) 445 fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH; 446 447 dst = ip6_route_output(net, NULL, &fl6); 448 if (dst->error) 449 goto out_err; 450 if (!ret_saddr) 451 return dst; 452 if (ipv6_addr_any(&fl6.saddr) && 453 ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, 454 &fl6.daddr, 0, &fl6.saddr) < 0) 455 goto out_err; 456 if (do_xfrm) { 457 dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); 458 if (IS_ERR(dst)) { 459 dst = NULL; 460 goto out_err; 461 } 462 } 463 *ret_saddr = fl6.saddr; 464 return dst; 465 466 out_err: 467 dst_release(dst); 468 IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr); 469 return NULL; 470 } 471 472 /* 473 * Get route to destination or remote server 474 */ 475 static int 476 __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, 477 struct ip_vs_dest *dest, 478 struct in6_addr *daddr, struct in6_addr *ret_saddr, 479 struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode) 480 { 481 struct net *net = ipvs->net; 482 struct ip_vs_dest_dst *dest_dst; 483 struct rt6_info *rt; /* Route to the other host */ 484 struct dst_entry *dst; 485 int mtu; 486 int local, noref = 1; 487 488 if (dest) { 489 dest_dst = __ip_vs_dst_check(dest); 490 if (likely(dest_dst)) { 491 rt = dst_rt6_info(dest_dst->dst_cache); 492 if (ret_saddr) 493 *ret_saddr = dest_dst->dst_saddr.in6; 494 } else { 495 u32 cookie; 496 497 dest_dst = ip_vs_dest_dst_alloc(); 498 spin_lock_bh(&dest->dst_lock); 499 if (!dest_dst) { 500 __ip_vs_dst_set(dest, NULL, NULL, 0); 501 spin_unlock_bh(&dest->dst_lock); 502 goto err_unreach; 503 } 504 dst = __ip_vs_route_output_v6(net, &dest->addr.in6, 505 &dest_dst->dst_saddr.in6, 506 do_xfrm, rt_mode); 507 if (!dst) { 508 __ip_vs_dst_set(dest, NULL, NULL, 0); 509 spin_unlock_bh(&dest->dst_lock); 510 ip_vs_dest_dst_free(dest_dst); 511 goto err_unreach; 512 } 513 rt = dst_rt6_info(dst); 514 cookie = rt6_get_cookie(rt); 515 /* It is forbidden to attach dest->dest_dst if 516 * device is going down. 517 */ 518 if (!rt_dev_is_down(dst_dev_rcu(&rt->dst))) 519 __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); 520 else 521 noref = 0; 522 spin_unlock_bh(&dest->dst_lock); 523 IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", 524 &dest->addr.in6, &dest_dst->dst_saddr.in6, 525 rcuref_read(&rt->dst.__rcuref)); 526 if (ret_saddr) 527 *ret_saddr = dest_dst->dst_saddr.in6; 528 if (!noref) 529 ip_vs_dest_dst_free(dest_dst); 530 } 531 } else { 532 noref = 0; 533 dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm, 534 rt_mode); 535 if (!dst) 536 goto err_unreach; 537 rt = dst_rt6_info(dst); 538 } 539 540 local = __ip_vs_is_local_route6(rt); 541 542 if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode, 543 local))) { 544 IP_VS_DBG_RL("We are crossing local and non-local addresses" 545 " daddr=%pI6\n", daddr); 546 goto err_put; 547 } 548 549 if (unlikely(local)) { 550 /* skb to local stack, preserve old route */ 551 if (!noref) 552 dst_release(&rt->dst); 553 return local; 554 } 555 556 if (!decrement_ttl(ipvs, skb_af, skb)) 557 goto err_put; 558 559 /* MTU checking */ 560 if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) 561 mtu = dst_mtu(&rt->dst); 562 else { 563 mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); 564 if (!dest) 565 goto err_put; 566 if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 567 mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); 568 if ((dest->tun_flags & 569 IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 570 skb->ip_summed == CHECKSUM_PARTIAL) 571 mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; 572 } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 573 IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; 574 575 if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 576 __set_bit(IP_TUNNEL_CSUM_BIT, tflags); 577 mtu -= gre_calc_hlen(tflags); 578 } 579 if (mtu < IPV6_MIN_MTU) { 580 IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, 581 IPV6_MIN_MTU); 582 goto err_put; 583 } 584 maybe_update_pmtu(skb_af, skb, mtu); 585 } 586 587 if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu)) 588 goto err_put; 589 590 skb_dst_drop(skb); 591 if (noref) 592 skb_dst_set_noref(skb, &rt->dst); 593 else 594 skb_dst_set(skb, &rt->dst); 595 596 return local; 597 598 err_put: 599 if (!noref) 600 dst_release(&rt->dst); 601 return -1; 602 603 err_unreach: 604 /* The ip6_link_failure function requires the dev field to be set 605 * in order to get the net (further for the sake of fwmark 606 * reflection). 607 */ 608 if (!skb->dev) 609 skb->dev = skb_dst(skb)->dev; 610 611 dst_link_failure(skb); 612 return -1; 613 } 614 #endif 615 616 617 /* return NF_ACCEPT to allow forwarding or other NF_xxx on error */ 618 static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, 619 struct ip_vs_conn *cp) 620 { 621 int ret = NF_ACCEPT; 622 623 skb->ipvs_property = 1; 624 if (unlikely(cp->flags & IP_VS_CONN_F_NFCT)) 625 ret = ip_vs_confirm_conntrack(skb); 626 if (ret == NF_ACCEPT) { 627 nf_reset_ct(skb); 628 skb_forward_csum(skb); 629 if (skb->dev) 630 skb_clear_tstamp(skb); 631 } 632 return ret; 633 } 634 635 /* In the event of a remote destination, it's possible that we would have 636 * matches against an old socket (particularly a TIME-WAIT socket). This 637 * causes havoc down the line (ip_local_out et. al. expect regular sockets 638 * and invalid memory accesses will happen) so simply drop the association 639 * in this case. 640 */ 641 static inline void ip_vs_drop_early_demux_sk(struct sk_buff *skb) 642 { 643 /* If dev is set, the packet came from the LOCAL_IN callback and 644 * not from a local TCP socket. 645 */ 646 if (skb->dev) 647 skb_orphan(skb); 648 } 649 650 /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ 651 static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, 652 struct ip_vs_conn *cp, int local) 653 { 654 int ret = NF_STOLEN; 655 656 skb->ipvs_property = 1; 657 if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) 658 ip_vs_notrack(skb); 659 else 660 ip_vs_update_conntrack(skb, cp, 1); 661 662 /* Remove the early_demux association unless it's bound for the 663 * exact same port and address on this host after translation. 664 */ 665 if (!local || cp->vport != cp->dport || 666 !ip_vs_addr_equal(cp->af, &cp->vaddr, &cp->daddr)) 667 ip_vs_drop_early_demux_sk(skb); 668 669 if (!local) { 670 skb_forward_csum(skb); 671 if (skb->dev) 672 skb_clear_tstamp(skb); 673 NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, 674 NULL, skb_dst(skb)->dev, dst_output); 675 } else 676 ret = NF_ACCEPT; 677 678 return ret; 679 } 680 681 /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ 682 static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, 683 struct ip_vs_conn *cp, int local) 684 { 685 int ret = NF_STOLEN; 686 687 skb->ipvs_property = 1; 688 if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) 689 ip_vs_notrack(skb); 690 if (!local) { 691 ip_vs_drop_early_demux_sk(skb); 692 skb_forward_csum(skb); 693 if (skb->dev) 694 skb_clear_tstamp(skb); 695 NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, 696 NULL, skb_dst(skb)->dev, dst_output); 697 } else 698 ret = NF_ACCEPT; 699 return ret; 700 } 701 702 703 /* 704 * NULL transmitter (do nothing except return NF_ACCEPT) 705 */ 706 int 707 ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 708 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 709 { 710 /* we do not touch skb and do not need pskb ptr */ 711 return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); 712 } 713 714 715 /* 716 * Bypass transmitter 717 * Let packets bypass the destination when the destination is not 718 * available, it may be only used in transparent cache cluster. 719 */ 720 int 721 ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 722 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 723 { 724 struct iphdr *iph = ip_hdr(skb); 725 726 if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr, 727 IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0) 728 goto tx_error; 729 730 ip_send_check(iph); 731 732 /* Another hack: avoid icmp_send in ip_fragment */ 733 skb->ignore_df = 1; 734 735 ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); 736 737 return NF_STOLEN; 738 739 tx_error: 740 kfree_skb(skb); 741 return NF_STOLEN; 742 } 743 744 #ifdef CONFIG_IP_VS_IPV6 745 int 746 ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 747 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 748 { 749 struct ipv6hdr *iph = ipv6_hdr(skb); 750 751 if (__ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, NULL, 752 &iph->daddr, NULL, 753 ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0) 754 goto tx_error; 755 756 /* Another hack: avoid icmp_send in ip_fragment */ 757 skb->ignore_df = 1; 758 759 ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); 760 761 return NF_STOLEN; 762 763 tx_error: 764 kfree_skb(skb); 765 return NF_STOLEN; 766 } 767 #endif 768 769 /* 770 * NAT transmitter (only for outside-to-inside nat forwarding) 771 * Not used for related ICMP 772 */ 773 int 774 ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 775 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 776 { 777 struct rtable *rt; /* Route to the other host */ 778 int local, rc, was_input; 779 780 /* check if it is a connection of no-client-port */ 781 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { 782 __be16 _pt, *p; 783 784 p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt); 785 if (p == NULL) 786 goto tx_error; 787 ip_vs_conn_fill_cport(cp, *p); 788 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); 789 } 790 791 was_input = rt_is_input_route(skb_rtable(skb)); 792 local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, 793 IP_VS_RT_MODE_LOCAL | 794 IP_VS_RT_MODE_NON_LOCAL | 795 IP_VS_RT_MODE_RDR, NULL, ipvsh); 796 if (local < 0) 797 goto tx_error; 798 rt = skb_rtable(skb); 799 /* 800 * Avoid duplicate tuple in reply direction for NAT traffic 801 * to local address when connection is sync-ed 802 */ 803 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 804 if (cp->flags & IP_VS_CONN_F_SYNC && local) { 805 enum ip_conntrack_info ctinfo; 806 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 807 808 if (ct) { 809 IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, ipvsh->off, 810 "ip_vs_nat_xmit(): " 811 "stopping DNAT to local address"); 812 goto tx_error; 813 } 814 } 815 #endif 816 817 /* From world but DNAT to loopback address? */ 818 if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { 819 IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, ipvsh->off, 820 "ip_vs_nat_xmit(): stopping DNAT to loopback " 821 "address"); 822 goto tx_error; 823 } 824 825 /* copy-on-write the packet before mangling it */ 826 if (skb_ensure_writable(skb, sizeof(struct iphdr))) 827 goto tx_error; 828 829 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 830 goto tx_error; 831 832 /* mangle the packet */ 833 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) 834 goto tx_error; 835 ip_hdr(skb)->daddr = cp->daddr.ip; 836 ip_send_check(ip_hdr(skb)); 837 838 IP_VS_DBG_PKT(10, AF_INET, pp, skb, ipvsh->off, "After DNAT"); 839 840 /* FIXME: when application helper enlarges the packet and the length 841 is larger than the MTU of outgoing device, there will be still 842 MTU problem. */ 843 844 /* Another hack: avoid icmp_send in ip_fragment */ 845 skb->ignore_df = 1; 846 847 rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); 848 849 return rc; 850 851 tx_error: 852 kfree_skb(skb); 853 return NF_STOLEN; 854 } 855 856 #ifdef CONFIG_IP_VS_IPV6 857 int 858 ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 859 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 860 { 861 struct rt6_info *rt; /* Route to the other host */ 862 int local, rc; 863 864 /* check if it is a connection of no-client-port */ 865 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) { 866 __be16 _pt, *p; 867 p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt); 868 if (p == NULL) 869 goto tx_error; 870 ip_vs_conn_fill_cport(cp, *p); 871 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); 872 } 873 874 local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, 875 &cp->daddr.in6, 876 NULL, ipvsh, 0, 877 IP_VS_RT_MODE_LOCAL | 878 IP_VS_RT_MODE_NON_LOCAL | 879 IP_VS_RT_MODE_RDR); 880 if (local < 0) 881 goto tx_error; 882 rt = dst_rt6_info(skb_dst(skb)); 883 /* 884 * Avoid duplicate tuple in reply direction for NAT traffic 885 * to local address when connection is sync-ed 886 */ 887 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 888 if (cp->flags & IP_VS_CONN_F_SYNC && local) { 889 enum ip_conntrack_info ctinfo; 890 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 891 892 if (ct) { 893 IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, ipvsh->off, 894 "ip_vs_nat_xmit_v6(): " 895 "stopping DNAT to local address"); 896 goto tx_error; 897 } 898 } 899 #endif 900 901 /* From world but DNAT to loopback address? */ 902 if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && 903 ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) { 904 IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, ipvsh->off, 905 "ip_vs_nat_xmit_v6(): " 906 "stopping DNAT to loopback address"); 907 goto tx_error; 908 } 909 910 /* copy-on-write the packet before mangling it */ 911 if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) 912 goto tx_error; 913 914 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 915 goto tx_error; 916 917 /* mangle the packet */ 918 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) 919 goto tx_error; 920 ipv6_hdr(skb)->daddr = cp->daddr.in6; 921 922 IP_VS_DBG_PKT(10, AF_INET6, pp, skb, ipvsh->off, "After DNAT"); 923 924 /* FIXME: when application helper enlarges the packet and the length 925 is larger than the MTU of outgoing device, there will be still 926 MTU problem. */ 927 928 /* Another hack: avoid icmp_send in ip_fragment */ 929 skb->ignore_df = 1; 930 931 rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); 932 933 return rc; 934 935 tx_error: 936 kfree_skb(skb); 937 return NF_STOLEN; 938 } 939 #endif 940 941 /* When forwarding a packet, we must ensure that we've got enough headroom 942 * for the encapsulation packet in the skb. This also gives us an 943 * opportunity to figure out what the payload_len, dsfield, ttl, and df 944 * values should be, so that we won't need to look at the old ip header 945 * again 946 */ 947 static struct sk_buff * 948 ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, 949 unsigned int max_headroom, __u8 *next_protocol, 950 __u32 *payload_len, __u8 *dsfield, __u8 *ttl, 951 __be16 *df) 952 { 953 struct sk_buff *new_skb = NULL; 954 struct iphdr *old_iph = NULL; 955 __u8 old_dsfield; 956 #ifdef CONFIG_IP_VS_IPV6 957 struct ipv6hdr *old_ipv6h = NULL; 958 #endif 959 960 ip_vs_drop_early_demux_sk(skb); 961 962 if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { 963 new_skb = skb_realloc_headroom(skb, max_headroom); 964 if (!new_skb) 965 goto error; 966 if (skb->sk) 967 skb_set_owner_w(new_skb, skb->sk); 968 consume_skb(skb); 969 skb = new_skb; 970 } 971 972 #ifdef CONFIG_IP_VS_IPV6 973 if (skb_af == AF_INET6) { 974 old_ipv6h = ipv6_hdr(skb); 975 *next_protocol = IPPROTO_IPV6; 976 if (payload_len) 977 *payload_len = 978 ipv6_payload_len(skb, old_ipv6h) + 979 sizeof(*old_ipv6h); 980 old_dsfield = ipv6_get_dsfield(old_ipv6h); 981 *ttl = old_ipv6h->hop_limit; 982 if (df) 983 *df = 0; 984 } else 985 #endif 986 { 987 old_iph = ip_hdr(skb); 988 /* Copy DF, reset fragment offset and MF */ 989 if (df) 990 *df = (old_iph->frag_off & htons(IP_DF)); 991 *next_protocol = IPPROTO_IPIP; 992 993 /* fix old IP header checksum */ 994 ip_send_check(old_iph); 995 old_dsfield = ipv4_get_dsfield(old_iph); 996 *ttl = old_iph->ttl; 997 if (payload_len) 998 *payload_len = skb_ip_totlen(skb); 999 } 1000 1001 /* Implement full-functionality option for ECN encapsulation */ 1002 *dsfield = INET_ECN_encapsulate(old_dsfield, old_dsfield); 1003 1004 return skb; 1005 error: 1006 kfree_skb(skb); 1007 return ERR_PTR(-ENOMEM); 1008 } 1009 1010 static inline int __tun_gso_type_mask(int encaps_af, int orig_af) 1011 { 1012 switch (encaps_af) { 1013 case AF_INET: 1014 return SKB_GSO_IPXIP4; 1015 case AF_INET6: 1016 return SKB_GSO_IPXIP6; 1017 default: 1018 return 0; 1019 } 1020 } 1021 1022 static int 1023 ipvs_gue_encap(struct net *net, struct sk_buff *skb, 1024 struct ip_vs_conn *cp, __u8 *next_protocol) 1025 { 1026 __be16 dport; 1027 __be16 sport = udp_flow_src_port(net, skb, 0, 0, false); 1028 struct udphdr *udph; /* Our new UDP header */ 1029 struct guehdr *gueh; /* Our new GUE header */ 1030 size_t hdrlen, optlen = 0; 1031 void *data; 1032 bool need_priv = false; 1033 1034 if ((cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 1035 skb->ip_summed == CHECKSUM_PARTIAL) { 1036 optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; 1037 need_priv = true; 1038 } 1039 1040 hdrlen = sizeof(struct guehdr) + optlen; 1041 1042 skb_push(skb, hdrlen); 1043 1044 gueh = (struct guehdr *)skb->data; 1045 1046 gueh->control = 0; 1047 gueh->version = 0; 1048 gueh->hlen = optlen >> 2; 1049 gueh->flags = 0; 1050 gueh->proto_ctype = *next_protocol; 1051 1052 data = &gueh[1]; 1053 1054 if (need_priv) { 1055 __be32 *flags = data; 1056 u16 csum_start = skb_checksum_start_offset(skb); 1057 __be16 *pd; 1058 1059 gueh->flags |= GUE_FLAG_PRIV; 1060 *flags = 0; 1061 data += GUE_LEN_PRIV; 1062 1063 if (csum_start < hdrlen) 1064 return -EINVAL; 1065 1066 csum_start -= hdrlen; 1067 pd = data; 1068 pd[0] = htons(csum_start); 1069 pd[1] = htons(csum_start + skb->csum_offset); 1070 1071 if (!skb_is_gso(skb)) { 1072 skb->ip_summed = CHECKSUM_NONE; 1073 skb->encapsulation = 0; 1074 } 1075 1076 *flags |= GUE_PFLAG_REMCSUM; 1077 data += GUE_PLEN_REMCSUM; 1078 } 1079 1080 skb_push(skb, sizeof(struct udphdr)); 1081 skb_reset_transport_header(skb); 1082 1083 udph = udp_hdr(skb); 1084 1085 dport = cp->dest->tun_port; 1086 udph->dest = dport; 1087 udph->source = sport; 1088 udph->len = htons(skb->len); 1089 udph->check = 0; 1090 1091 *next_protocol = IPPROTO_UDP; 1092 1093 return 0; 1094 } 1095 1096 static void 1097 ipvs_gre_encap(struct net *net, struct sk_buff *skb, 1098 struct ip_vs_conn *cp, __u8 *next_protocol) 1099 { 1100 __be16 proto = *next_protocol == IPPROTO_IPIP ? 1101 htons(ETH_P_IP) : htons(ETH_P_IPV6); 1102 IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; 1103 size_t hdrlen; 1104 1105 if (cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 1106 __set_bit(IP_TUNNEL_CSUM_BIT, tflags); 1107 1108 hdrlen = gre_calc_hlen(tflags); 1109 gre_build_header(skb, hdrlen, tflags, proto, 0, 0); 1110 1111 *next_protocol = IPPROTO_GRE; 1112 } 1113 1114 /* 1115 * IP Tunneling transmitter 1116 * 1117 * This function encapsulates the packet in a new IP packet, its 1118 * destination will be set to cp->daddr. Most code of this function 1119 * is taken from ipip.c. 1120 * 1121 * It is used in VS/TUN cluster. The load balancer selects a real 1122 * server from a cluster based on a scheduling algorithm, 1123 * encapsulates the request packet and forwards it to the selected 1124 * server. For example, all real servers are configured with 1125 * "ifconfig tunl0 <Virtual IP Address> up". When the server receives 1126 * the encapsulated packet, it will decapsulate the packet, processe 1127 * the request and return the response packets directly to the client 1128 * without passing the load balancer. This can greatly increase the 1129 * scalability of virtual server. 1130 * 1131 * Used for ANY protocol 1132 */ 1133 int 1134 ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 1135 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 1136 { 1137 struct netns_ipvs *ipvs = cp->ipvs; 1138 struct net *net = ipvs->net; 1139 struct rtable *rt; /* Route to the other host */ 1140 __be32 saddr; /* Source for tunnel */ 1141 struct net_device *tdev; /* Device to other host */ 1142 __u8 next_protocol = 0; 1143 __u8 dsfield = 0; 1144 __u8 ttl = 0; 1145 __be16 df = 0; 1146 __be16 *dfp = NULL; 1147 struct iphdr *iph; /* Our new IP header */ 1148 unsigned int max_headroom; /* The extra header space needed */ 1149 int ret, local; 1150 int tun_type, gso_type; 1151 int tun_flags; 1152 1153 local = __ip_vs_get_out_rt(ipvs, cp->af, skb, cp->dest, cp->daddr.ip, 1154 IP_VS_RT_MODE_LOCAL | 1155 IP_VS_RT_MODE_NON_LOCAL | 1156 IP_VS_RT_MODE_CONNECT | 1157 IP_VS_RT_MODE_TUNNEL, &saddr, ipvsh); 1158 if (local < 0) 1159 goto tx_error; 1160 if (local) 1161 return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); 1162 1163 rt = skb_rtable(skb); 1164 tdev = rt->dst.dev; 1165 1166 /* 1167 * Okay, now see if we can stuff it in the buffer as-is. 1168 */ 1169 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); 1170 1171 tun_type = cp->dest->tun_type; 1172 tun_flags = cp->dest->tun_flags; 1173 1174 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1175 size_t gue_hdrlen, gue_optlen = 0; 1176 1177 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 1178 skb->ip_summed == CHECKSUM_PARTIAL) { 1179 gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; 1180 } 1181 gue_hdrlen = sizeof(struct guehdr) + gue_optlen; 1182 1183 max_headroom += sizeof(struct udphdr) + gue_hdrlen; 1184 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 1185 IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; 1186 size_t gre_hdrlen; 1187 1188 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 1189 __set_bit(IP_TUNNEL_CSUM_BIT, tflags); 1190 gre_hdrlen = gre_calc_hlen(tflags); 1191 1192 max_headroom += gre_hdrlen; 1193 } 1194 1195 /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */ 1196 dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL; 1197 skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, 1198 &next_protocol, NULL, &dsfield, 1199 &ttl, dfp); 1200 if (IS_ERR(skb)) 1201 return NF_STOLEN; 1202 1203 gso_type = __tun_gso_type_mask(AF_INET, cp->af); 1204 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1205 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || 1206 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) 1207 gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; 1208 else 1209 gso_type |= SKB_GSO_UDP_TUNNEL; 1210 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 1211 skb->ip_summed == CHECKSUM_PARTIAL) { 1212 gso_type |= SKB_GSO_TUNNEL_REMCSUM; 1213 } 1214 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 1215 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 1216 gso_type |= SKB_GSO_GRE_CSUM; 1217 else 1218 gso_type |= SKB_GSO_GRE; 1219 } 1220 1221 if (iptunnel_handle_offloads(skb, gso_type)) 1222 goto tx_error; 1223 1224 skb->transport_header = skb->network_header; 1225 1226 skb_set_inner_ipproto(skb, next_protocol); 1227 skb_set_inner_mac_header(skb, skb_inner_network_offset(skb)); 1228 1229 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1230 bool check = false; 1231 1232 if (ipvs_gue_encap(net, skb, cp, &next_protocol)) 1233 goto tx_error; 1234 1235 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || 1236 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) 1237 check = true; 1238 1239 udp_set_csum(!check, skb, saddr, cp->daddr.ip, skb->len); 1240 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) 1241 ipvs_gre_encap(net, skb, cp, &next_protocol); 1242 1243 skb_push(skb, sizeof(struct iphdr)); 1244 skb_reset_network_header(skb); 1245 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 1246 1247 /* 1248 * Push down and install the IPIP header. 1249 */ 1250 iph = ip_hdr(skb); 1251 iph->version = 4; 1252 iph->ihl = sizeof(struct iphdr)>>2; 1253 iph->frag_off = df; 1254 iph->protocol = next_protocol; 1255 iph->tos = dsfield; 1256 iph->daddr = cp->daddr.ip; 1257 iph->saddr = saddr; 1258 iph->ttl = ttl; 1259 ip_select_ident(net, skb, NULL); 1260 1261 /* Another hack: avoid icmp_send in ip_fragment */ 1262 skb->ignore_df = 1; 1263 1264 ret = ip_vs_tunnel_xmit_prepare(skb, cp); 1265 if (ret == NF_ACCEPT) 1266 ip_local_out(net, skb->sk, skb); 1267 else if (ret == NF_DROP) 1268 kfree_skb(skb); 1269 1270 return NF_STOLEN; 1271 1272 tx_error: 1273 kfree_skb(skb); 1274 return NF_STOLEN; 1275 } 1276 1277 #ifdef CONFIG_IP_VS_IPV6 1278 int 1279 ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 1280 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 1281 { 1282 struct netns_ipvs *ipvs = cp->ipvs; 1283 struct net *net = ipvs->net; 1284 struct rt6_info *rt; /* Route to the other host */ 1285 struct in6_addr saddr; /* Source for tunnel */ 1286 struct net_device *tdev; /* Device to other host */ 1287 __u8 next_protocol = 0; 1288 __u32 payload_len = 0; 1289 __u8 dsfield = 0; 1290 __u8 ttl = 0; 1291 struct ipv6hdr *iph; /* Our new IP header */ 1292 unsigned int max_headroom; /* The extra header space needed */ 1293 int ret, local; 1294 int tun_type, gso_type; 1295 int tun_flags; 1296 1297 local = __ip_vs_get_out_rt_v6(ipvs, cp->af, skb, cp->dest, 1298 &cp->daddr.in6, 1299 &saddr, ipvsh, 1, 1300 IP_VS_RT_MODE_LOCAL | 1301 IP_VS_RT_MODE_NON_LOCAL | 1302 IP_VS_RT_MODE_TUNNEL); 1303 if (local < 0) 1304 goto tx_error; 1305 if (local) 1306 return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); 1307 1308 rt = dst_rt6_info(skb_dst(skb)); 1309 tdev = rt->dst.dev; 1310 1311 /* 1312 * Okay, now see if we can stuff it in the buffer as-is. 1313 */ 1314 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); 1315 1316 tun_type = cp->dest->tun_type; 1317 tun_flags = cp->dest->tun_flags; 1318 1319 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1320 size_t gue_hdrlen, gue_optlen = 0; 1321 1322 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 1323 skb->ip_summed == CHECKSUM_PARTIAL) { 1324 gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; 1325 } 1326 gue_hdrlen = sizeof(struct guehdr) + gue_optlen; 1327 1328 max_headroom += sizeof(struct udphdr) + gue_hdrlen; 1329 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 1330 IP_TUNNEL_DECLARE_FLAGS(tflags) = { }; 1331 size_t gre_hdrlen; 1332 1333 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 1334 __set_bit(IP_TUNNEL_CSUM_BIT, tflags); 1335 gre_hdrlen = gre_calc_hlen(tflags); 1336 1337 max_headroom += gre_hdrlen; 1338 } 1339 1340 skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, 1341 &next_protocol, &payload_len, 1342 &dsfield, &ttl, NULL); 1343 if (IS_ERR(skb)) 1344 return NF_STOLEN; 1345 1346 gso_type = __tun_gso_type_mask(AF_INET6, cp->af); 1347 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1348 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || 1349 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) 1350 gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; 1351 else 1352 gso_type |= SKB_GSO_UDP_TUNNEL; 1353 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 1354 skb->ip_summed == CHECKSUM_PARTIAL) { 1355 gso_type |= SKB_GSO_TUNNEL_REMCSUM; 1356 } 1357 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 1358 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 1359 gso_type |= SKB_GSO_GRE_CSUM; 1360 else 1361 gso_type |= SKB_GSO_GRE; 1362 } 1363 1364 if (iptunnel_handle_offloads(skb, gso_type)) 1365 goto tx_error; 1366 1367 skb->transport_header = skb->network_header; 1368 1369 skb_set_inner_ipproto(skb, next_protocol); 1370 skb_set_inner_mac_header(skb, skb_inner_network_offset(skb)); 1371 1372 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1373 bool check = false; 1374 1375 if (ipvs_gue_encap(net, skb, cp, &next_protocol)) 1376 goto tx_error; 1377 1378 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || 1379 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) 1380 check = true; 1381 1382 udp6_set_csum(!check, skb, &saddr, &cp->daddr.in6, skb->len); 1383 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) 1384 ipvs_gre_encap(net, skb, cp, &next_protocol); 1385 1386 skb_push(skb, sizeof(struct ipv6hdr)); 1387 skb_reset_network_header(skb); 1388 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 1389 1390 /* 1391 * Push down and install the IPIP header. 1392 */ 1393 iph = ipv6_hdr(skb); 1394 iph->version = 6; 1395 iph->nexthdr = next_protocol; 1396 iph->payload_len = htons(payload_len); 1397 memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); 1398 ipv6_change_dsfield(iph, 0, dsfield); 1399 iph->daddr = cp->daddr.in6; 1400 iph->saddr = saddr; 1401 iph->hop_limit = ttl; 1402 1403 /* Another hack: avoid icmp_send in ip_fragment */ 1404 skb->ignore_df = 1; 1405 1406 ret = ip_vs_tunnel_xmit_prepare(skb, cp); 1407 if (ret == NF_ACCEPT) 1408 ip6_local_out(net, skb->sk, skb); 1409 else if (ret == NF_DROP) 1410 kfree_skb(skb); 1411 1412 return NF_STOLEN; 1413 1414 tx_error: 1415 kfree_skb(skb); 1416 return NF_STOLEN; 1417 } 1418 #endif 1419 1420 1421 /* 1422 * Direct Routing transmitter 1423 * Used for ANY protocol 1424 */ 1425 int 1426 ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 1427 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 1428 { 1429 int local; 1430 1431 local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, 1432 IP_VS_RT_MODE_LOCAL | 1433 IP_VS_RT_MODE_NON_LOCAL | 1434 IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh); 1435 if (local < 0) 1436 goto tx_error; 1437 if (local) 1438 return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); 1439 1440 ip_send_check(ip_hdr(skb)); 1441 1442 /* Another hack: avoid icmp_send in ip_fragment */ 1443 skb->ignore_df = 1; 1444 1445 ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); 1446 1447 return NF_STOLEN; 1448 1449 tx_error: 1450 kfree_skb(skb); 1451 return NF_STOLEN; 1452 } 1453 1454 #ifdef CONFIG_IP_VS_IPV6 1455 int 1456 ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 1457 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 1458 { 1459 int local; 1460 1461 local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, 1462 &cp->daddr.in6, 1463 NULL, ipvsh, 0, 1464 IP_VS_RT_MODE_LOCAL | 1465 IP_VS_RT_MODE_NON_LOCAL | 1466 IP_VS_RT_MODE_KNOWN_NH); 1467 if (local < 0) 1468 goto tx_error; 1469 if (local) 1470 return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); 1471 1472 /* Another hack: avoid icmp_send in ip_fragment */ 1473 skb->ignore_df = 1; 1474 1475 ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); 1476 1477 return NF_STOLEN; 1478 1479 tx_error: 1480 kfree_skb(skb); 1481 return NF_STOLEN; 1482 } 1483 #endif 1484 1485 1486 /* 1487 * ICMP packet transmitter 1488 * called by the ip_vs_in_icmp 1489 */ 1490 int 1491 ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 1492 struct ip_vs_protocol *pp, int offset, unsigned int hooknum, 1493 struct ip_vs_iphdr *iph) 1494 { 1495 struct rtable *rt; /* Route to the other host */ 1496 int rc; 1497 int local; 1498 int rt_mode, was_input; 1499 1500 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be 1501 forwarded directly here, because there is no need to 1502 translate address/port back */ 1503 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { 1504 if (cp->packet_xmit) 1505 rc = cp->packet_xmit(skb, cp, pp, iph); 1506 else 1507 rc = NF_ACCEPT; 1508 /* do not touch skb anymore */ 1509 atomic_inc(&cp->in_pkts); 1510 return rc; 1511 } 1512 1513 /* 1514 * mangle and send the packet here (only for VS/NAT) 1515 */ 1516 was_input = rt_is_input_route(skb_rtable(skb)); 1517 1518 /* LOCALNODE from FORWARD hook is not supported */ 1519 rt_mode = (hooknum != NF_INET_FORWARD) ? 1520 IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | 1521 IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; 1522 local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, rt_mode, 1523 NULL, iph); 1524 if (local < 0) 1525 goto tx_error; 1526 rt = skb_rtable(skb); 1527 1528 /* 1529 * Avoid duplicate tuple in reply direction for NAT traffic 1530 * to local address when connection is sync-ed 1531 */ 1532 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 1533 if (cp->flags & IP_VS_CONN_F_SYNC && local) { 1534 enum ip_conntrack_info ctinfo; 1535 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 1536 1537 if (ct) { 1538 IP_VS_DBG(10, "%s(): " 1539 "stopping DNAT to local address %pI4\n", 1540 __func__, &cp->daddr.ip); 1541 goto tx_error; 1542 } 1543 } 1544 #endif 1545 1546 /* From world but DNAT to loopback address? */ 1547 if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { 1548 IP_VS_DBG(1, "%s(): " 1549 "stopping DNAT to loopback %pI4\n", 1550 __func__, &cp->daddr.ip); 1551 goto tx_error; 1552 } 1553 1554 /* copy-on-write the packet before mangling it */ 1555 if (skb_ensure_writable(skb, offset)) 1556 goto tx_error; 1557 1558 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 1559 goto tx_error; 1560 1561 ip_vs_nat_icmp(skb, pp, cp, 0); 1562 1563 /* Another hack: avoid icmp_send in ip_fragment */ 1564 skb->ignore_df = 1; 1565 1566 return ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); 1567 1568 tx_error: 1569 kfree_skb(skb); 1570 rc = NF_STOLEN; 1571 return rc; 1572 } 1573 1574 #ifdef CONFIG_IP_VS_IPV6 1575 int 1576 ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 1577 struct ip_vs_protocol *pp, int offset, unsigned int hooknum, 1578 struct ip_vs_iphdr *ipvsh) 1579 { 1580 struct rt6_info *rt; /* Route to the other host */ 1581 int rc; 1582 int local; 1583 int rt_mode; 1584 1585 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be 1586 forwarded directly here, because there is no need to 1587 translate address/port back */ 1588 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { 1589 if (cp->packet_xmit) 1590 rc = cp->packet_xmit(skb, cp, pp, ipvsh); 1591 else 1592 rc = NF_ACCEPT; 1593 /* do not touch skb anymore */ 1594 atomic_inc(&cp->in_pkts); 1595 return rc; 1596 } 1597 1598 /* 1599 * mangle and send the packet here (only for VS/NAT) 1600 */ 1601 1602 /* LOCALNODE from FORWARD hook is not supported */ 1603 rt_mode = (hooknum != NF_INET_FORWARD) ? 1604 IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | 1605 IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; 1606 local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, 1607 &cp->daddr.in6, NULL, ipvsh, 0, rt_mode); 1608 if (local < 0) 1609 goto tx_error; 1610 rt = dst_rt6_info(skb_dst(skb)); 1611 /* 1612 * Avoid duplicate tuple in reply direction for NAT traffic 1613 * to local address when connection is sync-ed 1614 */ 1615 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 1616 if (cp->flags & IP_VS_CONN_F_SYNC && local) { 1617 enum ip_conntrack_info ctinfo; 1618 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 1619 1620 if (ct) { 1621 IP_VS_DBG(10, "%s(): " 1622 "stopping DNAT to local address %pI6\n", 1623 __func__, &cp->daddr.in6); 1624 goto tx_error; 1625 } 1626 } 1627 #endif 1628 1629 /* From world but DNAT to loopback address? */ 1630 if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && 1631 ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) { 1632 IP_VS_DBG(1, "%s(): " 1633 "stopping DNAT to loopback %pI6\n", 1634 __func__, &cp->daddr.in6); 1635 goto tx_error; 1636 } 1637 1638 /* copy-on-write the packet before mangling it */ 1639 if (skb_ensure_writable(skb, offset)) 1640 goto tx_error; 1641 1642 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 1643 goto tx_error; 1644 1645 ip_vs_nat_icmp_v6(skb, pp, cp, 0); 1646 1647 /* Another hack: avoid icmp_send in ip_fragment */ 1648 skb->ignore_df = 1; 1649 1650 return ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); 1651 1652 tx_error: 1653 kfree_skb(skb); 1654 rc = NF_STOLEN; 1655 return rc; 1656 } 1657 #endif 1658