1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2002 Michael Shalayeff.
5 * Copyright (c) 2003 Ryan McBride.
6 * Copyright (c) 2011 Gleb Smirnoff <glebius@FreeBSD.org>
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
22 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
26 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
27 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
28 * THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 #include "opt_bpf.h"
32 #include "opt_inet.h"
33 #include "opt_inet6.h"
34
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/devctl.h>
38 #include <sys/jail.h>
39 #include <sys/kassert.h>
40 #include <sys/kernel.h>
41 #include <sys/limits.h>
42 #include <sys/malloc.h>
43 #include <sys/mbuf.h>
44 #include <sys/module.h>
45 #include <sys/priv.h>
46 #include <sys/proc.h>
47 #include <sys/socket.h>
48 #include <sys/sockio.h>
49 #include <sys/sysctl.h>
50 #include <sys/syslog.h>
51 #include <sys/taskqueue.h>
52 #include <sys/counter.h>
53
54 #include <net/ethernet.h>
55 #include <net/if.h>
56 #include <net/if_var.h>
57 #include <net/if_dl.h>
58 #include <net/if_llatbl.h>
59 #include <net/if_private.h>
60 #include <net/if_types.h>
61 #include <net/route.h>
62 #include <net/vnet.h>
63
64 #if defined(INET) || defined(INET6)
65 #include <netinet/in.h>
66 #include <netinet/in_var.h>
67 #include <netinet/ip_carp.h>
68 #include <netinet/ip_carp_nl.h>
69 #include <netinet/ip.h>
70 #include <machine/in_cksum.h>
71 #endif
72 #ifdef INET
73 #include <netinet/ip_var.h>
74 #include <netinet/if_ether.h>
75 #endif
76
77 #ifdef INET6
78 #include <netinet/icmp6.h>
79 #include <netinet/ip6.h>
80 #include <netinet6/in6_var.h>
81 #include <netinet6/ip6_var.h>
82 #include <netinet6/scope6_var.h>
83 #include <netinet6/nd6.h>
84 #endif
85
86 #include <netlink/netlink.h>
87 #include <netlink/netlink_ctl.h>
88 #include <netlink/netlink_generic.h>
89 #include <netlink/netlink_message_parser.h>
90
91 #include <crypto/sha1.h>
92
93 static MALLOC_DEFINE(M_CARP, "CARP", "CARP addresses");
94
95 struct carp_softc {
96 struct ifnet *sc_carpdev; /* Pointer to parent ifnet. */
97 struct ifaddr **sc_ifas; /* Our ifaddrs. */
98 carp_version_t sc_version; /* carp or VRRPv3 */
99 uint8_t sc_addr[ETHER_ADDR_LEN]; /* Our link level address. */
100 struct callout sc_ad_tmo; /* Advertising timeout. */
101 #ifdef INET
102 struct callout sc_md_tmo; /* Master down timeout. */
103 #endif
104 #ifdef INET6
105 struct callout sc_md6_tmo; /* XXX: Master down timeout. */
106 #endif
107 struct mtx sc_mtx;
108
109 int sc_vhid;
110 union {
111 struct { /* sc_version == CARP_VERSION_CARP */
112 int sc_advskew;
113 int sc_advbase;
114 struct in_addr sc_carpaddr;
115 struct in6_addr sc_carpaddr6;
116 uint64_t sc_counter;
117 bool sc_init_counter;
118 #define CARP_HMAC_PAD 64
119 unsigned char sc_key[CARP_KEY_LEN];
120 unsigned char sc_pad[CARP_HMAC_PAD];
121 SHA1_CTX sc_sha1;
122 };
123 struct { /* sc_version == CARP_VERSION_VRRPv3 */
124 uint8_t sc_vrrp_prio;
125 uint16_t sc_vrrp_adv_inter;
126 uint16_t sc_vrrp_master_inter;
127 };
128 };
129 int sc_naddrs;
130 int sc_naddrs6;
131 int sc_ifasiz;
132 enum { INIT = 0, BACKUP, MASTER } sc_state;
133 int sc_suppress;
134 int sc_sendad_errors;
135 #define CARP_SENDAD_MAX_ERRORS 3
136 int sc_sendad_success;
137 #define CARP_SENDAD_MIN_SUCCESS 3
138
139 TAILQ_ENTRY(carp_softc) sc_list; /* On the carp_if list. */
140 LIST_ENTRY(carp_softc) sc_next; /* On the global list. */
141 };
142
143 struct carp_if {
144 #ifdef INET
145 int cif_naddrs;
146 #endif
147 #ifdef INET6
148 int cif_naddrs6;
149 #endif
150 TAILQ_HEAD(, carp_softc) cif_vrs;
151 #ifdef INET
152 struct ip_moptions cif_imo;
153 #endif
154 #ifdef INET6
155 struct ip6_moptions cif_im6o;
156 #endif
157 struct ifnet *cif_ifp;
158 struct mtx cif_mtx;
159 uint32_t cif_flags;
160 #define CIF_PROMISC 0x00000001
161 };
162
163 /*
164 * Brief design of carp(4).
165 *
166 * Any carp-capable ifnet may have a list of carp softcs hanging off
167 * its ifp->if_carp pointer. Each softc represents one unique virtual
168 * host id, or vhid. The softc has a back pointer to the ifnet. All
169 * softcs are joined in a global list, which has quite limited use.
170 *
171 * Any interface address that takes part in CARP negotiation has a
172 * pointer to the softc of its vhid, ifa->ifa_carp. That could be either
173 * AF_INET or AF_INET6 address.
174 *
175 * Although, one can get the softc's backpointer to ifnet and traverse
176 * through its ifp->if_addrhead queue to find all interface addresses
177 * involved in CARP, we keep a growable array of ifaddr pointers. This
178 * allows us to avoid grabbing the IF_ADDR_LOCK() in many traversals that
179 * do calls into the network stack, thus avoiding LORs.
180 *
181 * Locking:
182 *
183 * Each softc has a lock sc_mtx. It is used to synchronise carp_input_c(),
184 * callout-driven events and ioctl()s.
185 *
186 * To traverse the list of softcs on an ifnet we use CIF_LOCK() or carp_sx.
187 * To traverse the global list we use the mutex carp_mtx.
188 *
189 * Known issues with locking:
190 *
191 * - On module unload we may race (?) with packet processing thread
192 * dereferencing our function pointers.
193 */
194
195 /* Accept incoming CARP packets. */
196 VNET_DEFINE_STATIC(int, carp_allow) = 1;
197 #define V_carp_allow VNET(carp_allow)
198
199 /* Set DSCP in outgoing CARP packets. */
200 VNET_DEFINE_STATIC(int, carp_dscp) = 56;
201 #define V_carp_dscp VNET(carp_dscp)
202
203 /* Preempt slower nodes. */
204 VNET_DEFINE_STATIC(int, carp_preempt) = 0;
205 #define V_carp_preempt VNET(carp_preempt)
206
207 /* Log level. */
208 VNET_DEFINE_STATIC(int, carp_log) = 1;
209 #define V_carp_log VNET(carp_log)
210
211 /* Global advskew demotion. */
212 VNET_DEFINE_STATIC(int, carp_demotion) = 0;
213 #define V_carp_demotion VNET(carp_demotion)
214
215 /* Send error demotion factor. */
216 VNET_DEFINE_STATIC(int, carp_senderr_adj) = CARP_MAXSKEW;
217 #define V_carp_senderr_adj VNET(carp_senderr_adj)
218
219 /* Iface down demotion factor. */
220 VNET_DEFINE_STATIC(int, carp_ifdown_adj) = CARP_MAXSKEW;
221 #define V_carp_ifdown_adj VNET(carp_ifdown_adj)
222
223 static int carp_allow_sysctl(SYSCTL_HANDLER_ARGS);
224 static int carp_dscp_sysctl(SYSCTL_HANDLER_ARGS);
225 static int carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS);
226
227 SYSCTL_NODE(_net_inet, IPPROTO_CARP, carp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
228 "CARP");
229 SYSCTL_PROC(_net_inet_carp, OID_AUTO, allow,
230 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE,
231 &VNET_NAME(carp_allow), 0, carp_allow_sysctl, "I",
232 "Accept incoming CARP packets");
233 SYSCTL_PROC(_net_inet_carp, OID_AUTO, dscp,
234 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
235 0, 0, carp_dscp_sysctl, "I",
236 "DSCP value for carp packets");
237 SYSCTL_INT(_net_inet_carp, OID_AUTO, preempt, CTLFLAG_VNET | CTLFLAG_RW,
238 &VNET_NAME(carp_preempt), 0, "High-priority backup preemption mode");
239 SYSCTL_INT(_net_inet_carp, OID_AUTO, log, CTLFLAG_VNET | CTLFLAG_RW,
240 &VNET_NAME(carp_log), 0, "CARP log level");
241 SYSCTL_PROC(_net_inet_carp, OID_AUTO, demotion,
242 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
243 0, 0, carp_demote_adj_sysctl, "I",
244 "Adjust demotion factor (skew of advskew)");
245 SYSCTL_INT(_net_inet_carp, OID_AUTO, senderr_demotion_factor,
246 CTLFLAG_VNET | CTLFLAG_RW,
247 &VNET_NAME(carp_senderr_adj), 0, "Send error demotion factor adjustment");
248 SYSCTL_INT(_net_inet_carp, OID_AUTO, ifdown_demotion_factor,
249 CTLFLAG_VNET | CTLFLAG_RW,
250 &VNET_NAME(carp_ifdown_adj), 0,
251 "Interface down demotion factor adjustment");
252
253 VNET_PCPUSTAT_DEFINE(struct carpstats, carpstats);
254 VNET_PCPUSTAT_SYSINIT(carpstats);
255 VNET_PCPUSTAT_SYSUNINIT(carpstats);
256
257 #define CARPSTATS_ADD(name, val) \
258 counter_u64_add(VNET(carpstats)[offsetof(struct carpstats, name) / \
259 sizeof(uint64_t)], (val))
260 #define CARPSTATS_INC(name) CARPSTATS_ADD(name, 1)
261
262 SYSCTL_VNET_PCPUSTAT(_net_inet_carp, OID_AUTO, stats, struct carpstats,
263 carpstats, "CARP statistics (struct carpstats, netinet/ip_carp.h)");
264
265 #define CARP_LOCK_INIT(sc) mtx_init(&(sc)->sc_mtx, "carp_softc", \
266 NULL, MTX_DEF)
267 #define CARP_LOCK_DESTROY(sc) mtx_destroy(&(sc)->sc_mtx)
268 #define CARP_LOCK_ASSERT(sc) mtx_assert(&(sc)->sc_mtx, MA_OWNED)
269 #define CARP_LOCK(sc) mtx_lock(&(sc)->sc_mtx)
270 #define CARP_UNLOCK(sc) mtx_unlock(&(sc)->sc_mtx)
271 #define CIF_LOCK_INIT(cif) mtx_init(&(cif)->cif_mtx, "carp_if", \
272 NULL, MTX_DEF)
273 #define CIF_LOCK_DESTROY(cif) mtx_destroy(&(cif)->cif_mtx)
274 #define CIF_LOCK_ASSERT(cif) mtx_assert(&(cif)->cif_mtx, MA_OWNED)
275 #define CIF_LOCK(cif) mtx_lock(&(cif)->cif_mtx)
276 #define CIF_UNLOCK(cif) mtx_unlock(&(cif)->cif_mtx)
277 #define CIF_FREE(cif) do { \
278 CIF_LOCK(cif); \
279 if (TAILQ_EMPTY(&(cif)->cif_vrs)) \
280 carp_free_if(cif); \
281 else \
282 CIF_UNLOCK(cif); \
283 } while (0)
284
285 #define CARP_LOG(...) do { \
286 if (V_carp_log > 0) \
287 log(LOG_INFO, "carp: " __VA_ARGS__); \
288 } while (0)
289
290 #define CARP_DEBUG(...) do { \
291 if (V_carp_log > 1) \
292 log(LOG_DEBUG, __VA_ARGS__); \
293 } while (0)
294
295 #define IFNET_FOREACH_IFA(ifp, ifa) \
296 CK_STAILQ_FOREACH((ifa), &(ifp)->if_addrhead, ifa_link) \
297 if ((ifa)->ifa_carp != NULL)
298
299 #define CARP_FOREACH_IFA(sc, ifa) \
300 CARP_LOCK_ASSERT(sc); \
301 for (int _i = 0; \
302 _i < (sc)->sc_naddrs + (sc)->sc_naddrs6 && \
303 ((ifa) = sc->sc_ifas[_i]) != NULL; \
304 ++_i)
305
306 #define IFNET_FOREACH_CARP(ifp, sc) \
307 KASSERT(mtx_owned(&ifp->if_carp->cif_mtx) || \
308 sx_xlocked(&carp_sx), ("cif_vrs not locked")); \
309 TAILQ_FOREACH((sc), &(ifp)->if_carp->cif_vrs, sc_list)
310
311 #define DEMOTE_ADVSKEW(sc) \
312 (((sc)->sc_advskew + V_carp_demotion > CARP_MAXSKEW) ? \
313 CARP_MAXSKEW : \
314 (((sc)->sc_advskew + V_carp_demotion < 0) ? \
315 0 : ((sc)->sc_advskew + V_carp_demotion)))
316
317 /*
318 * VRRPv3 priority is the inverse of CARP advskew: higher is better.
319 * Subtract the global demotion counter and clamp to [0, 254].
320 * Priority 255 (IP address owner) is never demoted.
321 */
322 #define DEMOTE_VRRP_PRIO(sc) \
323 ((sc)->sc_vrrp_prio == 255 ? 255 : \
324 (((int)(sc)->sc_vrrp_prio - V_carp_demotion < 0) ? 0 : \
325 (((int)(sc)->sc_vrrp_prio - V_carp_demotion > 254) ? 254 : \
326 (int)(sc)->sc_vrrp_prio - V_carp_demotion)))
327
328 static void carp_input_c(struct mbuf *, struct carp_header *, sa_family_t, int);
329 static void vrrp_input_c(struct mbuf *, int, sa_family_t, int, int, uint16_t);
330 static struct carp_softc
331 *carp_alloc(struct ifnet *, carp_version_t, int);
332 static void carp_destroy(struct carp_softc *);
333 static struct carp_if
334 *carp_alloc_if(struct ifnet *);
335 static void carp_free_if(struct carp_if *);
336 static void carp_set_state(struct carp_softc *, int, const char* reason);
337 static void carp_sc_state(struct carp_softc *);
338 static void carp_setrun(struct carp_softc *, sa_family_t);
339 static void carp_master_down(void *);
340 static void carp_master_down_locked(struct carp_softc *,
341 const char* reason);
342 static void carp_send_ad_locked(struct carp_softc *);
343 static void vrrp_send_ad_locked(struct carp_softc *);
344 static void carp_addroute(struct carp_softc *);
345 static void carp_ifa_addroute(struct ifaddr *);
346 static void carp_delroute(struct carp_softc *);
347 static void carp_ifa_delroute(struct ifaddr *);
348 static void carp_send_ad_all(void *, int);
349 static void carp_demote_adj(int, char *);
350
351 static LIST_HEAD(, carp_softc) carp_list = LIST_HEAD_INITIALIZER(carp_list);
352 static struct mtx carp_mtx;
353 static struct sx carp_sx;
354 static struct task carp_sendall_task =
355 TASK_INITIALIZER(0, carp_send_ad_all, NULL);
356
357 static int
carp_is_supported_if(if_t ifp)358 carp_is_supported_if(if_t ifp)
359 {
360 if (ifp == NULL)
361 return (ENXIO);
362
363 switch (ifp->if_type) {
364 case IFT_ETHER:
365 case IFT_L2VLAN:
366 case IFT_BRIDGE:
367 break;
368 default:
369 return (EOPNOTSUPP);
370 }
371
372 return (0);
373 }
374
375 static void
carp_hmac_prepare(struct carp_softc * sc)376 carp_hmac_prepare(struct carp_softc *sc)
377 {
378 uint8_t version = CARP_VERSION_CARP, type = CARP_ADVERTISEMENT;
379 uint8_t vhid = sc->sc_vhid & 0xff;
380 struct ifaddr *ifa;
381 int i, found;
382 #ifdef INET
383 struct in_addr last, cur, in;
384 #endif
385 #ifdef INET6
386 struct in6_addr last6, cur6, in6;
387 #endif
388
389 CARP_LOCK_ASSERT(sc);
390 MPASS(sc->sc_version == CARP_VERSION_CARP);
391
392 /* Compute ipad from key. */
393 bzero(sc->sc_pad, sizeof(sc->sc_pad));
394 bcopy(sc->sc_key, sc->sc_pad, sizeof(sc->sc_key));
395 for (i = 0; i < sizeof(sc->sc_pad); i++)
396 sc->sc_pad[i] ^= 0x36;
397
398 /* Precompute first part of inner hash. */
399 SHA1Init(&sc->sc_sha1);
400 SHA1Update(&sc->sc_sha1, sc->sc_pad, sizeof(sc->sc_pad));
401 SHA1Update(&sc->sc_sha1, (void *)&version, sizeof(version));
402 SHA1Update(&sc->sc_sha1, (void *)&type, sizeof(type));
403 SHA1Update(&sc->sc_sha1, (void *)&vhid, sizeof(vhid));
404 #ifdef INET
405 cur.s_addr = 0;
406 do {
407 found = 0;
408 last = cur;
409 cur.s_addr = 0xffffffff;
410 CARP_FOREACH_IFA(sc, ifa) {
411 in.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr;
412 if (ifa->ifa_addr->sa_family == AF_INET &&
413 ntohl(in.s_addr) > ntohl(last.s_addr) &&
414 ntohl(in.s_addr) < ntohl(cur.s_addr)) {
415 cur.s_addr = in.s_addr;
416 found++;
417 }
418 }
419 if (found)
420 SHA1Update(&sc->sc_sha1, (void *)&cur, sizeof(cur));
421 } while (found);
422 #endif /* INET */
423 #ifdef INET6
424 memset(&cur6, 0, sizeof(cur6));
425 do {
426 found = 0;
427 last6 = cur6;
428 memset(&cur6, 0xff, sizeof(cur6));
429 CARP_FOREACH_IFA(sc, ifa) {
430 in6 = ifatoia6(ifa)->ia_addr.sin6_addr;
431 if (IN6_IS_SCOPE_EMBED(&in6))
432 in6.s6_addr16[1] = 0;
433 if (ifa->ifa_addr->sa_family == AF_INET6 &&
434 memcmp(&in6, &last6, sizeof(in6)) > 0 &&
435 memcmp(&in6, &cur6, sizeof(in6)) < 0) {
436 cur6 = in6;
437 found++;
438 }
439 }
440 if (found)
441 SHA1Update(&sc->sc_sha1, (void *)&cur6, sizeof(cur6));
442 } while (found);
443 #endif /* INET6 */
444
445 /* convert ipad to opad */
446 for (i = 0; i < sizeof(sc->sc_pad); i++)
447 sc->sc_pad[i] ^= 0x36 ^ 0x5c;
448 }
449
450 static void
carp_hmac_generate(struct carp_softc * sc,uint32_t counter[2],unsigned char md[20])451 carp_hmac_generate(struct carp_softc *sc, uint32_t counter[2],
452 unsigned char md[20])
453 {
454 SHA1_CTX sha1ctx;
455
456 CARP_LOCK_ASSERT(sc);
457
458 /* fetch first half of inner hash */
459 bcopy(&sc->sc_sha1, &sha1ctx, sizeof(sha1ctx));
460
461 SHA1Update(&sha1ctx, (void *)counter, sizeof(sc->sc_counter));
462 SHA1Final(md, &sha1ctx);
463
464 /* outer hash */
465 SHA1Init(&sha1ctx);
466 SHA1Update(&sha1ctx, sc->sc_pad, sizeof(sc->sc_pad));
467 SHA1Update(&sha1ctx, md, 20);
468 SHA1Final(md, &sha1ctx);
469 }
470
471 static int
carp_hmac_verify(struct carp_softc * sc,uint32_t counter[2],unsigned char md[20])472 carp_hmac_verify(struct carp_softc *sc, uint32_t counter[2],
473 unsigned char md[20])
474 {
475 unsigned char md2[20];
476
477 CARP_LOCK_ASSERT(sc);
478
479 carp_hmac_generate(sc, counter, md2);
480
481 return (bcmp(md, md2, sizeof(md2)));
482 }
483
484 static int
vrrp_checksum_verify(struct mbuf * m,int off,int len,uint16_t phdrcksum)485 vrrp_checksum_verify(struct mbuf *m, int off, int len, uint16_t phdrcksum)
486 {
487 uint16_t cksum;
488
489 /*
490 * Note that VRRPv3 checksums are different from CARP checksums.
491 * Carp just calculates the checksum over the packet.
492 * VRRPv3 includes the pseudo-header checksum as well.
493 */
494 cksum = in_cksum_skip(m, off + len, off);
495 cksum -= phdrcksum;
496
497 return (cksum);
498 }
499
500 /*
501 * process input packet.
502 * we have rearranged checks order compared to the rfc,
503 * but it seems more efficient this way or not possible otherwise.
504 */
505 #ifdef INET
506 static int
carp_input(struct mbuf ** mp,int * offp,int proto)507 carp_input(struct mbuf **mp, int *offp, int proto)
508 {
509 struct mbuf *m = *mp;
510 struct ip *ip;
511 struct vrrpv3_header *vh;
512 int iplen;
513 int minlen;
514 int totlen;
515
516 iplen = *offp;
517 *mp = NULL;
518
519 CARPSTATS_INC(carps_ipackets);
520
521 if (!V_carp_allow) {
522 m_freem(m);
523 return (IPPROTO_DONE);
524 }
525
526 /* Ensure we have enough header to figure out the version. */
527 if (m->m_pkthdr.len < iplen + sizeof(*vh)) {
528 CARPSTATS_INC(carps_badlen);
529 CARP_DEBUG("%s: received len %zd < sizeof(struct vrrpv3_header) "
530 "on %s\n", __func__, m->m_len - sizeof(struct ip),
531 if_name(m->m_pkthdr.rcvif));
532 m_freem(m);
533 return (IPPROTO_DONE);
534 }
535
536 if (m->m_len < iplen + sizeof(*vh)) {
537 if ((m = m_pullup(m, iplen + sizeof(*vh))) == NULL) {
538 CARPSTATS_INC(carps_hdrops);
539 CARP_DEBUG("%s():%d: pullup failed\n", __func__, __LINE__);
540 return (IPPROTO_DONE);
541 }
542 }
543 ip = mtod(m, struct ip *);
544 totlen = ntohs(ip->ip_len);
545 vh = (struct vrrpv3_header *)((char *)ip + iplen);
546
547 switch (vh->vrrp_version) {
548 case CARP_VERSION_CARP:
549 minlen = sizeof(struct carp_header);
550 break;
551 case CARP_VERSION_VRRPv3:
552 minlen = sizeof(struct vrrpv3_header);
553 break;
554 default:
555 CARPSTATS_INC(carps_badver);
556 CARP_DEBUG("%s: unsupported version %d on %s\n", __func__,
557 vh->vrrp_version, if_name(m->m_pkthdr.rcvif));
558 m_freem(m);
559 return (IPPROTO_DONE);
560 }
561
562 /* And now check the length again but with the real minimal length. */
563 if (m->m_pkthdr.len < iplen + minlen) {
564 CARPSTATS_INC(carps_badlen);
565 CARP_DEBUG("%s: received len %zd < %d "
566 "on %s\n", __func__, m->m_len - sizeof(struct ip),
567 iplen + minlen,
568 if_name(m->m_pkthdr.rcvif));
569 m_freem(m);
570 return (IPPROTO_DONE);
571 }
572
573 if (m->m_len < iplen + minlen) {
574 if ((m = m_pullup(m, iplen + minlen)) == NULL) {
575 CARPSTATS_INC(carps_hdrops);
576 CARP_DEBUG("%s():%d: pullup failed\n", __func__, __LINE__);
577 return (IPPROTO_DONE);
578 }
579 ip = mtod(m, struct ip *);
580 vh = (struct vrrpv3_header *)((char *)ip + iplen);
581 }
582
583 switch (vh->vrrp_version) {
584 case CARP_VERSION_CARP: {
585 struct carp_header *ch;
586
587 /* verify the CARP checksum */
588 if (in_cksum_skip(m, totlen, iplen)) {
589 CARPSTATS_INC(carps_badsum);
590 CARP_DEBUG("%s: checksum failed on %s\n", __func__,
591 if_name(m->m_pkthdr.rcvif));
592 m_freem(m);
593 break;
594 }
595 ch = (struct carp_header *)((char *)ip + iplen);
596 carp_input_c(m, ch, AF_INET, ip->ip_ttl);
597 break;
598 }
599 case CARP_VERSION_VRRPv3: {
600 uint16_t phdrcksum;
601
602 phdrcksum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
603 htonl((u_short)(totlen - iplen) + ip->ip_p));
604 vrrp_input_c(m, iplen, AF_INET, ip->ip_ttl, totlen - iplen,
605 phdrcksum);
606 break;
607 }
608 default:
609 KASSERT(false, ("Unsupported version %d", vh->vrrp_version));
610 }
611
612 return (IPPROTO_DONE);
613 }
614 #endif
615
616 #ifdef INET6
617 static int
carp6_input(struct mbuf ** mp,int * offp,int proto)618 carp6_input(struct mbuf **mp, int *offp, int proto)
619 {
620 struct mbuf *m = *mp;
621 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
622 struct vrrpv3_header *vh;
623 u_int len, minlen;
624
625 CARPSTATS_INC(carps_ipackets6);
626
627 if (!V_carp_allow) {
628 m_freem(m);
629 return (IPPROTO_DONE);
630 }
631
632 /* check if received on a valid carp interface */
633 if (m->m_pkthdr.rcvif->if_carp == NULL) {
634 CARPSTATS_INC(carps_badif);
635 CARP_DEBUG("%s: packet received on non-carp interface: %s\n",
636 __func__, if_name(m->m_pkthdr.rcvif));
637 m_freem(m);
638 return (IPPROTO_DONE);
639 }
640
641 if (m->m_len < *offp + sizeof(*vh)) {
642 len = m->m_len;
643 m = m_pullup(m, *offp + sizeof(*vh));
644 if (m == NULL) {
645 CARPSTATS_INC(carps_badlen);
646 CARP_DEBUG("%s: packet size %u too small\n", __func__, len);
647 return (IPPROTO_DONE);
648 }
649 ip6 = mtod(m, struct ip6_hdr *);
650 }
651 vh = (struct vrrpv3_header *)(mtod(m, char *) + *offp);
652
653 switch (vh->vrrp_version) {
654 case CARP_VERSION_CARP:
655 minlen = sizeof(struct carp_header);
656 break;
657 case CARP_VERSION_VRRPv3:
658 minlen = sizeof(struct vrrpv3_header);
659 break;
660 default:
661 CARPSTATS_INC(carps_badver);
662 CARP_DEBUG("%s: unsupported version %d on %s\n", __func__,
663 vh->vrrp_version, if_name(m->m_pkthdr.rcvif));
664 m_freem(m);
665 return (IPPROTO_DONE);
666 }
667
668 /* And now check the length again but with the real minimal length. */
669 if (m->m_pkthdr.len < sizeof(*ip6) + minlen) {
670 CARPSTATS_INC(carps_badlen);
671 CARP_DEBUG("%s: received len %zd < %zd "
672 "on %s\n", __func__, m->m_len - sizeof(struct ip),
673 sizeof(*ip6) + minlen,
674 if_name(m->m_pkthdr.rcvif));
675 m_freem(m);
676 return (IPPROTO_DONE);
677 }
678
679 if (m->m_len < sizeof(*ip6) + minlen) {
680 if ((m = m_pullup(m, sizeof(*ip6) + minlen)) == NULL) {
681 CARPSTATS_INC(carps_hdrops);
682 CARP_DEBUG("%s():%d: pullup failed\n", __func__, __LINE__);
683 return (IPPROTO_DONE);
684 }
685 ip6 = mtod(m, struct ip6_hdr *);
686 vh = (struct vrrpv3_header *)mtodo(m, sizeof(*ip6));
687 }
688
689 switch (vh->vrrp_version) {
690 case CARP_VERSION_CARP: {
691 struct carp_header *ch;
692
693 /* verify the CARP checksum */
694 if (in_cksum_skip(m, *offp + sizeof(struct carp_header),
695 *offp)) {
696 CARPSTATS_INC(carps_badsum);
697 CARP_DEBUG("%s: checksum failed, on %s\n", __func__,
698 if_name(m->m_pkthdr.rcvif));
699 m_freem(m);
700 break;
701 }
702 ch = (struct carp_header *)((char *)ip6 + sizeof(*ip6));
703 carp_input_c(m, ch, AF_INET6, ip6->ip6_hlim);
704 break;
705 }
706 case CARP_VERSION_VRRPv3: {
707 uint16_t phdrcksum;
708
709 phdrcksum = in6_cksum_pseudo(ip6, ntohs(ip6->ip6_plen),
710 ip6->ip6_nxt, 0);
711 vrrp_input_c(m, sizeof(*ip6), AF_INET6, ip6->ip6_hlim,
712 ntohs(ip6->ip6_plen), phdrcksum);
713 break;
714 }
715 default:
716 KASSERT(false, ("Unsupported version %d", vh->vrrp_version));
717 }
718 return (IPPROTO_DONE);
719 }
720 #endif /* INET6 */
721
722 /*
723 * This routine should not be necessary at all, but some switches
724 * (VMWare ESX vswitches) can echo our own packets back at us,
725 * and we must ignore them or they will cause us to drop out of
726 * MASTER mode.
727 *
728 * We cannot catch all cases of network loops. Instead, what we
729 * do here is catch any packet that arrives with a carp header
730 * with a VHID of 0, that comes from an address that is our own.
731 * These packets are by definition "from us" (even if they are from
732 * a misconfigured host that is pretending to be us).
733 *
734 * The VHID test is outside this mini-function.
735 */
736 static int
carp_source_is_self(const struct mbuf * m,struct ifaddr * ifa,sa_family_t af)737 carp_source_is_self(const struct mbuf *m, struct ifaddr *ifa, sa_family_t af)
738 {
739 #ifdef INET
740 struct ip *ip4;
741 struct in_addr in4;
742 #endif
743 #ifdef INET6
744 struct ip6_hdr *ip6;
745 struct in6_addr in6;
746 #endif
747
748 switch (af) {
749 #ifdef INET
750 case AF_INET:
751 ip4 = mtod(m, struct ip *);
752 in4 = ifatoia(ifa)->ia_addr.sin_addr;
753 return (in4.s_addr == ip4->ip_src.s_addr);
754 #endif
755 #ifdef INET6
756 case AF_INET6:
757 ip6 = mtod(m, struct ip6_hdr *);
758 in6 = ifatoia6(ifa)->ia_addr.sin6_addr;
759 return (memcmp(&in6, &ip6->ip6_src, sizeof(in6)) == 0);
760 #endif
761 default:
762 break;
763 }
764 return (0);
765 }
766
767 static struct ifaddr *
carp_find_ifa(const struct mbuf * m,sa_family_t af,uint8_t vhid)768 carp_find_ifa(const struct mbuf *m, sa_family_t af, uint8_t vhid)
769 {
770 struct ifnet *ifp = m->m_pkthdr.rcvif;
771 struct ifaddr *ifa, *match;
772 int error;
773
774 NET_EPOCH_ASSERT();
775
776 /*
777 * Verify that the VHID is valid on the receiving interface.
778 *
779 * There should be just one match. If there are none
780 * the VHID is not valid and we drop the packet. If
781 * there are multiple VHID matches, take just the first
782 * one, for compatibility with previous code. While we're
783 * scanning, check for obvious loops in the network topology
784 * (these should never happen, and as noted above, we may
785 * miss real loops; this is just a double-check).
786 */
787 error = 0;
788 match = NULL;
789 IFNET_FOREACH_IFA(ifp, ifa) {
790 if (match == NULL && ifa->ifa_carp != NULL &&
791 ifa->ifa_addr->sa_family == af &&
792 ifa->ifa_carp->sc_vhid == vhid)
793 match = ifa;
794 if (vhid == 0 && carp_source_is_self(m, ifa, af))
795 error = ELOOP;
796 }
797 ifa = error ? NULL : match;
798 if (ifa != NULL)
799 ifa_ref(ifa);
800
801 if (ifa == NULL) {
802 if (error == ELOOP) {
803 CARP_DEBUG("dropping looped packet on interface %s\n",
804 if_name(ifp));
805 CARPSTATS_INC(carps_badif); /* ??? */
806 } else {
807 CARPSTATS_INC(carps_badvhid);
808 }
809 }
810
811 return (ifa);
812 }
813
814 static void
carp_input_c(struct mbuf * m,struct carp_header * ch,sa_family_t af,int ttl)815 carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af, int ttl)
816 {
817 struct ifnet *ifp = m->m_pkthdr.rcvif;
818 struct ifaddr *ifa;
819 struct carp_softc *sc;
820 uint64_t tmp_counter;
821 struct timeval sc_tv, ch_tv;
822 bool multicast = false;
823
824 NET_EPOCH_ASSERT();
825 MPASS(ch->carp_version == CARP_VERSION_CARP);
826
827 ifa = carp_find_ifa(m, af, ch->carp_vhid);
828 if (ifa == NULL) {
829 m_freem(m);
830 return;
831 }
832
833 sc = ifa->ifa_carp;
834 CARP_LOCK(sc);
835
836 /* verify the CARP version. */
837 if (sc->sc_version != CARP_VERSION_CARP) {
838 CARP_UNLOCK(sc);
839
840 CARPSTATS_INC(carps_badver);
841 CARP_DEBUG("%s: invalid version %d\n", if_name(ifp),
842 ch->carp_version);
843 ifa_free(ifa);
844 m_freem(m);
845 return;
846 }
847
848 if (ifa->ifa_addr->sa_family == AF_INET) {
849 multicast = IN_MULTICAST(ntohl(sc->sc_carpaddr.s_addr));
850 } else {
851 multicast = IN6_IS_ADDR_MULTICAST(&sc->sc_carpaddr6);
852 }
853 ifa_free(ifa);
854
855 /* verify that the IP TTL is 255, but only if we're not in unicast mode. */
856 if (multicast && ttl != CARP_DFLTTL) {
857 CARPSTATS_INC(carps_badttl);
858 CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__,
859 ttl, if_name(m->m_pkthdr.rcvif));
860 goto out;
861 }
862
863 if (carp_hmac_verify(sc, ch->carp_counter, ch->carp_md)) {
864 CARPSTATS_INC(carps_badauth);
865 CARP_DEBUG("%s: incorrect hash for VHID %u@%s\n", __func__,
866 sc->sc_vhid, if_name(ifp));
867 goto out;
868 }
869
870 tmp_counter = ntohl(ch->carp_counter[0]);
871 tmp_counter = tmp_counter<<32;
872 tmp_counter += ntohl(ch->carp_counter[1]);
873
874 /* XXX Replay protection goes here */
875
876 sc->sc_init_counter = false;
877 sc->sc_counter = tmp_counter;
878
879 sc_tv.tv_sec = sc->sc_advbase;
880 sc_tv.tv_usec = DEMOTE_ADVSKEW(sc) * 1000000 / 256;
881 ch_tv.tv_sec = ch->carp_advbase;
882 ch_tv.tv_usec = ch->carp_advskew * 1000000 / 256;
883
884 switch (sc->sc_state) {
885 case INIT:
886 break;
887 case MASTER:
888 /*
889 * If we receive an advertisement from a master who's going to
890 * be more frequent than us, go into BACKUP state.
891 */
892 if (timevalcmp(&sc_tv, &ch_tv, >) ||
893 timevalcmp(&sc_tv, &ch_tv, ==)) {
894 callout_stop(&sc->sc_ad_tmo);
895 carp_set_state(sc, BACKUP,
896 "more frequent advertisement received");
897 carp_setrun(sc, 0);
898 carp_delroute(sc);
899 }
900 break;
901 case BACKUP:
902 /*
903 * If we're pre-empting masters who advertise slower than us,
904 * and this one claims to be slower, treat him as down.
905 */
906 if (V_carp_preempt && timevalcmp(&sc_tv, &ch_tv, <)) {
907 carp_master_down_locked(sc,
908 "preempting a slower master");
909 break;
910 }
911
912 /*
913 * If the master is going to advertise at such a low frequency
914 * that he's guaranteed to time out, we'd might as well just
915 * treat him as timed out now.
916 */
917 sc_tv.tv_sec = sc->sc_advbase * 3;
918 if (timevalcmp(&sc_tv, &ch_tv, <)) {
919 carp_master_down_locked(sc, "master will time out");
920 break;
921 }
922
923 /*
924 * Otherwise, we reset the counter and wait for the next
925 * advertisement.
926 */
927 carp_setrun(sc, af);
928 break;
929 }
930
931 out:
932 CARP_UNLOCK(sc);
933 m_freem(m);
934 }
935
936 static void
vrrp_input_c(struct mbuf * m,int off,sa_family_t af,int ttl,int len,uint16_t phdrcksum)937 vrrp_input_c(struct mbuf *m, int off, sa_family_t af, int ttl,
938 int len, uint16_t phdrcksum)
939 {
940 struct vrrpv3_header *vh = mtodo(m, off);
941 struct ifnet *ifp = m->m_pkthdr.rcvif;
942 struct ifaddr *ifa;
943 struct carp_softc *sc;
944
945 NET_EPOCH_ASSERT();
946 MPASS(vh->vrrp_version == CARP_VERSION_VRRPv3);
947
948 ifa = carp_find_ifa(m, af, vh->vrrp_vrtid);
949 if (ifa == NULL) {
950 m_freem(m);
951 return;
952 }
953
954 sc = ifa->ifa_carp;
955 CARP_LOCK(sc);
956
957 ifa_free(ifa);
958
959 /* verify the CARP version. */
960 if (sc->sc_version != CARP_VERSION_VRRPv3) {
961 CARP_UNLOCK(sc);
962
963 CARPSTATS_INC(carps_badver);
964 CARP_DEBUG("%s: invalid version %d\n", if_name(ifp),
965 vh->vrrp_version);
966 m_freem(m);
967 return;
968 }
969
970 /* verify that the IP TTL is 255. */
971 if (ttl != CARP_DFLTTL) {
972 CARPSTATS_INC(carps_badttl);
973 CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__,
974 ttl, if_name(m->m_pkthdr.rcvif));
975 goto out;
976 }
977
978 if (vrrp_checksum_verify(m, off, len, phdrcksum)) {
979 CARPSTATS_INC(carps_badsum);
980 CARP_DEBUG("%s: incorrect checksum for VRID %u@%s\n", __func__,
981 sc->sc_vhid, if_name(ifp));
982 goto out;
983 }
984
985 /* RFC9568, 7.1 Receiving VRRP packets. */
986 if (sc->sc_vrrp_prio == 255) {
987 CARP_DEBUG("%s: our priority is 255. Ignore peer announcement.\n",
988 __func__);
989 goto out;
990 }
991
992 /* XXX TODO Check IP address payload. */
993
994 sc->sc_vrrp_master_inter = ntohs(vh->vrrp_max_adver_int);
995
996 switch (sc->sc_state) {
997 case INIT:
998 break;
999 case MASTER:
1000 /*
1001 * If we receive an advertisement from a master who's going to
1002 * be more frequent than us, go into BACKUP state.
1003 * Same if the peer has a higher priority than us.
1004 */
1005 if (ntohs(vh->vrrp_max_adver_int) < sc->sc_vrrp_adv_inter ||
1006 vh->vrrp_priority > DEMOTE_VRRP_PRIO(sc)) {
1007 callout_stop(&sc->sc_ad_tmo);
1008 carp_set_state(sc, BACKUP,
1009 "more frequent advertisement received");
1010 carp_setrun(sc, 0);
1011 carp_delroute(sc);
1012 }
1013 break;
1014 case BACKUP:
1015 /*
1016 * If we're pre-empting masters who advertise slower than us,
1017 * and this one claims to be slower, treat him as down.
1018 */
1019 if (V_carp_preempt && (ntohs(vh->vrrp_max_adver_int) > sc->sc_vrrp_adv_inter
1020 || vh->vrrp_priority < DEMOTE_VRRP_PRIO(sc))) {
1021 carp_master_down_locked(sc,
1022 "preempting a slower master");
1023 break;
1024 }
1025
1026 /*
1027 * Otherwise, we reset the counter and wait for the next
1028 * advertisement.
1029 */
1030 carp_setrun(sc, af);
1031 break;
1032 }
1033
1034 out:
1035 CARP_UNLOCK(sc);
1036 m_freem(m);
1037 }
1038
1039 static int
carp_tag(struct carp_softc * sc,struct mbuf * m)1040 carp_tag(struct carp_softc *sc, struct mbuf *m)
1041 {
1042 struct m_tag *mtag;
1043
1044 /* Tag packet for carp_output */
1045 if ((mtag = m_tag_get(PACKET_TAG_CARP, sizeof(sc->sc_vhid),
1046 M_NOWAIT)) == NULL) {
1047 m_freem(m);
1048 CARPSTATS_INC(carps_onomem);
1049 return (ENOMEM);
1050 }
1051 bcopy(&sc->sc_vhid, mtag + 1, sizeof(sc->sc_vhid));
1052 m_tag_prepend(m, mtag);
1053
1054 return (0);
1055 }
1056
1057 static void
carp_prepare_ad(struct mbuf * m,struct carp_softc * sc,struct carp_header * ch)1058 carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch)
1059 {
1060
1061 MPASS(sc->sc_version == CARP_VERSION_CARP);
1062
1063 if (sc->sc_init_counter) {
1064 /* this could also be seconds since unix epoch */
1065 sc->sc_counter = arc4random();
1066 sc->sc_counter = sc->sc_counter << 32;
1067 sc->sc_counter += arc4random();
1068 } else
1069 sc->sc_counter++;
1070
1071 ch->carp_counter[0] = htonl((sc->sc_counter>>32)&0xffffffff);
1072 ch->carp_counter[1] = htonl(sc->sc_counter&0xffffffff);
1073
1074 carp_hmac_generate(sc, ch->carp_counter, ch->carp_md);
1075 }
1076
1077 static inline void
send_ad_locked(struct carp_softc * sc)1078 send_ad_locked(struct carp_softc *sc)
1079 {
1080 switch (sc->sc_version) {
1081 case CARP_VERSION_CARP:
1082 carp_send_ad_locked(sc);
1083 break;
1084 case CARP_VERSION_VRRPv3:
1085 vrrp_send_ad_locked(sc);
1086 break;
1087 }
1088 }
1089
1090 /*
1091 * To avoid LORs and possible recursions this function shouldn't
1092 * be called directly, but scheduled via taskqueue.
1093 */
1094 static void
carp_send_ad_all(void * ctx __unused,int pending __unused)1095 carp_send_ad_all(void *ctx __unused, int pending __unused)
1096 {
1097 struct carp_softc *sc;
1098 struct epoch_tracker et;
1099
1100 NET_EPOCH_ENTER(et);
1101 mtx_lock(&carp_mtx);
1102 LIST_FOREACH(sc, &carp_list, sc_next)
1103 if (sc->sc_state == MASTER) {
1104 CARP_LOCK(sc);
1105 CURVNET_SET(sc->sc_carpdev->if_vnet);
1106 send_ad_locked(sc);
1107 CURVNET_RESTORE();
1108 CARP_UNLOCK(sc);
1109 }
1110 mtx_unlock(&carp_mtx);
1111 NET_EPOCH_EXIT(et);
1112 }
1113
1114 /* Send a periodic advertisement, executed in callout context. */
1115 static void
carp_callout(void * v)1116 carp_callout(void *v)
1117 {
1118 struct carp_softc *sc = v;
1119 struct epoch_tracker et;
1120
1121 NET_EPOCH_ENTER(et);
1122 CARP_LOCK_ASSERT(sc);
1123 CURVNET_SET(sc->sc_carpdev->if_vnet);
1124 send_ad_locked(sc);
1125 CURVNET_RESTORE();
1126 CARP_UNLOCK(sc);
1127 NET_EPOCH_EXIT(et);
1128 }
1129
1130 static void
carp_send_ad_error(struct carp_softc * sc,int error)1131 carp_send_ad_error(struct carp_softc *sc, int error)
1132 {
1133
1134 /*
1135 * We track errors and successful sends with this logic:
1136 * - Any error resets success counter to 0.
1137 * - MAX_ERRORS triggers demotion.
1138 * - MIN_SUCCESS successes resets error counter to 0.
1139 * - MIN_SUCCESS reverts demotion, if it was triggered before.
1140 */
1141 if (error) {
1142 if (sc->sc_sendad_errors < INT_MAX)
1143 sc->sc_sendad_errors++;
1144 if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) {
1145 static const char fmt[] = "send error %d on %s";
1146 char msg[sizeof(fmt) + IFNAMSIZ];
1147
1148 sprintf(msg, fmt, error, if_name(sc->sc_carpdev));
1149 carp_demote_adj(V_carp_senderr_adj, msg);
1150 }
1151 sc->sc_sendad_success = 0;
1152 } else if (sc->sc_sendad_errors > 0) {
1153 if (++sc->sc_sendad_success >= CARP_SENDAD_MIN_SUCCESS) {
1154 if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) {
1155 static const char fmt[] = "send ok on %s";
1156 char msg[sizeof(fmt) + IFNAMSIZ];
1157
1158 sprintf(msg, fmt, if_name(sc->sc_carpdev));
1159 carp_demote_adj(-V_carp_senderr_adj, msg);
1160 }
1161 sc->sc_sendad_errors = 0;
1162 }
1163 }
1164 }
1165
1166 /*
1167 * Pick the best ifaddr on the given ifp for sending CARP
1168 * advertisements.
1169 *
1170 * "Best" here is defined by ifa_preferred(). This function is much
1171 * much like ifaof_ifpforaddr() except that we just use ifa_preferred().
1172 *
1173 * (This could be simplified to return the actual address, except that
1174 * it has a different format in AF_INET and AF_INET6.)
1175 */
1176 static struct ifaddr *
carp_best_ifa(int af,struct ifnet * ifp)1177 carp_best_ifa(int af, struct ifnet *ifp)
1178 {
1179 struct ifaddr *ifa, *best;
1180
1181 NET_EPOCH_ASSERT();
1182
1183 if (af >= AF_MAX)
1184 return (NULL);
1185 best = NULL;
1186 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1187 if (ifa->ifa_addr->sa_family == af &&
1188 (best == NULL || ifa_preferred(best, ifa)))
1189 best = ifa;
1190 }
1191 if (best != NULL)
1192 ifa_ref(best);
1193 return (best);
1194 }
1195
1196 static void
carp_send_ad_locked(struct carp_softc * sc)1197 carp_send_ad_locked(struct carp_softc *sc)
1198 {
1199 struct carp_header ch;
1200 struct timeval tv;
1201 struct ifaddr *ifa;
1202 struct carp_header *ch_ptr;
1203 struct mbuf *m;
1204 int len, advskew;
1205
1206 NET_EPOCH_ASSERT();
1207 CARP_LOCK_ASSERT(sc);
1208 MPASS(sc->sc_version == CARP_VERSION_CARP);
1209
1210 advskew = DEMOTE_ADVSKEW(sc);
1211 tv.tv_sec = sc->sc_advbase;
1212 tv.tv_usec = advskew * 1000000 / 256;
1213
1214 ch.carp_version = CARP_VERSION_CARP;
1215 ch.carp_type = CARP_ADVERTISEMENT;
1216 ch.carp_vhid = sc->sc_vhid;
1217 ch.carp_advbase = sc->sc_advbase;
1218 ch.carp_advskew = advskew;
1219 ch.carp_authlen = 7; /* XXX DEFINE */
1220 ch.carp_pad1 = 0; /* must be zero */
1221 ch.carp_cksum = 0;
1222
1223 /* XXXGL: OpenBSD picks first ifaddr with needed family. */
1224
1225 #ifdef INET
1226 if (sc->sc_naddrs) {
1227 struct ip *ip;
1228
1229 m = m_gethdr(M_NOWAIT, MT_DATA);
1230 if (m == NULL) {
1231 CARPSTATS_INC(carps_onomem);
1232 goto resched;
1233 }
1234 len = sizeof(*ip) + sizeof(ch);
1235 m->m_pkthdr.len = len;
1236 m->m_pkthdr.rcvif = NULL;
1237 m->m_len = len;
1238 M_ALIGN(m, m->m_len);
1239 if (IN_MULTICAST(ntohl(sc->sc_carpaddr.s_addr)))
1240 m->m_flags |= M_MCAST;
1241 ip = mtod(m, struct ip *);
1242 ip->ip_v = IPVERSION;
1243 ip->ip_hl = sizeof(*ip) >> 2;
1244 ip->ip_tos = V_carp_dscp << IPTOS_DSCP_OFFSET;
1245 ip->ip_len = htons(len);
1246 ip->ip_off = htons(IP_DF);
1247 ip->ip_ttl = CARP_DFLTTL;
1248 ip->ip_p = IPPROTO_CARP;
1249 ip->ip_sum = 0;
1250 ip_fillid(ip, V_ip_random_id);
1251
1252 ifa = carp_best_ifa(AF_INET, sc->sc_carpdev);
1253 if (ifa != NULL) {
1254 ip->ip_src.s_addr =
1255 ifatoia(ifa)->ia_addr.sin_addr.s_addr;
1256 ifa_free(ifa);
1257 } else
1258 ip->ip_src.s_addr = 0;
1259 ip->ip_dst = sc->sc_carpaddr;
1260
1261 ch_ptr = (struct carp_header *)(&ip[1]);
1262 bcopy(&ch, ch_ptr, sizeof(ch));
1263 carp_prepare_ad(m, sc, ch_ptr);
1264 if (IN_MULTICAST(ntohl(sc->sc_carpaddr.s_addr)) &&
1265 carp_tag(sc, m) != 0)
1266 goto resched;
1267
1268 m->m_data += sizeof(*ip);
1269 ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip));
1270 m->m_data -= sizeof(*ip);
1271
1272 CARPSTATS_INC(carps_opackets);
1273
1274 carp_send_ad_error(sc, ip_output(m, NULL, NULL, IP_RAWOUTPUT,
1275 &sc->sc_carpdev->if_carp->cif_imo, NULL));
1276 }
1277 #endif /* INET */
1278 #ifdef INET6
1279 if (sc->sc_naddrs6) {
1280 struct ip6_hdr *ip6;
1281
1282 m = m_gethdr(M_NOWAIT, MT_DATA);
1283 if (m == NULL) {
1284 CARPSTATS_INC(carps_onomem);
1285 goto resched;
1286 }
1287 len = sizeof(*ip6) + sizeof(ch);
1288 m->m_pkthdr.len = len;
1289 m->m_pkthdr.rcvif = NULL;
1290 m->m_len = len;
1291 M_ALIGN(m, m->m_len);
1292 ip6 = mtod(m, struct ip6_hdr *);
1293 bzero(ip6, sizeof(*ip6));
1294 ip6->ip6_vfc |= IPV6_VERSION;
1295 /* Traffic class isn't defined in ip6 struct instead
1296 * it gets offset into flowid field */
1297 ip6->ip6_flow |= htonl(V_carp_dscp << (IPV6_FLOWLABEL_LEN +
1298 IPTOS_DSCP_OFFSET));
1299 ip6->ip6_hlim = CARP_DFLTTL;
1300 ip6->ip6_nxt = IPPROTO_CARP;
1301
1302 /* set the source address */
1303 ifa = carp_best_ifa(AF_INET6, sc->sc_carpdev);
1304 if (ifa != NULL) {
1305 bcopy(IFA_IN6(ifa), &ip6->ip6_src,
1306 sizeof(struct in6_addr));
1307 ifa_free(ifa);
1308 } else
1309 /* This should never happen with IPv6. */
1310 bzero(&ip6->ip6_src, sizeof(struct in6_addr));
1311
1312 /* Set the multicast destination. */
1313 memcpy(&ip6->ip6_dst, &sc->sc_carpaddr6, sizeof(ip6->ip6_dst));
1314 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
1315 IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_dst)) {
1316 if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) {
1317 m_freem(m);
1318 CARP_DEBUG("%s: in6_setscope failed\n", __func__);
1319 goto resched;
1320 }
1321 }
1322
1323 ch_ptr = (struct carp_header *)(&ip6[1]);
1324 bcopy(&ch, ch_ptr, sizeof(ch));
1325 carp_prepare_ad(m, sc, ch_ptr);
1326 if (IN6_IS_ADDR_MULTICAST(&sc->sc_carpaddr6) &&
1327 carp_tag(sc, m) != 0)
1328 goto resched;
1329
1330 m->m_data += sizeof(*ip6);
1331 ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip6));
1332 m->m_data -= sizeof(*ip6);
1333
1334 CARPSTATS_INC(carps_opackets6);
1335
1336 carp_send_ad_error(sc, ip6_output(m, NULL, NULL, 0,
1337 &sc->sc_carpdev->if_carp->cif_im6o, NULL, NULL));
1338 }
1339 #endif /* INET6 */
1340
1341 resched:
1342 callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_callout, sc);
1343 }
1344
1345 static void
vrrp_send_ad_locked(struct carp_softc * sc)1346 vrrp_send_ad_locked(struct carp_softc *sc)
1347 {
1348 struct vrrpv3_header *vh_ptr;
1349 struct ifaddr *ifa;
1350 struct mbuf *m;
1351 int len;
1352 struct vrrpv3_header vh = {
1353 .vrrp_version = CARP_VERSION_VRRPv3,
1354 .vrrp_type = VRRP_TYPE_ADVERTISEMENT,
1355 .vrrp_vrtid = sc->sc_vhid,
1356 .vrrp_priority = DEMOTE_VRRP_PRIO(sc),
1357 .vrrp_count_addr = 0,
1358 .vrrp_max_adver_int = htons(sc->sc_vrrp_adv_inter),
1359 .vrrp_checksum = 0,
1360 };
1361
1362 NET_EPOCH_ASSERT();
1363 CARP_LOCK_ASSERT(sc);
1364 MPASS(sc->sc_version == CARP_VERSION_VRRPv3);
1365
1366 #ifdef INET
1367 if (sc->sc_naddrs) {
1368 struct ip *ip;
1369
1370 m = m_gethdr(M_NOWAIT, MT_DATA);
1371 if (m == NULL) {
1372 CARPSTATS_INC(carps_onomem);
1373 goto resched;
1374 }
1375 len = sizeof(*ip) + sizeof(vh);
1376 m->m_pkthdr.len = len;
1377 m->m_pkthdr.rcvif = NULL;
1378 m->m_len = len;
1379 M_ALIGN(m, m->m_len);
1380 m->m_flags |= M_MCAST;
1381 ip = mtod(m, struct ip *);
1382 ip->ip_v = IPVERSION;
1383 ip->ip_hl = sizeof(*ip) >> 2;
1384 ip->ip_tos = V_carp_dscp << IPTOS_DSCP_OFFSET;
1385 ip->ip_off = htons(IP_DF);
1386 ip->ip_ttl = CARP_DFLTTL;
1387 ip->ip_p = IPPROTO_CARP;
1388 ip->ip_sum = 0;
1389 ip_fillid(ip, V_ip_random_id);
1390
1391 ifa = carp_best_ifa(AF_INET, sc->sc_carpdev);
1392 if (ifa != NULL) {
1393 ip->ip_src.s_addr =
1394 ifatoia(ifa)->ia_addr.sin_addr.s_addr;
1395 ifa_free(ifa);
1396 } else
1397 ip->ip_src.s_addr = 0;
1398 ip->ip_dst.s_addr = htonl(INADDR_CARP_GROUP);
1399
1400 /* Include the IP addresses in the announcement. */
1401 for (int i = 0; i < (sc->sc_naddrs + sc->sc_naddrs6); i++) {
1402 struct sockaddr_in *in;
1403
1404 MPASS(sc->sc_ifas[i] != NULL);
1405 if (sc->sc_ifas[i]->ifa_addr->sa_family != AF_INET)
1406 continue;
1407
1408 in = (struct sockaddr_in *)sc->sc_ifas[i]->ifa_addr;
1409
1410 if (m_append(m, sizeof(in->sin_addr),
1411 (caddr_t)&in->sin_addr) != 1) {
1412 m_freem(m);
1413 goto resched;
1414 }
1415
1416 vh.vrrp_count_addr++;
1417 len += sizeof(in->sin_addr);
1418 }
1419 ip->ip_len = htons(len);
1420
1421 vh_ptr = (struct vrrpv3_header *)mtodo(m, sizeof(*ip));
1422 bcopy(&vh, vh_ptr, sizeof(vh));
1423
1424 vh_ptr->vrrp_checksum = in_pseudo(ip->ip_src.s_addr,
1425 ip->ip_dst.s_addr,
1426 htonl((uint16_t)(len - sizeof(*ip)) + ip->ip_p));
1427 vh_ptr->vrrp_checksum = in_cksum_skip(m, len, sizeof(*ip));
1428
1429 if (carp_tag(sc, m))
1430 goto resched;
1431
1432 CARPSTATS_INC(carps_opackets);
1433
1434 carp_send_ad_error(sc, ip_output(m, NULL, NULL, IP_RAWOUTPUT,
1435 &sc->sc_carpdev->if_carp->cif_imo, NULL));
1436 }
1437 #endif
1438 #ifdef INET6
1439 if (sc->sc_naddrs6) {
1440 struct ip6_hdr *ip6;
1441
1442 m = m_gethdr(M_NOWAIT, MT_DATA);
1443 if (m == NULL) {
1444 CARPSTATS_INC(carps_onomem);
1445 goto resched;
1446 }
1447 len = sizeof(*ip6) + sizeof(vh);
1448 m->m_pkthdr.len = len;
1449 m->m_pkthdr.rcvif = NULL;
1450 m->m_len = len;
1451 M_ALIGN(m, m->m_len);
1452 m->m_flags |= M_MCAST;
1453 ip6 = mtod(m, struct ip6_hdr *);
1454 bzero(ip6, sizeof(*ip6));
1455 ip6->ip6_vfc |= IPV6_VERSION;
1456 /* Traffic class isn't defined in ip6 struct instead
1457 * it gets offset into flowid field */
1458 ip6->ip6_flow |= htonl(V_carp_dscp << (IPV6_FLOWLABEL_LEN +
1459 IPTOS_DSCP_OFFSET));
1460 ip6->ip6_hlim = CARP_DFLTTL;
1461 ip6->ip6_nxt = IPPROTO_CARP;
1462
1463 /* set the source address */
1464 ifa = carp_best_ifa(AF_INET6, sc->sc_carpdev);
1465 if (ifa != NULL) {
1466 bcopy(IFA_IN6(ifa), &ip6->ip6_src,
1467 sizeof(struct in6_addr));
1468 ifa_free(ifa);
1469 } else
1470 /* This should never happen with IPv6. */
1471 bzero(&ip6->ip6_src, sizeof(struct in6_addr));
1472
1473 /* Set the multicast destination. */
1474 bzero(&ip6->ip6_dst, sizeof(ip6->ip6_dst));
1475 ip6->ip6_dst.s6_addr16[0] = IPV6_ADDR_INT16_MLL;
1476 ip6->ip6_dst.s6_addr8[15] = 0x12;
1477
1478 /* Include the IP addresses in the announcement. */
1479 len = sizeof(vh);
1480 for (int i = 0; i < (sc->sc_naddrs + sc->sc_naddrs6); i++) {
1481 struct sockaddr_in6 *in6;
1482
1483 MPASS(sc->sc_ifas[i] != NULL);
1484 if (sc->sc_ifas[i]->ifa_addr->sa_family != AF_INET6)
1485 continue;
1486
1487 in6 = (struct sockaddr_in6 *)sc->sc_ifas[i]->ifa_addr;
1488
1489 if (m_append(m, sizeof(in6->sin6_addr),
1490 (char *)&in6->sin6_addr) != 1) {
1491 m_freem(m);
1492 goto resched;
1493 }
1494
1495 vh.vrrp_count_addr++;
1496 len += sizeof(in6->sin6_addr);
1497 }
1498 ip6->ip6_plen = htonl(len);
1499
1500 vh_ptr = (struct vrrpv3_header *)mtodo(m, sizeof(*ip6));
1501 bcopy(&vh, vh_ptr, sizeof(vh));
1502
1503 vh_ptr->vrrp_checksum = in6_cksum_pseudo(ip6, len, ip6->ip6_nxt, 0);
1504 vh_ptr->vrrp_checksum = in_cksum_skip(m, len + sizeof(*ip6), sizeof(*ip6));
1505
1506 if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) {
1507 m_freem(m);
1508 CARP_DEBUG("%s: in6_setscope failed\n", __func__);
1509 goto resched;
1510 }
1511
1512 if (carp_tag(sc, m))
1513 goto resched;
1514 CARPSTATS_INC(carps_opackets6);
1515
1516 carp_send_ad_error(sc, ip6_output(m, NULL, NULL, 0,
1517 &sc->sc_carpdev->if_carp->cif_im6o, NULL, NULL));
1518 }
1519 #endif
1520
1521 resched:
1522 callout_reset(&sc->sc_ad_tmo, sc->sc_vrrp_adv_inter * hz / 100,
1523 carp_callout, sc);
1524 }
1525
1526 static void
carp_addroute(struct carp_softc * sc)1527 carp_addroute(struct carp_softc *sc)
1528 {
1529 struct ifaddr *ifa;
1530
1531 CARP_FOREACH_IFA(sc, ifa)
1532 carp_ifa_addroute(ifa);
1533 }
1534
1535 static void
carp_ifa_addroute(struct ifaddr * ifa)1536 carp_ifa_addroute(struct ifaddr *ifa)
1537 {
1538
1539 switch (ifa->ifa_addr->sa_family) {
1540 #ifdef INET
1541 case AF_INET:
1542 in_addprefix(ifatoia(ifa));
1543 ifa_add_loopback_route(ifa,
1544 (struct sockaddr *)&ifatoia(ifa)->ia_addr);
1545 break;
1546 #endif
1547 #ifdef INET6
1548 case AF_INET6:
1549 ifa_add_loopback_route(ifa,
1550 (struct sockaddr *)&ifatoia6(ifa)->ia_addr);
1551 nd6_add_ifa_lle(ifatoia6(ifa));
1552 break;
1553 #endif
1554 }
1555 }
1556
1557 static void
carp_delroute(struct carp_softc * sc)1558 carp_delroute(struct carp_softc *sc)
1559 {
1560 struct ifaddr *ifa;
1561
1562 CARP_FOREACH_IFA(sc, ifa)
1563 carp_ifa_delroute(ifa);
1564 }
1565
1566 static void
carp_ifa_delroute(struct ifaddr * ifa)1567 carp_ifa_delroute(struct ifaddr *ifa)
1568 {
1569
1570 switch (ifa->ifa_addr->sa_family) {
1571 #ifdef INET
1572 case AF_INET:
1573 ifa_del_loopback_route(ifa,
1574 (struct sockaddr *)&ifatoia(ifa)->ia_addr);
1575 in_scrubprefix(ifatoia(ifa), LLE_STATIC);
1576 break;
1577 #endif
1578 #ifdef INET6
1579 case AF_INET6:
1580 ifa_del_loopback_route(ifa,
1581 (struct sockaddr *)&ifatoia6(ifa)->ia_addr);
1582 nd6_rem_ifa_lle(ifatoia6(ifa), 1);
1583 break;
1584 #endif
1585 }
1586 }
1587
1588 int
carp_master(struct ifaddr * ifa)1589 carp_master(struct ifaddr *ifa)
1590 {
1591 struct carp_softc *sc = ifa->ifa_carp;
1592
1593 return (sc->sc_state == MASTER);
1594 }
1595
1596 #ifdef INET
1597 /*
1598 * Broadcast a gratuitous ARP request containing
1599 * the virtual router MAC address for each IP address
1600 * associated with the virtual router.
1601 */
1602 static void
carp_send_arp(struct carp_softc * sc)1603 carp_send_arp(struct carp_softc *sc)
1604 {
1605 struct ifaddr *ifa;
1606 struct in_addr addr;
1607
1608 NET_EPOCH_ASSERT();
1609
1610 CARP_FOREACH_IFA(sc, ifa) {
1611 if (ifa->ifa_addr->sa_family != AF_INET)
1612 continue;
1613 addr = ((struct sockaddr_in *)ifa->ifa_addr)->sin_addr;
1614 arp_announce_ifaddr(sc->sc_carpdev, addr, sc->sc_addr);
1615 }
1616 }
1617
1618 int
carp_iamatch(struct ifaddr * ifa,uint8_t ** enaddr)1619 carp_iamatch(struct ifaddr *ifa, uint8_t **enaddr)
1620 {
1621 struct carp_softc *sc = ifa->ifa_carp;
1622
1623 if (sc->sc_state == MASTER) {
1624 *enaddr = sc->sc_addr;
1625 return (1);
1626 }
1627
1628 return (0);
1629 }
1630 #endif
1631
1632 #ifdef INET6
1633 static void
carp_send_na(struct carp_softc * sc)1634 carp_send_na(struct carp_softc *sc)
1635 {
1636 struct ifaddr *ifa;
1637 int flags;
1638
1639 /*
1640 * Sending Unsolicited Neighbor Advertisements
1641 *
1642 * If the node is a router, we MUST set the Router flag to one.
1643 * We set Override flag to one and send link-layer address option,
1644 * thus neighboring nodes will install the new link-layer address.
1645 */
1646 flags = ND_NA_FLAG_OVERRIDE;
1647 if (V_ip6_forwarding)
1648 flags |= ND_NA_FLAG_ROUTER;
1649 CARP_FOREACH_IFA(sc, ifa) {
1650 if (ifa->ifa_addr->sa_family != AF_INET6)
1651 continue;
1652 /*
1653 * We use unspecified address as destination here to avoid
1654 * scope initialization for each call.
1655 * nd6_na_output() will use all nodes multicast address if
1656 * destinaion address is unspecified.
1657 */
1658 nd6_na_output(sc->sc_carpdev, &in6addr_any, IFA_IN6(ifa),
1659 flags, ND6_NA_OPT_LLA | ND6_NA_CARP_MASTER, NULL);
1660 DELAY(1000); /* RetransTimer */
1661 }
1662 }
1663
1664 /*
1665 * Returns ifa in case it's a carp address and it is MASTER, or if the address
1666 * matches and is not a carp address. Returns NULL otherwise.
1667 */
1668 struct ifaddr *
carp_iamatch6(struct ifnet * ifp,struct in6_addr * taddr)1669 carp_iamatch6(struct ifnet *ifp, struct in6_addr *taddr)
1670 {
1671 struct ifaddr *ifa;
1672
1673 NET_EPOCH_ASSERT();
1674
1675 ifa = NULL;
1676 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1677 if (ifa->ifa_addr->sa_family != AF_INET6)
1678 continue;
1679 if (!IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa)))
1680 continue;
1681 if (ifa->ifa_carp && ifa->ifa_carp->sc_state != MASTER)
1682 ifa = NULL;
1683 else
1684 ifa_ref(ifa);
1685 break;
1686 }
1687
1688 return (ifa);
1689 }
1690
1691 char *
carp_macmatch6(struct ifnet * ifp,struct mbuf * m,const struct in6_addr * taddr)1692 carp_macmatch6(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr)
1693 {
1694 struct ifaddr *ifa;
1695 char *mac = NULL;
1696
1697 NET_EPOCH_ASSERT();
1698
1699 IFNET_FOREACH_IFA(ifp, ifa)
1700 if (ifa->ifa_addr->sa_family == AF_INET6 &&
1701 IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) {
1702 struct carp_softc *sc = ifa->ifa_carp;
1703 struct m_tag *mtag;
1704
1705 mtag = m_tag_get(PACKET_TAG_CARP,
1706 sizeof(sc->sc_vhid) + sizeof(sc->sc_addr),
1707 M_NOWAIT);
1708 if (mtag == NULL) {
1709 CARPSTATS_INC(carps_onomem);
1710 break;
1711 }
1712 /* carp_output expects sc_vhid first. */
1713 bcopy(&sc->sc_vhid, mtag + 1, sizeof(sc->sc_vhid));
1714 /*
1715 * Save sc_addr into mtag data after sc_vhid to avoid
1716 * possible access to destroyed softc.
1717 */
1718 mac = (char *)(mtag + 1) + sizeof(sc->sc_vhid);
1719 bcopy(sc->sc_addr, mac, sizeof(sc->sc_addr));
1720
1721 m_tag_prepend(m, mtag);
1722 break;
1723 }
1724
1725 return (mac);
1726 }
1727 #endif /* INET6 */
1728
1729 int
carp_forus(struct ifnet * ifp,u_char * dhost)1730 carp_forus(struct ifnet *ifp, u_char *dhost)
1731 {
1732 struct carp_softc *sc;
1733 uint8_t *ena = dhost;
1734
1735 if (ena[0] || ena[1] || ena[2] != 0x5e || ena[3] || ena[4] != 1)
1736 return (0);
1737
1738 CIF_LOCK(ifp->if_carp);
1739 IFNET_FOREACH_CARP(ifp, sc) {
1740 /*
1741 * CARP_LOCK() is not here, since would protect nothing, but
1742 * cause deadlock with if_bridge, calling this under its lock.
1743 */
1744 if (sc->sc_state == MASTER && !bcmp(dhost, sc->sc_addr,
1745 ETHER_ADDR_LEN)) {
1746 CIF_UNLOCK(ifp->if_carp);
1747 return (1);
1748 }
1749 }
1750 CIF_UNLOCK(ifp->if_carp);
1751
1752 return (0);
1753 }
1754
1755 /* Master down timeout event, executed in callout context. */
1756 static void
carp_master_down(void * v)1757 carp_master_down(void *v)
1758 {
1759 struct carp_softc *sc = v;
1760 struct epoch_tracker et;
1761
1762 NET_EPOCH_ENTER(et);
1763 CARP_LOCK_ASSERT(sc);
1764
1765 CURVNET_SET(sc->sc_carpdev->if_vnet);
1766 if (sc->sc_state == BACKUP) {
1767 carp_master_down_locked(sc, "master timed out");
1768 }
1769 CURVNET_RESTORE();
1770
1771 CARP_UNLOCK(sc);
1772 NET_EPOCH_EXIT(et);
1773 }
1774
1775 static void
carp_master_down_locked(struct carp_softc * sc,const char * reason)1776 carp_master_down_locked(struct carp_softc *sc, const char *reason)
1777 {
1778
1779 NET_EPOCH_ASSERT();
1780 CARP_LOCK_ASSERT(sc);
1781
1782 switch (sc->sc_state) {
1783 case BACKUP:
1784 carp_set_state(sc, MASTER, reason);
1785 send_ad_locked(sc);
1786 #ifdef INET
1787 carp_send_arp(sc);
1788 #endif
1789 #ifdef INET6
1790 carp_send_na(sc);
1791 #endif
1792 carp_setrun(sc, 0);
1793 carp_addroute(sc);
1794 break;
1795 case INIT:
1796 case MASTER:
1797 #ifdef INVARIANTS
1798 panic("carp: VHID %u@%s: master_down event in %s state\n",
1799 sc->sc_vhid,
1800 if_name(sc->sc_carpdev),
1801 sc->sc_state ? "MASTER" : "INIT");
1802 #endif
1803 break;
1804 }
1805 }
1806
1807 /*
1808 * When in backup state, af indicates whether to reset the master down timer
1809 * for v4 or v6. If it's set to zero, reset the ones which are already pending.
1810 */
1811 static void
carp_setrun(struct carp_softc * sc,sa_family_t af)1812 carp_setrun(struct carp_softc *sc, sa_family_t af)
1813 {
1814 struct timeval tv;
1815 int timeout;
1816
1817 CARP_LOCK_ASSERT(sc);
1818
1819 if ((sc->sc_carpdev->if_flags & IFF_UP) == 0 ||
1820 sc->sc_carpdev->if_link_state != LINK_STATE_UP ||
1821 (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0) ||
1822 !V_carp_allow)
1823 return;
1824
1825 switch (sc->sc_state) {
1826 case INIT:
1827 carp_set_state(sc, BACKUP, "initialization complete");
1828 carp_setrun(sc, 0);
1829 break;
1830 case BACKUP:
1831 callout_stop(&sc->sc_ad_tmo);
1832
1833 switch (sc->sc_version) {
1834 case CARP_VERSION_CARP:
1835 tv.tv_sec = 3 * sc->sc_advbase;
1836 tv.tv_usec = sc->sc_advskew * 1000000 / 256;
1837 timeout = tvtohz(&tv);
1838 break;
1839 case CARP_VERSION_VRRPv3:
1840 /* skew time */
1841 timeout = (256 - sc->sc_vrrp_prio) *
1842 sc->sc_vrrp_master_inter / 256;
1843 timeout += (3 * sc->sc_vrrp_master_inter);
1844 timeout *= hz;
1845 timeout /= 100; /* master interval is in centiseconds */
1846 break;
1847 }
1848 switch (af) {
1849 #ifdef INET
1850 case AF_INET:
1851 callout_reset(&sc->sc_md_tmo, timeout,
1852 carp_master_down, sc);
1853 break;
1854 #endif
1855 #ifdef INET6
1856 case AF_INET6:
1857 callout_reset(&sc->sc_md6_tmo, timeout,
1858 carp_master_down, sc);
1859 break;
1860 #endif
1861 default:
1862 #ifdef INET
1863 if (sc->sc_naddrs)
1864 callout_reset(&sc->sc_md_tmo, timeout,
1865 carp_master_down, sc);
1866 #endif
1867 #ifdef INET6
1868 if (sc->sc_naddrs6)
1869 callout_reset(&sc->sc_md6_tmo, timeout,
1870 carp_master_down, sc);
1871 #endif
1872 break;
1873 }
1874 break;
1875 case MASTER:
1876 switch (sc->sc_version) {
1877 case CARP_VERSION_CARP:
1878 tv.tv_sec = sc->sc_advbase;
1879 tv.tv_usec = sc->sc_advskew * 1000000 / 256;
1880 callout_reset(&sc->sc_ad_tmo, tvtohz(&tv),
1881 carp_callout, sc);
1882 break;
1883 case CARP_VERSION_VRRPv3:
1884 callout_reset(&sc->sc_ad_tmo,
1885 sc->sc_vrrp_adv_inter * hz / 100,
1886 carp_callout, sc);
1887 break;
1888 }
1889 break;
1890 }
1891 }
1892
1893 /*
1894 * Setup multicast structures.
1895 */
1896 static int
carp_multicast_setup(struct carp_if * cif,sa_family_t sa)1897 carp_multicast_setup(struct carp_if *cif, sa_family_t sa)
1898 {
1899 struct ifnet *ifp = cif->cif_ifp;
1900 int error = 0;
1901
1902 switch (sa) {
1903 #ifdef INET
1904 case AF_INET:
1905 {
1906 struct ip_moptions *imo = &cif->cif_imo;
1907 struct in_mfilter *imf;
1908 struct in_addr addr;
1909
1910 if (ip_mfilter_first(&imo->imo_head) != NULL)
1911 return (0);
1912
1913 imf = ip_mfilter_alloc(M_WAITOK, 0, 0);
1914 ip_mfilter_init(&imo->imo_head);
1915 imo->imo_multicast_vif = -1;
1916
1917 addr.s_addr = htonl(INADDR_CARP_GROUP);
1918 if ((error = in_joingroup(ifp, &addr, NULL,
1919 &imf->imf_inm)) != 0) {
1920 ip_mfilter_free(imf);
1921 break;
1922 }
1923
1924 ip_mfilter_insert(&imo->imo_head, imf);
1925 imo->imo_multicast_ifp = ifp;
1926 imo->imo_multicast_ttl = CARP_DFLTTL;
1927 imo->imo_multicast_loop = 0;
1928 break;
1929 }
1930 #endif
1931 #ifdef INET6
1932 case AF_INET6:
1933 {
1934 struct ip6_moptions *im6o = &cif->cif_im6o;
1935 struct in6_mfilter *im6f[2];
1936 struct in6_addr in6;
1937
1938 if (ip6_mfilter_first(&im6o->im6o_head))
1939 return (0);
1940
1941 im6f[0] = ip6_mfilter_alloc(M_WAITOK, 0, 0);
1942 im6f[1] = ip6_mfilter_alloc(M_WAITOK, 0, 0);
1943
1944 ip6_mfilter_init(&im6o->im6o_head);
1945 im6o->im6o_multicast_hlim = CARP_DFLTTL;
1946 im6o->im6o_multicast_ifp = ifp;
1947
1948 /* Join IPv6 CARP multicast group. */
1949 bzero(&in6, sizeof(in6));
1950 in6.s6_addr16[0] = htons(0xff02);
1951 in6.s6_addr8[15] = 0x12;
1952 if ((error = in6_setscope(&in6, ifp, NULL)) != 0) {
1953 ip6_mfilter_free(im6f[0]);
1954 ip6_mfilter_free(im6f[1]);
1955 break;
1956 }
1957 if ((error = in6_joingroup(ifp, &in6, NULL, &im6f[0]->im6f_in6m, 0)) != 0) {
1958 ip6_mfilter_free(im6f[0]);
1959 ip6_mfilter_free(im6f[1]);
1960 break;
1961 }
1962
1963 /* Join solicited multicast address. */
1964 bzero(&in6, sizeof(in6));
1965 in6.s6_addr16[0] = htons(0xff02);
1966 in6.s6_addr32[1] = 0;
1967 in6.s6_addr32[2] = htonl(1);
1968 in6.s6_addr32[3] = 0;
1969 in6.s6_addr8[12] = 0xff;
1970
1971 if ((error = in6_setscope(&in6, ifp, NULL)) != 0) {
1972 ip6_mfilter_free(im6f[0]);
1973 ip6_mfilter_free(im6f[1]);
1974 break;
1975 }
1976
1977 if ((error = in6_joingroup(ifp, &in6, NULL, &im6f[1]->im6f_in6m, 0)) != 0) {
1978 in6_leavegroup(im6f[0]->im6f_in6m, NULL);
1979 ip6_mfilter_free(im6f[0]);
1980 ip6_mfilter_free(im6f[1]);
1981 break;
1982 }
1983 ip6_mfilter_insert(&im6o->im6o_head, im6f[0]);
1984 ip6_mfilter_insert(&im6o->im6o_head, im6f[1]);
1985 break;
1986 }
1987 #endif
1988 }
1989
1990 return (error);
1991 }
1992
1993 /*
1994 * Free multicast structures.
1995 */
1996 static void
carp_multicast_cleanup(struct carp_if * cif,sa_family_t sa)1997 carp_multicast_cleanup(struct carp_if *cif, sa_family_t sa)
1998 {
1999 #ifdef INET
2000 struct ip_moptions *imo = &cif->cif_imo;
2001 struct in_mfilter *imf;
2002 #endif
2003 #ifdef INET6
2004 struct ip6_moptions *im6o = &cif->cif_im6o;
2005 struct in6_mfilter *im6f;
2006 #endif
2007 sx_assert(&carp_sx, SA_XLOCKED);
2008
2009 switch (sa) {
2010 #ifdef INET
2011 case AF_INET:
2012 if (cif->cif_naddrs != 0)
2013 break;
2014
2015 while ((imf = ip_mfilter_first(&imo->imo_head)) != NULL) {
2016 ip_mfilter_remove(&imo->imo_head, imf);
2017 in_leavegroup(imf->imf_inm, NULL);
2018 ip_mfilter_free(imf);
2019 }
2020 break;
2021 #endif
2022 #ifdef INET6
2023 case AF_INET6:
2024 if (cif->cif_naddrs6 != 0)
2025 break;
2026
2027 while ((im6f = ip6_mfilter_first(&im6o->im6o_head)) != NULL) {
2028 ip6_mfilter_remove(&im6o->im6o_head, im6f);
2029 in6_leavegroup(im6f->im6f_in6m, NULL);
2030 ip6_mfilter_free(im6f);
2031 }
2032 break;
2033 #endif
2034 }
2035 }
2036
2037 int
carp_output(struct ifnet * ifp,struct mbuf * m,const struct sockaddr * sa)2038 carp_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa)
2039 {
2040 struct m_tag *mtag;
2041 int vhid;
2042
2043 if (!sa)
2044 return (0);
2045
2046 switch (sa->sa_family) {
2047 #ifdef INET
2048 case AF_INET:
2049 break;
2050 #endif
2051 #ifdef INET6
2052 case AF_INET6:
2053 break;
2054 #endif
2055 default:
2056 return (0);
2057 }
2058
2059 mtag = m_tag_find(m, PACKET_TAG_CARP, NULL);
2060 if (mtag == NULL)
2061 return (0);
2062
2063 bcopy(mtag + 1, &vhid, sizeof(vhid));
2064
2065 /* Set the source MAC address to the Virtual Router MAC Address. */
2066 switch (ifp->if_type) {
2067 case IFT_ETHER:
2068 case IFT_BRIDGE:
2069 case IFT_L2VLAN: {
2070 struct ether_header *eh;
2071
2072 eh = mtod(m, struct ether_header *);
2073 eh->ether_shost[0] = 0;
2074 eh->ether_shost[1] = 0;
2075 eh->ether_shost[2] = 0x5e;
2076 eh->ether_shost[3] = 0;
2077 eh->ether_shost[4] = 1;
2078 eh->ether_shost[5] = vhid;
2079 }
2080 break;
2081 default:
2082 printf("%s: carp is not supported for the %d interface type\n",
2083 if_name(ifp), ifp->if_type);
2084 return (EOPNOTSUPP);
2085 }
2086
2087 return (0);
2088 }
2089
2090 static struct carp_softc*
carp_alloc(struct ifnet * ifp,carp_version_t version,int vhid)2091 carp_alloc(struct ifnet *ifp, carp_version_t version, int vhid)
2092 {
2093 struct carp_softc *sc;
2094 struct carp_if *cif;
2095
2096 sx_assert(&carp_sx, SA_XLOCKED);
2097
2098 if ((cif = ifp->if_carp) == NULL)
2099 cif = carp_alloc_if(ifp);
2100
2101 sc = malloc(sizeof(*sc), M_CARP, M_WAITOK);
2102 *sc = (struct carp_softc ){
2103 .sc_vhid = vhid,
2104 .sc_version = version,
2105 .sc_state = INIT,
2106 .sc_carpdev = ifp,
2107 .sc_ifasiz = sizeof(struct ifaddr *),
2108 .sc_addr = { 0, 0, 0x5e, 0, 1, vhid },
2109 };
2110 sc->sc_ifas = malloc(sc->sc_ifasiz, M_CARP, M_WAITOK|M_ZERO);
2111
2112 switch (version) {
2113 case CARP_VERSION_CARP:
2114 sc->sc_advbase = CARP_DFLTINTV;
2115 sc->sc_init_counter = true;
2116 sc->sc_carpaddr.s_addr = htonl(INADDR_CARP_GROUP);
2117 sc->sc_carpaddr6.s6_addr16[0] = IPV6_ADDR_INT16_MLL;
2118 sc->sc_carpaddr6.s6_addr8[15] = 0x12;
2119 break;
2120 case CARP_VERSION_VRRPv3:
2121 sc->sc_vrrp_adv_inter = 100;
2122 sc->sc_vrrp_master_inter = sc->sc_vrrp_adv_inter;
2123 sc->sc_vrrp_prio = 100;
2124 break;
2125 }
2126
2127 CARP_LOCK_INIT(sc);
2128 #ifdef INET
2129 callout_init_mtx(&sc->sc_md_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
2130 #endif
2131 #ifdef INET6
2132 callout_init_mtx(&sc->sc_md6_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
2133 #endif
2134 callout_init_mtx(&sc->sc_ad_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
2135
2136 CIF_LOCK(cif);
2137 TAILQ_INSERT_TAIL(&cif->cif_vrs, sc, sc_list);
2138 CIF_UNLOCK(cif);
2139
2140 mtx_lock(&carp_mtx);
2141 LIST_INSERT_HEAD(&carp_list, sc, sc_next);
2142 mtx_unlock(&carp_mtx);
2143
2144 return (sc);
2145 }
2146
2147 static void
carp_grow_ifas(struct carp_softc * sc)2148 carp_grow_ifas(struct carp_softc *sc)
2149 {
2150 struct ifaddr **new;
2151
2152 new = malloc(sc->sc_ifasiz * 2, M_CARP, M_WAITOK | M_ZERO);
2153 CARP_LOCK(sc);
2154 bcopy(sc->sc_ifas, new, sc->sc_ifasiz);
2155 free(sc->sc_ifas, M_CARP);
2156 sc->sc_ifas = new;
2157 sc->sc_ifasiz *= 2;
2158 CARP_UNLOCK(sc);
2159 }
2160
2161 static void
carp_destroy(struct carp_softc * sc)2162 carp_destroy(struct carp_softc *sc)
2163 {
2164 struct ifnet *ifp = sc->sc_carpdev;
2165 struct carp_if *cif = ifp->if_carp;
2166
2167 sx_assert(&carp_sx, SA_XLOCKED);
2168
2169 if (sc->sc_suppress)
2170 carp_demote_adj(-V_carp_ifdown_adj, "vhid removed");
2171 CARP_UNLOCK(sc);
2172
2173 CIF_LOCK(cif);
2174 TAILQ_REMOVE(&cif->cif_vrs, sc, sc_list);
2175 CIF_UNLOCK(cif);
2176
2177 mtx_lock(&carp_mtx);
2178 LIST_REMOVE(sc, sc_next);
2179 mtx_unlock(&carp_mtx);
2180
2181 callout_drain(&sc->sc_ad_tmo);
2182 #ifdef INET
2183 callout_drain(&sc->sc_md_tmo);
2184 #endif
2185 #ifdef INET6
2186 callout_drain(&sc->sc_md6_tmo);
2187 #endif
2188 CARP_LOCK_DESTROY(sc);
2189
2190 free(sc->sc_ifas, M_CARP);
2191 free(sc, M_CARP);
2192 }
2193
2194 static struct carp_if*
carp_alloc_if(struct ifnet * ifp)2195 carp_alloc_if(struct ifnet *ifp)
2196 {
2197 struct carp_if *cif;
2198 int error;
2199
2200 cif = malloc(sizeof(*cif), M_CARP, M_WAITOK|M_ZERO);
2201
2202 if ((error = ifpromisc(ifp, 1)) != 0)
2203 printf("%s: ifpromisc(%s) failed: %d\n",
2204 __func__, if_name(ifp), error);
2205 else
2206 cif->cif_flags |= CIF_PROMISC;
2207
2208 CIF_LOCK_INIT(cif);
2209 cif->cif_ifp = ifp;
2210 TAILQ_INIT(&cif->cif_vrs);
2211
2212 IF_ADDR_WLOCK(ifp);
2213 ifp->if_carp = cif;
2214 if_ref(ifp);
2215 IF_ADDR_WUNLOCK(ifp);
2216
2217 return (cif);
2218 }
2219
2220 static void
carp_free_if(struct carp_if * cif)2221 carp_free_if(struct carp_if *cif)
2222 {
2223 struct ifnet *ifp = cif->cif_ifp;
2224
2225 CIF_LOCK_ASSERT(cif);
2226 KASSERT(TAILQ_EMPTY(&cif->cif_vrs), ("%s: softc list not empty",
2227 __func__));
2228
2229 IF_ADDR_WLOCK(ifp);
2230 ifp->if_carp = NULL;
2231 IF_ADDR_WUNLOCK(ifp);
2232
2233 CIF_LOCK_DESTROY(cif);
2234
2235 if (cif->cif_flags & CIF_PROMISC)
2236 ifpromisc(ifp, 0);
2237 if_rele(ifp);
2238
2239 free(cif, M_CARP);
2240 }
2241
2242 static int
carp_get_vhid(struct ifaddr * ifa)2243 carp_get_vhid(struct ifaddr *ifa)
2244 {
2245
2246 if (ifa == NULL || ifa->ifa_carp == NULL)
2247 return (0);
2248
2249 return (ifa->ifa_carp->sc_vhid);
2250 }
2251
2252 int
carp_attach(struct ifaddr * ifa,int vhid)2253 carp_attach(struct ifaddr *ifa, int vhid)
2254 {
2255 struct ifnet *ifp = ifa->ifa_ifp;
2256 struct carp_if *cif = ifp->if_carp;
2257 struct carp_softc *sc;
2258 int index, error;
2259
2260 KASSERT(ifa->ifa_carp == NULL, ("%s: ifa %p attached", __func__, ifa));
2261
2262 switch (ifa->ifa_addr->sa_family) {
2263 #ifdef INET
2264 case AF_INET:
2265 #endif
2266 #ifdef INET6
2267 case AF_INET6:
2268 #endif
2269 break;
2270 default:
2271 return (EPROTOTYPE);
2272 }
2273
2274 sx_xlock(&carp_sx);
2275 if (ifp->if_carp == NULL) {
2276 sx_xunlock(&carp_sx);
2277 return (ENOPROTOOPT);
2278 }
2279
2280 IFNET_FOREACH_CARP(ifp, sc)
2281 if (sc->sc_vhid == vhid)
2282 break;
2283 if (sc == NULL) {
2284 sx_xunlock(&carp_sx);
2285 return (ENOENT);
2286 }
2287
2288 error = carp_multicast_setup(cif, ifa->ifa_addr->sa_family);
2289 if (error) {
2290 CIF_FREE(cif);
2291 sx_xunlock(&carp_sx);
2292 return (error);
2293 }
2294
2295 index = sc->sc_naddrs + sc->sc_naddrs6 + 1;
2296 if (index > sc->sc_ifasiz / sizeof(struct ifaddr *))
2297 carp_grow_ifas(sc);
2298
2299 switch (ifa->ifa_addr->sa_family) {
2300 #ifdef INET
2301 case AF_INET:
2302 cif->cif_naddrs++;
2303 sc->sc_naddrs++;
2304 break;
2305 #endif
2306 #ifdef INET6
2307 case AF_INET6:
2308 cif->cif_naddrs6++;
2309 sc->sc_naddrs6++;
2310 break;
2311 #endif
2312 }
2313
2314 ifa_ref(ifa);
2315
2316 CARP_LOCK(sc);
2317 sc->sc_ifas[index - 1] = ifa;
2318 ifa->ifa_carp = sc;
2319 if (sc->sc_version == CARP_VERSION_CARP)
2320 carp_hmac_prepare(sc);
2321 carp_sc_state(sc);
2322 CARP_UNLOCK(sc);
2323
2324 sx_xunlock(&carp_sx);
2325
2326 return (0);
2327 }
2328
2329 void
carp_detach(struct ifaddr * ifa,bool keep_cif)2330 carp_detach(struct ifaddr *ifa, bool keep_cif)
2331 {
2332 struct ifnet *ifp = ifa->ifa_ifp;
2333 struct carp_if *cif = ifp->if_carp;
2334 struct carp_softc *sc = ifa->ifa_carp;
2335 int i, index;
2336
2337 KASSERT(sc != NULL, ("%s: %p not attached", __func__, ifa));
2338
2339 sx_xlock(&carp_sx);
2340
2341 CARP_LOCK(sc);
2342 /* Shift array. */
2343 index = sc->sc_naddrs + sc->sc_naddrs6;
2344 for (i = 0; i < index; i++)
2345 if (sc->sc_ifas[i] == ifa)
2346 break;
2347 KASSERT(i < index, ("%s: %p no backref", __func__, ifa));
2348 for (; i < index - 1; i++)
2349 sc->sc_ifas[i] = sc->sc_ifas[i+1];
2350 sc->sc_ifas[index - 1] = NULL;
2351
2352 switch (ifa->ifa_addr->sa_family) {
2353 #ifdef INET
2354 case AF_INET:
2355 cif->cif_naddrs--;
2356 sc->sc_naddrs--;
2357 break;
2358 #endif
2359 #ifdef INET6
2360 case AF_INET6:
2361 cif->cif_naddrs6--;
2362 sc->sc_naddrs6--;
2363 break;
2364 #endif
2365 }
2366
2367 carp_ifa_delroute(ifa);
2368 carp_multicast_cleanup(cif, ifa->ifa_addr->sa_family);
2369
2370 ifa->ifa_carp = NULL;
2371 ifa_free(ifa);
2372
2373 if (sc->sc_version == CARP_VERSION_CARP)
2374 carp_hmac_prepare(sc);
2375 carp_sc_state(sc);
2376
2377 if (!keep_cif && sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0)
2378 carp_destroy(sc);
2379 else
2380 CARP_UNLOCK(sc);
2381
2382 if (!keep_cif)
2383 CIF_FREE(cif);
2384
2385 sx_xunlock(&carp_sx);
2386 }
2387
2388 static void
carp_set_state(struct carp_softc * sc,int state,const char * reason)2389 carp_set_state(struct carp_softc *sc, int state, const char *reason)
2390 {
2391
2392 CARP_LOCK_ASSERT(sc);
2393
2394 if (sc->sc_state != state) {
2395 const char *carp_states[] = { CARP_STATES };
2396 char subsys[IFNAMSIZ+5];
2397
2398 snprintf(subsys, IFNAMSIZ+5, "%u@%s", sc->sc_vhid,
2399 if_name(sc->sc_carpdev));
2400
2401 CARP_LOG("%s: %s -> %s (%s)\n", subsys,
2402 carp_states[sc->sc_state], carp_states[state], reason);
2403
2404 sc->sc_state = state;
2405
2406 devctl_notify("CARP", subsys, carp_states[state], NULL);
2407 }
2408 }
2409
2410 static void
carp_linkstate(struct ifnet * ifp)2411 carp_linkstate(struct ifnet *ifp)
2412 {
2413 struct carp_softc *sc;
2414
2415 CIF_LOCK(ifp->if_carp);
2416 IFNET_FOREACH_CARP(ifp, sc) {
2417 CARP_LOCK(sc);
2418 carp_sc_state(sc);
2419 CARP_UNLOCK(sc);
2420 }
2421 CIF_UNLOCK(ifp->if_carp);
2422 }
2423
2424 static void
carp_sc_state(struct carp_softc * sc)2425 carp_sc_state(struct carp_softc *sc)
2426 {
2427
2428 CARP_LOCK_ASSERT(sc);
2429
2430 if (sc->sc_carpdev->if_link_state != LINK_STATE_UP ||
2431 !(sc->sc_carpdev->if_flags & IFF_UP) ||
2432 !V_carp_allow) {
2433 callout_stop(&sc->sc_ad_tmo);
2434 #ifdef INET
2435 callout_stop(&sc->sc_md_tmo);
2436 #endif
2437 #ifdef INET6
2438 callout_stop(&sc->sc_md6_tmo);
2439 #endif
2440 carp_set_state(sc, INIT, "hardware interface down");
2441 carp_setrun(sc, 0);
2442 carp_delroute(sc);
2443 if (!sc->sc_suppress)
2444 carp_demote_adj(V_carp_ifdown_adj, "interface down");
2445 sc->sc_suppress = 1;
2446 } else {
2447 carp_set_state(sc, INIT, "hardware interface up");
2448 carp_setrun(sc, 0);
2449 if (sc->sc_suppress)
2450 carp_demote_adj(-V_carp_ifdown_adj, "interface up");
2451 sc->sc_suppress = 0;
2452 }
2453 }
2454
2455 static void
carp_demote_adj(int adj,char * reason)2456 carp_demote_adj(int adj, char *reason)
2457 {
2458 atomic_add_int(&V_carp_demotion, adj);
2459 CARP_LOG("demoted by %d to %d (%s)\n", adj, V_carp_demotion, reason);
2460 taskqueue_enqueue(taskqueue_swi, &carp_sendall_task);
2461 }
2462
2463 static int
carp_allow_sysctl(SYSCTL_HANDLER_ARGS)2464 carp_allow_sysctl(SYSCTL_HANDLER_ARGS)
2465 {
2466 int new, error;
2467 struct carp_softc *sc;
2468
2469 new = V_carp_allow;
2470 error = sysctl_handle_int(oidp, &new, 0, req);
2471 if (error || !req->newptr)
2472 return (error);
2473
2474 if (V_carp_allow != new) {
2475 V_carp_allow = new;
2476
2477 mtx_lock(&carp_mtx);
2478 LIST_FOREACH(sc, &carp_list, sc_next) {
2479 CARP_LOCK(sc);
2480 if (curvnet == sc->sc_carpdev->if_vnet)
2481 carp_sc_state(sc);
2482 CARP_UNLOCK(sc);
2483 }
2484 mtx_unlock(&carp_mtx);
2485 }
2486
2487 return (0);
2488 }
2489
2490 static int
carp_dscp_sysctl(SYSCTL_HANDLER_ARGS)2491 carp_dscp_sysctl(SYSCTL_HANDLER_ARGS)
2492 {
2493 int new, error;
2494
2495 new = V_carp_dscp;
2496 error = sysctl_handle_int(oidp, &new, 0, req);
2497 if (error || !req->newptr)
2498 return (error);
2499
2500 if (new < 0 || new > 63)
2501 return (EINVAL);
2502
2503 V_carp_dscp = new;
2504
2505 return (0);
2506 }
2507
2508 static int
carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS)2509 carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS)
2510 {
2511 int new, error;
2512
2513 new = V_carp_demotion;
2514 error = sysctl_handle_int(oidp, &new, 0, req);
2515 if (error || !req->newptr)
2516 return (error);
2517
2518 carp_demote_adj(new, "sysctl");
2519
2520 return (0);
2521 }
2522
2523 static int
nlattr_get_carp_key(struct nlattr * nla,struct nl_pstate * npt,const void * arg,void * target)2524 nlattr_get_carp_key(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target)
2525 {
2526 if (__predict_false(NLA_DATA_LEN(nla) > CARP_KEY_LEN))
2527 return (EINVAL);
2528
2529 memcpy(target, NLA_DATA_CONST(nla), NLA_DATA_LEN(nla));
2530 return (0);
2531 }
2532
2533 struct nl_carp_parsed {
2534 unsigned int ifindex;
2535 char *ifname;
2536 uint32_t state;
2537 uint32_t vhid;
2538 int32_t advbase;
2539 int32_t advskew;
2540 char key[CARP_KEY_LEN];
2541 struct in_addr addr;
2542 struct in6_addr addr6;
2543 carp_version_t version;
2544 uint8_t vrrp_prio;
2545 uint16_t vrrp_adv_inter;
2546 };
2547
2548 #define _OUT(_field) offsetof(struct nl_carp_parsed, _field)
2549 static const struct nlattr_parser nla_p_set[] = {
2550 { .type = CARP_NL_VHID, .off = _OUT(vhid), .cb = nlattr_get_uint32 },
2551 { .type = CARP_NL_STATE, .off = _OUT(state), .cb = nlattr_get_uint32 },
2552 { .type = CARP_NL_ADVBASE, .off = _OUT(advbase), .cb = nlattr_get_uint32 },
2553 { .type = CARP_NL_ADVSKEW, .off = _OUT(advskew), .cb = nlattr_get_uint32 },
2554 { .type = CARP_NL_KEY, .off = _OUT(key), .cb = nlattr_get_carp_key },
2555 { .type = CARP_NL_IFINDEX, .off = _OUT(ifindex), .cb = nlattr_get_uint32 },
2556 { .type = CARP_NL_ADDR, .off = _OUT(addr), .cb = nlattr_get_in_addr },
2557 { .type = CARP_NL_ADDR6, .off = _OUT(addr6), .cb = nlattr_get_in6_addr },
2558 { .type = CARP_NL_IFNAME, .off = _OUT(ifname), .cb = nlattr_get_string },
2559 { .type = CARP_NL_VERSION, .off = _OUT(version), .cb = nlattr_get_uint8 },
2560 { .type = CARP_NL_VRRP_PRIORITY, .off = _OUT(vrrp_prio), .cb = nlattr_get_uint8 },
2561 { .type = CARP_NL_VRRP_ADV_INTER, .off = _OUT(vrrp_adv_inter), .cb = nlattr_get_uint16 },
2562 };
2563 NL_DECLARE_PARSER(carp_parser, struct genlmsghdr, nlf_p_empty, nla_p_set);
2564 #undef _OUT
2565
2566
2567 static int
carp_nl_get(struct nlmsghdr * hdr,struct nl_pstate * npt)2568 carp_nl_get(struct nlmsghdr *hdr, struct nl_pstate *npt)
2569 {
2570 struct nl_carp_parsed attrs = { };
2571 struct epoch_tracker et;
2572 struct nl_writer *nw = npt->nw;
2573 struct carp_softc *sc;
2574 if_t ifp = NULL;
2575 int error;
2576 bool privileged;
2577
2578 error = nl_parse_nlmsg(hdr, &carp_parser, npt, &attrs);
2579 if (error != 0)
2580 return (error);
2581
2582 if (attrs.vhid < 0 || attrs.vhid > CARP_MAXVHID)
2583 return (EINVAL);
2584
2585 NET_EPOCH_ENTER(et);
2586 if (attrs.ifname != NULL)
2587 ifp = ifunit_ref(attrs.ifname);
2588 else if (attrs.ifindex != 0)
2589 ifp = ifnet_byindex_ref(attrs.ifindex);
2590 NET_EPOCH_EXIT(et);
2591
2592 if ((error = carp_is_supported_if(ifp)) != 0)
2593 goto out;
2594
2595 if (ifp->if_carp == NULL) {
2596 error = ENOENT;
2597 goto out;
2598 }
2599
2600 hdr->nlmsg_flags |= NLM_F_MULTI;
2601 privileged = (priv_check_cred(nlp_get_cred(npt->nlp),
2602 PRIV_NETINET_CARP) == 0);
2603
2604 sx_xlock(&carp_sx);
2605 IFNET_FOREACH_CARP(ifp, sc) {
2606 struct genlmsghdr *ghdr_new;
2607
2608 if (attrs.vhid != 0 && attrs.vhid != sc->sc_vhid)
2609 continue;
2610
2611 if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) {
2612 nlmsg_abort(nw);
2613 error = ENOMEM;
2614 break;
2615 }
2616
2617 ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr);
2618 if (ghdr_new == NULL) {
2619 nlmsg_abort(nw);
2620 error = ENOMEM;
2621 break;
2622 }
2623
2624 ghdr_new->cmd = CARP_NL_CMD_GET;
2625 ghdr_new->version = 0;
2626 ghdr_new->reserved = 0;
2627
2628 CARP_LOCK(sc);
2629 nlattr_add_u32(nw, CARP_NL_VHID, sc->sc_vhid);
2630 nlattr_add_u32(nw, CARP_NL_STATE, sc->sc_state);
2631 nlattr_add_u8(nw, CARP_NL_VERSION, sc->sc_version);
2632 switch (sc->sc_version) {
2633 case CARP_VERSION_CARP:
2634 nlattr_add_s32(nw, CARP_NL_ADVBASE, sc->sc_advbase);
2635 nlattr_add_s32(nw, CARP_NL_ADVSKEW, sc->sc_advskew);
2636 nlattr_add_in_addr(nw, CARP_NL_ADDR, &sc->sc_carpaddr);
2637 nlattr_add_in6_addr(nw, CARP_NL_ADDR6,
2638 &sc->sc_carpaddr6);
2639 if (privileged)
2640 nlattr_add(nw, CARP_NL_KEY, sizeof(sc->sc_key),
2641 sc->sc_key);
2642 break;
2643 case CARP_VERSION_VRRPv3:
2644 nlattr_add_u8(nw, CARP_NL_VRRP_PRIORITY,
2645 sc->sc_vrrp_prio);
2646 nlattr_add_u16(nw, CARP_NL_VRRP_ADV_INTER,
2647 sc->sc_vrrp_adv_inter);
2648 break;
2649 }
2650 CARP_UNLOCK(sc);
2651
2652 if (! nlmsg_end(nw)) {
2653 nlmsg_abort(nw);
2654 error = ENOMEM;
2655 break;
2656 }
2657 }
2658 sx_xunlock(&carp_sx);
2659
2660 if (! nlmsg_end_dump(nw, error, hdr))
2661 error = ENOMEM;
2662
2663 out:
2664 if (ifp != NULL)
2665 if_rele(ifp);
2666
2667 return (error);
2668 }
2669
2670 static int
carp_nl_set(struct nlmsghdr * hdr,struct nl_pstate * npt)2671 carp_nl_set(struct nlmsghdr *hdr, struct nl_pstate *npt)
2672 {
2673 struct nl_carp_parsed attrs = { };
2674 struct epoch_tracker et;
2675 struct carp_softc *sc;
2676 if_t ifp = NULL;
2677 int error;
2678
2679 error = nl_parse_nlmsg(hdr, &carp_parser, npt, &attrs);
2680 if (error != 0)
2681 return (error);
2682
2683 if (attrs.vhid <= 0 || attrs.vhid > CARP_MAXVHID)
2684 return (EINVAL);
2685 if (attrs.state > CARP_MAXSTATE)
2686 return (EINVAL);
2687 if (attrs.version == 0) /* compat with pre-VRRPv3 */
2688 attrs.version = CARP_VERSION_CARP;
2689 switch (attrs.version) {
2690 case CARP_VERSION_CARP:
2691 if (attrs.advbase < 0 || attrs.advskew < 0)
2692 return (EINVAL);
2693 if (attrs.advbase > 255)
2694 return (EINVAL);
2695 if (attrs.advskew >= 255)
2696 return (EINVAL);
2697 break;
2698 case CARP_VERSION_VRRPv3:
2699 if (attrs.vrrp_adv_inter > VRRP_MAX_INTERVAL)
2700 return (EINVAL);
2701 break;
2702 default:
2703 return (EINVAL);
2704 }
2705
2706 NET_EPOCH_ENTER(et);
2707 if (attrs.ifname != NULL)
2708 ifp = ifunit_ref(attrs.ifname);
2709 else if (attrs.ifindex != 0)
2710 ifp = ifnet_byindex_ref(attrs.ifindex);
2711 NET_EPOCH_EXIT(et);
2712
2713 if ((error = carp_is_supported_if(ifp)) != 0)
2714 goto out;
2715
2716 if ((ifp->if_flags & IFF_MULTICAST) == 0) {
2717 error = EADDRNOTAVAIL;
2718 goto out;
2719 }
2720
2721 sx_xlock(&carp_sx);
2722 if (ifp->if_carp) {
2723 IFNET_FOREACH_CARP(ifp, sc)
2724 if (sc->sc_vhid == attrs.vhid)
2725 break;
2726 } else
2727 sc = NULL;
2728 if (sc == NULL)
2729 sc = carp_alloc(ifp, attrs.version, attrs.vhid);
2730 else if (sc->sc_version != attrs.version) {
2731 sx_xunlock(&carp_sx);
2732 error = EINVAL;
2733 goto out;
2734 }
2735
2736 CARP_LOCK(sc);
2737 switch (sc->sc_version) {
2738 case CARP_VERSION_CARP:
2739 if (attrs.advbase != 0)
2740 sc->sc_advbase = attrs.advbase;
2741 sc->sc_advskew = attrs.advskew;
2742 if (attrs.addr.s_addr != INADDR_ANY)
2743 sc->sc_carpaddr = attrs.addr;
2744 if (!IN6_IS_ADDR_UNSPECIFIED(&attrs.addr6)) {
2745 memcpy(&sc->sc_carpaddr6, &attrs.addr6,
2746 sizeof(sc->sc_carpaddr6));
2747 }
2748 if (attrs.key[0] != '\0') {
2749 bcopy(attrs.key, sc->sc_key, sizeof(sc->sc_key));
2750 carp_hmac_prepare(sc);
2751 }
2752 break;
2753 case CARP_VERSION_VRRPv3:
2754 if (attrs.vrrp_prio != 0)
2755 sc->sc_vrrp_prio = attrs.vrrp_prio;
2756 if (attrs.vrrp_adv_inter)
2757 sc->sc_vrrp_adv_inter = attrs.vrrp_adv_inter;
2758 break;
2759 }
2760
2761 if (sc->sc_state != INIT && sc->sc_state != attrs.state) {
2762 switch (attrs.state) {
2763 case BACKUP:
2764 callout_stop(&sc->sc_ad_tmo);
2765 carp_set_state(sc, BACKUP,
2766 "user requested via ifconfig");
2767 carp_setrun(sc, 0);
2768 carp_delroute(sc);
2769 break;
2770 case MASTER:
2771 NET_EPOCH_ENTER(et);
2772 carp_master_down_locked(sc,
2773 "user requested via ifconfig");
2774 NET_EPOCH_EXIT(et);
2775 break;
2776 default:
2777 break;
2778 }
2779 }
2780 CARP_UNLOCK(sc);
2781 sx_xunlock(&carp_sx);
2782
2783 out:
2784 if (ifp != NULL)
2785 if_rele(ifp);
2786
2787 return (error);
2788 }
2789
2790 static const struct nlhdr_parser *all_parsers[] = {
2791 &carp_parser
2792 };
2793
2794 static const struct genl_cmd carp_cmds[] = {
2795 {
2796 .cmd_num = CARP_NL_CMD_GET,
2797 .cmd_name = "SIOCGVH",
2798 .cmd_cb = carp_nl_get,
2799 .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP |
2800 GENL_CMD_CAP_HASPOL,
2801 },
2802 {
2803 .cmd_num = CARP_NL_CMD_SET,
2804 .cmd_name = "SIOCSVH",
2805 .cmd_cb = carp_nl_set,
2806 .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_HASPOL,
2807 .cmd_priv = PRIV_NETINET_CARP,
2808 },
2809 };
2810
2811 static uint16_t carp_family_id;
2812 static void
carp_nl_register(void)2813 carp_nl_register(void)
2814 {
2815 bool ret __diagused;
2816
2817 NL_VERIFY_PARSERS(all_parsers);
2818 carp_family_id = genl_register_family(CARP_NL_FAMILY_NAME, 0, 2,
2819 CARP_NL_CMD_MAX);
2820 MPASS(carp_family_id != 0);
2821
2822 ret = genl_register_cmds(carp_family_id, carp_cmds, nitems(carp_cmds));
2823 MPASS(ret);
2824 }
2825
2826 static void
carp_nl_unregister(void)2827 carp_nl_unregister(void)
2828 {
2829 genl_unregister_family(carp_family_id);
2830 }
2831
2832 static void
carp_mod_cleanup(void)2833 carp_mod_cleanup(void)
2834 {
2835
2836 carp_nl_unregister();
2837
2838 #ifdef INET
2839 (void)ipproto_unregister(IPPROTO_CARP);
2840 carp_iamatch_p = NULL;
2841 #endif
2842 #ifdef INET6
2843 (void)ip6proto_unregister(IPPROTO_CARP);
2844 carp_iamatch6_p = NULL;
2845 carp_macmatch6_p = NULL;
2846 #endif
2847 carp_attach_p = NULL;
2848 carp_detach_p = NULL;
2849 carp_get_vhid_p = NULL;
2850 carp_linkstate_p = NULL;
2851 carp_forus_p = NULL;
2852 carp_output_p = NULL;
2853 carp_demote_adj_p = NULL;
2854 carp_master_p = NULL;
2855 mtx_unlock(&carp_mtx);
2856 taskqueue_drain(taskqueue_swi, &carp_sendall_task);
2857 mtx_destroy(&carp_mtx);
2858 sx_destroy(&carp_sx);
2859 }
2860
2861 static void
ipcarp_sysinit(void)2862 ipcarp_sysinit(void)
2863 {
2864
2865 /* Load allow as tunable so to postpone carp start after module load */
2866 TUNABLE_INT_FETCH("net.inet.carp.allow", &V_carp_allow);
2867 }
2868 VNET_SYSINIT(ip_carp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, ipcarp_sysinit, NULL);
2869
2870 static int
carp_mod_load(void)2871 carp_mod_load(void)
2872 {
2873 int err;
2874
2875 mtx_init(&carp_mtx, "carp_mtx", NULL, MTX_DEF);
2876 sx_init(&carp_sx, "carp_sx");
2877 carp_get_vhid_p = carp_get_vhid;
2878 carp_forus_p = carp_forus;
2879 carp_output_p = carp_output;
2880 carp_linkstate_p = carp_linkstate;
2881 carp_attach_p = carp_attach;
2882 carp_detach_p = carp_detach;
2883 carp_demote_adj_p = carp_demote_adj;
2884 carp_master_p = carp_master;
2885 #ifdef INET6
2886 carp_iamatch6_p = carp_iamatch6;
2887 carp_macmatch6_p = carp_macmatch6;
2888 err = ip6proto_register(IPPROTO_CARP, carp6_input, NULL);
2889 if (err) {
2890 printf("carp: error %d registering with INET6\n", err);
2891 carp_mod_cleanup();
2892 return (err);
2893 }
2894 #endif
2895 #ifdef INET
2896 carp_iamatch_p = carp_iamatch;
2897 err = ipproto_register(IPPROTO_CARP, carp_input, NULL);
2898 if (err) {
2899 printf("carp: error %d registering with INET\n", err);
2900 carp_mod_cleanup();
2901 return (err);
2902 }
2903 #endif
2904
2905 carp_nl_register();
2906
2907 return (0);
2908 }
2909
2910 static int
carp_modevent(module_t mod,int type,void * data)2911 carp_modevent(module_t mod, int type, void *data)
2912 {
2913 switch (type) {
2914 case MOD_LOAD:
2915 return carp_mod_load();
2916 /* NOTREACHED */
2917 case MOD_UNLOAD:
2918 mtx_lock(&carp_mtx);
2919 if (LIST_EMPTY(&carp_list))
2920 carp_mod_cleanup();
2921 else {
2922 mtx_unlock(&carp_mtx);
2923 return (EBUSY);
2924 }
2925 break;
2926
2927 default:
2928 return (EINVAL);
2929 }
2930
2931 return (0);
2932 }
2933
2934 static moduledata_t carp_mod = {
2935 "carp",
2936 carp_modevent,
2937 0
2938 };
2939
2940 DECLARE_MODULE(carp, carp_mod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
2941