diff options
author | Sebastian Huber <sebastian.huber@embedded-brains.de> | 2018-08-22 14:59:50 +0200 |
---|---|---|
committer | Sebastian Huber <sebastian.huber@embedded-brains.de> | 2018-09-21 10:29:41 +0200 |
commit | 3489e3b6396ee9944a6a2e19e675ca54c36993b4 (patch) | |
tree | cd55cfac1c96ff4b888a9606fd6a0d8eb65bb446 /freebsd/sys/netinet/in_pcb.c | |
parent | ck: Define CK_MD_PPC32_LWSYNC if available (diff) | |
download | rtems-libbsd-3489e3b6396ee9944a6a2e19e675ca54c36993b4.tar.bz2 |
Update to FreeBSD head 2018-09-17
Git mirror commit 6c2192b1ef8c50788c751f878552526800b1e319.
Update #3472.
Diffstat (limited to 'freebsd/sys/netinet/in_pcb.c')
-rw-r--r-- | freebsd/sys/netinet/in_pcb.c | 589 |
1 files changed, 459 insertions, 130 deletions
diff --git a/freebsd/sys/netinet/in_pcb.c b/freebsd/sys/netinet/in_pcb.c index f89487b6..5ba918fa 100644 --- a/freebsd/sys/netinet/in_pcb.c +++ b/freebsd/sys/netinet/in_pcb.c @@ -114,6 +114,9 @@ __FBSDID("$FreeBSD$"); #include <security/mac/mac_framework.h> +#define INPCBLBGROUP_SIZMIN 8 +#define INPCBLBGROUP_SIZMAX 256 + static struct callout ipport_tick_callout; /* @@ -141,7 +144,7 @@ VNET_DEFINE(int, ipport_randomcps) = 10; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_randomtime) = 45; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_stoprandom); /* toggled by ipport_tick */ VNET_DEFINE(int, ipport_tcpallocs); -static VNET_DEFINE(int, ipport_tcplastcount); +VNET_DEFINE_STATIC(int, ipport_tcplastcount); #define V_ipport_tcplastcount VNET(ipport_tcplastcount) @@ -223,6 +226,222 @@ SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, * functions often modify hash chains or addresses in pcbs. */ +static struct inpcblbgroup * +in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag, + uint16_t port, const union in_dependaddr *addr, int size) +{ + struct inpcblbgroup *grp; + size_t bytes; + + bytes = __offsetof(struct inpcblbgroup, il_inp[size]); + grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT); + if (!grp) + return (NULL); + grp->il_vflag = vflag; + grp->il_lport = port; + grp->il_dependladdr = *addr; + grp->il_inpsiz = size; + CK_LIST_INSERT_HEAD(hdr, grp, il_list); + return (grp); +} + +static void +in_pcblbgroup_free_deferred(epoch_context_t ctx) +{ + struct inpcblbgroup *grp; + + grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx); + free(grp, M_PCB); +} + +static void +in_pcblbgroup_free(struct inpcblbgroup *grp) +{ + + CK_LIST_REMOVE(grp, il_list); + epoch_call(net_epoch_preempt, &grp->il_epoch_ctx, + in_pcblbgroup_free_deferred); +} + +static struct inpcblbgroup * +in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, + struct inpcblbgroup *old_grp, int size) +{ + struct inpcblbgroup *grp; + int i; + + grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag, + old_grp->il_lport, &old_grp->il_dependladdr, size); + if (!grp) + return (NULL); + + KASSERT(old_grp->il_inpcnt < grp->il_inpsiz, + ("invalid new local group size %d and old local group count %d", + grp->il_inpsiz, old_grp->il_inpcnt)); + + for (i = 0; i < old_grp->il_inpcnt; ++i) + grp->il_inp[i] = old_grp->il_inp[i]; + grp->il_inpcnt = old_grp->il_inpcnt; + in_pcblbgroup_free(old_grp); + return (grp); +} + +/* + * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i] + * and shrink group if possible. + */ +static void +in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp, + int i) +{ + struct inpcblbgroup *grp = *grpp; + + for (; i + 1 < grp->il_inpcnt; ++i) + grp->il_inp[i] = grp->il_inp[i + 1]; + grp->il_inpcnt--; + + if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN && + grp->il_inpcnt <= (grp->il_inpsiz / 4)) { + /* Shrink this group. */ + struct inpcblbgroup *new_grp = + in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2); + if (new_grp) + *grpp = new_grp; + } + return; +} + +/* + * Add PCB to load balance group for SO_REUSEPORT_LB option. + */ +static int +in_pcbinslbgrouphash(struct inpcb *inp) +{ + const static struct timeval interval = { 60, 0 }; + static struct timeval lastprint; + struct inpcbinfo *pcbinfo; + struct inpcblbgrouphead *hdr; + struct inpcblbgroup *grp; + uint16_t hashmask, lport; + uint32_t group_index; + struct ucred *cred; + + pcbinfo = inp->inp_pcbinfo; + + INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK_ASSERT(pcbinfo); + + if (pcbinfo->ipi_lbgrouphashbase == NULL) + return (0); + + hashmask = pcbinfo->ipi_lbgrouphashmask; + lport = inp->inp_lport; + group_index = INP_PCBLBGROUP_PORTHASH(lport, hashmask); + hdr = &pcbinfo->ipi_lbgrouphashbase[group_index]; + + /* + * Don't allow jailed socket to join local group. + */ + if (inp->inp_socket != NULL) + cred = inp->inp_socket->so_cred; + else + cred = NULL; + if (cred != NULL && jailed(cred)) + return (0); + +#ifdef INET6 + /* + * Don't allow IPv4 mapped INET6 wild socket. + */ + if ((inp->inp_vflag & INP_IPV4) && + inp->inp_laddr.s_addr == INADDR_ANY && + INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) { + return (0); + } +#endif + + hdr = &pcbinfo->ipi_lbgrouphashbase[ + INP_PCBLBGROUP_PORTHASH(inp->inp_lport, + pcbinfo->ipi_lbgrouphashmask)]; + CK_LIST_FOREACH(grp, hdr, il_list) { + if (grp->il_vflag == inp->inp_vflag && + grp->il_lport == inp->inp_lport && + memcmp(&grp->il_dependladdr, + &inp->inp_inc.inc_ie.ie_dependladdr, + sizeof(grp->il_dependladdr)) == 0) { + break; + } + } + if (grp == NULL) { + /* Create new load balance group. */ + grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag, + inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, + INPCBLBGROUP_SIZMIN); + if (!grp) + return (ENOBUFS); + } else if (grp->il_inpcnt == grp->il_inpsiz) { + if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) { + if (ratecheck(&lastprint, &interval)) + printf("lb group port %d, limit reached\n", + ntohs(grp->il_lport)); + return (0); + } + + /* Expand this local group. */ + grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2); + if (!grp) + return (ENOBUFS); + } + + KASSERT(grp->il_inpcnt < grp->il_inpsiz, + ("invalid local group size %d and count %d", + grp->il_inpsiz, grp->il_inpcnt)); + + grp->il_inp[grp->il_inpcnt] = inp; + grp->il_inpcnt++; + return (0); +} + +/* + * Remove PCB from load balance group. + */ +static void +in_pcbremlbgrouphash(struct inpcb *inp) +{ + struct inpcbinfo *pcbinfo; + struct inpcblbgrouphead *hdr; + struct inpcblbgroup *grp; + int i; + + pcbinfo = inp->inp_pcbinfo; + + INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK_ASSERT(pcbinfo); + + if (pcbinfo->ipi_lbgrouphashbase == NULL) + return; + + hdr = &pcbinfo->ipi_lbgrouphashbase[ + INP_PCBLBGROUP_PORTHASH(inp->inp_lport, + pcbinfo->ipi_lbgrouphashmask)]; + + CK_LIST_FOREACH(grp, hdr, il_list) { + for (i = 0; i < grp->il_inpcnt; ++i) { + if (grp->il_inp[i] != inp) + continue; + + if (grp->il_inpcnt == 1) { + /* We are the last, free this local group. */ + in_pcblbgroup_free(grp); + } else { + /* Pull up inpcbs, shrink group if possible. */ + in_pcblbgroup_reorder(hdr, &grp, i); + } + return; + } + } +} + /* * Different protocols initialize their inpcbs differently - giving * different name to the lock. But they all are disposed the same. @@ -252,12 +471,14 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, pcbinfo->ipi_vnet = curvnet; #endif pcbinfo->ipi_listhead = listhead; - LIST_INIT(pcbinfo->ipi_listhead); + CK_LIST_INIT(pcbinfo->ipi_listhead); pcbinfo->ipi_count = 0; pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, &pcbinfo->ipi_hashmask); pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_porthashmask); + pcbinfo->ipi_lbgrouphashbase = hashinit(hash_nelements, M_PCB, + &pcbinfo->ipi_lbgrouphashmask); #ifdef PCBGROUP in_pcbgroup_init(pcbinfo, hashfields, hash_nelements); #endif @@ -281,6 +502,8 @@ in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, pcbinfo->ipi_porthashmask); + hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, + pcbinfo->ipi_lbgrouphashmask); #ifdef PCBGROUP in_pcbgroup_destroy(pcbinfo); #endif @@ -341,7 +564,7 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) #endif INP_WLOCK(inp); INP_LIST_WLOCK(pcbinfo); - LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list); + CK_LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list); pcbinfo->ipi_count++; so->so_pcb = (caddr_t)inp; #ifdef INET6 @@ -519,18 +742,20 @@ in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, /* * Return cached socket options. */ -short +int inp_so_options(const struct inpcb *inp) { - short so_options; + int so_options; - so_options = 0; + so_options = 0; - if ((inp->inp_flags2 & INP_REUSEPORT) != 0) - so_options |= SO_REUSEPORT; - if ((inp->inp_flags2 & INP_REUSEADDR) != 0) - so_options |= SO_REUSEADDR; - return (so_options); + if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) + so_options |= SO_REUSEPORT_LB; + if ((inp->inp_flags2 & INP_REUSEPORT) != 0) + so_options |= SO_REUSEPORT; + if ((inp->inp_flags2 & INP_REUSEADDR) != 0) + so_options |= SO_REUSEADDR; + return (so_options); } #endif /* INET || INET6 */ @@ -589,6 +814,12 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, int error; /* + * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here + * so that we don't have to add to the (already messy) code below. + */ + int reuseport_lb = (so->so_options & SO_REUSEPORT_LB); + + /* * No state changes, so read locks are sufficient here. */ INP_LOCK_ASSERT(inp); @@ -599,7 +830,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, laddr.s_addr = *laddrp; if (nam != NULL && laddr.s_addr != INADDR_ANY) return (EINVAL); - if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) + if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0) lookupflags = INPLOOKUP_WILDCARD; if (nam == NULL) { if ((error = prison_local_ip4(cred, &laddr)) != 0) @@ -636,16 +867,23 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) reuseport = SO_REUSEADDR|SO_REUSEPORT; + /* + * XXX: How to deal with SO_REUSEPORT_LB here? + * Treat same as SO_REUSEPORT for now. + */ + if ((so->so_options & + (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0) + reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB; } else if (sin->sin_addr.s_addr != INADDR_ANY) { sin->sin_port = 0; /* yech... */ bzero(&sin->sin_zero, sizeof(sin->sin_zero)); /* - * Is the address a local IP address? + * Is the address a local IP address? * If INP_BINDANY is set, then the socket may be bound * to any endpoint address, local or not. */ if ((inp->inp_flags & INP_BINDANY) == 0 && - ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) + ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) return (EADDRNOTAVAIL); } laddr = sin->sin_addr; @@ -675,7 +913,8 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || - (t->inp_flags2 & INP_REUSEPORT) == 0) && + (t->inp_flags2 & INP_REUSEPORT) || + (t->inp_flags2 & INP_REUSEPORT_LB) == 0) && #ifndef __rtems__ (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) @@ -704,11 +943,15 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, */ tw = intotw(t); if (tw == NULL || - (reuseport & tw->tw_so_options) == 0) + ((reuseport & tw->tw_so_options) == 0 && + (reuseport_lb & + tw->tw_so_options) == 0)) { return (EADDRINUSE); + } } else if (t && - ((inp->inp_flags2 & INP_BINDMULTI) == 0) && - (reuseport & inp_so_options(t)) == 0) { + ((inp->inp_flags2 & INP_BINDMULTI) == 0) && + (reuseport & inp_so_options(t)) == 0 && + (reuseport_lb & inp_so_options(t)) == 0) { #ifdef INET6 if (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || @@ -717,7 +960,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, (inp->inp_vflag & INP_IPV6PROTO) == 0 || (t->inp_vflag & INP_IPV6PROTO) == 0) #endif - return (EADDRINUSE); + return (EADDRINUSE); if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } @@ -862,7 +1105,6 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, ifp = ia->ia_ifp; ia = NULL; - IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; @@ -876,10 +1118,8 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; - IF_ADDR_RUNLOCK(ifp); goto done; } - IF_ADDR_RUNLOCK(ifp); /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); @@ -921,7 +1161,6 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, */ ia = NULL; ifp = sro.ro_rt->rt_ifp; - IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) @@ -934,10 +1173,8 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; - IF_ADDR_RUNLOCK(ifp); goto done; } - IF_ADDR_RUNLOCK(ifp); /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); @@ -985,9 +1222,7 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, ifp = ia->ia_ifp; ia = NULL; - IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; @@ -1000,10 +1235,8 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; - IF_ADDR_RUNLOCK(ifp); goto done; } - IF_ADDR_RUNLOCK(ifp); } /* 3. As a last resort return the 'default' jail address. */ @@ -1347,6 +1580,58 @@ in_pcblist_rele_rlocked(epoch_context_t ctx) free(il, M_TEMP); } +static void +inpcbport_free(epoch_context_t ctx) +{ + struct inpcbport *phd; + + phd = __containerof(ctx, struct inpcbport, phd_epoch_ctx); + free(phd, M_PCB); +} + +static void +in_pcbfree_deferred(epoch_context_t ctx) +{ + struct inpcb *inp; + int released __unused; + + inp = __containerof(ctx, struct inpcb, inp_epoch_ctx); + + INP_WLOCK(inp); +#ifdef INET + struct ip_moptions *imo = inp->inp_moptions; + inp->inp_moptions = NULL; +#endif + /* XXXRW: Do as much as possible here. */ +#if defined(IPSEC) || defined(IPSEC_SUPPORT) + if (inp->inp_sp != NULL) + ipsec_delete_pcbpolicy(inp); +#endif +#ifdef INET6 + struct ip6_moptions *im6o = NULL; + if (inp->inp_vflag & INP_IPV6PROTO) { + ip6_freepcbopts(inp->in6p_outputopts); + im6o = inp->in6p_moptions; + inp->in6p_moptions = NULL; + } +#endif + if (inp->inp_options) + (void)m_free(inp->inp_options); + inp->inp_vflag = 0; + crfree(inp->inp_cred); +#ifdef MAC + mac_inpcb_destroy(inp); +#endif + released = in_pcbrele_wlocked(inp); + MPASS(released); +#ifdef INET6 + ip6_freemoptions(im6o); +#endif +#ifdef INET + inp_freemoptions(imo); +#endif +} + /* * Unconditionally schedule an inpcb to be freed by decrementing its * reference count, which should occur only after the inpcb has been detached @@ -1361,14 +1646,7 @@ in_pcbfree(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; -#ifdef INET6 - struct ip6_moptions *im6o = NULL; -#endif -#ifdef INET - struct ip_moptions *imo = NULL; -#endif KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); - KASSERT((inp->inp_flags2 & INP_FREED) == 0, ("%s: called twice for pcb %p", __func__, inp)); if (inp->inp_flags2 & INP_FREED) { @@ -1384,45 +1662,14 @@ in_pcbfree(struct inpcb *inp) } #endif INP_WLOCK_ASSERT(inp); - -#ifdef INET - imo = inp->inp_moptions; - inp->inp_moptions = NULL; -#endif - /* XXXRW: Do as much as possible here. */ -#if defined(IPSEC) || defined(IPSEC_SUPPORT) - if (inp->inp_sp != NULL) - ipsec_delete_pcbpolicy(inp); -#endif INP_LIST_WLOCK(pcbinfo); - inp->inp_gencnt = ++pcbinfo->ipi_gencnt; in_pcbremlists(inp); INP_LIST_WUNLOCK(pcbinfo); -#ifdef INET6 - if (inp->inp_vflag & INP_IPV6PROTO) { - ip6_freepcbopts(inp->in6p_outputopts); - im6o = inp->in6p_moptions; - inp->in6p_moptions = NULL; - } -#endif - if (inp->inp_options) - (void)m_free(inp->inp_options); RO_INVALIDATE_CACHE(&inp->inp_route); - - inp->inp_vflag = 0; + /* mark as destruction in progress */ inp->inp_flags2 |= INP_FREED; - crfree(inp->inp_cred); -#ifdef MAC - mac_inpcb_destroy(inp); -#endif -#ifdef INET6 - ip6_freemoptions(im6o); -#endif -#ifdef INET - inp_freemoptions(imo); -#endif - if (!in_pcbrele_wlocked(inp)) - INP_WUNLOCK(inp); + INP_WUNLOCK(inp); + epoch_call(net_epoch_preempt, &inp->inp_epoch_ctx, in_pcbfree_deferred); } /* @@ -1444,6 +1691,10 @@ in_pcbdrop(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); +#ifdef INVARIANTS + if (inp->inp_socket != NULL && inp->inp_ppcb != NULL) + MPASS(inp->inp_refcount > 1); +#endif /* * XXXRW: Possibly we should protect the setting of INP_DROPPED with @@ -1454,11 +1705,12 @@ in_pcbdrop(struct inpcb *inp) struct inpcbport *phd = inp->inp_phd; INP_HASH_WLOCK(inp->inp_pcbinfo); - LIST_REMOVE(inp, inp_hash); - LIST_REMOVE(inp, inp_portlist); - if (LIST_FIRST(&phd->phd_pcblist) == NULL) { - LIST_REMOVE(phd, phd_hash); - free(phd, M_PCB); + in_pcbremlbgrouphash(inp); + CK_LIST_REMOVE(inp, inp_hash); + CK_LIST_REMOVE(inp, inp_portlist); + if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { + CK_LIST_REMOVE(phd, phd_hash); + epoch_call(net_epoch_preempt, &phd->phd_epoch_ctx, inpcbport_free); } INP_HASH_WUNLOCK(inp->inp_pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; @@ -1532,7 +1784,7 @@ in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, struct inpcb *inp, *inp_temp; INP_INFO_WLOCK(pcbinfo); - LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { + CK_LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { INP_WLOCK(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) { @@ -1559,7 +1811,7 @@ in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) int i, gap; INP_INFO_WLOCK(pcbinfo); - LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { + CK_LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { INP_WLOCK(inp); imo = inp->inp_moptions; if ((inp->inp_vflag & INP_IPV4) && @@ -1624,7 +1876,7 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_hashmask)]; - LIST_FOREACH(inp, head, inp_hash) { + CK_LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) @@ -1658,7 +1910,7 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, */ porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, pcbinfo->ipi_porthashmask)]; - LIST_FOREACH(phd, porthash, phd_hash) { + CK_LIST_FOREACH(phd, porthash, phd_hash) { if (phd->phd_port == lport) break; } @@ -1667,7 +1919,7 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, * Port is in use by one or more PCBs. Look for best * fit. */ - LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { + CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { wildcard = 0; if (cred != NULL && !prison_equal_ip4(inp->inp_cred->cr_prison, @@ -1717,6 +1969,50 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, } #undef INP_LOOKUP_MAPPED_PCB_COST +static struct inpcb * +in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, + const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr, + uint16_t fport, int lookupflags) +{ + struct inpcb *local_wild; + const struct inpcblbgrouphead *hdr; + struct inpcblbgroup *grp; + uint32_t idx; + + INP_HASH_LOCK_ASSERT(pcbinfo); + + hdr = &pcbinfo->ipi_lbgrouphashbase[INP_PCBLBGROUP_PORTHASH(lport, + pcbinfo->ipi_lbgrouphashmask)]; + + /* + * Order of socket selection: + * 1. non-wild. + * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD). + * + * NOTE: + * - Load balanced group does not contain jailed sockets + * - Load balanced group does not contain IPv4 mapped INET6 wild sockets + */ + local_wild = NULL; + CK_LIST_FOREACH(grp, hdr, il_list) { +#ifdef INET6 + if (!(grp->il_vflag & INP_IPV4)) + continue; +#endif + if (grp->il_lport != lport) + continue; + + idx = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport) % + grp->il_inpcnt; + if (grp->il_laddr.s_addr == laddr->s_addr) + return (grp->il_inp[idx]); + if (grp->il_laddr.s_addr == INADDR_ANY && + (lookupflags & INPLOOKUP_WILDCARD) != 0) + local_wild = grp->il_inp[idx]; + } + return (local_wild); +} + #ifdef PCBGROUP /* * Lookup PCB in hash list, using pcbgroup tables. @@ -1738,7 +2034,7 @@ in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, INP_GROUP_LOCK(pcbgroup); head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbgroup->ipg_hashmask)]; - LIST_FOREACH(inp, head, inp_pcbgrouphash) { + CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) @@ -1788,7 +2084,7 @@ in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbgroup->ipg_hashmask)]; - LIST_FOREACH(inp, head, inp_pcbgrouphash) { + CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) @@ -1862,7 +2158,7 @@ in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, */ head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_wildmask)]; - LIST_FOREACH(inp, head, inp_pcbgroup_wild) { + CK_LIST_FOREACH(inp, head, inp_pcbgroup_wild) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) @@ -1922,7 +2218,13 @@ found: locked = INP_TRY_RLOCK(inp); else panic("%s: locking bug", __func__); - if (!locked) + if (__predict_false(locked && (inp->inp_flags2 & INP_FREED))) { + if (lookupflags & INPLOOKUP_WLOCKPCB) + INP_WUNLOCK(inp); + else + INP_RUNLOCK(inp); + return (NULL); + } else if (!locked) in_pcbref(inp); INP_GROUP_UNLOCK(pcbgroup); if (!locked) { @@ -1960,18 +2262,19 @@ in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; +#ifdef INVARIANTS KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); - - INP_HASH_LOCK_ASSERT(pcbinfo); - + if (!mtx_owned(&pcbinfo->ipi_hash_lock)) + MPASS(in_epoch_verbose(net_epoch_preempt, 1)); +#endif /* * First look for an exact match. */ tmpinp = NULL; head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbinfo->ipi_hashmask)]; - LIST_FOREACH(inp, head, inp_hash) { + CK_LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) @@ -1996,6 +2299,18 @@ in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, return (tmpinp); /* + * Then look in lb group (for wildcard match). + */ + if (pcbinfo->ipi_lbgrouphashbase != NULL && + (lookupflags & INPLOOKUP_WILDCARD)) { + inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr, + fport, lookupflags); + if (inp != NULL) { + return (inp); + } + } + + /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { @@ -2016,7 +2331,7 @@ in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_hashmask)]; - LIST_FOREACH(inp, head, inp_hash) { + CK_LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) @@ -2080,40 +2395,35 @@ in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, struct ifnet *ifp) { struct inpcb *inp; - bool locked; INP_HASH_RLOCK(pcbinfo); inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp); if (inp != NULL) { - if (lookupflags & INPLOOKUP_WLOCKPCB) - locked = INP_TRY_WLOCK(inp); - else if (lookupflags & INPLOOKUP_RLOCKPCB) - locked = INP_TRY_RLOCK(inp); - else - panic("%s: locking bug", __func__); - if (!locked) - in_pcbref(inp); - INP_HASH_RUNLOCK(pcbinfo); - if (!locked) { - if (lookupflags & INPLOOKUP_WLOCKPCB) { - INP_WLOCK(inp); - if (in_pcbrele_wlocked(inp)) - return (NULL); - } else { - INP_RLOCK(inp); - if (in_pcbrele_rlocked(inp)) - return (NULL); + if (lookupflags & INPLOOKUP_WLOCKPCB) { + INP_WLOCK(inp); + if (__predict_false(inp->inp_flags2 & INP_FREED)) { + INP_WUNLOCK(inp); + inp = NULL; } - } + } else if (lookupflags & INPLOOKUP_RLOCKPCB) { + INP_RLOCK(inp); + if (__predict_false(inp->inp_flags2 & INP_FREED)) { + INP_RUNLOCK(inp); + inp = NULL; + } + } else + panic("%s: locking bug", __func__); #ifdef INVARIANTS - if (lookupflags & INPLOOKUP_WLOCKPCB) - INP_WLOCK_ASSERT(inp); - else - INP_RLOCK_ASSERT(inp); + if (inp != NULL) { + if (lookupflags & INPLOOKUP_WLOCKPCB) + INP_WLOCK_ASSERT(inp); + else + INP_RLOCK_ASSERT(inp); + } #endif - } else - INP_HASH_RUNLOCK(pcbinfo); + } + INP_HASH_RUNLOCK(pcbinfo); return (inp); } @@ -2212,6 +2522,7 @@ in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update) struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbport *phd; u_int32_t hashkey_faddr; + int so_options; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); @@ -2233,9 +2544,22 @@ in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update) INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; /* + * Add entry to load balance group. + * Only do this if SO_REUSEPORT_LB is set. + */ + so_options = inp_so_options(inp); + if (so_options & SO_REUSEPORT_LB) { + int ret = in_pcbinslbgrouphash(inp); + if (ret) { + /* pcb lb group malloc fail (ret=ENOBUFS). */ + return (ret); + } + } + + /* * Go through port list and look for a head for this lport. */ - LIST_FOREACH(phd, pcbporthash, phd_hash) { + CK_LIST_FOREACH(phd, pcbporthash, phd_hash) { if (phd->phd_port == inp->inp_lport) break; } @@ -2247,13 +2571,14 @@ in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update) if (phd == NULL) { return (ENOBUFS); /* XXX */ } + bzero(&phd->phd_epoch_ctx, sizeof(struct epoch_context)); phd->phd_port = inp->inp_lport; - LIST_INIT(&phd->phd_pcblist); - LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); + CK_LIST_INIT(&phd->phd_pcblist); + CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); } inp->inp_phd = phd; - LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); - LIST_INSERT_HEAD(pcbhash, inp, inp_hash); + CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); + CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash); inp->inp_flags |= INP_INHASHLIST; #ifdef PCBGROUP if (do_pcbgroup_update) @@ -2316,8 +2641,8 @@ in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m) head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; - LIST_REMOVE(inp, inp_hash); - LIST_INSERT_HEAD(head, inp, inp_hash); + CK_LIST_REMOVE(inp, inp_hash); + CK_LIST_INSERT_HEAD(head, inp, inp_hash); #ifdef PCBGROUP if (m != NULL) @@ -2358,16 +2683,20 @@ in_pcbremlists(struct inpcb *inp) struct inpcbport *phd = inp->inp_phd; INP_HASH_WLOCK(pcbinfo); - LIST_REMOVE(inp, inp_hash); - LIST_REMOVE(inp, inp_portlist); - if (LIST_FIRST(&phd->phd_pcblist) == NULL) { - LIST_REMOVE(phd, phd_hash); - free(phd, M_PCB); + + /* XXX: Only do if SO_REUSEPORT_LB set? */ + in_pcbremlbgrouphash(inp); + + CK_LIST_REMOVE(inp, inp_hash); + CK_LIST_REMOVE(inp, inp_portlist); + if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { + CK_LIST_REMOVE(phd, phd_hash); + epoch_call(net_epoch_preempt, &phd->phd_epoch_ctx, inpcbport_free); } INP_HASH_WUNLOCK(pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; } - LIST_REMOVE(inp, inp_list); + CK_LIST_REMOVE(inp, inp_list); pcbinfo->ipi_count--; #ifdef PCBGROUP in_pcbgroup_remove(inp); @@ -2511,7 +2840,7 @@ inp_apply_all(void (*func)(struct inpcb *, void *), void *arg) struct inpcb *inp; INP_INFO_WLOCK(&V_tcbinfo); - LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { + CK_LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { INP_WLOCK(inp); func(inp, arg); INP_WUNLOCK(inp); @@ -2594,7 +2923,7 @@ in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi) bzero(&xi->xi_socket, sizeof(struct xsocket)); bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo)); xi->inp_gencnt = inp->inp_gencnt; - xi->inp_ppcb = inp->inp_ppcb; + xi->inp_ppcb = (uintptr_t)inp->inp_ppcb; xi->inp_flow = inp->inp_flow; xi->inp_flowid = inp->inp_flowid; xi->inp_flowtype = inp->inp_flowtype; |