summaryrefslogtreecommitdiffstats
path: root/freebsd/sys/netinet/in_pcb.c
diff options
context:
space:
mode:
Diffstat (limited to 'freebsd/sys/netinet/in_pcb.c')
-rw-r--r--freebsd/sys/netinet/in_pcb.c589
1 files changed, 459 insertions, 130 deletions
diff --git a/freebsd/sys/netinet/in_pcb.c b/freebsd/sys/netinet/in_pcb.c
index f89487b6..5ba918fa 100644
--- a/freebsd/sys/netinet/in_pcb.c
+++ b/freebsd/sys/netinet/in_pcb.c
@@ -114,6 +114,9 @@ __FBSDID("$FreeBSD$");
#include <security/mac/mac_framework.h>
+#define INPCBLBGROUP_SIZMIN 8
+#define INPCBLBGROUP_SIZMAX 256
+
static struct callout ipport_tick_callout;
/*
@@ -141,7 +144,7 @@ VNET_DEFINE(int, ipport_randomcps) = 10; /* user controlled via sysctl */
VNET_DEFINE(int, ipport_randomtime) = 45; /* user controlled via sysctl */
VNET_DEFINE(int, ipport_stoprandom); /* toggled by ipport_tick */
VNET_DEFINE(int, ipport_tcpallocs);
-static VNET_DEFINE(int, ipport_tcplastcount);
+VNET_DEFINE_STATIC(int, ipport_tcplastcount);
#define V_ipport_tcplastcount VNET(ipport_tcplastcount)
@@ -223,6 +226,222 @@ SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime,
* functions often modify hash chains or addresses in pcbs.
*/
+static struct inpcblbgroup *
+in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag,
+ uint16_t port, const union in_dependaddr *addr, int size)
+{
+ struct inpcblbgroup *grp;
+ size_t bytes;
+
+ bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
+ grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
+ if (!grp)
+ return (NULL);
+ grp->il_vflag = vflag;
+ grp->il_lport = port;
+ grp->il_dependladdr = *addr;
+ grp->il_inpsiz = size;
+ CK_LIST_INSERT_HEAD(hdr, grp, il_list);
+ return (grp);
+}
+
+static void
+in_pcblbgroup_free_deferred(epoch_context_t ctx)
+{
+ struct inpcblbgroup *grp;
+
+ grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx);
+ free(grp, M_PCB);
+}
+
+static void
+in_pcblbgroup_free(struct inpcblbgroup *grp)
+{
+
+ CK_LIST_REMOVE(grp, il_list);
+ epoch_call(net_epoch_preempt, &grp->il_epoch_ctx,
+ in_pcblbgroup_free_deferred);
+}
+
+static struct inpcblbgroup *
+in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
+ struct inpcblbgroup *old_grp, int size)
+{
+ struct inpcblbgroup *grp;
+ int i;
+
+ grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag,
+ old_grp->il_lport, &old_grp->il_dependladdr, size);
+ if (!grp)
+ return (NULL);
+
+ KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
+ ("invalid new local group size %d and old local group count %d",
+ grp->il_inpsiz, old_grp->il_inpcnt));
+
+ for (i = 0; i < old_grp->il_inpcnt; ++i)
+ grp->il_inp[i] = old_grp->il_inp[i];
+ grp->il_inpcnt = old_grp->il_inpcnt;
+ in_pcblbgroup_free(old_grp);
+ return (grp);
+}
+
+/*
+ * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i]
+ * and shrink group if possible.
+ */
+static void
+in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp,
+ int i)
+{
+ struct inpcblbgroup *grp = *grpp;
+
+ for (; i + 1 < grp->il_inpcnt; ++i)
+ grp->il_inp[i] = grp->il_inp[i + 1];
+ grp->il_inpcnt--;
+
+ if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN &&
+ grp->il_inpcnt <= (grp->il_inpsiz / 4)) {
+ /* Shrink this group. */
+ struct inpcblbgroup *new_grp =
+ in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2);
+ if (new_grp)
+ *grpp = new_grp;
+ }
+ return;
+}
+
+/*
+ * Add PCB to load balance group for SO_REUSEPORT_LB option.
+ */
+static int
+in_pcbinslbgrouphash(struct inpcb *inp)
+{
+ const static struct timeval interval = { 60, 0 };
+ static struct timeval lastprint;
+ struct inpcbinfo *pcbinfo;
+ struct inpcblbgrouphead *hdr;
+ struct inpcblbgroup *grp;
+ uint16_t hashmask, lport;
+ uint32_t group_index;
+ struct ucred *cred;
+
+ pcbinfo = inp->inp_pcbinfo;
+
+ INP_WLOCK_ASSERT(inp);
+ INP_HASH_WLOCK_ASSERT(pcbinfo);
+
+ if (pcbinfo->ipi_lbgrouphashbase == NULL)
+ return (0);
+
+ hashmask = pcbinfo->ipi_lbgrouphashmask;
+ lport = inp->inp_lport;
+ group_index = INP_PCBLBGROUP_PORTHASH(lport, hashmask);
+ hdr = &pcbinfo->ipi_lbgrouphashbase[group_index];
+
+ /*
+ * Don't allow jailed socket to join local group.
+ */
+ if (inp->inp_socket != NULL)
+ cred = inp->inp_socket->so_cred;
+ else
+ cred = NULL;
+ if (cred != NULL && jailed(cred))
+ return (0);
+
+#ifdef INET6
+ /*
+ * Don't allow IPv4 mapped INET6 wild socket.
+ */
+ if ((inp->inp_vflag & INP_IPV4) &&
+ inp->inp_laddr.s_addr == INADDR_ANY &&
+ INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
+ return (0);
+ }
+#endif
+
+ hdr = &pcbinfo->ipi_lbgrouphashbase[
+ INP_PCBLBGROUP_PORTHASH(inp->inp_lport,
+ pcbinfo->ipi_lbgrouphashmask)];
+ CK_LIST_FOREACH(grp, hdr, il_list) {
+ if (grp->il_vflag == inp->inp_vflag &&
+ grp->il_lport == inp->inp_lport &&
+ memcmp(&grp->il_dependladdr,
+ &inp->inp_inc.inc_ie.ie_dependladdr,
+ sizeof(grp->il_dependladdr)) == 0) {
+ break;
+ }
+ }
+ if (grp == NULL) {
+ /* Create new load balance group. */
+ grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag,
+ inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
+ INPCBLBGROUP_SIZMIN);
+ if (!grp)
+ return (ENOBUFS);
+ } else if (grp->il_inpcnt == grp->il_inpsiz) {
+ if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
+ if (ratecheck(&lastprint, &interval))
+ printf("lb group port %d, limit reached\n",
+ ntohs(grp->il_lport));
+ return (0);
+ }
+
+ /* Expand this local group. */
+ grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
+ if (!grp)
+ return (ENOBUFS);
+ }
+
+ KASSERT(grp->il_inpcnt < grp->il_inpsiz,
+ ("invalid local group size %d and count %d",
+ grp->il_inpsiz, grp->il_inpcnt));
+
+ grp->il_inp[grp->il_inpcnt] = inp;
+ grp->il_inpcnt++;
+ return (0);
+}
+
+/*
+ * Remove PCB from load balance group.
+ */
+static void
+in_pcbremlbgrouphash(struct inpcb *inp)
+{
+ struct inpcbinfo *pcbinfo;
+ struct inpcblbgrouphead *hdr;
+ struct inpcblbgroup *grp;
+ int i;
+
+ pcbinfo = inp->inp_pcbinfo;
+
+ INP_WLOCK_ASSERT(inp);
+ INP_HASH_WLOCK_ASSERT(pcbinfo);
+
+ if (pcbinfo->ipi_lbgrouphashbase == NULL)
+ return;
+
+ hdr = &pcbinfo->ipi_lbgrouphashbase[
+ INP_PCBLBGROUP_PORTHASH(inp->inp_lport,
+ pcbinfo->ipi_lbgrouphashmask)];
+
+ CK_LIST_FOREACH(grp, hdr, il_list) {
+ for (i = 0; i < grp->il_inpcnt; ++i) {
+ if (grp->il_inp[i] != inp)
+ continue;
+
+ if (grp->il_inpcnt == 1) {
+ /* We are the last, free this local group. */
+ in_pcblbgroup_free(grp);
+ } else {
+ /* Pull up inpcbs, shrink group if possible. */
+ in_pcblbgroup_reorder(hdr, &grp, i);
+ }
+ return;
+ }
+ }
+}
+
/*
* Different protocols initialize their inpcbs differently - giving
* different name to the lock. But they all are disposed the same.
@@ -252,12 +471,14 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
pcbinfo->ipi_vnet = curvnet;
#endif
pcbinfo->ipi_listhead = listhead;
- LIST_INIT(pcbinfo->ipi_listhead);
+ CK_LIST_INIT(pcbinfo->ipi_listhead);
pcbinfo->ipi_count = 0;
pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB,
&pcbinfo->ipi_hashmask);
pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
&pcbinfo->ipi_porthashmask);
+ pcbinfo->ipi_lbgrouphashbase = hashinit(hash_nelements, M_PCB,
+ &pcbinfo->ipi_lbgrouphashmask);
#ifdef PCBGROUP
in_pcbgroup_init(pcbinfo, hashfields, hash_nelements);
#endif
@@ -281,6 +502,8 @@ in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
pcbinfo->ipi_porthashmask);
+ hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
+ pcbinfo->ipi_lbgrouphashmask);
#ifdef PCBGROUP
in_pcbgroup_destroy(pcbinfo);
#endif
@@ -341,7 +564,7 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
#endif
INP_WLOCK(inp);
INP_LIST_WLOCK(pcbinfo);
- LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
+ CK_LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
pcbinfo->ipi_count++;
so->so_pcb = (caddr_t)inp;
#ifdef INET6
@@ -519,18 +742,20 @@ in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
/*
* Return cached socket options.
*/
-short
+int
inp_so_options(const struct inpcb *inp)
{
- short so_options;
+ int so_options;
- so_options = 0;
+ so_options = 0;
- if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
- so_options |= SO_REUSEPORT;
- if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
- so_options |= SO_REUSEADDR;
- return (so_options);
+ if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0)
+ so_options |= SO_REUSEPORT_LB;
+ if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
+ so_options |= SO_REUSEPORT;
+ if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
+ so_options |= SO_REUSEADDR;
+ return (so_options);
}
#endif /* INET || INET6 */
@@ -589,6 +814,12 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
int error;
/*
+ * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here
+ * so that we don't have to add to the (already messy) code below.
+ */
+ int reuseport_lb = (so->so_options & SO_REUSEPORT_LB);
+
+ /*
* No state changes, so read locks are sufficient here.
*/
INP_LOCK_ASSERT(inp);
@@ -599,7 +830,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
laddr.s_addr = *laddrp;
if (nam != NULL && laddr.s_addr != INADDR_ANY)
return (EINVAL);
- if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
+ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
lookupflags = INPLOOKUP_WILDCARD;
if (nam == NULL) {
if ((error = prison_local_ip4(cred, &laddr)) != 0)
@@ -636,16 +867,23 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
*/
if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
reuseport = SO_REUSEADDR|SO_REUSEPORT;
+ /*
+ * XXX: How to deal with SO_REUSEPORT_LB here?
+ * Treat same as SO_REUSEPORT for now.
+ */
+ if ((so->so_options &
+ (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0)
+ reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB;
} else if (sin->sin_addr.s_addr != INADDR_ANY) {
sin->sin_port = 0; /* yech... */
bzero(&sin->sin_zero, sizeof(sin->sin_zero));
/*
- * Is the address a local IP address?
+ * Is the address a local IP address?
* If INP_BINDANY is set, then the socket may be bound
* to any endpoint address, local or not.
*/
if ((inp->inp_flags & INP_BINDANY) == 0 &&
- ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
+ ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
return (EADDRNOTAVAIL);
}
laddr = sin->sin_addr;
@@ -675,7 +913,8 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
(ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
- (t->inp_flags2 & INP_REUSEPORT) == 0) &&
+ (t->inp_flags2 & INP_REUSEPORT) ||
+ (t->inp_flags2 & INP_REUSEPORT_LB) == 0) &&
#ifndef __rtems__
(inp->inp_cred->cr_uid !=
t->inp_cred->cr_uid))
@@ -704,11 +943,15 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
*/
tw = intotw(t);
if (tw == NULL ||
- (reuseport & tw->tw_so_options) == 0)
+ ((reuseport & tw->tw_so_options) == 0 &&
+ (reuseport_lb &
+ tw->tw_so_options) == 0)) {
return (EADDRINUSE);
+ }
} else if (t &&
- ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
- (reuseport & inp_so_options(t)) == 0) {
+ ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
+ (reuseport & inp_so_options(t)) == 0 &&
+ (reuseport_lb & inp_so_options(t)) == 0) {
#ifdef INET6
if (ntohl(sin->sin_addr.s_addr) !=
INADDR_ANY ||
@@ -717,7 +960,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
(inp->inp_vflag & INP_IPV6PROTO) == 0 ||
(t->inp_vflag & INP_IPV6PROTO) == 0)
#endif
- return (EADDRINUSE);
+ return (EADDRINUSE);
if (t && (! in_pcbbind_check_bindmulti(inp, t)))
return (EADDRINUSE);
}
@@ -862,7 +1105,6 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
ifp = ia->ia_ifp;
ia = NULL;
- IF_ADDR_RLOCK(ifp);
CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
sa = ifa->ifa_addr;
@@ -876,10 +1118,8 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
}
if (ia != NULL) {
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
- IF_ADDR_RUNLOCK(ifp);
goto done;
}
- IF_ADDR_RUNLOCK(ifp);
/* 3. As a last resort return the 'default' jail address. */
error = prison_get_ip4(cred, laddr);
@@ -921,7 +1161,6 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
*/
ia = NULL;
ifp = sro.ro_rt->rt_ifp;
- IF_ADDR_RLOCK(ifp);
CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
sa = ifa->ifa_addr;
if (sa->sa_family != AF_INET)
@@ -934,10 +1173,8 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
}
if (ia != NULL) {
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
- IF_ADDR_RUNLOCK(ifp);
goto done;
}
- IF_ADDR_RUNLOCK(ifp);
/* 3. As a last resort return the 'default' jail address. */
error = prison_get_ip4(cred, laddr);
@@ -985,9 +1222,7 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
ifp = ia->ia_ifp;
ia = NULL;
- IF_ADDR_RLOCK(ifp);
CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
-
sa = ifa->ifa_addr;
if (sa->sa_family != AF_INET)
continue;
@@ -1000,10 +1235,8 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
}
if (ia != NULL) {
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
- IF_ADDR_RUNLOCK(ifp);
goto done;
}
- IF_ADDR_RUNLOCK(ifp);
}
/* 3. As a last resort return the 'default' jail address. */
@@ -1347,6 +1580,58 @@ in_pcblist_rele_rlocked(epoch_context_t ctx)
free(il, M_TEMP);
}
+static void
+inpcbport_free(epoch_context_t ctx)
+{
+ struct inpcbport *phd;
+
+ phd = __containerof(ctx, struct inpcbport, phd_epoch_ctx);
+ free(phd, M_PCB);
+}
+
+static void
+in_pcbfree_deferred(epoch_context_t ctx)
+{
+ struct inpcb *inp;
+ int released __unused;
+
+ inp = __containerof(ctx, struct inpcb, inp_epoch_ctx);
+
+ INP_WLOCK(inp);
+#ifdef INET
+ struct ip_moptions *imo = inp->inp_moptions;
+ inp->inp_moptions = NULL;
+#endif
+ /* XXXRW: Do as much as possible here. */
+#if defined(IPSEC) || defined(IPSEC_SUPPORT)
+ if (inp->inp_sp != NULL)
+ ipsec_delete_pcbpolicy(inp);
+#endif
+#ifdef INET6
+ struct ip6_moptions *im6o = NULL;
+ if (inp->inp_vflag & INP_IPV6PROTO) {
+ ip6_freepcbopts(inp->in6p_outputopts);
+ im6o = inp->in6p_moptions;
+ inp->in6p_moptions = NULL;
+ }
+#endif
+ if (inp->inp_options)
+ (void)m_free(inp->inp_options);
+ inp->inp_vflag = 0;
+ crfree(inp->inp_cred);
+#ifdef MAC
+ mac_inpcb_destroy(inp);
+#endif
+ released = in_pcbrele_wlocked(inp);
+ MPASS(released);
+#ifdef INET6
+ ip6_freemoptions(im6o);
+#endif
+#ifdef INET
+ inp_freemoptions(imo);
+#endif
+}
+
/*
* Unconditionally schedule an inpcb to be freed by decrementing its
* reference count, which should occur only after the inpcb has been detached
@@ -1361,14 +1646,7 @@ in_pcbfree(struct inpcb *inp)
{
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
-#ifdef INET6
- struct ip6_moptions *im6o = NULL;
-#endif
-#ifdef INET
- struct ip_moptions *imo = NULL;
-#endif
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
-
KASSERT((inp->inp_flags2 & INP_FREED) == 0,
("%s: called twice for pcb %p", __func__, inp));
if (inp->inp_flags2 & INP_FREED) {
@@ -1384,45 +1662,14 @@ in_pcbfree(struct inpcb *inp)
}
#endif
INP_WLOCK_ASSERT(inp);
-
-#ifdef INET
- imo = inp->inp_moptions;
- inp->inp_moptions = NULL;
-#endif
- /* XXXRW: Do as much as possible here. */
-#if defined(IPSEC) || defined(IPSEC_SUPPORT)
- if (inp->inp_sp != NULL)
- ipsec_delete_pcbpolicy(inp);
-#endif
INP_LIST_WLOCK(pcbinfo);
- inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
in_pcbremlists(inp);
INP_LIST_WUNLOCK(pcbinfo);
-#ifdef INET6
- if (inp->inp_vflag & INP_IPV6PROTO) {
- ip6_freepcbopts(inp->in6p_outputopts);
- im6o = inp->in6p_moptions;
- inp->in6p_moptions = NULL;
- }
-#endif
- if (inp->inp_options)
- (void)m_free(inp->inp_options);
RO_INVALIDATE_CACHE(&inp->inp_route);
-
- inp->inp_vflag = 0;
+ /* mark as destruction in progress */
inp->inp_flags2 |= INP_FREED;
- crfree(inp->inp_cred);
-#ifdef MAC
- mac_inpcb_destroy(inp);
-#endif
-#ifdef INET6
- ip6_freemoptions(im6o);
-#endif
-#ifdef INET
- inp_freemoptions(imo);
-#endif
- if (!in_pcbrele_wlocked(inp))
- INP_WUNLOCK(inp);
+ INP_WUNLOCK(inp);
+ epoch_call(net_epoch_preempt, &inp->inp_epoch_ctx, in_pcbfree_deferred);
}
/*
@@ -1444,6 +1691,10 @@ in_pcbdrop(struct inpcb *inp)
{
INP_WLOCK_ASSERT(inp);
+#ifdef INVARIANTS
+ if (inp->inp_socket != NULL && inp->inp_ppcb != NULL)
+ MPASS(inp->inp_refcount > 1);
+#endif
/*
* XXXRW: Possibly we should protect the setting of INP_DROPPED with
@@ -1454,11 +1705,12 @@ in_pcbdrop(struct inpcb *inp)
struct inpcbport *phd = inp->inp_phd;
INP_HASH_WLOCK(inp->inp_pcbinfo);
- LIST_REMOVE(inp, inp_hash);
- LIST_REMOVE(inp, inp_portlist);
- if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
- LIST_REMOVE(phd, phd_hash);
- free(phd, M_PCB);
+ in_pcbremlbgrouphash(inp);
+ CK_LIST_REMOVE(inp, inp_hash);
+ CK_LIST_REMOVE(inp, inp_portlist);
+ if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
+ CK_LIST_REMOVE(phd, phd_hash);
+ epoch_call(net_epoch_preempt, &phd->phd_epoch_ctx, inpcbport_free);
}
INP_HASH_WUNLOCK(inp->inp_pcbinfo);
inp->inp_flags &= ~INP_INHASHLIST;
@@ -1532,7 +1784,7 @@ in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
struct inpcb *inp, *inp_temp;
INP_INFO_WLOCK(pcbinfo);
- LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
+ CK_LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
INP_WLOCK(inp);
#ifdef INET6
if ((inp->inp_vflag & INP_IPV4) == 0) {
@@ -1559,7 +1811,7 @@ in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
int i, gap;
INP_INFO_WLOCK(pcbinfo);
- LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
+ CK_LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
INP_WLOCK(inp);
imo = inp->inp_moptions;
if ((inp->inp_vflag & INP_IPV4) &&
@@ -1624,7 +1876,7 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
*/
head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
0, pcbinfo->ipi_hashmask)];
- LIST_FOREACH(inp, head, inp_hash) {
+ CK_LIST_FOREACH(inp, head, inp_hash) {
#ifdef INET6
/* XXX inp locking */
if ((inp->inp_vflag & INP_IPV4) == 0)
@@ -1658,7 +1910,7 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
*/
porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
pcbinfo->ipi_porthashmask)];
- LIST_FOREACH(phd, porthash, phd_hash) {
+ CK_LIST_FOREACH(phd, porthash, phd_hash) {
if (phd->phd_port == lport)
break;
}
@@ -1667,7 +1919,7 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
* Port is in use by one or more PCBs. Look for best
* fit.
*/
- LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
+ CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
wildcard = 0;
if (cred != NULL &&
!prison_equal_ip4(inp->inp_cred->cr_prison,
@@ -1717,6 +1969,50 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
}
#undef INP_LOOKUP_MAPPED_PCB_COST
+static struct inpcb *
+in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
+ const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr,
+ uint16_t fport, int lookupflags)
+{
+ struct inpcb *local_wild;
+ const struct inpcblbgrouphead *hdr;
+ struct inpcblbgroup *grp;
+ uint32_t idx;
+
+ INP_HASH_LOCK_ASSERT(pcbinfo);
+
+ hdr = &pcbinfo->ipi_lbgrouphashbase[INP_PCBLBGROUP_PORTHASH(lport,
+ pcbinfo->ipi_lbgrouphashmask)];
+
+ /*
+ * Order of socket selection:
+ * 1. non-wild.
+ * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD).
+ *
+ * NOTE:
+ * - Load balanced group does not contain jailed sockets
+ * - Load balanced group does not contain IPv4 mapped INET6 wild sockets
+ */
+ local_wild = NULL;
+ CK_LIST_FOREACH(grp, hdr, il_list) {
+#ifdef INET6
+ if (!(grp->il_vflag & INP_IPV4))
+ continue;
+#endif
+ if (grp->il_lport != lport)
+ continue;
+
+ idx = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport) %
+ grp->il_inpcnt;
+ if (grp->il_laddr.s_addr == laddr->s_addr)
+ return (grp->il_inp[idx]);
+ if (grp->il_laddr.s_addr == INADDR_ANY &&
+ (lookupflags & INPLOOKUP_WILDCARD) != 0)
+ local_wild = grp->il_inp[idx];
+ }
+ return (local_wild);
+}
+
#ifdef PCBGROUP
/*
* Lookup PCB in hash list, using pcbgroup tables.
@@ -1738,7 +2034,7 @@ in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup,
INP_GROUP_LOCK(pcbgroup);
head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
pcbgroup->ipg_hashmask)];
- LIST_FOREACH(inp, head, inp_pcbgrouphash) {
+ CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) {
#ifdef INET6
/* XXX inp locking */
if ((inp->inp_vflag & INP_IPV4) == 0)
@@ -1788,7 +2084,7 @@ in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup,
head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY,
lport, 0, pcbgroup->ipg_hashmask)];
- LIST_FOREACH(inp, head, inp_pcbgrouphash) {
+ CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) {
#ifdef INET6
/* XXX inp locking */
if ((inp->inp_vflag & INP_IPV4) == 0)
@@ -1862,7 +2158,7 @@ in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup,
*/
head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport,
0, pcbinfo->ipi_wildmask)];
- LIST_FOREACH(inp, head, inp_pcbgroup_wild) {
+ CK_LIST_FOREACH(inp, head, inp_pcbgroup_wild) {
#ifdef INET6
/* XXX inp locking */
if ((inp->inp_vflag & INP_IPV4) == 0)
@@ -1922,7 +2218,13 @@ found:
locked = INP_TRY_RLOCK(inp);
else
panic("%s: locking bug", __func__);
- if (!locked)
+ if (__predict_false(locked && (inp->inp_flags2 & INP_FREED))) {
+ if (lookupflags & INPLOOKUP_WLOCKPCB)
+ INP_WUNLOCK(inp);
+ else
+ INP_RUNLOCK(inp);
+ return (NULL);
+ } else if (!locked)
in_pcbref(inp);
INP_GROUP_UNLOCK(pcbgroup);
if (!locked) {
@@ -1960,18 +2262,19 @@ in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
struct inpcb *inp, *tmpinp;
u_short fport = fport_arg, lport = lport_arg;
+#ifdef INVARIANTS
KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
("%s: invalid lookup flags %d", __func__, lookupflags));
-
- INP_HASH_LOCK_ASSERT(pcbinfo);
-
+ if (!mtx_owned(&pcbinfo->ipi_hash_lock))
+ MPASS(in_epoch_verbose(net_epoch_preempt, 1));
+#endif
/*
* First look for an exact match.
*/
tmpinp = NULL;
head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
pcbinfo->ipi_hashmask)];
- LIST_FOREACH(inp, head, inp_hash) {
+ CK_LIST_FOREACH(inp, head, inp_hash) {
#ifdef INET6
/* XXX inp locking */
if ((inp->inp_vflag & INP_IPV4) == 0)
@@ -1996,6 +2299,18 @@ in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
return (tmpinp);
/*
+ * Then look in lb group (for wildcard match).
+ */
+ if (pcbinfo->ipi_lbgrouphashbase != NULL &&
+ (lookupflags & INPLOOKUP_WILDCARD)) {
+ inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr,
+ fport, lookupflags);
+ if (inp != NULL) {
+ return (inp);
+ }
+ }
+
+ /*
* Then look for a wildcard match, if requested.
*/
if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
@@ -2016,7 +2331,7 @@ in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
0, pcbinfo->ipi_hashmask)];
- LIST_FOREACH(inp, head, inp_hash) {
+ CK_LIST_FOREACH(inp, head, inp_hash) {
#ifdef INET6
/* XXX inp locking */
if ((inp->inp_vflag & INP_IPV4) == 0)
@@ -2080,40 +2395,35 @@ in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
struct ifnet *ifp)
{
struct inpcb *inp;
- bool locked;
INP_HASH_RLOCK(pcbinfo);
inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
(lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp);
if (inp != NULL) {
- if (lookupflags & INPLOOKUP_WLOCKPCB)
- locked = INP_TRY_WLOCK(inp);
- else if (lookupflags & INPLOOKUP_RLOCKPCB)
- locked = INP_TRY_RLOCK(inp);
- else
- panic("%s: locking bug", __func__);
- if (!locked)
- in_pcbref(inp);
- INP_HASH_RUNLOCK(pcbinfo);
- if (!locked) {
- if (lookupflags & INPLOOKUP_WLOCKPCB) {
- INP_WLOCK(inp);
- if (in_pcbrele_wlocked(inp))
- return (NULL);
- } else {
- INP_RLOCK(inp);
- if (in_pcbrele_rlocked(inp))
- return (NULL);
+ if (lookupflags & INPLOOKUP_WLOCKPCB) {
+ INP_WLOCK(inp);
+ if (__predict_false(inp->inp_flags2 & INP_FREED)) {
+ INP_WUNLOCK(inp);
+ inp = NULL;
}
- }
+ } else if (lookupflags & INPLOOKUP_RLOCKPCB) {
+ INP_RLOCK(inp);
+ if (__predict_false(inp->inp_flags2 & INP_FREED)) {
+ INP_RUNLOCK(inp);
+ inp = NULL;
+ }
+ } else
+ panic("%s: locking bug", __func__);
#ifdef INVARIANTS
- if (lookupflags & INPLOOKUP_WLOCKPCB)
- INP_WLOCK_ASSERT(inp);
- else
- INP_RLOCK_ASSERT(inp);
+ if (inp != NULL) {
+ if (lookupflags & INPLOOKUP_WLOCKPCB)
+ INP_WLOCK_ASSERT(inp);
+ else
+ INP_RLOCK_ASSERT(inp);
+ }
#endif
- } else
- INP_HASH_RUNLOCK(pcbinfo);
+ }
+ INP_HASH_RUNLOCK(pcbinfo);
return (inp);
}
@@ -2212,6 +2522,7 @@ in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update)
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
struct inpcbport *phd;
u_int32_t hashkey_faddr;
+ int so_options;
INP_WLOCK_ASSERT(inp);
INP_HASH_WLOCK_ASSERT(pcbinfo);
@@ -2233,9 +2544,22 @@ in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update)
INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
/*
+ * Add entry to load balance group.
+ * Only do this if SO_REUSEPORT_LB is set.
+ */
+ so_options = inp_so_options(inp);
+ if (so_options & SO_REUSEPORT_LB) {
+ int ret = in_pcbinslbgrouphash(inp);
+ if (ret) {
+ /* pcb lb group malloc fail (ret=ENOBUFS). */
+ return (ret);
+ }
+ }
+
+ /*
* Go through port list and look for a head for this lport.
*/
- LIST_FOREACH(phd, pcbporthash, phd_hash) {
+ CK_LIST_FOREACH(phd, pcbporthash, phd_hash) {
if (phd->phd_port == inp->inp_lport)
break;
}
@@ -2247,13 +2571,14 @@ in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update)
if (phd == NULL) {
return (ENOBUFS); /* XXX */
}
+ bzero(&phd->phd_epoch_ctx, sizeof(struct epoch_context));
phd->phd_port = inp->inp_lport;
- LIST_INIT(&phd->phd_pcblist);
- LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
+ CK_LIST_INIT(&phd->phd_pcblist);
+ CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
}
inp->inp_phd = phd;
- LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
- LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
+ CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
+ CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
inp->inp_flags |= INP_INHASHLIST;
#ifdef PCBGROUP
if (do_pcbgroup_update)
@@ -2316,8 +2641,8 @@ in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m)
head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
- LIST_REMOVE(inp, inp_hash);
- LIST_INSERT_HEAD(head, inp, inp_hash);
+ CK_LIST_REMOVE(inp, inp_hash);
+ CK_LIST_INSERT_HEAD(head, inp, inp_hash);
#ifdef PCBGROUP
if (m != NULL)
@@ -2358,16 +2683,20 @@ in_pcbremlists(struct inpcb *inp)
struct inpcbport *phd = inp->inp_phd;
INP_HASH_WLOCK(pcbinfo);
- LIST_REMOVE(inp, inp_hash);
- LIST_REMOVE(inp, inp_portlist);
- if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
- LIST_REMOVE(phd, phd_hash);
- free(phd, M_PCB);
+
+ /* XXX: Only do if SO_REUSEPORT_LB set? */
+ in_pcbremlbgrouphash(inp);
+
+ CK_LIST_REMOVE(inp, inp_hash);
+ CK_LIST_REMOVE(inp, inp_portlist);
+ if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
+ CK_LIST_REMOVE(phd, phd_hash);
+ epoch_call(net_epoch_preempt, &phd->phd_epoch_ctx, inpcbport_free);
}
INP_HASH_WUNLOCK(pcbinfo);
inp->inp_flags &= ~INP_INHASHLIST;
}
- LIST_REMOVE(inp, inp_list);
+ CK_LIST_REMOVE(inp, inp_list);
pcbinfo->ipi_count--;
#ifdef PCBGROUP
in_pcbgroup_remove(inp);
@@ -2511,7 +2840,7 @@ inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
struct inpcb *inp;
INP_INFO_WLOCK(&V_tcbinfo);
- LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
+ CK_LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
INP_WLOCK(inp);
func(inp, arg);
INP_WUNLOCK(inp);
@@ -2594,7 +2923,7 @@ in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi)
bzero(&xi->xi_socket, sizeof(struct xsocket));
bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo));
xi->inp_gencnt = inp->inp_gencnt;
- xi->inp_ppcb = inp->inp_ppcb;
+ xi->inp_ppcb = (uintptr_t)inp->inp_ppcb;
xi->inp_flow = inp->inp_flow;
xi->inp_flowid = inp->inp_flowid;
xi->inp_flowtype = inp->inp_flowtype;