summaryrefslogtreecommitdiffstats
path: root/freebsd/sys/netinet
diff options
context:
space:
mode:
authorSebastian Huber <sebastian.huber@embedded-brains.de>2018-08-21 13:47:02 +0200
committerSebastian Huber <sebastian.huber@embedded-brains.de>2018-09-21 10:29:41 +0200
commitbcdce02d9bc8150e1d191ed5ca9da45b7604964a (patch)
tree3b2faf509db7672ee1fc98857736470be97e7ed8 /freebsd/sys/netinet
parentUpdate to FreeBSD head 2018-04-01 (diff)
downloadrtems-libbsd-bcdce02d9bc8150e1d191ed5ca9da45b7604964a.tar.bz2
Update to FreeBSD head 2018-06-01
Git mirror commit fb63610a69b0eb7f69a201ba05c4c1a7a2739cf9. Update #3472.
Diffstat (limited to 'freebsd/sys/netinet')
-rw-r--r--freebsd/sys/netinet/cc/cc_newreno.c66
-rw-r--r--freebsd/sys/netinet/if_ether.c16
-rw-r--r--freebsd/sys/netinet/igmp.c201
-rw-r--r--freebsd/sys/netinet/igmp_var.h1
-rw-r--r--freebsd/sys/netinet/in.c179
-rw-r--r--freebsd/sys/netinet/in_mcast.c326
-rw-r--r--freebsd/sys/netinet/in_pcb.c135
-rw-r--r--freebsd/sys/netinet/in_pcb.h89
-rw-r--r--freebsd/sys/netinet/in_proto.c5
-rw-r--r--freebsd/sys/netinet/in_var.h62
-rw-r--r--freebsd/sys/netinet/ip_carp.c72
-rw-r--r--freebsd/sys/netinet/ip_divert.c27
-rw-r--r--freebsd/sys/netinet/ip_encap.c9
-rw-r--r--freebsd/sys/netinet/ip_encap.h1
-rw-r--r--freebsd/sys/netinet/ip_icmp.c18
-rw-r--r--freebsd/sys/netinet/ip_input.c32
-rw-r--r--freebsd/sys/netinet/ip_mroute.c6
-rw-r--r--freebsd/sys/netinet/ip_options.c7
-rw-r--r--freebsd/sys/netinet/ip_output.c14
-rw-r--r--freebsd/sys/netinet/ip_var.h4
-rw-r--r--freebsd/sys/netinet/netdump/netdump.h132
-rw-r--r--freebsd/sys/netinet/raw_ip.c27
-rw-r--r--freebsd/sys/netinet/sctp_bsd_addr.c13
-rw-r--r--freebsd/sys/netinet/sctp_indata.c8
-rw-r--r--freebsd/sys/netinet/sctp_input.c2
-rw-r--r--freebsd/sys/netinet/sctp_os_bsd.h1
-rw-r--r--freebsd/sys/netinet/sctp_output.c9
-rw-r--r--freebsd/sys/netinet/sctp_usrreq.c20
-rw-r--r--freebsd/sys/netinet/sctputil.c82
-rw-r--r--freebsd/sys/netinet/tcp_hpts.h304
-rw-r--r--freebsd/sys/netinet/tcp_input.c14
-rw-r--r--freebsd/sys/netinet/tcp_offload.c11
-rw-r--r--freebsd/sys/netinet/tcp_offload.h1
-rw-r--r--freebsd/sys/netinet/tcp_output.c17
-rw-r--r--freebsd/sys/netinet/tcp_seq.h8
-rw-r--r--freebsd/sys/netinet/tcp_subr.c339
-rw-r--r--freebsd/sys/netinet/tcp_syncache.c6
-rw-r--r--freebsd/sys/netinet/tcp_timer.c10
-rw-r--r--freebsd/sys/netinet/tcp_timewait.c2
-rw-r--r--freebsd/sys/netinet/tcp_usrreq.c41
-rw-r--r--freebsd/sys/netinet/tcp_var.h144
-rw-r--r--freebsd/sys/netinet/toecore.h5
-rw-r--r--freebsd/sys/netinet/udp_usrreq.c23
43 files changed, 1779 insertions, 710 deletions
diff --git a/freebsd/sys/netinet/cc/cc_newreno.c b/freebsd/sys/netinet/cc/cc_newreno.c
index b7f59520..4d5f8644 100644
--- a/freebsd/sys/netinet/cc/cc_newreno.c
+++ b/freebsd/sys/netinet/cc/cc_newreno.c
@@ -83,7 +83,7 @@ static MALLOC_DEFINE(M_NEWRENO, "newreno data",
#define CAST_PTR_INT(X) (*((int*)(X)))
-static int newreno_cb_init(struct cc_var *ccv);
+static void newreno_cb_destroy(struct cc_var *ccv);
static void newreno_ack_received(struct cc_var *ccv, uint16_t type);
static void newreno_after_idle(struct cc_var *ccv);
static void newreno_cong_signal(struct cc_var *ccv, uint32_t type);
@@ -97,7 +97,7 @@ static VNET_DEFINE(uint32_t, newreno_beta_ecn) = 80;
struct cc_algo newreno_cc_algo = {
.name = "newreno",
- .cb_init = newreno_cb_init,
+ .cb_destroy = newreno_cb_destroy,
.ack_received = newreno_ack_received,
.after_idle = newreno_after_idle,
.cong_signal = newreno_cong_signal,
@@ -110,18 +110,28 @@ struct newreno {
uint32_t beta_ecn;
};
-int
-newreno_cb_init(struct cc_var *ccv)
+static inline struct newreno *
+newreno_malloc(struct cc_var *ccv)
{
- struct newreno *nreno;
+ struct newreno *nreno;
- nreno = malloc(sizeof(struct newreno), M_NEWRENO, M_NOWAIT|M_ZERO);
+ nreno = malloc(sizeof(struct newreno), M_NEWRENO, M_NOWAIT);
if (nreno != NULL) {
+ /* NB: nreno is not zeroed, so initialise all fields. */
nreno->beta = V_newreno_beta;
nreno->beta_ecn = V_newreno_beta_ecn;
+ ccv->cc_data = nreno;
}
- return (0);
+ return (nreno);
+}
+
+static void
+newreno_cb_destroy(struct cc_var *ccv)
+{
+
+ if (ccv->cc_data != NULL)
+ free(ccv->cc_data, M_NEWRENO);
}
static void
@@ -226,20 +236,18 @@ static void
newreno_cong_signal(struct cc_var *ccv, uint32_t type)
{
struct newreno *nreno;
- uint32_t cwin, factor;
+ uint32_t beta, beta_ecn, cwin, factor;
u_int mss;
- factor = V_newreno_beta;
- nreno = ccv->cc_data;
- if (nreno != NULL) {
- if (V_cc_do_abe)
- factor = (type == CC_ECN ? nreno->beta_ecn: nreno->beta);
- else
- factor = nreno->beta;
- }
-
cwin = CCV(ccv, snd_cwnd);
mss = CCV(ccv, t_maxseg);
+ nreno = ccv->cc_data;
+ beta = (nreno == NULL) ? V_newreno_beta : nreno->beta;
+ beta_ecn = (nreno == NULL) ? V_newreno_beta_ecn : nreno->beta_ecn;
+ if (V_cc_do_abe && type == CC_ECN)
+ factor = beta_ecn;
+ else
+ factor = beta;
/* Catch algos which mistakenly leak private signal types. */
KASSERT((type & CC_SIGPRIVMASK) == 0,
@@ -255,8 +263,8 @@ newreno_cong_signal(struct cc_var *ccv, uint32_t type)
V_cc_do_abe && V_cc_abe_frlossreduce)) {
CCV(ccv, snd_ssthresh) =
((uint64_t)CCV(ccv, snd_ssthresh) *
- (uint64_t)nreno->beta) /
- (100ULL * (uint64_t)nreno->beta_ecn);
+ (uint64_t)beta) /
+ (100ULL * (uint64_t)beta_ecn);
}
if (!IN_CONGRECOVERY(CCV(ccv, t_flags)))
CCV(ccv, snd_ssthresh) = cwin;
@@ -280,7 +288,6 @@ static void
newreno_post_recovery(struct cc_var *ccv)
{
int pipe;
- pipe = 0;
if (IN_FASTRECOVERY(CCV(ccv, t_flags))) {
/*
@@ -304,7 +311,7 @@ newreno_post_recovery(struct cc_var *ccv)
}
}
-int
+static int
newreno_ctl_output(struct cc_var *ccv, struct sockopt *sopt, void *buf)
{
struct newreno *nreno;
@@ -315,9 +322,15 @@ newreno_ctl_output(struct cc_var *ccv, struct sockopt *sopt, void *buf)
nreno = ccv->cc_data;
opt = buf;
-
+
switch (sopt->sopt_dir) {
case SOPT_SET:
+ /* We cannot set without cc_data memory. */
+ if (nreno == NULL) {
+ nreno = newreno_malloc(ccv);
+ if (nreno == NULL)
+ return (ENOMEM);
+ }
switch (opt->name) {
case CC_NEWRENO_BETA:
nreno->beta = opt->val;
@@ -330,17 +343,21 @@ newreno_ctl_output(struct cc_var *ccv, struct sockopt *sopt, void *buf)
default:
return (ENOPROTOOPT);
}
+ break;
case SOPT_GET:
switch (opt->name) {
case CC_NEWRENO_BETA:
- opt->val = nreno->beta;
+ opt->val = (nreno == NULL) ?
+ V_newreno_beta : nreno->beta;
break;
case CC_NEWRENO_BETA_ECN:
- opt->val = nreno->beta_ecn;
+ opt->val = (nreno == NULL) ?
+ V_newreno_beta_ecn : nreno->beta_ecn;
break;
default:
return (ENOPROTOOPT);
}
+ break;
default:
return (EINVAL);
}
@@ -351,6 +368,7 @@ newreno_ctl_output(struct cc_var *ccv, struct sockopt *sopt, void *buf)
static int
newreno_beta_handler(SYSCTL_HANDLER_ARGS)
{
+
if (req->newptr != NULL ) {
if (arg1 == &VNET_NAME(newreno_beta_ecn) && !V_cc_do_abe)
return (EACCES);
diff --git a/freebsd/sys/netinet/if_ether.c b/freebsd/sys/netinet/if_ether.c
index 699af2e4..0d608180 100644
--- a/freebsd/sys/netinet/if_ether.c
+++ b/freebsd/sys/netinet/if_ether.c
@@ -364,7 +364,7 @@ arprequest(struct ifnet *ifp, const struct in_addr *sip,
struct ifaddr *ifa;
IF_ADDR_RLOCK(ifp);
- TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
if (ifa->ifa_addr->sa_family != AF_INET)
continue;
@@ -696,14 +696,6 @@ arpintr(struct mbuf *m)
hlen = ETHER_ADDR_LEN; /* RFC 826 */
layer = "ethernet";
break;
- case ARPHRD_IEEE802:
- hlen = 6; /* RFC 1390, FDDI_ADDR_LEN */
- layer = "fddi";
- break;
- case ARPHRD_ARCNET:
- hlen = 1; /* RFC 1201, ARC_ADDR_LEN */
- layer = "arcnet";
- break;
case ARPHRD_INFINIBAND:
hlen = 20; /* RFC 4391, INFINIBAND_ALEN */
layer = "infiniband";
@@ -896,7 +888,7 @@ in_arpinput(struct mbuf *m)
* as a dummy address for the rest of the function.
*/
IF_ADDR_RLOCK(ifp);
- TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
+ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
if (ifa->ifa_addr->sa_family == AF_INET &&
(ifa->ifa_carp == NULL ||
(*carp_iamatch_p)(ifa, &enaddr))) {
@@ -911,7 +903,7 @@ in_arpinput(struct mbuf *m)
* If bridging, fall back to using any inet address.
*/
IN_IFADDR_RLOCK(&in_ifa_tracker);
- if (!bridged || (ia = TAILQ_FIRST(&V_in_ifaddrhead)) == NULL) {
+ if (!bridged || (ia = CK_STAILQ_FIRST(&V_in_ifaddrhead)) == NULL) {
IN_IFADDR_RUNLOCK(&in_ifa_tracker);
goto drop;
}
@@ -1455,7 +1447,7 @@ arp_handle_ifllchange(struct ifnet *ifp)
{
struct ifaddr *ifa;
- TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
if (ifa->ifa_addr->sa_family == AF_INET)
arp_ifinit(ifp, ifa);
}
diff --git a/freebsd/sys/netinet/igmp.c b/freebsd/sys/netinet/igmp.c
index cf319470..a4b99f62 100644
--- a/freebsd/sys/netinet/igmp.c
+++ b/freebsd/sys/netinet/igmp.c
@@ -138,7 +138,7 @@ static int igmp_v3_enqueue_group_record(struct mbufq *,
struct in_multi *, const int, const int, const int);
static int igmp_v3_enqueue_filter_change(struct mbufq *,
struct in_multi *);
-static void igmp_v3_process_group_timers(struct igmp_ifsoftc *,
+static void igmp_v3_process_group_timers(struct in_multi_head *,
struct mbufq *, struct mbufq *, struct in_multi *,
const int);
static int igmp_v3_merge_state_changes(struct in_multi *,
@@ -164,12 +164,12 @@ static const struct netisr_handler igmp_nh = {
* themselves are not virtualized.
*
* Locking:
- * * The permitted lock order is: IN_MULTI_LOCK, IGMP_LOCK, IF_ADDR_LOCK.
+ * * The permitted lock order is: IN_MULTI_LIST_LOCK, IGMP_LOCK, IF_ADDR_LOCK.
* Any may be taken independently; if any are held at the same
* time, the above lock order must be followed.
* * All output is delegated to the netisr.
* Now that Giant has been eliminated, the netisr may be inlined.
- * * IN_MULTI_LOCK covers in_multi.
+ * * IN_MULTI_LIST_LOCK covers in_multi.
* * IGMP_LOCK covers igmp_ifsoftc and any global variables in this file,
* including the output queue.
* * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of
@@ -443,7 +443,7 @@ sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS)
if (error)
return (error);
- IN_MULTI_LOCK();
+ IN_MULTI_LIST_LOCK();
IGMP_LOCK();
if (name[0] <= 0 || name[0] > V_if_index) {
@@ -477,7 +477,7 @@ sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS)
out_locked:
IGMP_UNLOCK();
- IN_MULTI_UNLOCK();
+ IN_MULTI_LIST_UNLOCK();
return (error);
}
@@ -588,7 +588,6 @@ igi_alloc_locked(/*const*/ struct ifnet *ifp)
igi->igi_qi = IGMP_QI_INIT;
igi->igi_qri = IGMP_QRI_INIT;
igi->igi_uri = IGMP_URI_INIT;
- SLIST_INIT(&igi->igi_relinmhead);
mbufq_init(&igi->igi_gq, IGMP_MAX_RESPONSE_PACKETS);
LIST_INSERT_HEAD(&V_igi_head, igi, igi_link);
@@ -613,44 +612,37 @@ void
igmp_ifdetach(struct ifnet *ifp)
{
struct igmp_ifsoftc *igi;
- struct ifmultiaddr *ifma;
- struct in_multi *inm, *tinm;
-
+ struct ifmultiaddr *ifma, *next;
+ struct in_multi *inm;
+ struct in_multi_head inm_free_tmp;
CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp,
ifp->if_xname);
+ SLIST_INIT(&inm_free_tmp);
IGMP_LOCK();
igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
if (igi->igi_version == IGMP_VERSION_3) {
- IF_ADDR_RLOCK(ifp);
- TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
+ IF_ADDR_WLOCK(ifp);
+ restart:
+ CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) {
if (ifma->ifma_addr->sa_family != AF_INET ||
ifma->ifma_protospec == NULL)
continue;
-#if 0
- KASSERT(ifma->ifma_protospec != NULL,
- ("%s: ifma_protospec is NULL", __func__));
-#endif
inm = (struct in_multi *)ifma->ifma_protospec;
- if (inm->inm_state == IGMP_LEAVING_MEMBER) {
- SLIST_INSERT_HEAD(&igi->igi_relinmhead,
- inm, inm_nrele);
- }
+ if (inm->inm_state == IGMP_LEAVING_MEMBER)
+ inm_rele_locked(&inm_free_tmp, inm);
inm_clear_recorded(inm);
+ if (__predict_false(ifma_restart)) {
+ ifma_restart = false;
+ goto restart;
+ }
}
- IF_ADDR_RUNLOCK(ifp);
- /*
- * Free the in_multi reference(s) for this IGMP lifecycle.
- */
- SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele,
- tinm) {
- SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele);
- inm_release_locked(inm);
- }
+ IF_ADDR_WUNLOCK(ifp);
+ inm_release_list_deferred(&inm_free_tmp);
}
-
IGMP_UNLOCK();
+
}
/*
@@ -686,11 +678,6 @@ igi_delete_locked(const struct ifnet *ifp)
mbufq_drain(&igi->igi_gq);
LIST_REMOVE(igi, igi_link);
-
- KASSERT(SLIST_EMPTY(&igi->igi_relinmhead),
- ("%s: there are dangling in_multi references",
- __func__));
-
free(igi, M_IGMP);
return;
}
@@ -724,7 +711,7 @@ igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip,
}
IGMPSTAT_INC(igps_rcv_gen_queries);
- IN_MULTI_LOCK();
+ IN_MULTI_LIST_LOCK();
IGMP_LOCK();
igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
@@ -749,7 +736,7 @@ igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip,
* except those which are already running.
*/
IF_ADDR_RLOCK(ifp);
- TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
+ CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
if (ifma->ifma_addr->sa_family != AF_INET ||
ifma->ifma_protospec == NULL)
continue;
@@ -780,7 +767,7 @@ igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip,
out_locked:
IGMP_UNLOCK();
- IN_MULTI_UNLOCK();
+ IN_MULTI_LIST_UNLOCK();
return (0);
}
@@ -818,7 +805,7 @@ igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip,
IGMPSTAT_INC(igps_rcv_group_queries);
}
- IN_MULTI_LOCK();
+ IN_MULTI_LIST_LOCK();
IGMP_LOCK();
igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
@@ -850,7 +837,7 @@ igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip,
CTR2(KTR_IGMPV3, "process v2 general query on ifp %p(%s)",
ifp, ifp->if_xname);
IF_ADDR_RLOCK(ifp);
- TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
+ CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
if (ifma->ifma_addr->sa_family != AF_INET ||
ifma->ifma_protospec == NULL)
continue;
@@ -874,7 +861,7 @@ igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip,
out_locked:
IGMP_UNLOCK();
- IN_MULTI_UNLOCK();
+ IN_MULTI_LIST_UNLOCK();
return (0);
}
@@ -901,7 +888,7 @@ igmp_v2_update_group(struct in_multi *inm, const int timer)
CTR4(KTR_IGMPV3, "0x%08x: %s/%s timer=%d", __func__,
ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname, timer);
- IN_MULTI_LOCK_ASSERT();
+ IN_MULTI_LIST_LOCK_ASSERT();
switch (inm->inm_state) {
case IGMP_NOT_MEMBER:
@@ -1013,7 +1000,7 @@ igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip,
IGMPSTAT_INC(igps_rcv_gsr_queries);
}
- IN_MULTI_LOCK();
+ IN_MULTI_LIST_LOCK();
IGMP_LOCK();
igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
@@ -1094,7 +1081,7 @@ igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip,
out_locked:
IGMP_UNLOCK();
- IN_MULTI_UNLOCK();
+ IN_MULTI_LIST_UNLOCK();
return (0);
}
@@ -1111,7 +1098,7 @@ igmp_input_v3_group_query(struct in_multi *inm, struct igmp_ifsoftc *igi,
int retval;
uint16_t nsrc;
- IN_MULTI_LOCK_ASSERT();
+ IN_MULTI_LIST_LOCK_ASSERT();
IGMP_LOCK_ASSERT();
retval = 0;
@@ -1233,11 +1220,11 @@ igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip,
* Replace 0.0.0.0 with the subnet address if told to do so.
*/
if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) {
+ NET_EPOCH_ENTER();
IFP_TO_IA(ifp, ia, &in_ifa_tracker);
- if (ia != NULL) {
+ if (ia != NULL)
ip->ip_src.s_addr = htonl(ia->ia_subnet);
- ifa_free(&ia->ia_ifa);
- }
+ NET_EPOCH_EXIT();
}
CTR3(KTR_IGMPV3, "process v1 report 0x%08x on ifp %p(%s)",
@@ -1248,7 +1235,7 @@ igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip,
* If we are a member of this group, and our membership should be
* reported, stop our group timer and transition to the 'lazy' state.
*/
- IN_MULTI_LOCK();
+ IN_MULTI_LIST_LOCK();
inm = inm_lookup(ifp, igmp->igmp_group);
if (inm != NULL) {
struct igmp_ifsoftc *igi;
@@ -1307,7 +1294,7 @@ igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip,
}
out_locked:
- IN_MULTI_UNLOCK();
+ IN_MULTI_LIST_UNLOCK();
return (0);
}
@@ -1330,24 +1317,23 @@ igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip,
* leave requires knowing that we are the only member of a
* group.
*/
+ NET_EPOCH_ENTER();
IFP_TO_IA(ifp, ia, &in_ifa_tracker);
if (ia != NULL && in_hosteq(ip->ip_src, IA_SIN(ia)->sin_addr)) {
- ifa_free(&ia->ia_ifa);
+ NET_EPOCH_EXIT();
return (0);
}
IGMPSTAT_INC(igps_rcv_reports);
if (ifp->if_flags & IFF_LOOPBACK) {
- if (ia != NULL)
- ifa_free(&ia->ia_ifa);
+ NET_EPOCH_EXIT();
return (0);
}
if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) ||
!in_hosteq(igmp->igmp_group, ip->ip_dst)) {
- if (ia != NULL)
- ifa_free(&ia->ia_ifa);
+ NET_EPOCH_EXIT();
IGMPSTAT_INC(igps_rcv_badreports);
return (EINVAL);
}
@@ -1363,8 +1349,7 @@ igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip,
if (ia != NULL)
ip->ip_src.s_addr = htonl(ia->ia_subnet);
}
- if (ia != NULL)
- ifa_free(&ia->ia_ifa);
+ NET_EPOCH_EXIT();
CTR3(KTR_IGMPV3, "process v2 report 0x%08x on ifp %p(%s)",
ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname);
@@ -1375,7 +1360,7 @@ igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip,
* reported, and our group timer is pending or about to be reset,
* stop our group timer by transitioning to the 'lazy' state.
*/
- IN_MULTI_LOCK();
+ IN_MULTI_LIST_LOCK();
inm = inm_lookup(ifp, igmp->igmp_group);
if (inm != NULL) {
struct igmp_ifsoftc *igi;
@@ -1420,7 +1405,7 @@ igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip,
}
out_locked:
- IN_MULTI_UNLOCK();
+ IN_MULTI_LIST_UNLOCK();
return (0);
}
@@ -1647,8 +1632,9 @@ igmp_fasttimo_vnet(void)
struct mbufq qrq; /* Query response packets */
struct ifnet *ifp;
struct igmp_ifsoftc *igi;
- struct ifmultiaddr *ifma;
+ struct ifmultiaddr *ifma, *next;
struct in_multi *inm;
+ struct in_multi_head inm_free_tmp;
int loop, uri_fasthz;
loop = 0;
@@ -1664,7 +1650,8 @@ igmp_fasttimo_vnet(void)
!V_state_change_timers_running)
return;
- IN_MULTI_LOCK();
+ SLIST_INIT(&inm_free_tmp);
+ IN_MULTI_LIST_LOCK();
IGMP_LOCK();
/*
@@ -1709,8 +1696,9 @@ igmp_fasttimo_vnet(void)
mbufq_init(&scq, IGMP_MAX_STATE_CHANGE_PACKETS);
}
- IF_ADDR_RLOCK(ifp);
- TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
+ IF_ADDR_WLOCK(ifp);
+ restart:
+ CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) {
if (ifma->ifma_addr->sa_family != AF_INET ||
ifma->ifma_protospec == NULL)
continue;
@@ -1722,16 +1710,18 @@ igmp_fasttimo_vnet(void)
igi->igi_version);
break;
case IGMP_VERSION_3:
- igmp_v3_process_group_timers(igi, &qrq,
+ igmp_v3_process_group_timers(&inm_free_tmp, &qrq,
&scq, inm, uri_fasthz);
break;
}
+ if (__predict_false(ifma_restart)) {
+ ifma_restart = false;
+ goto restart;
+ }
}
- IF_ADDR_RUNLOCK(ifp);
+ IF_ADDR_WUNLOCK(ifp);
if (igi->igi_version == IGMP_VERSION_3) {
- struct in_multi *tinm;
-
igmp_dispatch_queue(&qrq, 0, loop);
igmp_dispatch_queue(&scq, 0, loop);
@@ -1739,18 +1729,13 @@ igmp_fasttimo_vnet(void)
* Free the in_multi reference(s) for this
* IGMP lifecycle.
*/
- SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead,
- inm_nrele, tinm) {
- SLIST_REMOVE_HEAD(&igi->igi_relinmhead,
- inm_nrele);
- inm_release_locked(inm);
- }
+ inm_release_list_deferred(&inm_free_tmp);
}
}
out_locked:
IGMP_UNLOCK();
- IN_MULTI_UNLOCK();
+ IN_MULTI_LIST_UNLOCK();
}
/*
@@ -1762,7 +1747,7 @@ igmp_v1v2_process_group_timer(struct in_multi *inm, const int version)
{
int report_timer_expired;
- IN_MULTI_LOCK_ASSERT();
+ IN_MULTI_LIST_LOCK_ASSERT();
IGMP_LOCK_ASSERT();
if (inm->inm_timer == 0) {
@@ -1804,14 +1789,14 @@ igmp_v1v2_process_group_timer(struct in_multi *inm, const int version)
* Note: Unlocked read from igi.
*/
static void
-igmp_v3_process_group_timers(struct igmp_ifsoftc *igi,
+igmp_v3_process_group_timers(struct in_multi_head *inmh,
struct mbufq *qrq, struct mbufq *scq,
struct in_multi *inm, const int uri_fasthz)
{
int query_response_timer_expired;
int state_change_retransmit_timer_expired;
- IN_MULTI_LOCK_ASSERT();
+ IN_MULTI_LIST_LOCK_ASSERT();
IGMP_LOCK_ASSERT();
query_response_timer_expired = 0;
@@ -1861,7 +1846,7 @@ igmp_v3_process_group_timers(struct igmp_ifsoftc *igi,
* immediate transmission.
*/
if (query_response_timer_expired) {
- int retval;
+ int retval __unused;
retval = igmp_v3_enqueue_group_record(qrq, inm, 0, 1,
(inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER));
@@ -1909,8 +1894,7 @@ igmp_v3_process_group_timers(struct igmp_ifsoftc *igi,
if (inm->inm_state == IGMP_LEAVING_MEMBER &&
inm->inm_scrv == 0) {
inm->inm_state = IGMP_NOT_MEMBER;
- SLIST_INSERT_HEAD(&igi->igi_relinmhead,
- inm, inm_nrele);
+ inm_rele_locked(inmh, inm);
}
}
break;
@@ -1931,7 +1915,7 @@ static void
igmp_v3_suppress_group_record(struct in_multi *inm)
{
- IN_MULTI_LOCK_ASSERT();
+ IN_MULTI_LIST_LOCK_ASSERT();
KASSERT(inm->inm_igi->igi_version == IGMP_VERSION_3,
("%s: not IGMPv3 mode on link", __func__));
@@ -2005,13 +1989,15 @@ igmp_v3_cancel_link_timers(struct igmp_ifsoftc *igi)
{
struct ifmultiaddr *ifma;
struct ifnet *ifp;
- struct in_multi *inm, *tinm;
+ struct in_multi *inm;
+ struct in_multi_head inm_free_tmp;
CTR3(KTR_IGMPV3, "%s: cancel v3 timers on ifp %p(%s)", __func__,
igi->igi_ifp, igi->igi_ifp->if_xname);
- IN_MULTI_LOCK_ASSERT();
+ IN_MULTI_LIST_LOCK_ASSERT();
IGMP_LOCK_ASSERT();
+ SLIST_INIT(&inm_free_tmp);
/*
* Stop the v3 General Query Response on this link stone dead.
@@ -2026,7 +2012,7 @@ igmp_v3_cancel_link_timers(struct igmp_ifsoftc *igi)
*/
ifp = igi->igi_ifp;
IF_ADDR_RLOCK(ifp);
- TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
+ CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
if (ifma->ifma_addr->sa_family != AF_INET ||
ifma->ifma_protospec == NULL)
continue;
@@ -2052,7 +2038,7 @@ igmp_v3_cancel_link_timers(struct igmp_ifsoftc *igi)
* message is sent upstream to the old querier --
* transition to NOT would lose the leave and race.
*/
- SLIST_INSERT_HEAD(&igi->igi_relinmhead, inm, inm_nrele);
+ inm_rele_locked(&inm_free_tmp, inm);
/* FALLTHROUGH */
case IGMP_G_QUERY_PENDING_MEMBER:
case IGMP_SG_QUERY_PENDING_MEMBER:
@@ -2071,10 +2057,8 @@ igmp_v3_cancel_link_timers(struct igmp_ifsoftc *igi)
mbufq_drain(&inm->inm_scq);
}
IF_ADDR_RUNLOCK(ifp);
- SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele, tinm) {
- SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele);
- inm_release_locked(inm);
- }
+
+ inm_release_list_deferred(&inm_free_tmp);
}
/*
@@ -2201,7 +2185,7 @@ igmp_v1v2_queue_report(struct in_multi *inm, const int type)
struct ip *ip;
struct mbuf *m;
- IN_MULTI_LOCK_ASSERT();
+ IN_MULTI_LIST_LOCK_ASSERT();
IGMP_LOCK_ASSERT();
ifp = inm->inm_ifp;
@@ -2278,10 +2262,8 @@ igmp_change_state(struct in_multi *inm)
struct ifnet *ifp;
int error;
- IN_MULTI_LOCK_ASSERT();
-
error = 0;
-
+ IN_MULTI_LOCK_ASSERT();
/*
* Try to detect if the upper layer just asked us to change state
* for an interface which has now gone away.
@@ -2381,9 +2363,10 @@ igmp_initial_join(struct in_multi *inm, struct igmp_ifsoftc *igi)
* group around for the final INCLUDE {} enqueue.
*/
if (igi->igi_version == IGMP_VERSION_3 &&
- inm->inm_state == IGMP_LEAVING_MEMBER)
- inm_release_locked(inm);
-
+ inm->inm_state == IGMP_LEAVING_MEMBER) {
+ MPASS(inm->inm_refcount > 1);
+ inm_rele_locked(NULL, inm);
+ }
inm->inm_state = IGMP_REPORTING_MEMBER;
switch (igi->igi_version) {
@@ -2475,7 +2458,7 @@ igmp_handle_state_change(struct in_multi *inm, struct igmp_ifsoftc *igi)
ifp = inm->inm_ifp;
- IN_MULTI_LOCK_ASSERT();
+ IN_MULTI_LIST_LOCK_ASSERT();
IGMP_LOCK_ASSERT();
KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__));
@@ -2533,7 +2516,7 @@ igmp_final_leave(struct in_multi *inm, struct igmp_ifsoftc *igi)
__func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp,
inm->inm_ifp->if_xname);
- IN_MULTI_LOCK_ASSERT();
+ IN_MULTI_LIST_LOCK_ASSERT();
IGMP_LOCK_ASSERT();
switch (inm->inm_state) {
@@ -2579,7 +2562,7 @@ igmp_final_leave(struct in_multi *inm, struct igmp_ifsoftc *igi)
inm->inm_state = IGMP_NOT_MEMBER;
inm->inm_sctimer = 0;
} else {
- int retval;
+ int retval __unused;
inm_acquire_locked(inm);
@@ -2652,7 +2635,7 @@ igmp_v3_enqueue_group_record(struct mbufq *mq, struct in_multi *inm,
struct ifnet *ifp;
struct ip_msource *ims, *nims;
struct mbuf *m0, *m, *md;
- int error, is_filter_list_change;
+ int is_filter_list_change;
int minrec0len, m0srcs, msrcs, nbytes, off;
int record_has_sources;
int now;
@@ -2660,9 +2643,8 @@ igmp_v3_enqueue_group_record(struct mbufq *mq, struct in_multi *inm,
in_addr_t naddr;
uint8_t mode;
- IN_MULTI_LOCK_ASSERT();
+ IN_MULTI_LIST_LOCK_ASSERT();
- error = 0;
ifp = inm->inm_ifp;
is_filter_list_change = 0;
m = NULL;
@@ -3020,7 +3002,7 @@ igmp_v3_enqueue_filter_change(struct mbufq *mq, struct in_multi *inm)
uint8_t mode, now, then;
rectype_t crt, drt, nrt;
- IN_MULTI_LOCK_ASSERT();
+ IN_MULTI_LIST_LOCK_ASSERT();
if (inm->inm_nsrc == 0 ||
(inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0))
@@ -3223,7 +3205,7 @@ igmp_v3_merge_state_changes(struct in_multi *inm, struct mbufq *scq)
domerge = 0;
recslen = 0;
- IN_MULTI_LOCK_ASSERT();
+ IN_MULTI_LIST_LOCK_ASSERT();
IGMP_LOCK_ASSERT();
/*
@@ -3320,9 +3302,9 @@ igmp_v3_dispatch_general_query(struct igmp_ifsoftc *igi)
struct ifmultiaddr *ifma;
struct ifnet *ifp;
struct in_multi *inm;
- int retval, loop;
+ int retval __unused, loop;
- IN_MULTI_LOCK_ASSERT();
+ IN_MULTI_LIST_LOCK_ASSERT();
IGMP_LOCK_ASSERT();
KASSERT(igi->igi_version == IGMP_VERSION_3,
@@ -3340,7 +3322,7 @@ igmp_v3_dispatch_general_query(struct igmp_ifsoftc *igi)
ifp = igi->igi_ifp;
IF_ADDR_RLOCK(ifp);
- TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
+ CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
if (ifma->ifma_addr->sa_family != AF_INET ||
ifma->ifma_protospec == NULL)
continue;
@@ -3544,11 +3526,11 @@ igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m)
if (m->m_flags & M_IGMP_LOOP) {
struct in_ifaddr *ia;
+ NET_EPOCH_ENTER();
IFP_TO_IA(ifp, ia, &in_ifa_tracker);
- if (ia != NULL) {
+ if (ia != NULL)
ip->ip_src = ia->ia_addr.sin_addr;
- ifa_free(&ia->ia_ifa);
- }
+ NET_EPOCH_EXIT();
}
ip->ip_dst.s_addr = htonl(INADDR_ALLRPTS_GROUP);
@@ -3634,7 +3616,6 @@ DB_SHOW_COMMAND(igi_list, db_show_igi_list)
db_printf(" qi %u\n", igi->igi_qi);
db_printf(" qri %u\n", igi->igi_qri);
db_printf(" uri %u\n", igi->igi_uri);
- /* SLIST_HEAD(,in_multi) igi_relinmhead */
/* struct mbufq igi_gq; */
db_printf("\n");
}
diff --git a/freebsd/sys/netinet/igmp_var.h b/freebsd/sys/netinet/igmp_var.h
index 4f9db06c..11f086f8 100644
--- a/freebsd/sys/netinet/igmp_var.h
+++ b/freebsd/sys/netinet/igmp_var.h
@@ -214,7 +214,6 @@ struct igmp_ifsoftc {
uint32_t igi_qi; /* IGMPv3 Query Interval (s) */
uint32_t igi_qri; /* IGMPv3 Query Response Interval (s) */
uint32_t igi_uri; /* IGMPv3 Unsolicited Report Interval (s) */
- SLIST_HEAD(,in_multi) igi_relinmhead; /* released groups */
struct mbufq igi_gq; /* general query responses queue */
};
diff --git a/freebsd/sys/netinet/in.c b/freebsd/sys/netinet/in.c
index 28c257aa..7233f9a2 100644
--- a/freebsd/sys/netinet/in.c
+++ b/freebsd/sys/netinet/in.c
@@ -104,7 +104,7 @@ in_localaddr(struct in_addr in)
struct in_ifaddr *ia;
IN_IFADDR_RLOCK(&in_ifa_tracker);
- TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
+ CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
if ((i & ia->ia_subnetmask) == ia->ia_subnet) {
IN_IFADDR_RUNLOCK(&in_ifa_tracker);
return (1);
@@ -145,7 +145,7 @@ in_ifhasaddr(struct ifnet *ifp, struct in_addr in)
struct in_ifaddr *ia;
IF_ADDR_RLOCK(ifp);
- TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
if (ifa->ifa_addr->sa_family != AF_INET)
continue;
ia = (struct in_ifaddr *)ifa;
@@ -282,7 +282,7 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp,
* first one on the interface, if possible.
*/
IF_ADDR_RLOCK(ifp);
- TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
if (ifa->ifa_addr->sa_family != AF_INET)
continue;
ia = (struct in_ifaddr *)ifa;
@@ -290,7 +290,7 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp,
break;
}
if (ifa == NULL)
- TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
+ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
if (ifa->ifa_addr->sa_family == AF_INET) {
ia = (struct in_ifaddr *)ifa;
if (prison_check_ip4(td->td_ucred,
@@ -381,7 +381,7 @@ in_aifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td)
iaIsFirst = true;
ia = NULL;
IF_ADDR_RLOCK(ifp);
- TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
struct in_ifaddr *it;
if (ifa->ifa_addr->sa_family != AF_INET)
@@ -459,12 +459,12 @@ in_aifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td)
/* if_addrhead is already referenced by ifa_alloc() */
IF_ADDR_WLOCK(ifp);
- TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link);
+ CK_STAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link);
IF_ADDR_WUNLOCK(ifp);
ifa_ref(ifa); /* in_ifaddrhead */
IN_IFADDR_WLOCK();
- TAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link);
+ CK_STAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link);
LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia, ia_hash);
IN_IFADDR_WUNLOCK();
@@ -537,12 +537,12 @@ fail1:
(*carp_detach_p)(&ia->ia_ifa, false);
IF_ADDR_WLOCK(ifp);
- TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link);
+ CK_STAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifaddr, ifa_link);
IF_ADDR_WUNLOCK(ifp);
ifa_free(&ia->ia_ifa); /* if_addrhead */
IN_IFADDR_WLOCK();
- TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link);
+ CK_STAILQ_REMOVE(&V_in_ifaddrhead, ia, in_ifaddr, ia_link);
LIST_REMOVE(ia, ia_hash);
IN_IFADDR_WUNLOCK();
ifa_free(&ia->ia_ifa); /* in_ifaddrhead */
@@ -576,7 +576,7 @@ in_difaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td)
iaIsLast = true;
ia = NULL;
IF_ADDR_WLOCK(ifp);
- TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
struct in_ifaddr *it;
if (ifa->ifa_addr->sa_family != AF_INET)
@@ -601,12 +601,12 @@ in_difaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td)
return (EADDRNOTAVAIL);
}
- TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link);
+ CK_STAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifaddr, ifa_link);
IF_ADDR_WUNLOCK(ifp);
ifa_free(&ia->ia_ifa); /* if_addrhead */
IN_IFADDR_WLOCK();
- TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link);
+ CK_STAILQ_REMOVE(&V_in_ifaddrhead, ia, in_ifaddr, ia_link);
LIST_REMOVE(ia, ia_hash);
IN_IFADDR_WUNLOCK();
@@ -636,12 +636,10 @@ in_difaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td)
struct in_ifinfo *ii;
ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]);
- IN_MULTI_LOCK();
if (ii->ii_allhosts) {
- (void)in_leavegroup_locked(ii->ii_allhosts, NULL);
+ (void)in_leavegroup(ii->ii_allhosts, NULL);
ii->ii_allhosts = NULL;
}
- IN_MULTI_UNLOCK();
}
IF_ADDR_WLOCK(ifp);
@@ -682,7 +680,7 @@ in_addprefix(struct in_ifaddr *target, int flags)
IN_IFADDR_RLOCK(&in_ifa_tracker);
/* Look for an existing address with the same prefix, mask, and fib */
- TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
+ CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
if (rtinitflags(ia)) {
p = ia->ia_dstaddr.sin_addr;
@@ -842,7 +840,7 @@ in_scrubprefix(struct in_ifaddr *target, u_int flags)
}
IN_IFADDR_RLOCK(&in_ifa_tracker);
- TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
+ CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
if (rtinitflags(ia)) {
p = ia->ia_dstaddr.sin_addr;
@@ -918,10 +916,10 @@ in_ifscrub_all(void)
struct ifaliasreq ifr;
IFNET_RLOCK();
- TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
+ CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
/* Cannot lock here - lock recursion. */
/* IF_ADDR_RLOCK(ifp); */
- TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, nifa) {
+ CK_STAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, nifa) {
if (ifa->ifa_addr->sa_family != AF_INET)
continue;
@@ -982,7 +980,7 @@ in_broadcast(struct in_addr in, struct ifnet *ifp)
* with a broadcast address.
*/
IF_ADDR_RLOCK(ifp);
- TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
+ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
if (ifa->ifa_addr->sa_family == AF_INET &&
in_ifaddr_broadcast(in, (struct in_ifaddr *)ifa)) {
found = 1;
@@ -998,11 +996,12 @@ in_broadcast(struct in_addr in, struct ifnet *ifp)
void
in_ifdetach(struct ifnet *ifp)
{
-
+ IN_MULTI_LOCK();
in_pcbpurgeif0(&V_ripcbinfo, ifp);
in_pcbpurgeif0(&V_udbinfo, ifp);
in_pcbpurgeif0(&V_ulitecbinfo, ifp);
in_purgemaddrs(ifp);
+ IN_MULTI_UNLOCK();
}
/*
@@ -1015,12 +1014,12 @@ in_ifdetach(struct ifnet *ifp)
static void
in_purgemaddrs(struct ifnet *ifp)
{
- LIST_HEAD(,in_multi) purgeinms;
- struct in_multi *inm, *tinm;
- struct ifmultiaddr *ifma;
+ struct in_multi_head purgeinms;
+ struct in_multi *inm;
+ struct ifmultiaddr *ifma, *next;
- LIST_INIT(&purgeinms);
- IN_MULTI_LOCK();
+ SLIST_INIT(&purgeinms);
+ IN_MULTI_LIST_LOCK();
/*
* Extract list of in_multi associated with the detaching ifp
@@ -1028,27 +1027,24 @@ in_purgemaddrs(struct ifnet *ifp)
* We need to do this as IF_ADDR_LOCK() may be re-acquired
* by code further down.
*/
- IF_ADDR_RLOCK(ifp);
- TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
+ IF_ADDR_WLOCK(ifp);
+ restart:
+ CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) {
if (ifma->ifma_addr->sa_family != AF_INET ||
ifma->ifma_protospec == NULL)
continue;
-#if 0
- KASSERT(ifma->ifma_protospec != NULL,
- ("%s: ifma_protospec is NULL", __func__));
-#endif
inm = (struct in_multi *)ifma->ifma_protospec;
- LIST_INSERT_HEAD(&purgeinms, inm, inm_link);
+ inm_rele_locked(&purgeinms, inm);
+ if (__predict_false(ifma_restart)) {
+ ifma_restart = true;
+ goto restart;
+ }
}
- IF_ADDR_RUNLOCK(ifp);
+ IF_ADDR_WUNLOCK(ifp);
- LIST_FOREACH_SAFE(inm, &purgeinms, inm_link, tinm) {
- LIST_REMOVE(inm, inm_link);
- inm_release_locked(inm);
- }
+ inm_release_list_deferred(&purgeinms);
igmp_ifdetach(ifp);
-
- IN_MULTI_UNLOCK();
+ IN_MULTI_LIST_UNLOCK();
}
struct in_llentry {
@@ -1063,9 +1059,11 @@ struct in_llentry {
* Do actual deallocation of @lle.
*/
static void
-in_lltable_destroy_lle_unlocked(struct llentry *lle)
+in_lltable_destroy_lle_unlocked(epoch_context_t ctx)
{
+ struct llentry *lle;
+ lle = __containerof(ctx, struct llentry, lle_epoch_ctx);
LLE_LOCK_DESTROY(lle);
LLE_REQ_DESTROY(lle);
free(lle, M_LLTABLE);
@@ -1093,7 +1091,7 @@ in_lltable_destroy_lle(struct llentry *lle)
{
LLE_WUNLOCK(lle);
- in_lltable_destroy_lle_unlocked(lle);
+ epoch_call(net_epoch_preempt, &lle->lle_epoch_ctx, in_lltable_destroy_lle_unlocked);
}
static struct llentry *
@@ -1160,7 +1158,6 @@ in_lltable_match_prefix(const struct sockaddr *saddr,
static void
in_lltable_free_entry(struct lltable *llt, struct llentry *lle)
{
- struct ifnet *ifp;
size_t pkts_dropped;
LLE_WLOCK_ASSERT(lle);
@@ -1168,8 +1165,7 @@ in_lltable_free_entry(struct lltable *llt, struct llentry *lle)
/* Unlink entry from table if not already */
if ((lle->la_flags & LLE_LINKED) != 0) {
- ifp = llt->llt_ifp;
- IF_AFDATA_WLOCK_ASSERT(ifp);
+ IF_AFDATA_WLOCK_ASSERT(llt->llt_ifp);
lltable_unlink_entry(llt, lle);
}
@@ -1304,7 +1300,7 @@ in_lltable_find_dst(struct lltable *llt, struct in_addr dst)
hashidx = in_lltable_hash_dst(dst, llt->llt_hsize);
lleh = &llt->lle_head[hashidx];
- LIST_FOREACH(lle, lleh, lle_next) {
+ CK_LIST_FOREACH(lle, lleh, lle_next) {
if (lle->la_flags & LLE_DELETED)
continue;
if (lle->r_l3addr.addr4.s_addr == dst.s_addr)
@@ -1360,7 +1356,7 @@ in_lltable_alloc(struct lltable *llt, u_int flags, const struct sockaddr *l3addr
linkhdrsize = LLE_MAX_LINKHDR;
if (lltable_calc_llheader(ifp, AF_INET, IF_LLADDR(ifp),
linkhdr, &linkhdrsize, &lladdr_off) != 0) {
- in_lltable_destroy_lle_unlocked(lle);
+ epoch_call(net_epoch_preempt, &lle->lle_epoch_ctx, in_lltable_destroy_lle_unlocked);
return (NULL);
}
lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize,
@@ -1420,52 +1416,51 @@ in_lltable_dump_entry(struct lltable *llt, struct llentry *lle,
int error;
bzero(&arpc, sizeof(arpc));
- /* skip deleted entries */
- if ((lle->la_flags & LLE_DELETED) == LLE_DELETED)
- return (0);
- /* Skip if jailed and not a valid IP of the prison. */
- lltable_fill_sa_entry(lle,(struct sockaddr *)&arpc.sin);
- if (prison_if(wr->td->td_ucred,
- (struct sockaddr *)&arpc.sin) != 0)
- return (0);
- /*
- * produce a msg made of:
- * struct rt_msghdr;
- * struct sockaddr_in; (IPv4)
- * struct sockaddr_dl;
- */
- arpc.rtm.rtm_msglen = sizeof(arpc);
- arpc.rtm.rtm_version = RTM_VERSION;
- arpc.rtm.rtm_type = RTM_GET;
- arpc.rtm.rtm_flags = RTF_UP;
- arpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY;
-
- /* publish */
- if (lle->la_flags & LLE_PUB)
- arpc.rtm.rtm_flags |= RTF_ANNOUNCE;
-
- sdl = &arpc.sdl;
- sdl->sdl_family = AF_LINK;
- sdl->sdl_len = sizeof(*sdl);
- sdl->sdl_index = ifp->if_index;
- sdl->sdl_type = ifp->if_type;
- if ((lle->la_flags & LLE_VALID) == LLE_VALID) {
- sdl->sdl_alen = ifp->if_addrlen;
- bcopy(lle->ll_addr, LLADDR(sdl), ifp->if_addrlen);
- } else {
- sdl->sdl_alen = 0;
- bzero(LLADDR(sdl), ifp->if_addrlen);
- }
+ /* skip deleted entries */
+ if ((lle->la_flags & LLE_DELETED) == LLE_DELETED)
+ return (0);
+ /* Skip if jailed and not a valid IP of the prison. */
+ lltable_fill_sa_entry(lle,(struct sockaddr *)&arpc.sin);
+ if (prison_if(wr->td->td_ucred, (struct sockaddr *)&arpc.sin) != 0)
+ return (0);
+ /*
+ * produce a msg made of:
+ * struct rt_msghdr;
+ * struct sockaddr_in; (IPv4)
+ * struct sockaddr_dl;
+ */
+ arpc.rtm.rtm_msglen = sizeof(arpc);
+ arpc.rtm.rtm_version = RTM_VERSION;
+ arpc.rtm.rtm_type = RTM_GET;
+ arpc.rtm.rtm_flags = RTF_UP;
+ arpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY;
+
+ /* publish */
+ if (lle->la_flags & LLE_PUB)
+ arpc.rtm.rtm_flags |= RTF_ANNOUNCE;
+
+ sdl = &arpc.sdl;
+ sdl->sdl_family = AF_LINK;
+ sdl->sdl_len = sizeof(*sdl);
+ sdl->sdl_index = ifp->if_index;
+ sdl->sdl_type = ifp->if_type;
+ if ((lle->la_flags & LLE_VALID) == LLE_VALID) {
+ sdl->sdl_alen = ifp->if_addrlen;
+ bcopy(lle->ll_addr, LLADDR(sdl), ifp->if_addrlen);
+ } else {
+ sdl->sdl_alen = 0;
+ bzero(LLADDR(sdl), ifp->if_addrlen);
+ }
- arpc.rtm.rtm_rmx.rmx_expire =
- lle->la_flags & LLE_STATIC ? 0 : lle->la_expire;
- arpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA);
- if (lle->la_flags & LLE_STATIC)
- arpc.rtm.rtm_flags |= RTF_STATIC;
- if (lle->la_flags & LLE_IFADDR)
- arpc.rtm.rtm_flags |= RTF_PINNED;
- arpc.rtm.rtm_index = ifp->if_index;
- error = SYSCTL_OUT(wr, &arpc, sizeof(arpc));
+ arpc.rtm.rtm_rmx.rmx_expire =
+ lle->la_flags & LLE_STATIC ? 0 : lle->la_expire;
+ arpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA);
+ if (lle->la_flags & LLE_STATIC)
+ arpc.rtm.rtm_flags |= RTF_STATIC;
+ if (lle->la_flags & LLE_IFADDR)
+ arpc.rtm.rtm_flags |= RTF_PINNED;
+ arpc.rtm.rtm_index = ifp->if_index;
+ error = SYSCTL_OUT(wr, &arpc, sizeof(arpc));
return (error);
}
diff --git a/freebsd/sys/netinet/in_mcast.c b/freebsd/sys/netinet/in_mcast.c
index 41beed9b..ea4779fc 100644
--- a/freebsd/sys/netinet/in_mcast.c
+++ b/freebsd/sys/netinet/in_mcast.c
@@ -53,6 +53,7 @@ __FBSDID("$FreeBSD$");
#include <sys/sysctl.h>
#include <sys/ktr.h>
#include <sys/taskqueue.h>
+#include <sys/gtaskqueue.h>
#include <sys/tree.h>
#include <net/if.h>
@@ -61,6 +62,8 @@ __FBSDID("$FreeBSD$");
#include <net/route.h>
#include <net/vnet.h>
+#include <net/ethernet.h>
+
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/in_fib.h>
@@ -93,17 +96,25 @@ static MALLOC_DEFINE(M_IPMSOURCE, "ip_msource",
/*
* Locking:
- * - Lock order is: Giant, INP_WLOCK, IN_MULTI_LOCK, IGMP_LOCK, IF_ADDR_LOCK.
+ * - Lock order is: Giant, INP_WLOCK, IN_MULTI_LIST_LOCK, IGMP_LOCK, IF_ADDR_LOCK.
* - The IF_ADDR_LOCK is implicitly taken by inm_lookup() earlier, however
* it can be taken by code in net/if.c also.
* - ip_moptions and in_mfilter are covered by the INP_WLOCK.
*
- * struct in_multi is covered by IN_MULTI_LOCK. There isn't strictly
+ * struct in_multi is covered by IN_MULTI_LIST_LOCK. There isn't strictly
* any need for in_multi itself to be virtualized -- it is bound to an ifp
* anyway no matter what happens.
*/
-struct mtx in_multi_mtx;
-MTX_SYSINIT(in_multi_mtx, &in_multi_mtx, "in_multi_mtx", MTX_DEF);
+struct mtx in_multi_list_mtx;
+MTX_SYSINIT(in_multi_mtx, &in_multi_list_mtx, "in_multi_list_mtx", MTX_DEF);
+
+struct mtx in_multi_free_mtx;
+MTX_SYSINIT(in_multi_free_mtx, &in_multi_free_mtx, "in_multi_free_mtx", MTX_DEF);
+
+struct sx in_multi_sx;
+SX_SYSINIT(in_multi_sx, &in_multi_sx, "in_multi_sx");
+
+int ifma_restart;
/*
* Functions with non-static linkage defined in this file should be
@@ -153,10 +164,9 @@ static int inm_is_ifp_detached(const struct in_multi *);
static int inm_merge(struct in_multi *, /*const*/ struct in_mfilter *);
static void inm_purge(struct in_multi *);
static void inm_reap(struct in_multi *);
+static void inm_release(struct in_multi *);
static struct ip_moptions *
inp_findmoptions(struct inpcb *);
-static void inp_freemoptions_internal(struct ip_moptions *);
-static void inp_gcmoptions(void *, int);
static int inp_get_source_filters(struct inpcb *, struct sockopt *);
static int inp_join_group(struct inpcb *, struct sockopt *);
static int inp_leave_group(struct inpcb *, struct sockopt *);
@@ -189,10 +199,6 @@ static SYSCTL_NODE(_net_inet_ip_mcast, OID_AUTO, filters,
CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip_mcast_filters,
"Per-interface stack-wide source filters");
-static STAILQ_HEAD(, ip_moptions) imo_gc_list =
- STAILQ_HEAD_INITIALIZER(imo_gc_list);
-static struct task imo_gc_task = TASK_INITIALIZER(0, inp_gcmoptions, NULL);
-
#ifdef KTR
/*
* Inline function which wraps assertions for a valid ifp.
@@ -218,6 +224,93 @@ inm_is_ifp_detached(const struct in_multi *inm)
}
#endif
+static struct grouptask free_gtask;
+static struct in_multi_head inm_free_list;
+static void inm_release_task(void *arg __unused);
+static void inm_init(void)
+{
+ SLIST_INIT(&inm_free_list);
+ taskqgroup_config_gtask_init(NULL, &free_gtask, inm_release_task, "inm release task");
+}
+
+SYSINIT(inm_init, SI_SUB_SMP + 1, SI_ORDER_FIRST,
+ inm_init, NULL);
+
+
+void
+inm_release_list_deferred(struct in_multi_head *inmh)
+{
+
+ if (SLIST_EMPTY(inmh))
+ return;
+ mtx_lock(&in_multi_free_mtx);
+ SLIST_CONCAT(&inm_free_list, inmh, in_multi, inm_nrele);
+ mtx_unlock(&in_multi_free_mtx);
+ GROUPTASK_ENQUEUE(&free_gtask);
+}
+
+void
+inm_disconnect(struct in_multi *inm)
+{
+ struct ifnet *ifp;
+ struct ifmultiaddr *ifma, *ll_ifma;
+
+ ifp = inm->inm_ifp;
+ IF_ADDR_WLOCK_ASSERT(ifp);
+ ifma = inm->inm_ifma;
+
+ if_ref(ifp);
+ CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link);
+ MCDPRINTF("removed ifma: %p from %s\n", ifma, ifp->if_xname);
+ if ((ll_ifma = ifma->ifma_llifma) != NULL) {
+ MPASS(ifma != ll_ifma);
+ ifma->ifma_llifma = NULL;
+ MPASS(ll_ifma->ifma_llifma == NULL);
+ MPASS(ll_ifma->ifma_ifp == ifp);
+ if (--ll_ifma->ifma_refcount == 0) {
+ CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link);
+ MCDPRINTF("removed ll_ifma: %p from %s\n", ll_ifma, ifp->if_xname);
+ if_freemulti(ll_ifma);
+ ifma_restart = true;
+ }
+ }
+}
+
+void
+inm_release_deferred(struct in_multi *inm)
+{
+ struct in_multi_head tmp;
+
+ IN_MULTI_LIST_LOCK_ASSERT();
+ MPASS(inm->inm_refcount > 0);
+ if (--inm->inm_refcount == 0) {
+ SLIST_INIT(&tmp);
+ inm_disconnect(inm);
+ inm->inm_ifma->ifma_protospec = NULL;
+ SLIST_INSERT_HEAD(&tmp, inm, inm_nrele);
+ inm_release_list_deferred(&tmp);
+ }
+}
+
+static void
+inm_release_task(void *arg __unused)
+{
+ struct in_multi_head inm_free_tmp;
+ struct in_multi *inm, *tinm;
+
+ SLIST_INIT(&inm_free_tmp);
+ mtx_lock(&in_multi_free_mtx);
+ SLIST_CONCAT(&inm_free_tmp, &inm_free_list, in_multi, inm_nrele);
+ mtx_unlock(&in_multi_free_mtx);
+ IN_MULTI_LOCK();
+ SLIST_FOREACH_SAFE(inm, &inm_free_tmp, inm_nrele, tinm) {
+ SLIST_REMOVE_HEAD(&inm_free_tmp, inm_nrele);
+ MPASS(inm);
+ inm_release(inm);
+ }
+ IN_MULTI_UNLOCK();
+}
+
/*
* Initialize an in_mfilter structure to a known state at t0, t1
* with an empty source filter list.
@@ -234,7 +327,7 @@ imf_init(struct in_mfilter *imf, const int st0, const int st1)
/*
* Function for looking up an in_multi record for an IPv4 multicast address
* on a given interface. ifp must be valid. If no record found, return NULL.
- * The IN_MULTI_LOCK and IF_ADDR_LOCK on ifp must be held.
+ * The IN_MULTI_LIST_LOCK and IF_ADDR_LOCK on ifp must be held.
*/
struct in_multi *
inm_lookup_locked(struct ifnet *ifp, const struct in_addr ina)
@@ -242,17 +335,18 @@ inm_lookup_locked(struct ifnet *ifp, const struct in_addr ina)
struct ifmultiaddr *ifma;
struct in_multi *inm;
- IN_MULTI_LOCK_ASSERT();
+ IN_MULTI_LIST_LOCK_ASSERT();
IF_ADDR_LOCK_ASSERT(ifp);
inm = NULL;
- TAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) {
- if (ifma->ifma_addr->sa_family == AF_INET) {
- inm = (struct in_multi *)ifma->ifma_protospec;
- if (inm->inm_addr.s_addr == ina.s_addr)
- break;
- inm = NULL;
- }
+ CK_STAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) {
+ if (ifma->ifma_addr->sa_family != AF_INET ||
+ ifma->ifma_protospec == NULL)
+ continue;
+ inm = (struct in_multi *)ifma->ifma_protospec;
+ if (inm->inm_addr.s_addr == ina.s_addr)
+ break;
+ inm = NULL;
}
return (inm);
}
@@ -266,7 +360,7 @@ inm_lookup(struct ifnet *ifp, const struct in_addr ina)
{
struct in_multi *inm;
- IN_MULTI_LOCK_ASSERT();
+ IN_MULTI_LIST_LOCK_ASSERT();
IF_ADDR_RLOCK(ifp);
inm = inm_lookup_locked(ifp, ina);
IF_ADDR_RUNLOCK(ifp);
@@ -453,7 +547,7 @@ in_getmulti(struct ifnet *ifp, const struct in_addr *group,
IN_MULTI_LOCK_ASSERT();
ii = (struct in_ifinfo *)ifp->if_afdata[AF_INET];
-
+ IN_MULTI_LIST_LOCK();
inm = inm_lookup(ifp, *group);
if (inm != NULL) {
/*
@@ -462,11 +556,13 @@ in_getmulti(struct ifnet *ifp, const struct in_addr *group,
*/
KASSERT(inm->inm_refcount >= 1,
("%s: bad refcount %d", __func__, inm->inm_refcount));
- ++inm->inm_refcount;
+ inm_acquire_locked(inm);
*pinm = inm;
- return (0);
}
-
+ IN_MULTI_LIST_UNLOCK();
+ if (inm != NULL)
+ return (0);
+
memset(&gsin, 0, sizeof(gsin));
gsin.sin_family = AF_INET;
gsin.sin_len = sizeof(struct sockaddr_in);
@@ -481,6 +577,7 @@ in_getmulti(struct ifnet *ifp, const struct in_addr *group,
return (error);
/* XXX ifma_protospec must be covered by IF_ADDR_LOCK */
+ IN_MULTI_LIST_LOCK();
IF_ADDR_WLOCK(ifp);
/*
@@ -506,10 +603,9 @@ in_getmulti(struct ifnet *ifp, const struct in_addr *group,
__func__, ifma, inm, inet_ntoa_r(*group, addrbuf));
}
#endif
- ++inm->inm_refcount;
+ inm_acquire_locked(inm);
*pinm = inm;
- IF_ADDR_WUNLOCK(ifp);
- return (0);
+ goto out_locked;
}
IF_ADDR_WLOCK_ASSERT(ifp);
@@ -524,6 +620,7 @@ in_getmulti(struct ifnet *ifp, const struct in_addr *group,
inm = malloc(sizeof(*inm), M_IPMADDR, M_NOWAIT | M_ZERO);
if (inm == NULL) {
IF_ADDR_WUNLOCK(ifp);
+ IN_MULTI_LIST_UNLOCK();
if_delmulti_ifma(ifma);
return (ENOMEM);
}
@@ -541,8 +638,9 @@ in_getmulti(struct ifnet *ifp, const struct in_addr *group,
ifma->ifma_protospec = inm;
*pinm = inm;
-
+ out_locked:
IF_ADDR_WUNLOCK(ifp);
+ IN_MULTI_LIST_UNLOCK();
return (0);
}
@@ -552,36 +650,33 @@ in_getmulti(struct ifnet *ifp, const struct in_addr *group,
* If the refcount drops to 0, free the in_multi record and
* delete the underlying link-layer membership.
*/
-void
-inm_release_locked(struct in_multi *inm)
+static void
+inm_release(struct in_multi *inm)
{
struct ifmultiaddr *ifma;
-
- IN_MULTI_LOCK_ASSERT();
+ struct ifnet *ifp;
CTR2(KTR_IGMPV3, "%s: refcount is %d", __func__, inm->inm_refcount);
-
- if (--inm->inm_refcount > 0) {
- CTR2(KTR_IGMPV3, "%s: refcount is now %d", __func__,
- inm->inm_refcount);
- return;
- }
-
+ MPASS(inm->inm_refcount == 0);
CTR2(KTR_IGMPV3, "%s: freeing inm %p", __func__, inm);
ifma = inm->inm_ifma;
+ ifp = inm->inm_ifp;
/* XXX this access is not covered by IF_ADDR_LOCK */
CTR2(KTR_IGMPV3, "%s: purging ifma %p", __func__, ifma);
- KASSERT(ifma->ifma_protospec == inm,
- ("%s: ifma_protospec != inm", __func__));
- ifma->ifma_protospec = NULL;
-
- inm_purge(inm);
-
- free(inm, M_IPMADDR);
-
- if_delmulti_ifma(ifma);
+ if (ifp != NULL) {
+ CURVNET_SET(ifp->if_vnet);
+ inm_purge(inm);
+ free(inm, M_IPMADDR);
+ if_delmulti_ifma_flags(ifma, 1);
+ CURVNET_RESTORE();
+ if_rele(ifp);
+ } else {
+ inm_purge(inm);
+ free(inm, M_IPMADDR);
+ if_delmulti_ifma_flags(ifma, 1);
+ }
}
/*
@@ -594,7 +689,7 @@ inm_clear_recorded(struct in_multi *inm)
{
struct ip_msource *ims;
- IN_MULTI_LOCK_ASSERT();
+ IN_MULTI_LIST_LOCK_ASSERT();
RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) {
if (ims->ims_stp) {
@@ -634,7 +729,7 @@ inm_record_source(struct in_multi *inm, const in_addr_t naddr)
struct ip_msource find;
struct ip_msource *ims, *nims;
- IN_MULTI_LOCK_ASSERT();
+ IN_MULTI_LIST_LOCK_ASSERT();
find.ims_haddr = ntohl(naddr);
ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find);
@@ -961,6 +1056,7 @@ inm_merge(struct in_multi *inm, /*const*/ struct in_mfilter *imf)
schanged = 0;
error = 0;
nsrc1 = nsrc0 = 0;
+ IN_MULTI_LIST_LOCK_ASSERT();
/*
* Update the source filters first, as this may fail.
@@ -1167,6 +1263,7 @@ in_joingroup_locked(struct ifnet *ifp, const struct in_addr *gina,
int error;
IN_MULTI_LOCK_ASSERT();
+ IN_MULTI_LIST_UNLOCK_ASSERT();
CTR4(KTR_IGMPV3, "%s: join 0x%08x on %p(%s))", __func__,
ntohl(gina->s_addr), ifp, ifp->if_xname);
@@ -1188,7 +1285,7 @@ in_joingroup_locked(struct ifnet *ifp, const struct in_addr *gina,
CTR1(KTR_IGMPV3, "%s: in_getmulti() failure", __func__);
return (error);
}
-
+ IN_MULTI_LIST_LOCK();
CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
error = inm_merge(inm, imf);
if (error) {
@@ -1203,13 +1300,15 @@ in_joingroup_locked(struct ifnet *ifp, const struct in_addr *gina,
goto out_inm_release;
}
-out_inm_release:
+ out_inm_release:
if (error) {
+
CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm);
- inm_release_locked(inm);
+ inm_release_deferred(inm);
} else {
*pinm = inm;
}
+ IN_MULTI_LIST_UNLOCK();
return (error);
}
@@ -1251,6 +1350,7 @@ in_leavegroup_locked(struct in_multi *inm, /*const*/ struct in_mfilter *imf)
error = 0;
IN_MULTI_LOCK_ASSERT();
+ IN_MULTI_LIST_UNLOCK_ASSERT();
CTR5(KTR_IGMPV3, "%s: leave inm %p, 0x%08x/%s, imf %p", __func__,
inm, ntohl(inm->inm_addr.s_addr),
@@ -1274,18 +1374,22 @@ in_leavegroup_locked(struct in_multi *inm, /*const*/ struct in_mfilter *imf)
* the transaction, it MUST NOT fail.
*/
CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
+ IN_MULTI_LIST_LOCK();
error = inm_merge(inm, imf);
KASSERT(error == 0, ("%s: failed to merge inm state", __func__));
CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
CURVNET_SET(inm->inm_ifp->if_vnet);
error = igmp_change_state(inm);
+ IF_ADDR_WLOCK(inm->inm_ifp);
+ inm_release_deferred(inm);
+ IF_ADDR_WUNLOCK(inm->inm_ifp);
+ IN_MULTI_LIST_UNLOCK();
CURVNET_RESTORE();
if (error)
CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__);
CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm);
- inm_release_locked(inm);
return (error);
}
@@ -1317,18 +1421,6 @@ in_addmulti(struct in_addr *ap, struct ifnet *ifp)
}
/*
- * Leave an IPv4 multicast group, assumed to be in exclusive (*,G) mode.
- * This KPI is for legacy kernel consumers only.
- */
-void
-in_delmulti(struct in_multi *inm)
-{
-
- (void)in_leavegroup(inm, NULL);
-}
-/*#endif*/
-
-/*
* Block or unblock an ASM multicast source on an inpcb.
* This implements the delta-based API described in RFC 3678.
*
@@ -1489,7 +1581,7 @@ inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt)
* Begin state merge transaction at IGMP layer.
*/
IN_MULTI_LOCK();
-
+ IN_MULTI_LIST_LOCK();
CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
error = inm_merge(inm, imf);
if (error) {
@@ -1505,7 +1597,7 @@ inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt)
out_in_multi_locked:
IN_MULTI_UNLOCK();
-
+ IN_MULTI_UNLOCK();
out_imf_rollback:
if (error)
imf_rollback(imf);
@@ -1571,37 +1663,31 @@ inp_findmoptions(struct inpcb *inp)
return (imo);
}
-/*
- * Discard the IP multicast options (and source filters). To minimize
- * the amount of work done while holding locks such as the INP's
- * pcbinfo lock (which is used in the receive path), the free
- * operation is performed asynchronously in a separate task.
- *
- * SMPng: NOTE: assumes INP write lock is held.
- */
-void
-inp_freemoptions(struct ip_moptions *imo)
-{
-
- KASSERT(imo != NULL, ("%s: ip_moptions is NULL", __func__));
- IN_MULTI_LOCK();
- STAILQ_INSERT_TAIL(&imo_gc_list, imo, imo_link);
- IN_MULTI_UNLOCK();
- taskqueue_enqueue(taskqueue_thread, &imo_gc_task);
-}
-
static void
-inp_freemoptions_internal(struct ip_moptions *imo)
+inp_gcmoptions(epoch_context_t ctx)
{
+ struct ip_moptions *imo;
struct in_mfilter *imf;
+ struct in_multi *inm;
+ struct ifnet *ifp;
size_t idx, nmships;
+ imo = __containerof(ctx, struct ip_moptions, imo_epoch_ctx);
+
nmships = imo->imo_num_memberships;
for (idx = 0; idx < nmships; ++idx) {
imf = imo->imo_mfilters ? &imo->imo_mfilters[idx] : NULL;
if (imf)
imf_leave(imf);
- (void)in_leavegroup(imo->imo_membership[idx], imf);
+ inm = imo->imo_membership[idx];
+ ifp = inm->inm_ifp;
+ if (ifp != NULL) {
+ CURVNET_SET(ifp->if_vnet);
+ (void)in_leavegroup(inm, imf);
+ CURVNET_RESTORE();
+ } else {
+ (void)in_leavegroup(inm, imf);
+ }
if (imf)
imf_purge(imf);
}
@@ -1612,20 +1698,18 @@ inp_freemoptions_internal(struct ip_moptions *imo)
free(imo, M_IPMOPTS);
}
-static void
-inp_gcmoptions(void *context, int pending)
+/*
+ * Discard the IP multicast options (and source filters). To minimize
+ * the amount of work done while holding locks such as the INP's
+ * pcbinfo lock (which is used in the receive path), the free
+ * operation is deferred to the epoch callback task.
+ */
+void
+inp_freemoptions(struct ip_moptions *imo)
{
- struct ip_moptions *imo;
-
- IN_MULTI_LOCK();
- while (!STAILQ_EMPTY(&imo_gc_list)) {
- imo = STAILQ_FIRST(&imo_gc_list);
- STAILQ_REMOVE_HEAD(&imo_gc_list, imo_link);
- IN_MULTI_UNLOCK();
- inp_freemoptions_internal(imo);
- IN_MULTI_LOCK();
- }
- IN_MULTI_UNLOCK();
+ if (imo == NULL)
+ return;
+ epoch_call(net_epoch_preempt, &imo->imo_epoch_ctx, inp_gcmoptions);
}
/*
@@ -1794,12 +1878,12 @@ inp_getmoptions(struct inpcb *inp, struct sockopt *sopt)
mreqn.imr_address = imo->imo_multicast_addr;
} else if (ifp != NULL) {
mreqn.imr_ifindex = ifp->if_index;
+ NET_EPOCH_ENTER();
IFP_TO_IA(ifp, ia, &in_ifa_tracker);
- if (ia != NULL) {
+ if (ia != NULL)
mreqn.imr_address =
IA_SIN(ia)->sin_addr;
- ifa_free(&ia->ia_ifa);
- }
+ NET_EPOCH_EXIT();
}
}
INP_WUNLOCK(inp);
@@ -1907,7 +1991,7 @@ inp_lookup_mcast_ifp(const struct inpcb *inp,
mifp = NULL;
IN_IFADDR_RLOCK(&in_ifa_tracker);
- TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
+ CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
mifp = ia->ia_ifp;
if (!(mifp->if_flags & IFF_LOOPBACK) &&
(mifp->if_flags & IFF_MULTICAST)) {
@@ -2165,6 +2249,8 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt)
/*
* Begin state merge transaction at IGMP layer.
*/
+ in_pcbref(inp);
+ INP_WUNLOCK(inp);
IN_MULTI_LOCK();
if (is_new) {
@@ -2173,20 +2259,23 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt)
if (error) {
CTR1(KTR_IGMPV3, "%s: in_joingroup_locked failed",
__func__);
- IN_MULTI_UNLOCK();
+ IN_MULTI_LIST_UNLOCK();
goto out_imo_free;
}
imo->imo_membership[idx] = inm;
} else {
CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
+ IN_MULTI_LIST_LOCK();
error = inm_merge(inm, imf);
if (error) {
CTR1(KTR_IGMPV3, "%s: failed to merge inm state",
- __func__);
+ __func__);
+ IN_MULTI_LIST_UNLOCK();
goto out_in_multi_locked;
}
CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
error = igmp_change_state(inm);
+ IN_MULTI_LIST_UNLOCK();
if (error) {
CTR1(KTR_IGMPV3, "%s: failed igmp downcall",
__func__);
@@ -2197,8 +2286,9 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt)
out_in_multi_locked:
IN_MULTI_UNLOCK();
-
- INP_WLOCK_ASSERT(inp);
+ INP_WLOCK(inp);
+ if (in_pcbrele_wlocked(inp))
+ return (ENXIO);
if (error) {
imf_rollback(imf);
if (is_new)
@@ -2387,6 +2477,8 @@ inp_leave_group(struct inpcb *inp, struct sockopt *sopt)
/*
* Begin state merge transaction at IGMP layer.
*/
+ in_pcbref(inp);
+ INP_WUNLOCK(inp);
IN_MULTI_LOCK();
if (is_final) {
@@ -2397,6 +2489,7 @@ inp_leave_group(struct inpcb *inp, struct sockopt *sopt)
(void)in_leavegroup_locked(inm, imf);
} else {
CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
+ IN_MULTI_LIST_LOCK();
error = inm_merge(inm, imf);
if (error) {
CTR1(KTR_IGMPV3, "%s: failed to merge inm state",
@@ -2406,6 +2499,7 @@ inp_leave_group(struct inpcb *inp, struct sockopt *sopt)
CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
error = igmp_change_state(inm);
+ IN_MULTI_LIST_UNLOCK();
if (error) {
CTR1(KTR_IGMPV3, "%s: failed igmp downcall",
__func__);
@@ -2415,6 +2509,9 @@ inp_leave_group(struct inpcb *inp, struct sockopt *sopt)
out_in_multi_locked:
IN_MULTI_UNLOCK();
+ INP_WLOCK(inp);
+ if (in_pcbrele_wlocked(inp))
+ return (ENXIO);
if (error)
imf_rollback(imf);
@@ -2641,6 +2738,7 @@ inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt)
INP_WLOCK_ASSERT(inp);
IN_MULTI_LOCK();
+ IN_MULTI_LIST_LOCK();
/*
* Begin state merge transaction at IGMP layer.
@@ -2649,11 +2747,13 @@ inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt)
error = inm_merge(inm, imf);
if (error) {
CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__);
+ IN_MULTI_LIST_UNLOCK();
goto out_in_multi_locked;
}
CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
error = igmp_change_state(inm);
+ IN_MULTI_LIST_UNLOCK();
if (error)
CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__);
@@ -2885,10 +2985,10 @@ sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS)
if (retval)
return (retval);
- IN_MULTI_LOCK();
+ IN_MULTI_LIST_LOCK();
IF_ADDR_RLOCK(ifp);
- TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
+ CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
if (ifma->ifma_addr->sa_family != AF_INET ||
ifma->ifma_protospec == NULL)
continue;
@@ -2918,7 +3018,7 @@ sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS)
}
IF_ADDR_RUNLOCK(ifp);
- IN_MULTI_UNLOCK();
+ IN_MULTI_LIST_UNLOCK();
return (retval);
}
diff --git a/freebsd/sys/netinet/in_pcb.c b/freebsd/sys/netinet/in_pcb.c
index 0d388132..f89487b6 100644
--- a/freebsd/sys/netinet/in_pcb.c
+++ b/freebsd/sys/netinet/in_pcb.c
@@ -64,6 +64,7 @@ __FBSDID("$FreeBSD$");
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/rmlock.h>
+#include <sys/smp.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sockio.h>
@@ -93,6 +94,9 @@ __FBSDID("$FreeBSD$");
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>
#include <netinet/tcp_var.h>
+#ifdef TCPHPTS
+#include <netinet/tcp_hpts.h>
+#endif
#include <netinet/udp.h>
#include <netinet/udp_var.h>
#endif
@@ -590,7 +594,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
INP_LOCK_ASSERT(inp);
INP_HASH_LOCK_ASSERT(pcbinfo);
- if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */
+ if (CK_STAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */
return (EADDRNOTAVAIL);
laddr.s_addr = *laddrp;
if (nam != NULL && laddr.s_addr != INADDR_ANY)
@@ -802,7 +806,6 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
int error;
KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
-
/*
* Bypass source address selection and use the primary jail IP
* if requested.
@@ -835,15 +838,18 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
* network and try to find a corresponding interface to take
* the source address from.
*/
+ NET_EPOCH_ENTER();
if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) {
struct in_ifaddr *ia;
struct ifnet *ifp;
ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
inp->inp_socket->so_fibnum));
- if (ia == NULL)
+ if (ia == NULL) {
ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
inp->inp_socket->so_fibnum));
+
+ }
if (ia == NULL) {
error = ENETUNREACH;
goto done;
@@ -851,15 +857,13 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
if (cred == NULL || !prison_flag(cred, PR_IP4)) {
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
- ifa_free(&ia->ia_ifa);
goto done;
}
ifp = ia->ia_ifp;
- ifa_free(&ia->ia_ifa);
ia = NULL;
IF_ADDR_RLOCK(ifp);
- TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
sa = ifa->ifa_addr;
if (sa->sa_family != AF_INET)
@@ -918,7 +922,7 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
ia = NULL;
ifp = sro.ro_rt->rt_ifp;
IF_ADDR_RLOCK(ifp);
- TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
sa = ifa->ifa_addr;
if (sa->sa_family != AF_INET)
continue;
@@ -972,7 +976,6 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
goto done;
}
laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
- ifa_free(&ia->ia_ifa);
goto done;
}
@@ -981,10 +984,9 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
struct ifnet *ifp;
ifp = ia->ia_ifp;
- ifa_free(&ia->ia_ifa);
ia = NULL;
IF_ADDR_RLOCK(ifp);
- TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
sa = ifa->ifa_addr;
if (sa->sa_family != AF_INET)
@@ -1010,6 +1012,7 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
}
done:
+ NET_EPOCH_EXIT();
if (sro.ro_rt != NULL)
RTFREE(sro.ro_rt);
return (error);
@@ -1063,7 +1066,7 @@ in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
faddr = sin->sin_addr;
fport = sin->sin_port;
- if (!TAILQ_EMPTY(&V_in_ifaddrhead)) {
+ if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) {
/*
* If the destination address is INADDR_ANY,
* use the primary local address.
@@ -1074,16 +1077,16 @@ in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
if (faddr.s_addr == INADDR_ANY) {
IN_IFADDR_RLOCK(&in_ifa_tracker);
faddr =
- IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
+ IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
IN_IFADDR_RUNLOCK(&in_ifa_tracker);
if (cred != NULL &&
(error = prison_get_ip4(cred, &faddr)) != 0)
return (error);
} else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
IN_IFADDR_RLOCK(&in_ifa_tracker);
- if (TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
+ if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
IFF_BROADCAST)
- faddr = satosin(&TAILQ_FIRST(
+ faddr = satosin(&CK_STAILQ_FIRST(
&V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
IN_IFADDR_RUNLOCK(&in_ifa_tracker);
}
@@ -1104,7 +1107,7 @@ in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
if (imo->imo_multicast_ifp != NULL) {
ifp = imo->imo_multicast_ifp;
IN_IFADDR_RLOCK(&in_ifa_tracker);
- TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
+ CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
if ((ia->ia_ifp == ifp) &&
(cred == NULL ||
prison_check_ip4(cred,
@@ -1236,9 +1239,28 @@ in_pcbrele_rlocked(struct inpcb *inp)
}
return (0);
}
-
+
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
-
+#ifdef TCPHPTS
+ if (inp->inp_in_hpts || inp->inp_in_input) {
+ struct tcp_hpts_entry *hpts;
+ /*
+ * We should not be on the hpts at
+ * this point in any form. we must
+ * get the lock to be sure.
+ */
+ hpts = tcp_hpts_lock(inp);
+ if (inp->inp_in_hpts)
+ panic("Hpts:%p inp:%p at free still on hpts",
+ hpts, inp);
+ mtx_unlock(&hpts->p_mtx);
+ hpts = tcp_input_lock(inp);
+ if (inp->inp_in_input)
+ panic("Hpts:%p inp:%p at free still on input hpts",
+ hpts, inp);
+ mtx_unlock(&hpts->p_mtx);
+ }
+#endif
INP_RUNLOCK(inp);
pcbinfo = inp->inp_pcbinfo;
uma_zfree(pcbinfo->ipi_zone, inp);
@@ -1267,7 +1289,26 @@ in_pcbrele_wlocked(struct inpcb *inp)
}
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
-
+#ifdef TCPHPTS
+ if (inp->inp_in_hpts || inp->inp_in_input) {
+ struct tcp_hpts_entry *hpts;
+ /*
+ * We should not be on the hpts at
+ * this point in any form. we must
+ * get the lock to be sure.
+ */
+ hpts = tcp_hpts_lock(inp);
+ if (inp->inp_in_hpts)
+ panic("Hpts:%p inp:%p at free still on hpts",
+ hpts, inp);
+ mtx_unlock(&hpts->p_mtx);
+ hpts = tcp_input_lock(inp);
+ if (inp->inp_in_input)
+ panic("Hpts:%p inp:%p at free still on input hpts",
+ hpts, inp);
+ mtx_unlock(&hpts->p_mtx);
+ }
+#endif
INP_WUNLOCK(inp);
pcbinfo = inp->inp_pcbinfo;
uma_zfree(pcbinfo->ipi_zone, inp);
@@ -1284,6 +1325,28 @@ in_pcbrele(struct inpcb *inp)
return (in_pcbrele_wlocked(inp));
}
+void
+in_pcblist_rele_rlocked(epoch_context_t ctx)
+{
+ struct in_pcblist *il;
+ struct inpcb *inp;
+ struct inpcbinfo *pcbinfo;
+ int i, n;
+
+ il = __containerof(ctx, struct in_pcblist, il_epoch_ctx);
+ pcbinfo = il->il_pcbinfo;
+ n = il->il_count;
+ INP_INFO_WLOCK(pcbinfo);
+ for (i = 0; i < n; i++) {
+ inp = il->il_inp_list[i];
+ INP_RLOCK(inp);
+ if (!in_pcbrele_rlocked(inp))
+ INP_RUNLOCK(inp);
+ }
+ INP_INFO_WUNLOCK(pcbinfo);
+ free(il, M_TEMP);
+}
+
/*
* Unconditionally schedule an inpcb to be freed by decrementing its
* reference count, which should occur only after the inpcb has been detached
@@ -1298,8 +1361,21 @@ in_pcbfree(struct inpcb *inp)
{
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
+#ifdef INET6
+ struct ip6_moptions *im6o = NULL;
+#endif
+#ifdef INET
+ struct ip_moptions *imo = NULL;
+#endif
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
+ KASSERT((inp->inp_flags2 & INP_FREED) == 0,
+ ("%s: called twice for pcb %p", __func__, inp));
+ if (inp->inp_flags2 & INP_FREED) {
+ INP_WUNLOCK(inp);
+ return;
+ }
+
#ifdef INVARIANTS
if (pcbinfo == &V_tcbinfo) {
INP_INFO_LOCK_ASSERT(pcbinfo);
@@ -1309,6 +1385,10 @@ in_pcbfree(struct inpcb *inp)
#endif
INP_WLOCK_ASSERT(inp);
+#ifdef INET
+ imo = inp->inp_moptions;
+ inp->inp_moptions = NULL;
+#endif
/* XXXRW: Do as much as possible here. */
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
if (inp->inp_sp != NULL)
@@ -1321,16 +1401,12 @@ in_pcbfree(struct inpcb *inp)
#ifdef INET6
if (inp->inp_vflag & INP_IPV6PROTO) {
ip6_freepcbopts(inp->in6p_outputopts);
- if (inp->in6p_moptions != NULL)
- ip6_freemoptions(inp->in6p_moptions);
+ im6o = inp->in6p_moptions;
+ inp->in6p_moptions = NULL;
}
#endif
if (inp->inp_options)
(void)m_free(inp->inp_options);
-#ifdef INET
- if (inp->inp_moptions != NULL)
- inp_freemoptions(inp->inp_moptions);
-#endif
RO_INVALIDATE_CACHE(&inp->inp_route);
inp->inp_vflag = 0;
@@ -1339,6 +1415,12 @@ in_pcbfree(struct inpcb *inp)
#ifdef MAC
mac_inpcb_destroy(inp);
#endif
+#ifdef INET6
+ ip6_freemoptions(im6o);
+#endif
+#ifdef INET
+ inp_freemoptions(imo);
+#endif
if (!in_pcbrele_wlocked(inp))
INP_WUNLOCK(inp);
}
@@ -1492,11 +1574,14 @@ in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
/*
* Drop multicast group membership if we joined
* through the interface being detached.
+ *
+ * XXX This can all be deferred to an epoch_call
*/
for (i = 0, gap = 0; i < imo->imo_num_memberships;
i++) {
if (imo->imo_membership[i]->inm_ifp == ifp) {
- in_delmulti(imo->imo_membership[i]);
+ IN_MULTI_LOCK_ASSERT();
+ in_leavegroup_locked(imo->imo_membership[i], NULL);
gap++;
} else if (gap != 0)
imo->imo_membership[i - gap] =
diff --git a/freebsd/sys/netinet/in_pcb.h b/freebsd/sys/netinet/in_pcb.h
index 574ab407..d00dd456 100644
--- a/freebsd/sys/netinet/in_pcb.h
+++ b/freebsd/sys/netinet/in_pcb.h
@@ -41,6 +41,7 @@
#define _NETINET_IN_PCB_H_
#include <sys/queue.h>
+#include <sys/epoch.h>
#include <sys/_lock.h>
#include <sys/_mutex.h>
#include <sys/_rwlock.h>
@@ -156,6 +157,7 @@ struct in_conninfo {
* from the global list.
*
* Key:
+ * (b) - Protected by the hpts lock.
* (c) - Constant after initialization
* (g) - Protected by the pcbgroup lock
* (i) - Protected by the inpcb lock
@@ -164,6 +166,51 @@ struct in_conninfo {
* (h) - Protected by the pcbhash lock for the inpcb
* (s) - Protected by another subsystem's locks
* (x) - Undefined locking
+ *
+ * Notes on the tcp_hpts:
+ *
+ * First Hpts lock order is
+ * 1) INP_WLOCK()
+ * 2) HPTS_LOCK() i.e. hpts->pmtx
+ *
+ * To insert a TCB on the hpts you *must* be holding the INP_WLOCK().
+ * You may check the inp->inp_in_hpts flag without the hpts lock.
+ * The hpts is the only one that will clear this flag holding
+ * only the hpts lock. This means that in your tcp_output()
+ * routine when you test for the inp_in_hpts flag to be 1
+ * it may be transitioning to 0 (by the hpts).
+ * That's ok since that will just mean an extra call to tcp_output
+ * that most likely will find the call you executed
+ * (when the mis-match occured) will have put the TCB back
+ * on the hpts and it will return. If your
+ * call did not add the inp back to the hpts then you will either
+ * over-send or the cwnd will block you from sending more.
+ *
+ * Note you should also be holding the INP_WLOCK() when you
+ * call the remove from the hpts as well. Though usually
+ * you are either doing this from a timer, where you need and have
+ * the INP_WLOCK() or from destroying your TCB where again
+ * you should already have the INP_WLOCK().
+ *
+ * The inp_hpts_cpu, inp_hpts_cpu_set, inp_input_cpu and
+ * inp_input_cpu_set fields are controlled completely by
+ * the hpts. Do not ever set these. The inp_hpts_cpu_set
+ * and inp_input_cpu_set fields indicate if the hpts has
+ * setup the respective cpu field. It is advised if this
+ * field is 0, to enqueue the packet with the appropriate
+ * hpts_immediate() call. If the _set field is 1, then
+ * you may compare the inp_*_cpu field to the curcpu and
+ * may want to again insert onto the hpts if these fields
+ * are not equal (i.e. you are not on the expected CPU).
+ *
+ * A note on inp_hpts_calls and inp_input_calls, these
+ * flags are set when the hpts calls either the output
+ * or do_segment routines respectively. If the routine
+ * being called wants to use this, then it needs to
+ * clear the flag before returning. The hpts will not
+ * clear the flag. The flags can be used to tell if
+ * the hpts is the function calling the respective
+ * routine.
*
* A few other notes:
*
@@ -190,14 +237,45 @@ struct inpcb {
LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */
struct rwlock inp_lock;
/* Cache line #2 (amd64) */
-#define inp_start_zero inp_refcount
+#define inp_start_zero inp_hpts
#define inp_zero_size (sizeof(struct inpcb) - \
offsetof(struct inpcb, inp_start_zero))
+ TAILQ_ENTRY(inpcb) inp_hpts; /* pacing out queue next lock(b) */
+
+ uint32_t inp_hpts_request; /* Current hpts request, zero if
+ * fits in the pacing window (i&b). */
+ /*
+ * Note the next fields are protected by a
+ * different lock (hpts-lock). This means that
+ * they must correspond in size to the smallest
+ * protectable bit field (uint8_t on x86, and
+ * other platfomrs potentially uint32_t?). Also
+ * since CPU switches can occur at different times the two
+ * fields can *not* be collapsed into a signal bit field.
+ */
+#if defined(__amd64__) || defined(__i386__)
+ volatile uint8_t inp_in_hpts; /* on output hpts (lock b) */
+ volatile uint8_t inp_in_input; /* on input hpts (lock b) */
+#else
+ volatile uint32_t inp_in_hpts; /* on output hpts (lock b) */
+ volatile uint32_t inp_in_input; /* on input hpts (lock b) */
+#endif
+ volatile uint16_t inp_hpts_cpu; /* Lock (i) */
u_int inp_refcount; /* (i) refcount */
int inp_flags; /* (i) generic IP/datagram flags */
int inp_flags2; /* (i) generic IP/datagram flags #2*/
+ volatile uint16_t inp_input_cpu; /* Lock (i) */
+ volatile uint8_t inp_hpts_cpu_set :1, /* on output hpts (i) */
+ inp_input_cpu_set : 1, /* on input hpts (i) */
+ inp_hpts_calls :1, /* (i) from output hpts */
+ inp_input_calls :1, /* (i) from input hpts */
+ inp_spare_bits2 : 4;
+ uint8_t inp_spare_byte; /* Compiler hole */
void *inp_ppcb; /* (i) pointer to per-protocol pcb */
struct socket *inp_socket; /* (i) back pointer to socket */
+ uint32_t inp_hptsslot; /* Hpts wheel slot this tcb is Lock(i&b) */
+ uint32_t inp_hpts_drop_reas; /* reason we are dropping the PCB (lock i&b) */
+ TAILQ_ENTRY(inpcb) inp_input; /* pacing in queue next lock(b) */
struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */
struct inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */
LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/h) group wildcard entry */
@@ -330,6 +408,13 @@ struct inpcbport {
u_short phd_port;
};
+struct in_pcblist {
+ int il_count;
+ struct epoch_context il_epoch_ctx;
+ struct inpcbinfo *il_pcbinfo;
+ struct inpcb *il_inp_list[0];
+};
+
/*-
* Global data structure for each high-level protocol (UDP, TCP, ...) in both
* IPv4 and IPv6. Holds inpcb lists and information for managing them.
@@ -638,6 +723,7 @@ short inp_so_options(const struct inpcb *inp);
#define INP_RECVRSSBUCKETID 0x00000200 /* populate recv datagram with bucket id */
#define INP_RATE_LIMIT_CHANGED 0x00000400 /* rate limit needs attention */
#define INP_ORIGDSTADDR 0x00000800 /* receive IP dst address/port */
+#define INP_CANNOT_DO_ECN 0x00001000 /* The stack does not do ECN */
/*
* Flags passed to in_pcblookup*() functions.
@@ -751,6 +837,7 @@ void in_pcbrehash_mbuf(struct inpcb *, struct mbuf *);
int in_pcbrele(struct inpcb *);
int in_pcbrele_rlocked(struct inpcb *);
int in_pcbrele_wlocked(struct inpcb *);
+void in_pcblist_rele_rlocked(epoch_context_t ctx);
void in_losing(struct inpcb *);
void in_pcbsetsolabel(struct socket *so);
int in_getpeeraddr(struct socket *so, struct sockaddr **nam);
diff --git a/freebsd/sys/netinet/in_proto.c b/freebsd/sys/netinet/in_proto.c
index f1dec6c5..a563c950 100644
--- a/freebsd/sys/netinet/in_proto.c
+++ b/freebsd/sys/netinet/in_proto.c
@@ -229,7 +229,6 @@ struct protosw inetsw[] = {
.pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
.pr_input = encap4_input,
.pr_ctloutput = rip_ctloutput,
- .pr_init = encap_init,
.pr_usrreqs = &rip_usrreqs
},
{
@@ -239,7 +238,6 @@ struct protosw inetsw[] = {
.pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
.pr_input = encap4_input,
.pr_ctloutput = rip_ctloutput,
- .pr_init = encap_init,
.pr_usrreqs = &rip_usrreqs
},
{
@@ -249,7 +247,6 @@ struct protosw inetsw[] = {
.pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
.pr_input = encap4_input,
.pr_ctloutput = rip_ctloutput,
- .pr_init = encap_init,
.pr_usrreqs = &rip_usrreqs
},
{
@@ -259,7 +256,6 @@ struct protosw inetsw[] = {
.pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
.pr_input = encap4_input,
.pr_ctloutput = rip_ctloutput,
- .pr_init = encap_init,
.pr_usrreqs = &rip_usrreqs
},
# ifdef INET6
@@ -270,7 +266,6 @@ struct protosw inetsw[] = {
.pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
.pr_input = encap4_input,
.pr_ctloutput = rip_ctloutput,
- .pr_init = encap_init,
.pr_usrreqs = &rip_usrreqs
},
#endif
diff --git a/freebsd/sys/netinet/in_var.h b/freebsd/sys/netinet/in_var.h
index ff722fc9..5b7a464b 100644
--- a/freebsd/sys/netinet/in_var.h
+++ b/freebsd/sys/netinet/in_var.h
@@ -55,6 +55,7 @@ struct in_aliasreq {
struct igmp_ifsoftc;
struct in_multi;
struct lltable;
+SLIST_HEAD(in_multi_head, in_multi);
/*
* IPv4 per-interface state.
@@ -79,7 +80,7 @@ struct in_ifaddr {
u_long ia_subnet; /* subnet address */
u_long ia_subnetmask; /* mask of subnet */
LIST_ENTRY(in_ifaddr) ia_hash; /* entry in bucket of inet addresses */
- TAILQ_ENTRY(in_ifaddr) ia_link; /* list of internet addresses */
+ CK_STAILQ_ENTRY(in_ifaddr) ia_link; /* list of internet addresses */
struct sockaddr_in ia_addr; /* reserve space for interface name */
struct sockaddr_in ia_dstaddr; /* reserve space for broadcast addr */
#define ia_broadaddr ia_dstaddr
@@ -106,7 +107,7 @@ extern u_char inetctlerrmap[];
/*
* Hash table for IP addresses.
*/
-TAILQ_HEAD(in_ifaddrhead, in_ifaddr);
+CK_STAILQ_HEAD(in_ifaddrhead, in_ifaddr);
LIST_HEAD(in_ifaddrhashhead, in_ifaddr);
VNET_DECLARE(struct in_ifaddrhashhead *, in_ifaddrhashtbl);
@@ -171,12 +172,10 @@ do { \
/* struct rm_priotracker *t; */ \
do { \
IN_IFADDR_RLOCK((t)); \
- for ((ia) = TAILQ_FIRST(&V_in_ifaddrhead); \
+ for ((ia) = CK_STAILQ_FIRST(&V_in_ifaddrhead); \
(ia) != NULL && (ia)->ia_ifp != (ifp); \
- (ia) = TAILQ_NEXT((ia), ia_link)) \
+ (ia) = CK_STAILQ_NEXT((ia), ia_link)) \
continue; \
- if ((ia) != NULL) \
- ifa_ref(&(ia)->ia_ifa); \
IN_IFADDR_RUNLOCK((t)); \
} while (0)
@@ -329,21 +328,53 @@ SYSCTL_DECL(_net_inet_raw);
* consumers of IN_*_MULTI() macros should acquire the locks before
* calling them; users of the in_{add,del}multi() functions should not.
*/
-extern struct mtx in_multi_mtx;
-#define IN_MULTI_LOCK() mtx_lock(&in_multi_mtx)
-#define IN_MULTI_UNLOCK() mtx_unlock(&in_multi_mtx)
-#define IN_MULTI_LOCK_ASSERT() mtx_assert(&in_multi_mtx, MA_OWNED)
-#define IN_MULTI_UNLOCK_ASSERT() mtx_assert(&in_multi_mtx, MA_NOTOWNED)
+extern struct mtx in_multi_list_mtx;
+extern struct sx in_multi_sx;
+
+#define IN_MULTI_LIST_LOCK() mtx_lock(&in_multi_list_mtx)
+#define IN_MULTI_LIST_UNLOCK() mtx_unlock(&in_multi_list_mtx)
+#define IN_MULTI_LIST_LOCK_ASSERT() mtx_assert(&in_multi_list_mtx, MA_OWNED)
+#define IN_MULTI_LIST_UNLOCK_ASSERT() mtx_assert(&in_multi_list_mtx, MA_NOTOWNED)
+
+#define IN_MULTI_LOCK() sx_xlock(&in_multi_sx)
+#define IN_MULTI_UNLOCK() sx_xunlock(&in_multi_sx)
+#define IN_MULTI_LOCK_ASSERT() sx_assert(&in_multi_sx, SA_XLOCKED)
+#define IN_MULTI_UNLOCK_ASSERT() sx_assert(&in_multi_sx, SA_XUNLOCKED)
+
+void inm_disconnect(struct in_multi *inm);
+extern int ifma_restart;
/* Acquire an in_multi record. */
static __inline void
inm_acquire_locked(struct in_multi *inm)
{
- IN_MULTI_LOCK_ASSERT();
+ IN_MULTI_LIST_LOCK_ASSERT();
++inm->inm_refcount;
}
+static __inline void
+inm_acquire(struct in_multi *inm)
+{
+ IN_MULTI_LIST_LOCK();
+ inm_acquire_locked(inm);
+ IN_MULTI_LIST_UNLOCK();
+}
+
+static __inline void
+inm_rele_locked(struct in_multi_head *inmh, struct in_multi *inm)
+{
+ MPASS(inm->inm_refcount > 0);
+ IN_MULTI_LIST_LOCK_ASSERT();
+
+ if (--inm->inm_refcount == 0) {
+ MPASS(inmh != NULL);
+ inm_disconnect(inm);
+ inm->inm_ifma->ifma_protospec = NULL;
+ SLIST_INSERT_HEAD(inmh, inm, inm_nrele);
+ }
+}
+
/*
* Return values for imo_multi_filter().
*/
@@ -364,11 +395,10 @@ void inm_commit(struct in_multi *);
void inm_clear_recorded(struct in_multi *);
void inm_print(const struct in_multi *);
int inm_record_source(struct in_multi *inm, const in_addr_t);
-void inm_release(struct in_multi *);
-void inm_release_locked(struct in_multi *);
+void inm_release_deferred(struct in_multi *);
+void inm_release_list_deferred(struct in_multi_head *);
struct in_multi *
- in_addmulti(struct in_addr *, struct ifnet *);
-void in_delmulti(struct in_multi *);
+in_addmulti(struct in_addr *, struct ifnet *);
int in_joingroup(struct ifnet *, const struct in_addr *,
/*const*/ struct in_mfilter *, struct in_multi **);
int in_joingroup_locked(struct ifnet *, const struct in_addr *,
diff --git a/freebsd/sys/netinet/ip_carp.c b/freebsd/sys/netinet/ip_carp.c
index e2bd0a0a..6f5160e0 100644
--- a/freebsd/sys/netinet/ip_carp.c
+++ b/freebsd/sys/netinet/ip_carp.c
@@ -57,7 +57,6 @@ __FBSDID("$FreeBSD$");
#include <sys/counter.h>
#include <net/ethernet.h>
-#include <net/fddi.h>
#include <net/if.h>
#include <net/if_var.h>
#include <net/if_dl.h>
@@ -213,11 +212,13 @@ static VNET_DEFINE(int, carp_senderr_adj) = CARP_MAXSKEW;
static VNET_DEFINE(int, carp_ifdown_adj) = CARP_MAXSKEW;
#define V_carp_ifdown_adj VNET(carp_ifdown_adj)
+static int carp_allow_sysctl(SYSCTL_HANDLER_ARGS);
static int carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS);
SYSCTL_NODE(_net_inet, IPPROTO_CARP, carp, CTLFLAG_RW, 0, "CARP");
-SYSCTL_INT(_net_inet_carp, OID_AUTO, allow, CTLFLAG_VNET | CTLFLAG_RW,
- &VNET_NAME(carp_allow), 0, "Accept incoming CARP packets");
+SYSCTL_PROC(_net_inet_carp, OID_AUTO, allow,
+ CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, 0, 0, carp_allow_sysctl, "I",
+ "Accept incoming CARP packets");
SYSCTL_INT(_net_inet_carp, OID_AUTO, preempt, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(carp_preempt), 0, "High-priority backup preemption mode");
SYSCTL_INT(_net_inet_carp, OID_AUTO, log, CTLFLAG_VNET | CTLFLAG_RW,
@@ -277,8 +278,7 @@ SYSCTL_VNET_PCPUSTAT(_net_inet_carp, OID_AUTO, stats, struct carpstats,
} while (0)
#define IFNET_FOREACH_IFA(ifp, ifa) \
- IF_ADDR_LOCK_ASSERT(ifp); \
- TAILQ_FOREACH((ifa), &(ifp)->if_addrhead, ifa_link) \
+ CK_STAILQ_FOREACH((ifa), &(ifp)->if_addrhead, ifa_link) \
if ((ifa)->ifa_carp != NULL)
#define CARP_FOREACH_IFA(sc, ifa) \
@@ -879,7 +879,7 @@ carp_best_ifa(int af, struct ifnet *ifp)
return (NULL);
best = NULL;
IF_ADDR_RLOCK(ifp);
- TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
if (ifa->ifa_addr->sa_family == af &&
(best == NULL || ifa_preferred(best, ifa)))
best = ifa;
@@ -1161,7 +1161,7 @@ carp_iamatch6(struct ifnet *ifp, struct in6_addr *taddr)
ifa = NULL;
IF_ADDR_RLOCK(ifp);
- TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
if (!IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa)))
@@ -1294,7 +1294,8 @@ carp_setrun(struct carp_softc *sc, sa_family_t af)
if ((sc->sc_carpdev->if_flags & IFF_UP) == 0 ||
sc->sc_carpdev->if_link_state != LINK_STATE_UP ||
- (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0))
+ (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0) ||
+ !V_carp_allow)
return;
switch (sc->sc_state) {
@@ -1408,7 +1409,7 @@ carp_multicast_setup(struct carp_if *cif, sa_family_t sa)
break;
}
in6m = NULL;
- if ((error = in6_mc_join(ifp, &in6, NULL, &in6m, 0)) != 0) {
+ if ((error = in6_joingroup(ifp, &in6, NULL, &in6m, 0)) != 0) {
free(im6o->im6o_membership, M_CARP);
break;
}
@@ -1423,13 +1424,13 @@ carp_multicast_setup(struct carp_if *cif, sa_family_t sa)
in6.s6_addr32[3] = 0;
in6.s6_addr8[12] = 0xff;
if ((error = in6_setscope(&in6, ifp, NULL)) != 0) {
- in6_mc_leave(im6o->im6o_membership[0], NULL);
+ in6_leavegroup(im6o->im6o_membership[0], NULL);
free(im6o->im6o_membership, M_CARP);
break;
}
in6m = NULL;
- if ((error = in6_mc_join(ifp, &in6, NULL, &in6m, 0)) != 0) {
- in6_mc_leave(im6o->im6o_membership[0], NULL);
+ if ((error = in6_joingroup(ifp, &in6, NULL, &in6m, 0)) != 0) {
+ in6_leavegroup(im6o->im6o_membership[0], NULL);
free(im6o->im6o_membership, M_CARP);
break;
}
@@ -1472,8 +1473,8 @@ carp_multicast_cleanup(struct carp_if *cif, sa_family_t sa)
if (cif->cif_naddrs6 == 0) {
struct ip6_moptions *im6o = &cif->cif_im6o;
- in6_mc_leave(im6o->im6o_membership[0], NULL);
- in6_mc_leave(im6o->im6o_membership[1], NULL);
+ in6_leavegroup(im6o->im6o_membership[0], NULL);
+ in6_leavegroup(im6o->im6o_membership[1], NULL);
KASSERT(im6o->im6o_mfilters == NULL,
("%s: im6o_mfilters != NULL", __func__));
free(im6o->im6o_membership, M_CARP);
@@ -1528,18 +1529,6 @@ carp_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa)
eh->ether_shost[5] = sc->sc_vhid;
}
break;
- case IFT_FDDI: {
- struct fddi_header *fh;
-
- fh = mtod(m, struct fddi_header *);
- fh->fddi_shost[0] = 0;
- fh->fddi_shost[1] = 0;
- fh->fddi_shost[2] = 0x5e;
- fh->fddi_shost[3] = 0;
- fh->fddi_shost[4] = 1;
- fh->fddi_shost[5] = sc->sc_vhid;
- }
- break;
default:
printf("%s: carp is not supported for the %d interface type\n",
ifp->if_xname, ifp->if_type);
@@ -1721,7 +1710,6 @@ carp_ioctl(struct ifreq *ifr, u_long cmd, struct thread *td)
case IFT_ETHER:
case IFT_L2VLAN:
case IFT_BRIDGE:
- case IFT_FDDI:
break;
default:
error = EOPNOTSUPP;
@@ -2057,7 +2045,8 @@ carp_sc_state(struct carp_softc *sc)
CARP_LOCK_ASSERT(sc);
if (sc->sc_carpdev->if_link_state != LINK_STATE_UP ||
- !(sc->sc_carpdev->if_flags & IFF_UP)) {
+ !(sc->sc_carpdev->if_flags & IFF_UP) ||
+ !V_carp_allow) {
callout_stop(&sc->sc_ad_tmo);
#ifdef INET
callout_stop(&sc->sc_md_tmo);
@@ -2088,6 +2077,33 @@ carp_demote_adj(int adj, char *reason)
}
static int
+carp_allow_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ int new, error;
+ struct carp_softc *sc;
+
+ new = V_carp_allow;
+ error = sysctl_handle_int(oidp, &new, 0, req);
+ if (error || !req->newptr)
+ return (error);
+
+ if (V_carp_allow != new) {
+ V_carp_allow = new;
+
+ mtx_lock(&carp_mtx);
+ LIST_FOREACH(sc, &carp_list, sc_next) {
+ CARP_LOCK(sc);
+ if (curvnet == sc->sc_carpdev->if_vnet)
+ carp_sc_state(sc);
+ CARP_UNLOCK(sc);
+ }
+ mtx_unlock(&carp_mtx);
+ }
+
+ return (0);
+}
+
+static int
carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS)
{
int new, error;
diff --git a/freebsd/sys/netinet/ip_divert.c b/freebsd/sys/netinet/ip_divert.c
index 53a0445e..84f39023 100644
--- a/freebsd/sys/netinet/ip_divert.c
+++ b/freebsd/sys/netinet/ip_divert.c
@@ -76,7 +76,6 @@ __FBSDID("$FreeBSD$");
#endif
#include <security/mac/mac_framework.h>
-
/*
* Divert sockets
*/
@@ -237,7 +236,7 @@ divert_packet(struct mbuf *m, int incoming)
/* Find IP address for receive interface */
ifp = m->m_pkthdr.rcvif;
if_addr_rlock(ifp);
- TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
if (ifa->ifa_addr->sa_family != AF_INET)
continue;
divsrc.sin_addr =
@@ -471,13 +470,15 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin,
bzero(sin->sin_zero, sizeof(sin->sin_zero));
sin->sin_port = 0;
+ NET_EPOCH_ENTER();
ifa = ifa_ifwithaddr((struct sockaddr *) sin);
if (ifa == NULL) {
error = EADDRNOTAVAIL;
+ NET_EPOCH_EXIT();
goto cantsend;
}
m->m_pkthdr.rcvif = ifa->ifa_ifp;
- ifa_free(ifa);
+ NET_EPOCH_EXIT();
}
#ifdef MAC
mac_socket_create_mbuf(so, m);
@@ -553,6 +554,7 @@ div_detach(struct socket *so)
KASSERT(inp != NULL, ("div_detach: inp == NULL"));
INP_INFO_WLOCK(&V_divcbinfo);
INP_WLOCK(inp);
+ /* XXX defer destruction to epoch_call */
in_pcbdetach(inp);
in_pcbfree(inp);
INP_INFO_WUNLOCK(&V_divcbinfo);
@@ -632,6 +634,7 @@ static int
div_pcblist(SYSCTL_HANDLER_ARGS)
{
int error, i, n;
+ struct in_pcblist *il;
struct inpcb *inp, **inp_list;
inp_gen_t gencnt;
struct xinpgen xig;
@@ -671,9 +674,8 @@ div_pcblist(SYSCTL_HANDLER_ARGS)
if (error)
return error;
- inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
- if (inp_list == NULL)
- return ENOMEM;
+ il = malloc(sizeof(struct in_pcblist) + n * sizeof(struct inpcb *), M_TEMP, M_WAITOK|M_ZERO_INVARIANTS);
+ inp_list = il->il_inp_list;
INP_INFO_RLOCK(&V_divcbinfo);
for (inp = LIST_FIRST(V_divcbinfo.ipi_listhead), i = 0; inp && i < n;
@@ -702,14 +704,9 @@ div_pcblist(SYSCTL_HANDLER_ARGS)
} else
INP_RUNLOCK(inp);
}
- INP_INFO_WLOCK(&V_divcbinfo);
- for (i = 0; i < n; i++) {
- inp = inp_list[i];
- INP_RLOCK(inp);
- if (!in_pcbrele_rlocked(inp))
- INP_RUNLOCK(inp);
- }
- INP_INFO_WUNLOCK(&V_divcbinfo);
+ il->il_count = n;
+ il->il_pcbinfo = &V_divcbinfo;
+ epoch_call(net_epoch_preempt, &il->il_epoch_ctx, in_pcblist_rele_rlocked);
if (!error) {
/*
@@ -726,7 +723,6 @@ div_pcblist(SYSCTL_HANDLER_ARGS)
INP_INFO_RUNLOCK(&V_divcbinfo);
error = SYSCTL_OUT(req, &xig, sizeof xig);
}
- free(inp_list, M_TEMP);
return error;
}
@@ -806,6 +802,7 @@ div_modevent(module_t mod, int type, void *unused)
break;
}
ip_divert_ptr = NULL;
+ /* XXX defer to epoch_call ? */
err = pf_proto_unregister(PF_INET, IPPROTO_DIVERT, SOCK_RAW);
INP_INFO_WUNLOCK(&V_divcbinfo);
#ifndef VIMAGE
diff --git a/freebsd/sys/netinet/ip_encap.c b/freebsd/sys/netinet/ip_encap.c
index d0866b00..52cd0b40 100644
--- a/freebsd/sys/netinet/ip_encap.c
+++ b/freebsd/sys/netinet/ip_encap.c
@@ -110,15 +110,6 @@ static struct mtx encapmtx;
MTX_SYSINIT(encapmtx, &encapmtx, "encapmtx", MTX_DEF);
static LIST_HEAD(, encaptab) encaptab = LIST_HEAD_INITIALIZER(encaptab);
-/*
- * We currently keey encap_init() for source code compatibility reasons --
- * it's referenced by KAME pieces in netinet6.
- */
-void
-encap_init(void)
-{
-}
-
#ifdef INET
int
encap4_input(struct mbuf **mp, int *offp, int proto)
diff --git a/freebsd/sys/netinet/ip_encap.h b/freebsd/sys/netinet/ip_encap.h
index bbbee390..ef232189 100644
--- a/freebsd/sys/netinet/ip_encap.h
+++ b/freebsd/sys/netinet/ip_encap.h
@@ -50,7 +50,6 @@ struct encaptab {
void *arg; /* passed via m->m_pkthdr.aux */
};
-void encap_init(void);
int encap4_input(struct mbuf **, int *, int);
int encap6_input(struct mbuf **, int *, int);
const struct encaptab *encap_attach(int, int, const struct sockaddr *,
diff --git a/freebsd/sys/netinet/ip_icmp.c b/freebsd/sys/netinet/ip_icmp.c
index b03fea56..3fc59a14 100644
--- a/freebsd/sys/netinet/ip_icmp.c
+++ b/freebsd/sys/netinet/ip_icmp.c
@@ -71,6 +71,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/tcpip.h>
#include <netinet/icmp_var.h>
+
#ifdef INET
#include <machine/in_cksum.h>
@@ -407,6 +408,7 @@ icmp_input(struct mbuf **mp, int *offp, int proto)
inet_ntoa_r(ip->ip_dst, dstbuf), icmplen);
}
#endif
+ NET_EPOCH_ENTER();
if (icmplen < ICMP_MINLEN) {
ICMPSTAT_INC(icps_tooshort);
goto freeit;
@@ -414,6 +416,7 @@ icmp_input(struct mbuf **mp, int *offp, int proto)
i = hlen + min(icmplen, ICMP_ADVLENMIN);
if (m->m_len < i && (m = m_pullup(m, i)) == NULL) {
ICMPSTAT_INC(icps_tooshort);
+ NET_EPOCH_EXIT();
return (IPPROTO_DONE);
}
ip = mtod(m, struct ip *);
@@ -531,6 +534,7 @@ icmp_input(struct mbuf **mp, int *offp, int proto)
if (m->m_len < i && (m = m_pullup(m, i)) == NULL) {
/* This should actually not happen */
ICMPSTAT_INC(icps_tooshort);
+ NET_EPOCH_EXIT();
return (IPPROTO_DONE);
}
ip = mtod(m, struct ip *);
@@ -606,10 +610,8 @@ icmp_input(struct mbuf **mp, int *offp, int proto)
(struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif);
if (ia == NULL)
break;
- if (ia->ia_ifp == NULL) {
- ifa_free(&ia->ia_ifa);
+ if (ia->ia_ifp == NULL)
break;
- }
icp->icmp_type = ICMP_MASKREPLY;
if (V_icmpmaskfake == 0)
icp->icmp_mask = ia->ia_sockmask.sin_addr.s_addr;
@@ -621,11 +623,11 @@ icmp_input(struct mbuf **mp, int *offp, int proto)
else if (ia->ia_ifp->if_flags & IFF_POINTOPOINT)
ip->ip_src = satosin(&ia->ia_dstaddr)->sin_addr;
}
- ifa_free(&ia->ia_ifa);
reflect:
ICMPSTAT_INC(icps_reflect);
ICMPSTAT_INC(icps_outhist[icp->icmp_type]);
icmp_reflect(m);
+ NET_EPOCH_EXIT();
return (IPPROTO_DONE);
case ICMP_REDIRECT:
@@ -702,11 +704,13 @@ reflect:
}
raw:
+ NET_EPOCH_EXIT();
*mp = m;
rip_input(mp, offp, proto);
return (IPPROTO_DONE);
freeit:
+ NET_EPOCH_EXIT();
m_freem(m);
return (IPPROTO_DONE);
}
@@ -762,7 +766,7 @@ icmp_reflect(struct mbuf *m)
ifp = m->m_pkthdr.rcvif;
if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) {
IF_ADDR_RLOCK(ifp);
- TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
if (ifa->ifa_addr->sa_family != AF_INET)
continue;
ia = ifatoia(ifa);
@@ -783,7 +787,7 @@ icmp_reflect(struct mbuf *m)
*/
if (V_icmp_rfi && ifp != NULL) {
IF_ADDR_RLOCK(ifp);
- TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
if (ifa->ifa_addr->sa_family != AF_INET)
continue;
ia = ifatoia(ifa);
@@ -801,7 +805,7 @@ icmp_reflect(struct mbuf *m)
*/
if (V_reply_src[0] != '\0' && (ifp = ifunit(V_reply_src))) {
IF_ADDR_RLOCK(ifp);
- TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
if (ifa->ifa_addr->sa_family != AF_INET)
continue;
ia = ifatoia(ifa);
diff --git a/freebsd/sys/netinet/ip_input.c b/freebsd/sys/netinet/ip_input.c
index 2c8bf427..343eec5e 100644
--- a/freebsd/sys/netinet/ip_input.c
+++ b/freebsd/sys/netinet/ip_input.c
@@ -306,7 +306,7 @@ ip_init(void)
struct protosw *pr;
int i;
- TAILQ_INIT(&V_in_ifaddrhead);
+ CK_STAILQ_INIT(&V_in_ifaddrhead);
V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask);
/* Initialize IP reassembly queue. */
@@ -401,7 +401,7 @@ ip_destroy(void *unused __unused)
/* Make sure the IPv4 routes are gone as well. */
IFNET_RLOCK();
- TAILQ_FOREACH(ifp, &V_ifnet, if_link)
+ CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link)
rt_flushifroutes_af(ifp, AF_INET);
IFNET_RUNLOCK();
@@ -652,7 +652,7 @@ passin:
* we receive might be for us (and let the upper layers deal
* with it).
*/
- if (TAILQ_EMPTY(&V_in_ifaddrhead) &&
+ if (CK_STAILQ_EMPTY(&V_in_ifaddrhead) &&
(m->m_flags & (M_MCAST|M_BCAST)) == 0)
goto ours;
@@ -709,7 +709,7 @@ passin:
*/
if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) {
IF_ADDR_RLOCK(ifp);
- TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
if (ifa->ifa_addr->sa_family != AF_INET)
continue;
ia = ifatoia(ifa);
@@ -979,9 +979,9 @@ ip_forward(struct mbuf *m, int srcrt)
#else
in_rtalloc_ign(&ro, 0, M_GETFIB(m));
#endif
+ NET_EPOCH_ENTER();
if (ro.ro_rt != NULL) {
ia = ifatoia(ro.ro_rt->rt_ifa);
- ifa_ref(&ia->ia_ifa);
} else
ia = NULL;
/*
@@ -1027,7 +1027,7 @@ ip_forward(struct mbuf *m, int srcrt)
m_freem(mcopy);
if (error != EINPROGRESS)
IPSTAT_INC(ips_cantforward);
- return;
+ goto out;
}
/* No IPsec processing required */
}
@@ -1080,16 +1080,12 @@ ip_forward(struct mbuf *m, int srcrt)
else {
if (mcopy)
m_freem(mcopy);
- if (ia != NULL)
- ifa_free(&ia->ia_ifa);
- return;
+ goto out;
}
}
- if (mcopy == NULL) {
- if (ia != NULL)
- ifa_free(&ia->ia_ifa);
- return;
- }
+ if (mcopy == NULL)
+ goto out;
+
switch (error) {
@@ -1131,13 +1127,11 @@ ip_forward(struct mbuf *m, int srcrt)
case ENOBUFS:
case EACCES: /* ipfw denied packet */
m_freem(mcopy);
- if (ia != NULL)
- ifa_free(&ia->ia_ifa);
- return;
+ goto out;
}
- if (ia != NULL)
- ifa_free(&ia->ia_ifa);
icmp_error(mcopy, type, code, dest.s_addr, mtu);
+ out:
+ NET_EPOCH_EXIT();
}
#define CHECK_SO_CT(sp, ct) \
diff --git a/freebsd/sys/netinet/ip_mroute.c b/freebsd/sys/netinet/ip_mroute.c
index 3bf4fa91..ac901601 100644
--- a/freebsd/sys/netinet/ip_mroute.c
+++ b/freebsd/sys/netinet/ip_mroute.c
@@ -880,13 +880,15 @@ add_vif(struct vifctl *vifcp)
ifp = NULL;
} else {
sin.sin_addr = vifcp->vifc_lcl_addr;
+ NET_EPOCH_ENTER();
ifa = ifa_ifwithaddr((struct sockaddr *)&sin);
if (ifa == NULL) {
+ NET_EPOCH_EXIT();
VIF_UNLOCK();
return EADDRNOTAVAIL;
}
ifp = ifa->ifa_ifp;
- ifa_free(ifa);
+ NET_EPOCH_EXIT();
}
if ((vifcp->vifc_flags & VIFF_TUNNEL) != 0) {
@@ -1682,7 +1684,7 @@ send_packet(struct vif *vifp, struct mbuf *m)
{
struct ip_moptions imo;
struct in_multi *imm[2];
- int error;
+ int error __unused;
VIF_LOCK_ASSERT();
diff --git a/freebsd/sys/netinet/ip_options.c b/freebsd/sys/netinet/ip_options.c
index d85aecf3..cc2f3eed 100644
--- a/freebsd/sys/netinet/ip_options.c
+++ b/freebsd/sys/netinet/ip_options.c
@@ -112,6 +112,7 @@ ip_dooptions(struct mbuf *m, int pass)
struct nhop4_extended nh_ext;
struct sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET };
+ NET_EPOCH_ENTER();
/* Ignore or reject packets with IP options. */
if (V_ip_doopts == 0)
return 0;
@@ -226,6 +227,7 @@ dropit:
#endif
IPSTAT_INC(ips_cantforward);
m_freem(m);
+ NET_EPOCH_EXIT();
return (1);
}
}
@@ -252,7 +254,6 @@ dropit:
memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
sizeof(struct in_addr));
- ifa_free(&ia->ia_ifa);
} else {
/* XXX MRT 0 for routing */
if (fib4_lookup_nh_ext(M_GETFIB(m),
@@ -300,7 +301,6 @@ dropit:
if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) != NULL) {
memcpy(cp + off, &(IA_SIN(ia)->sin_addr),
sizeof(struct in_addr));
- ifa_free(&ia->ia_ifa);
} else if (fib4_lookup_nh_ext(M_GETFIB(m),
ipaddr.sin_addr, 0, 0, &nh_ext) == 0) {
memcpy(cp + off, &nh_ext.nh_src,
@@ -355,7 +355,6 @@ dropit:
continue;
(void)memcpy(sin, &IA_SIN(ia)->sin_addr,
sizeof(struct in_addr));
- ifa_free(&ia->ia_ifa);
cp[IPOPT_OFFSET] += sizeof(struct in_addr);
off += sizeof(struct in_addr);
break;
@@ -383,12 +382,14 @@ dropit:
cp[IPOPT_OFFSET] += sizeof(uint32_t);
}
}
+ NET_EPOCH_EXIT();
if (forward && V_ipforwarding) {
ip_forward(m, 1);
return (1);
}
return (0);
bad:
+ NET_EPOCH_EXIT();
icmp_error(m, type, code, 0, 0);
IPSTAT_INC(ips_badoptions);
return (1);
diff --git a/freebsd/sys/netinet/ip_output.c b/freebsd/sys/netinet/ip_output.c
index 21b3919a..792f2311 100644
--- a/freebsd/sys/netinet/ip_output.c
+++ b/freebsd/sys/netinet/ip_output.c
@@ -227,7 +227,6 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
struct route iproute;
struct rtentry *rte; /* cache for ro->ro_rt */
uint32_t fibnum;
- int have_ia_ref;
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
int no_route_but_check_spd = 0;
#endif
@@ -283,6 +282,7 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
dst->sin_len = sizeof(*dst);
dst->sin_addr = ip->ip_dst;
}
+ NET_EPOCH_ENTER();
again:
/*
* Validate route against routing table additions;
@@ -308,7 +308,6 @@ again:
rte = NULL;
}
ia = NULL;
- have_ia_ref = 0;
/*
* If routing to interface only, short circuit routing lookup.
* The use of an all-ones broadcast address implies this; an
@@ -324,7 +323,6 @@ again:
error = ENETUNREACH;
goto bad;
}
- have_ia_ref = 1;
ip->ip_dst.s_addr = INADDR_BROADCAST;
dst->sin_addr = ip->ip_dst;
ifp = ia->ia_ifp;
@@ -339,7 +337,6 @@ again:
error = ENETUNREACH;
goto bad;
}
- have_ia_ref = 1;
ifp = ia->ia_ifp;
ip->ip_ttl = 1;
isbroadcast = ifp->if_flags & IFF_BROADCAST ?
@@ -352,8 +349,6 @@ again:
*/
ifp = imo->imo_multicast_ifp;
IFP_TO_IA(ifp, ia, &in_ifa_tracker);
- if (ia)
- have_ia_ref = 1;
isbroadcast = 0; /* fool gcc */
} else {
/*
@@ -581,8 +576,6 @@ sendit:
case -1: /* Need to try again */
/* Reset everything for a new round */
RO_RTFREE(ro);
- if (have_ia_ref)
- ifa_free(&ia->ia_ifa);
ro->ro_prepend = NULL;
rte = NULL;
gw = dst;
@@ -737,10 +730,9 @@ done:
* calling RTFREE on it again.
*/
ro->ro_rt = NULL;
- if (have_ia_ref)
- ifa_free(&ia->ia_ifa);
+ NET_EPOCH_EXIT();
return (error);
-bad:
+ bad:
m_freem(m);
goto done;
}
diff --git a/freebsd/sys/netinet/ip_var.h b/freebsd/sys/netinet/ip_var.h
index 9e7ee591..f874628a 100644
--- a/freebsd/sys/netinet/ip_var.h
+++ b/freebsd/sys/netinet/ip_var.h
@@ -36,6 +36,7 @@
#define _NETINET_IP_VAR_H_
#include <sys/queue.h>
+#include <sys/epoch.h>
/*
* Overlay for ip header used by other protocols (tcp, udp).
@@ -95,7 +96,7 @@ struct ip_moptions {
u_short imo_max_memberships; /* max memberships this socket */
struct in_multi **imo_membership; /* group memberships */
struct in_mfilter *imo_mfilters; /* source filters */
- STAILQ_ENTRY(ip_moptions) imo_link;
+ struct epoch_context imo_epoch_ctx;
};
struct ipstat {
@@ -175,6 +176,7 @@ struct ip;
struct inpcb;
struct route;
struct sockopt;
+struct inpcbinfo;
VNET_DECLARE(int, ip_defttl); /* default IP ttl */
VNET_DECLARE(int, ipforwarding); /* ip forwarding */
diff --git a/freebsd/sys/netinet/netdump/netdump.h b/freebsd/sys/netinet/netdump/netdump.h
new file mode 100644
index 00000000..12a527ee
--- /dev/null
+++ b/freebsd/sys/netinet/netdump/netdump.h
@@ -0,0 +1,132 @@
+/*-
+ * Copyright (c) 2005-2014 Sandvine Incorporated
+ * Copyright (c) 2000 Darrell Anderson <anderson@cs.duke.edu>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_NETDUMP_H_
+#define _NETINET_NETDUMP_H_
+
+#include <sys/types.h>
+#include <sys/disk.h>
+#include <sys/ioccom.h>
+
+#include <net/if.h>
+#include <netinet/in.h>
+
+#define NETDUMP_PORT 20023 /* Server UDP port for heralds. */
+#define NETDUMP_ACKPORT 20024 /* Client UDP port for acks. */
+
+#define NETDUMP_HERALD 1 /* Broadcast before starting a dump. */
+#define NETDUMP_FINISHED 2 /* Send after finishing a dump. */
+#define NETDUMP_VMCORE 3 /* Contains dump data. */
+#define NETDUMP_KDH 4 /* Contains kernel dump header. */
+#define NETDUMP_EKCD_KEY 5 /* Contains kernel dump key. */
+
+#define NETDUMP_DATASIZE 4096 /* Arbitrary packet size limit. */
+
+struct netdump_msg_hdr {
+ uint32_t mh_type; /* Netdump message type. */
+ uint32_t mh_seqno; /* Match acks with msgs. */
+ uint64_t mh_offset; /* vmcore offset (bytes). */
+ uint32_t mh_len; /* Attached data (bytes). */
+ uint32_t mh__pad;
+} __packed;
+
+struct netdump_ack {
+ uint32_t na_seqno; /* Match acks with msgs. */
+} __packed;
+
+struct netdump_conf {
+#ifndef __rtems__
+ struct diocskerneldump_arg ndc_kda;
+#endif /* __rtems__ */
+ char ndc_iface[IFNAMSIZ];
+ struct in_addr ndc_server;
+ struct in_addr ndc_client;
+ struct in_addr ndc_gateway;
+};
+
+#define _PATH_NETDUMP "/dev/netdump"
+
+#define NETDUMPGCONF _IOR('n', 1, struct netdump_conf)
+#define NETDUMPSCONF _IOW('n', 2, struct netdump_conf)
+
+#ifdef _KERNEL
+#ifdef NETDUMP
+
+#define NETDUMP_MAX_IN_FLIGHT 64
+
+enum netdump_ev {
+ NETDUMP_START,
+ NETDUMP_END,
+};
+
+struct ifnet;
+struct mbuf;
+
+void netdump_reinit(struct ifnet *);
+
+typedef void netdump_init_t(struct ifnet *, int *nrxr, int *ncl, int *clsize);
+typedef void netdump_event_t(struct ifnet *, enum netdump_ev);
+typedef int netdump_transmit_t(struct ifnet *, struct mbuf *);
+typedef int netdump_poll_t(struct ifnet *, int);
+
+struct netdump_methods {
+ netdump_init_t *nd_init;
+ netdump_event_t *nd_event;
+ netdump_transmit_t *nd_transmit;
+ netdump_poll_t *nd_poll;
+};
+
+#define NETDUMP_DEFINE(driver) \
+ static netdump_init_t driver##_netdump_init; \
+ static netdump_event_t driver##_netdump_event; \
+ static netdump_transmit_t driver##_netdump_transmit; \
+ static netdump_poll_t driver##_netdump_poll; \
+ \
+ static struct netdump_methods driver##_netdump_methods = { \
+ .nd_init = driver##_netdump_init, \
+ .nd_event = driver##_netdump_event, \
+ .nd_transmit = driver##_netdump_transmit, \
+ .nd_poll = driver##_netdump_poll, \
+ }
+
+#define NETDUMP_REINIT(ifp) netdump_reinit(ifp)
+
+#define NETDUMP_SET(ifp, driver) \
+ (ifp)->if_netdump_methods = &driver##_netdump_methods
+
+#else /* !NETDUMP */
+
+#define NETDUMP_DEFINE(driver)
+#define NETDUMP_REINIT(ifp)
+#define NETDUMP_SET(ifp, driver)
+
+#endif /* NETDUMP */
+#endif /* _KERNEL */
+
+#endif /* _NETINET_NETDUMP_H_ */
diff --git a/freebsd/sys/netinet/raw_ip.c b/freebsd/sys/netinet/raw_ip.c
index 0ed185ae..7dea3ec1 100644
--- a/freebsd/sys/netinet/raw_ip.c
+++ b/freebsd/sys/netinet/raw_ip.c
@@ -745,7 +745,7 @@ rip_ctlinput(int cmd, struct sockaddr *sa, void *vip)
switch (cmd) {
case PRC_IFDOWN:
IN_IFADDR_RLOCK(&in_ifa_tracker);
- TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
+ CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
if (ia->ia_ifa.ifa_addr == sa
&& (ia->ia_flags & IFA_ROUTE)) {
ifa_ref(&ia->ia_ifa);
@@ -771,7 +771,7 @@ rip_ctlinput(int cmd, struct sockaddr *sa, void *vip)
case PRC_IFUP:
IN_IFADDR_RLOCK(&in_ifa_tracker);
- TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
+ CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
if (ia->ia_ifa.ifa_addr == sa)
break;
}
@@ -853,6 +853,7 @@ rip_detach(struct socket *so)
ip_rsvp_force_done(so);
if (so == V_ip_rsvpd)
ip_rsvp_done();
+ /* XXX defer to epoch_call */
in_pcbdetach(inp);
in_pcbfree(inp);
INP_INFO_WUNLOCK(&V_ripcbinfo);
@@ -930,7 +931,7 @@ rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("rip_bind: inp == NULL"));
- if (TAILQ_EMPTY(&V_ifnet) ||
+ if (CK_STAILQ_EMPTY(&V_ifnet) ||
(addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) ||
(addr->sin_addr.s_addr &&
(inp->inp_flags & INP_BINDANY) == 0 &&
@@ -955,7 +956,7 @@ rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
if (nam->sa_len != sizeof(*addr))
return (EINVAL);
- if (TAILQ_EMPTY(&V_ifnet))
+ if (CK_STAILQ_EMPTY(&V_ifnet))
return (EADDRNOTAVAIL);
if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK)
return (EAFNOSUPPORT);
@@ -1022,6 +1023,7 @@ static int
rip_pcblist(SYSCTL_HANDLER_ARGS)
{
int error, i, n;
+ struct in_pcblist *il;
struct inpcb *inp, **inp_list;
inp_gen_t gencnt;
struct xinpgen xig;
@@ -1056,9 +1058,8 @@ rip_pcblist(SYSCTL_HANDLER_ARGS)
if (error)
return (error);
- inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
- if (inp_list == NULL)
- return (ENOMEM);
+ il = malloc(sizeof(struct in_pcblist) + n * sizeof(struct inpcb *), M_TEMP, M_WAITOK|M_ZERO_INVARIANTS);
+ inp_list = il->il_inp_list;
INP_INFO_RLOCK(&V_ripcbinfo);
for (inp = LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n;
@@ -1087,14 +1088,9 @@ rip_pcblist(SYSCTL_HANDLER_ARGS)
} else
INP_RUNLOCK(inp);
}
- INP_INFO_WLOCK(&V_ripcbinfo);
- for (i = 0; i < n; i++) {
- inp = inp_list[i];
- INP_RLOCK(inp);
- if (!in_pcbrele_rlocked(inp))
- INP_RUNLOCK(inp);
- }
- INP_INFO_WUNLOCK(&V_ripcbinfo);
+ il->il_count = n;
+ il->il_pcbinfo = &V_ripcbinfo;
+ epoch_call(net_epoch_preempt, &il->il_epoch_ctx, in_pcblist_rele_rlocked);
if (!error) {
/*
@@ -1110,7 +1106,6 @@ rip_pcblist(SYSCTL_HANDLER_ARGS)
INP_INFO_RUNLOCK(&V_ripcbinfo);
error = SYSCTL_OUT(req, &xig, sizeof xig);
}
- free(inp_list, M_TEMP);
return (error);
}
diff --git a/freebsd/sys/netinet/sctp_bsd_addr.c b/freebsd/sys/netinet/sctp_bsd_addr.c
index 7e2ef189..94c23bff 100644
--- a/freebsd/sys/netinet/sctp_bsd_addr.c
+++ b/freebsd/sys/netinet/sctp_bsd_addr.c
@@ -209,13 +209,13 @@ sctp_init_ifns_for_vrf(int vrfid)
#endif
IFNET_RLOCK();
- TAILQ_FOREACH(ifn, &MODULE_GLOBAL(ifnet), if_link) {
+ CK_STAILQ_FOREACH(ifn, &MODULE_GLOBAL(ifnet), if_link) {
if (sctp_is_desired_interface_type(ifn) == 0) {
/* non desired type */
continue;
}
IF_ADDR_RLOCK(ifn);
- TAILQ_FOREACH(ifa, &ifn->if_addrhead, ifa_link) {
+ CK_STAILQ_FOREACH(ifa, &ifn->if_addrhead, ifa_link) {
if (ifa->ifa_addr == NULL) {
continue;
}
@@ -362,11 +362,11 @@ void
struct ifaddr *ifa;
IFNET_RLOCK();
- TAILQ_FOREACH(ifn, &MODULE_GLOBAL(ifnet), if_link) {
+ CK_STAILQ_FOREACH(ifn, &MODULE_GLOBAL(ifnet), if_link) {
if (!(*pred) (ifn)) {
continue;
}
- TAILQ_FOREACH(ifa, &ifn->if_addrhead, ifa_link) {
+ CK_STAILQ_FOREACH(ifa, &ifn->if_addrhead, ifa_link) {
sctp_addr_change(ifa, add ? RTM_ADD : RTM_DELETE);
}
}
@@ -389,10 +389,7 @@ sctp_get_mbuf_for_msg(unsigned int space_needed, int want_header,
m_freem(m);
return (NULL);
}
- }
- if (SCTP_BUF_NEXT(m)) {
- sctp_m_freem(SCTP_BUF_NEXT(m));
- SCTP_BUF_NEXT(m) = NULL;
+ KASSERT(SCTP_BUF_NEXT(m) == NULL, ("%s: no chain allowed", __FUNCTION__));
}
#ifdef SCTP_MBUF_LOGGING
if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
diff --git a/freebsd/sys/netinet/sctp_indata.c b/freebsd/sys/netinet/sctp_indata.c
index 3325dd03..98b397a2 100644
--- a/freebsd/sys/netinet/sctp_indata.c
+++ b/freebsd/sys/netinet/sctp_indata.c
@@ -1673,9 +1673,7 @@ sctp_process_a_data_chunk(struct sctp_tcb *stcb, struct sctp_association *asoc,
struct sctp_nets *net, uint32_t *high_tsn, int *abort_flag,
int *break_flag, int last_chunk, uint8_t chk_type)
{
- /* Process a data chunk */
- /* struct sctp_tmit_chunk *chk; */
- struct sctp_tmit_chunk *chk;
+ struct sctp_tmit_chunk *chk = NULL; /* make gcc happy */
uint32_t tsn, fsn, gap, mid;
struct mbuf *dmbuf;
int the_len;
@@ -3623,7 +3621,9 @@ sctp_strike_gap_ack_chunks(struct sctp_tcb *stcb, struct sctp_association *asoc,
SCTP_SO_NOT_LOCKED);
}
/* Make sure to flag we had a FR */
- tp1->whoTo->net_ack++;
+ if (tp1->whoTo != NULL) {
+ tp1->whoTo->net_ack++;
+ }
continue;
}
}
diff --git a/freebsd/sys/netinet/sctp_input.c b/freebsd/sys/netinet/sctp_input.c
index 9a74ef4b..ee206551 100644
--- a/freebsd/sys/netinet/sctp_input.c
+++ b/freebsd/sys/netinet/sctp_input.c
@@ -2618,7 +2618,7 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset,
(sizeof(uint32_t))));
diff = now;
timevalsub(&diff, &time_expires);
- if (diff.tv_sec > UINT32_MAX / 1000000) {
+ if ((uint32_t)diff.tv_sec > UINT32_MAX / 1000000) {
staleness = UINT32_MAX;
} else {
staleness = diff.tv_sec * 1000000;
diff --git a/freebsd/sys/netinet/sctp_os_bsd.h b/freebsd/sys/netinet/sctp_os_bsd.h
index c9eaa069..d8d9e6e8 100644
--- a/freebsd/sys/netinet/sctp_os_bsd.h
+++ b/freebsd/sys/netinet/sctp_os_bsd.h
@@ -40,7 +40,6 @@ __FBSDID("$FreeBSD$");
/*
* includes
*/
-#include <rtems/bsd/local/opt_compat.h>
#include <rtems/bsd/local/opt_inet6.h>
#include <rtems/bsd/local/opt_inet.h>
#include <rtems/bsd/local/opt_sctp.h>
diff --git a/freebsd/sys/netinet/sctp_output.c b/freebsd/sys/netinet/sctp_output.c
index 9dd2e0fa..bdef958c 100644
--- a/freebsd/sys/netinet/sctp_output.c
+++ b/freebsd/sys/netinet/sctp_output.c
@@ -7452,7 +7452,7 @@ dont_do_it:
/* Not enough room for a chunk header, get some */
struct mbuf *m;
- m = sctp_get_mbuf_for_msg(1, 0, M_NOWAIT, 0, MT_DATA);
+ m = sctp_get_mbuf_for_msg(1, 0, M_NOWAIT, 1, MT_DATA);
if (m == NULL) {
/*
* we're in trouble here. _PREPEND below will free
@@ -11032,9 +11032,8 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst,
struct sctp_chunkhdr *ch;
#if defined(INET) || defined(INET6)
struct udphdr *udp;
- int ret;
#endif
- int len, cause_len, padding_len;
+ int ret, len, cause_len, padding_len;
#ifdef INET
struct sockaddr_in *src_sin, *dst_sin;
struct ip *ip;
@@ -11261,9 +11260,13 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst,
SCTP_LTRACE_ERR_RET_PKT(mout, NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EFAULT);
return;
}
+ SCTPDBG(SCTP_DEBUG_OUTPUT3, "return from send is %d\n", ret);
SCTP_STAT_INCR(sctps_sendpackets);
SCTP_STAT_INCR_COUNTER64(sctps_outpackets);
SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks);
+ if (ret) {
+ SCTP_STAT_INCR(sctps_senderrors);
+ }
return;
}
diff --git a/freebsd/sys/netinet/sctp_usrreq.c b/freebsd/sys/netinet/sctp_usrreq.c
index 05ddee01..071d44c2 100644
--- a/freebsd/sys/netinet/sctp_usrreq.c
+++ b/freebsd/sys/netinet/sctp_usrreq.c
@@ -206,7 +206,7 @@ sctp_notify(struct sctp_inpcb *inp,
#endif
/* no need to unlock here, since the TCB is gone */
} else if (icmp_code == ICMP_UNREACH_NEEDFRAG) {
- if ((net->dest_state & SCTP_ADDR_NO_PMTUD) == 0) {
+ if (net->dest_state & SCTP_ADDR_NO_PMTUD) {
SCTP_TCB_UNLOCK(stcb);
return;
}
@@ -707,22 +707,10 @@ sctp_disconnect(struct socket *so)
if (SCTP_GET_STATE(asoc) !=
SCTP_STATE_COOKIE_WAIT) {
/* Left with Data unread */
- struct mbuf *err;
-
- err = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), 0, M_NOWAIT, 1, MT_DATA);
- if (err) {
- /*
- * Fill in the user
- * initiated abort
- */
- struct sctp_paramhdr *ph;
+ struct mbuf *op_err;
- ph = mtod(err, struct sctp_paramhdr *);
- SCTP_BUF_LEN(err) = sizeof(struct sctp_paramhdr);
- ph->param_type = htons(SCTP_CAUSE_USER_INITIATED_ABT);
- ph->param_length = htons(SCTP_BUF_LEN(err));
- }
- sctp_send_abort_tcb(stcb, err, SCTP_SO_LOCKED);
+ op_err = sctp_generate_cause(SCTP_CAUSE_USER_INITIATED_ABT, "");
+ sctp_send_abort_tcb(stcb, op_err, SCTP_SO_LOCKED);
SCTP_STAT_INCR_COUNTER32(sctps_aborted);
}
SCTP_INP_RUNLOCK(inp);
diff --git a/freebsd/sys/netinet/sctputil.c b/freebsd/sys/netinet/sctputil.c
index 5511df64..aad1e19d 100644
--- a/freebsd/sys/netinet/sctputil.c
+++ b/freebsd/sys/netinet/sctputil.c
@@ -74,6 +74,7 @@ extern const struct sctp_ss_functions sctp_ss_functions[];
void
sctp_sblog(struct sockbuf *sb, struct sctp_tcb *stcb, int from, int incr)
{
+#if defined(SCTP_LOCAL_TRACE_BUF)
struct sctp_cwnd_log sctp_clog;
sctp_clog.x.sb.stcb = stcb;
@@ -90,11 +91,13 @@ sctp_sblog(struct sockbuf *sb, struct sctp_tcb *stcb, int from, int incr)
sctp_clog.x.misc.log2,
sctp_clog.x.misc.log3,
sctp_clog.x.misc.log4);
+#endif
}
void
sctp_log_closing(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int16_t loc)
{
+#if defined(SCTP_LOCAL_TRACE_BUF)
struct sctp_cwnd_log sctp_clog;
sctp_clog.x.close.inp = (void *)inp;
@@ -114,11 +117,13 @@ sctp_log_closing(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int16_t loc)
sctp_clog.x.misc.log2,
sctp_clog.x.misc.log3,
sctp_clog.x.misc.log4);
+#endif
}
void
rto_logging(struct sctp_nets *net, int from)
{
+#if defined(SCTP_LOCAL_TRACE_BUF)
struct sctp_cwnd_log sctp_clog;
memset(&sctp_clog, 0, sizeof(sctp_clog));
@@ -131,11 +136,13 @@ rto_logging(struct sctp_nets *net, int from)
sctp_clog.x.misc.log2,
sctp_clog.x.misc.log3,
sctp_clog.x.misc.log4);
+#endif
}
void
sctp_log_strm_del_alt(struct sctp_tcb *stcb, uint32_t tsn, uint16_t sseq, uint16_t stream, int from)
{
+#if defined(SCTP_LOCAL_TRACE_BUF)
struct sctp_cwnd_log sctp_clog;
sctp_clog.x.strlog.stcb = stcb;
@@ -151,11 +158,13 @@ sctp_log_strm_del_alt(struct sctp_tcb *stcb, uint32_t tsn, uint16_t sseq, uint16
sctp_clog.x.misc.log2,
sctp_clog.x.misc.log3,
sctp_clog.x.misc.log4);
+#endif
}
void
sctp_log_nagle_event(struct sctp_tcb *stcb, int action)
{
+#if defined(SCTP_LOCAL_TRACE_BUF)
struct sctp_cwnd_log sctp_clog;
sctp_clog.x.nagle.stcb = (void *)stcb;
@@ -170,11 +179,13 @@ sctp_log_nagle_event(struct sctp_tcb *stcb, int action)
sctp_clog.x.misc.log2,
sctp_clog.x.misc.log3,
sctp_clog.x.misc.log4);
+#endif
}
void
sctp_log_sack(uint32_t old_cumack, uint32_t cumack, uint32_t tsn, uint16_t gaps, uint16_t dups, int from)
{
+#if defined(SCTP_LOCAL_TRACE_BUF)
struct sctp_cwnd_log sctp_clog;
sctp_clog.x.sack.cumack = cumack;
@@ -189,11 +200,13 @@ sctp_log_sack(uint32_t old_cumack, uint32_t cumack, uint32_t tsn, uint16_t gaps,
sctp_clog.x.misc.log2,
sctp_clog.x.misc.log3,
sctp_clog.x.misc.log4);
+#endif
}
void
sctp_log_map(uint32_t map, uint32_t cum, uint32_t high, int from)
{
+#if defined(SCTP_LOCAL_TRACE_BUF)
struct sctp_cwnd_log sctp_clog;
memset(&sctp_clog, 0, sizeof(sctp_clog));
@@ -207,11 +220,13 @@ sctp_log_map(uint32_t map, uint32_t cum, uint32_t high, int from)
sctp_clog.x.misc.log2,
sctp_clog.x.misc.log3,
sctp_clog.x.misc.log4);
+#endif
}
void
sctp_log_fr(uint32_t biggest_tsn, uint32_t biggest_new_tsn, uint32_t tsn, int from)
{
+#if defined(SCTP_LOCAL_TRACE_BUF)
struct sctp_cwnd_log sctp_clog;
memset(&sctp_clog, 0, sizeof(sctp_clog));
@@ -225,12 +240,14 @@ sctp_log_fr(uint32_t biggest_tsn, uint32_t biggest_new_tsn, uint32_t tsn, int fr
sctp_clog.x.misc.log2,
sctp_clog.x.misc.log3,
sctp_clog.x.misc.log4);
+#endif
}
#ifdef SCTP_MBUF_LOGGING
void
sctp_log_mb(struct mbuf *m, int from)
{
+#if defined(SCTP_LOCAL_TRACE_BUF)
struct sctp_cwnd_log sctp_clog;
sctp_clog.x.mb.mp = m;
@@ -251,6 +268,7 @@ sctp_log_mb(struct mbuf *m, int from)
sctp_clog.x.misc.log2,
sctp_clog.x.misc.log3,
sctp_clog.x.misc.log4);
+#endif
}
void
@@ -267,6 +285,7 @@ sctp_log_mbc(struct mbuf *m, int from)
void
sctp_log_strm_del(struct sctp_queued_to_read *control, struct sctp_queued_to_read *poschk, int from)
{
+#if defined(SCTP_LOCAL_TRACE_BUF)
struct sctp_cwnd_log sctp_clog;
if (control == NULL) {
@@ -291,11 +310,13 @@ sctp_log_strm_del(struct sctp_queued_to_read *control, struct sctp_queued_to_rea
sctp_clog.x.misc.log2,
sctp_clog.x.misc.log3,
sctp_clog.x.misc.log4);
+#endif
}
void
sctp_log_cwnd(struct sctp_tcb *stcb, struct sctp_nets *net, int augment, uint8_t from)
{
+#if defined(SCTP_LOCAL_TRACE_BUF)
struct sctp_cwnd_log sctp_clog;
sctp_clog.x.cwnd.net = net;
@@ -326,11 +347,13 @@ sctp_log_cwnd(struct sctp_tcb *stcb, struct sctp_nets *net, int augment, uint8_t
sctp_clog.x.misc.log2,
sctp_clog.x.misc.log3,
sctp_clog.x.misc.log4);
+#endif
}
void
sctp_log_lock(struct sctp_inpcb *inp, struct sctp_tcb *stcb, uint8_t from)
{
+#if defined(SCTP_LOCAL_TRACE_BUF)
struct sctp_cwnd_log sctp_clog;
memset(&sctp_clog, 0, sizeof(sctp_clog));
@@ -370,11 +393,13 @@ sctp_log_lock(struct sctp_inpcb *inp, struct sctp_tcb *stcb, uint8_t from)
sctp_clog.x.misc.log2,
sctp_clog.x.misc.log3,
sctp_clog.x.misc.log4);
+#endif
}
void
sctp_log_maxburst(struct sctp_tcb *stcb, struct sctp_nets *net, int error, int burst, uint8_t from)
{
+#if defined(SCTP_LOCAL_TRACE_BUF)
struct sctp_cwnd_log sctp_clog;
memset(&sctp_clog, 0, sizeof(sctp_clog));
@@ -397,11 +422,13 @@ sctp_log_maxburst(struct sctp_tcb *stcb, struct sctp_nets *net, int error, int b
sctp_clog.x.misc.log2,
sctp_clog.x.misc.log3,
sctp_clog.x.misc.log4);
+#endif
}
void
sctp_log_rwnd(uint8_t from, uint32_t peers_rwnd, uint32_t snd_size, uint32_t overhead)
{
+#if defined(SCTP_LOCAL_TRACE_BUF)
struct sctp_cwnd_log sctp_clog;
sctp_clog.x.rwnd.rwnd = peers_rwnd;
@@ -415,11 +442,13 @@ sctp_log_rwnd(uint8_t from, uint32_t peers_rwnd, uint32_t snd_size, uint32_t ove
sctp_clog.x.misc.log2,
sctp_clog.x.misc.log3,
sctp_clog.x.misc.log4);
+#endif
}
void
sctp_log_rwnd_set(uint8_t from, uint32_t peers_rwnd, uint32_t flight_size, uint32_t overhead, uint32_t a_rwndval)
{
+#if defined(SCTP_LOCAL_TRACE_BUF)
struct sctp_cwnd_log sctp_clog;
sctp_clog.x.rwnd.rwnd = peers_rwnd;
@@ -433,12 +462,14 @@ sctp_log_rwnd_set(uint8_t from, uint32_t peers_rwnd, uint32_t flight_size, uint3
sctp_clog.x.misc.log2,
sctp_clog.x.misc.log3,
sctp_clog.x.misc.log4);
+#endif
}
#ifdef SCTP_MBCNT_LOGGING
static void
sctp_log_mbcnt(uint8_t from, uint32_t total_oq, uint32_t book, uint32_t total_mbcnt_q, uint32_t mbcnt)
{
+#if defined(SCTP_LOCAL_TRACE_BUF)
struct sctp_cwnd_log sctp_clog;
sctp_clog.x.mbcnt.total_queue_size = total_oq;
@@ -452,21 +483,25 @@ sctp_log_mbcnt(uint8_t from, uint32_t total_oq, uint32_t book, uint32_t total_mb
sctp_clog.x.misc.log2,
sctp_clog.x.misc.log3,
sctp_clog.x.misc.log4);
+#endif
}
#endif
void
sctp_misc_ints(uint8_t from, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
{
+#if defined(SCTP_LOCAL_TRACE_BUF)
SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x",
SCTP_LOG_MISC_EVENT,
from,
a, b, c, d);
+#endif
}
void
sctp_wakeup_log(struct sctp_tcb *stcb, uint32_t wake_cnt, int from)
{
+#if defined(SCTP_LOCAL_TRACE_BUF)
struct sctp_cwnd_log sctp_clog;
sctp_clog.x.wake.stcb = (void *)stcb;
@@ -508,11 +543,13 @@ sctp_wakeup_log(struct sctp_tcb *stcb, uint32_t wake_cnt, int from)
sctp_clog.x.misc.log2,
sctp_clog.x.misc.log3,
sctp_clog.x.misc.log4);
+#endif
}
void
sctp_log_block(uint8_t from, struct sctp_association *asoc, size_t sendlen)
{
+#if defined(SCTP_LOCAL_TRACE_BUF)
struct sctp_cwnd_log sctp_clog;
sctp_clog.x.blk.onsb = asoc->total_output_queue_size;
@@ -529,6 +566,7 @@ sctp_log_block(uint8_t from, struct sctp_association *asoc, size_t sendlen)
sctp_clog.x.misc.log2,
sctp_clog.x.misc.log3,
sctp_clog.x.misc.log4);
+#endif
}
int
@@ -760,8 +798,8 @@ sctp_stop_timers_for_shutdown(struct sctp_tcb *stcb)
}
/*
- * a list of sizes based on typical mtu's, used only if next hop size not
- * returned.
+ * A list of sizes based on typical mtu's, used only if next hop size not
+ * returned. These values MUST be multiples of 4 and MUST be ordered.
*/
static uint32_t sctp_mtu_sizes[] = {
68,
@@ -770,29 +808,32 @@ static uint32_t sctp_mtu_sizes[] = {
512,
544,
576,
- 1006,
+ 1004,
1492,
1500,
1536,
- 2002,
+ 2000,
2048,
4352,
4464,
8166,
- 17914,
+ 17912,
32000,
- 65535
+ 65532
};
/*
- * Return the largest MTU smaller than val. If there is no
- * entry, just return val.
+ * Return the largest MTU in sctp_mtu_sizes smaller than val.
+ * If val is smaller than the minimum, just return the largest
+ * multiple of 4 smaller or equal to val.
+ * Ensure that the result is a multiple of 4.
*/
uint32_t
sctp_get_prev_mtu(uint32_t val)
{
uint32_t i;
+ val &= 0xfffffffc;
if (val <= sctp_mtu_sizes[0]) {
return (val);
}
@@ -801,12 +842,16 @@ sctp_get_prev_mtu(uint32_t val)
break;
}
}
+ KASSERT((sctp_mtu_sizes[i - 1] & 0x00000003) == 0,
+ ("sctp_mtu_sizes[%u] not a multiple of 4", i - 1));
return (sctp_mtu_sizes[i - 1]);
}
/*
- * Return the smallest MTU larger than val. If there is no
- * entry, just return val.
+ * Return the smallest MTU in sctp_mtu_sizes larger than val.
+ * If val is larger than the maximum, just return the largest multiple of 4 smaller
+ * or equal to val.
+ * Ensure that the result is a multiple of 4.
*/
uint32_t
sctp_get_next_mtu(uint32_t val)
@@ -814,8 +859,11 @@ sctp_get_next_mtu(uint32_t val)
/* select another MTU that is just bigger than this one */
uint32_t i;
+ val &= 0xfffffffc;
for (i = 0; i < (sizeof(sctp_mtu_sizes) / sizeof(uint32_t)); i++) {
if (val < sctp_mtu_sizes[i]) {
+ KASSERT((sctp_mtu_sizes[i] & 0x00000003) == 0,
+ ("sctp_mtu_sizes[%u] not a multiple of 4", i));
return (sctp_mtu_sizes[i]);
}
}
@@ -2662,6 +2710,13 @@ sctp_notify_assoc_change(uint16_t state, struct sctp_tcb *stcb,
notif_len = (unsigned int)sizeof(struct sctp_assoc_change);
if (abort != NULL) {
abort_len = ntohs(abort->ch.chunk_length);
+ /*
+ * Only SCTP_CHUNK_BUFFER_SIZE are guaranteed to be
+ * contiguous.
+ */
+ if (abort_len > SCTP_CHUNK_BUFFER_SIZE) {
+ abort_len = SCTP_CHUNK_BUFFER_SIZE;
+ }
} else {
abort_len = 0;
}
@@ -3567,6 +3622,13 @@ sctp_notify_remote_error(struct sctp_tcb *stcb, uint16_t error, struct sctp_erro
}
if (chunk != NULL) {
chunk_len = ntohs(chunk->ch.chunk_length);
+ /*
+ * Only SCTP_CHUNK_BUFFER_SIZE are guaranteed to be
+ * contiguous.
+ */
+ if (chunk_len > SCTP_CHUNK_BUFFER_SIZE) {
+ chunk_len = SCTP_CHUNK_BUFFER_SIZE;
+ }
} else {
chunk_len = 0;
}
diff --git a/freebsd/sys/netinet/tcp_hpts.h b/freebsd/sys/netinet/tcp_hpts.h
new file mode 100644
index 00000000..c52a1d78
--- /dev/null
+++ b/freebsd/sys/netinet/tcp_hpts.h
@@ -0,0 +1,304 @@
+/*-
+ * Copyright (c) 2016-2018 Netflix Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef __tcp_hpts_h__
+#define __tcp_hpts_h__
+
+/*
+ * The hpts uses a 102400 wheel. The wheel
+ * defines the time in 10 usec increments (102400 x 10).
+ * This gives a range of 10usec - 1024ms to place
+ * an entry within. If the user requests more than
+ * 1.024 second, a remaineder is attached and the hpts
+ * when seeing the remainder will re-insert the
+ * inpcb forward in time from where it is until
+ * the remainder is zero.
+ */
+
+#define NUM_OF_HPTSI_SLOTS 102400
+
+TAILQ_HEAD(hptsh, inpcb);
+
+/* Number of useconds in a hpts tick */
+#define HPTS_TICKS_PER_USEC 10
+#define HPTS_MS_TO_SLOTS(x) (x * 100)
+#define HPTS_USEC_TO_SLOTS(x) ((x+9) /10)
+#define HPTS_USEC_IN_SEC 1000000
+#define HPTS_MSEC_IN_SEC 1000
+#define HPTS_USEC_IN_MSEC 1000
+
+#define DEFAULT_HPTS_LOG 3072
+
+/*
+ * Log flags consist of
+ * 7f 7f 1 1 bits
+ * p_cpu | p_num | INPUT_ACTIVE | HPTS_ACTIVE
+ *
+ * So for example cpu 10, number 10 would with
+ * input active would show up as:
+ * p_flags = 0001010 0001010 1 0
+ * <or>
+ * p_flags = 0x142a
+ */
+#define HPTS_HPTS_ACTIVE 0x01
+#define HPTS_INPUT_ACTIVE 0x02
+
+#define HPTSLOG_IMMEDIATE 1
+#define HPTSLOG_INSERT_NORMAL 2
+#define HPTSLOG_INSERT_SLEEPER 3
+#define HPTSLOG_SLEEP_AFTER 4
+#define HPTSLOG_SLEEP_BEFORE 5
+#define HPTSLOG_INSERTED 6
+#define HPTSLOG_WAKEUP_HPTS 7
+#define HPTSLOG_SETTORUN 8
+#define HPTSLOG_HPTSI 9
+#define HPTSLOG_TOLONG 10
+#define HPTSLOG_AWAKENS 11
+#define HPTSLOG_TIMESOUT 12
+#define HPTSLOG_SLEEPSET 13
+#define HPTSLOG_WAKEUP_INPUT 14
+#define HPTSLOG_RESCHEDULE 15
+#define HPTSLOG_AWAKE 16
+#define HPTSLOG_INP_DONE 17
+
+struct hpts_log {
+ struct inpcb *inp;
+ int32_t event;
+ uint32_t cts;
+ int32_t line;
+ uint32_t ticknow;
+ uint32_t t_paceslot;
+ uint32_t t_hptsreq;
+ uint32_t p_curtick;
+ uint32_t p_prevtick;
+ uint32_t slot_req;
+ uint32_t p_on_queue_cnt;
+ uint32_t p_nxt_slot;
+ uint32_t p_cur_slot;
+ uint32_t p_hpts_sleep_time;
+ uint16_t p_flags;
+ uint8_t p_onhpts;
+ uint8_t p_oninput;
+ uint8_t is_notempty;
+};
+
+struct hpts_diag {
+ uint32_t p_hpts_active;
+ uint32_t p_nxt_slot;
+ uint32_t p_cur_slot;
+ uint32_t slot_req;
+ uint32_t inp_hptsslot;
+ uint32_t slot_now;
+ uint32_t have_slept;
+ uint32_t hpts_sleep_time;
+ uint32_t yet_to_sleep;
+ uint32_t need_new_to;
+ int32_t co_ret;
+ uint8_t p_on_min_sleep;
+};
+
+#ifdef _KERNEL
+/* Each hpts has its own p_mtx which is used for locking */
+struct tcp_hpts_entry {
+ /* Cache line 0x00 */
+ struct mtx p_mtx; /* Mutex for hpts */
+ uint32_t p_hpts_active; /* Flag that says hpts is awake */
+ uint32_t p_curtick; /* Current tick in 10 us the hpts is at */
+ uint32_t p_prevtick; /* Previous tick in 10 us the hpts ran */
+ uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */
+ uint32_t p_nxt_slot; /* The next slot outside the current range of
+ * slots that the hpts is running on. */
+ int32_t p_on_queue_cnt; /* Count on queue in this hpts */
+ uint32_t enobuf_cnt;
+ uint16_t p_log_at;
+ uint8_t p_direct_wake :1, /* boolean */
+ p_log_wrapped :1, /* boolean */
+ p_on_min_sleep:1; /* boolean */
+ uint8_t p_fill;
+ /* Cache line 0x40 */
+ void *p_inp;
+ struct hptsh p_input; /* For the tcp-input runner */
+ /* Hptsi wheel */
+ struct hptsh *p_hptss;
+ struct hpts_log *p_log;
+ uint32_t p_logsize;
+ int32_t p_on_inqueue_cnt; /* Count on input queue in this hpts */
+ uint32_t hit_no_enobuf;
+ uint32_t p_dyn_adjust;
+ uint32_t p_hpts_sleep_time; /* Current sleep interval having a max
+ * of 255ms */
+ uint32_t p_delayed_by; /* How much were we delayed by */
+ /* Cache line 0x80 */
+ struct sysctl_ctx_list hpts_ctx;
+ struct sysctl_oid *hpts_root;
+ struct intr_event *ie;
+ void *ie_cookie;
+ uint16_t p_num; /* The hpts number one per cpu */
+ uint16_t p_cpu; /* The hpts CPU */
+ /* There is extra space in here */
+ /* Cache line 0x100 */
+ struct callout co __aligned(CACHE_LINE_SIZE);
+} __aligned(CACHE_LINE_SIZE);
+
+struct tcp_hptsi {
+ struct proc *rp_proc; /* Process structure for hpts */
+ struct tcp_hpts_entry **rp_ent; /* Array of hptss */
+ uint32_t rp_num_hptss; /* Number of hpts threads */
+};
+
+#endif
+
+#define HPTS_REMOVE_INPUT 0x01
+#define HPTS_REMOVE_OUTPUT 0x02
+#define HPTS_REMOVE_ALL (HPTS_REMOVE_INPUT | HPTS_REMOVE_OUTPUT)
+
+/*
+ * When using the hpts, a TCP stack must make sure
+ * that once a INP_DROPPED flag is applied to a INP
+ * that it does not expect tcp_output() to ever be
+ * called by the hpts. The hpts will *not* call
+ * any output (or input) functions on a TCB that
+ * is in the DROPPED state.
+ *
+ * This implies final ACK's and RST's that might
+ * be sent when a TCB is still around must be
+ * sent from a routine like tcp_respond().
+ */
+#define DEFAULT_MIN_SLEEP 250 /* How many usec's is default for hpts sleep
+ * this determines min granularity of the
+ * hpts. If 0, granularity is 10useconds at
+ * the cost of more CPU (context switching). */
+#ifdef _KERNEL
+#define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED)
+struct tcp_hpts_entry *tcp_hpts_lock(struct inpcb *inp);
+struct tcp_hpts_entry *tcp_input_lock(struct inpcb *inp);
+int __tcp_queue_to_hpts_immediate(struct inpcb *inp, int32_t line);
+#define tcp_queue_to_hpts_immediate(a)__tcp_queue_to_hpts_immediate(a, __LINE__)
+
+struct tcp_hpts_entry *tcp_cur_hpts(struct inpcb *inp);
+#define tcp_hpts_remove(a, b) __tcp_hpts_remove(a, b, __LINE__)
+void __tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line);
+
+/*
+ * To insert a TCB on the hpts you *must* be holding the
+ * INP_WLOCK(). The hpts insert code will then acqurire
+ * the hpts's lock and insert the TCB on the requested
+ * slot possibly waking up the hpts if you are requesting
+ * a time earlier than what the hpts is sleeping to (if
+ * the hpts is sleeping). You may check the inp->inp_in_hpts
+ * flag without the hpts lock. The hpts is the only one
+ * that will clear this flag holding only the hpts lock. This
+ * means that in your tcp_output() routine when you test for
+ * it to be 1 (so you wont call output) it may be transitioning
+ * to 0 (by the hpts). That will be fine since that will just
+ * mean an extra call to tcp_output that most likely will find
+ * the call you executed (when the mis-match occured) will have
+ * put the TCB back on the hpts and it will return. If your
+ * call did not add it back to the hpts then you will either
+ * over-send or the cwnd will block you from sending more.
+ *
+ * Note you should also be holding the INP_WLOCK() when you
+ * call the remove from the hpts as well. Thoug usually
+ * you are either doing this from a timer, where you need
+ * that INP_WLOCK() or from destroying your TCB where again
+ * you should already have the INP_WLOCK().
+ */
+uint32_t __tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line);
+#define tcp_hpts_insert(a, b) __tcp_hpts_insert(a, b, __LINE__)
+
+uint32_t
+tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag);
+
+int
+ __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line);
+#define tcp_queue_to_input_locked(a, b) __tcp_queue_to_input_locked(a, b, __LINE__);
+void
+tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
+ int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, uint8_t ti_locked);
+int
+__tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
+ int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, uint8_t ti_locked, int32_t line);
+#define tcp_queue_to_input(a, b, c, d, e, f, g) __tcp_queue_to_input(a, b, c, d, e, f, g, __LINE__)
+
+uint16_t tcp_hpts_delayedby(struct inpcb *inp);
+
+void __tcp_set_hpts(struct inpcb *inp, int32_t line);
+#define tcp_set_hpts(a) __tcp_set_hpts(a, __LINE__)
+
+void __tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line);
+#define tcp_set_inp_to_drop(a, b) __tcp_set_inp_to_drop(a, b, __LINE__)
+
+extern int32_t tcp_min_hptsi_time;
+
+static __inline uint32_t
+tcp_tv_to_hptstick(struct timeval *sv)
+{
+ return ((sv->tv_sec * 100000) + (sv->tv_usec / 10));
+}
+
+static __inline uint32_t
+tcp_gethptstick(struct timeval *sv)
+{
+ struct timeval tv;
+
+ if (sv == NULL)
+ sv = &tv;
+ microuptime(sv);
+ return (tcp_tv_to_hptstick(sv));
+}
+
+static __inline uint32_t
+tcp_tv_to_usectick(struct timeval *sv)
+{
+ return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
+}
+
+static __inline uint32_t
+tcp_tv_to_mssectick(struct timeval *sv)
+{
+ return ((uint32_t) ((sv->tv_sec * HPTS_MSEC_IN_SEC) + (sv->tv_usec/HPTS_USEC_IN_MSEC)));
+}
+
+static __inline void
+tcp_hpts_unlock(struct tcp_hpts_entry *hpts)
+{
+ mtx_unlock(&hpts->p_mtx);
+}
+
+static __inline uint32_t
+tcp_get_usecs(struct timeval *tv)
+{
+ struct timeval tvd;
+
+ if (tv == NULL)
+ tv = &tvd;
+ microuptime(tv);
+ return (tcp_tv_to_usectick(tv));
+}
+
+#endif /* _KERNEL */
+#endif /* __tcp_hpts_h__ */
diff --git a/freebsd/sys/netinet/tcp_input.c b/freebsd/sys/netinet/tcp_input.c
index 7c907da9..20bea2de 100644
--- a/freebsd/sys/netinet/tcp_input.c
+++ b/freebsd/sys/netinet/tcp_input.c
@@ -1684,6 +1684,9 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
to.to_tsecr -= tp->ts_offset;
if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks()))
to.to_tsecr = 0;
+ else if (tp->t_flags & TF_PREVVALID &&
+ tp->t_badrxtwin != 0 && SEQ_LT(to.to_tsecr, tp->t_badrxtwin))
+ cc_cong_signal(tp, th, CC_RTO_ERR);
}
/*
* Process options only when we get SYN/ACK back. The SYN case
@@ -1796,9 +1799,10 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
TCPSTAT_INC(tcps_predack);
/*
- * "bad retransmit" recovery.
+ * "bad retransmit" recovery without timestamps.
*/
- if (tp->t_rxtshift == 1 &&
+ if ((to.to_flags & TOF_TS) == 0 &&
+ tp->t_rxtshift == 1 &&
tp->t_flags & TF_PREVVALID &&
(int)(ticks - tp->t_badrxtwin) < 0) {
cc_cong_signal(tp, th, CC_RTO_ERR);
@@ -2789,8 +2793,10 @@ process_ACK:
* original cwnd and ssthresh, and proceed to transmit where
* we left off.
*/
- if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID &&
- (int)(ticks - tp->t_badrxtwin) < 0)
+ if (tp->t_rxtshift == 1 &&
+ tp->t_flags & TF_PREVVALID &&
+ tp->t_badrxtwin &&
+ SEQ_LT(to.to_tsecr, tp->t_badrxtwin))
cc_cong_signal(tp, th, CC_RTO_ERR);
/*
diff --git a/freebsd/sys/netinet/tcp_offload.c b/freebsd/sys/netinet/tcp_offload.c
index 41302db1..f3ab3b50 100644
--- a/freebsd/sys/netinet/tcp_offload.c
+++ b/freebsd/sys/netinet/tcp_offload.c
@@ -170,6 +170,17 @@ tcp_offload_ctloutput(struct tcpcb *tp, int sopt_dir, int sopt_name)
}
void
+tcp_offload_tcp_info(struct tcpcb *tp, struct tcp_info *ti)
+{
+ struct toedev *tod = tp->tod;
+
+ KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp));
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ tod->tod_tcp_info(tod, tp, ti);
+}
+
+void
tcp_offload_detach(struct tcpcb *tp)
{
struct toedev *tod = tp->tod;
diff --git a/freebsd/sys/netinet/tcp_offload.h b/freebsd/sys/netinet/tcp_offload.h
index 8485fa29..f755ce7e 100644
--- a/freebsd/sys/netinet/tcp_offload.h
+++ b/freebsd/sys/netinet/tcp_offload.h
@@ -45,6 +45,7 @@ void tcp_offload_input(struct tcpcb *, struct mbuf *);
int tcp_offload_output(struct tcpcb *);
void tcp_offload_rcvd(struct tcpcb *);
void tcp_offload_ctloutput(struct tcpcb *, int, int);
+void tcp_offload_tcp_info(struct tcpcb *, struct tcp_info *);
void tcp_offload_detach(struct tcpcb *);
#endif
diff --git a/freebsd/sys/netinet/tcp_output.c b/freebsd/sys/netinet/tcp_output.c
index 8762407f..bdbfe984 100644
--- a/freebsd/sys/netinet/tcp_output.c
+++ b/freebsd/sys/netinet/tcp_output.c
@@ -208,7 +208,7 @@ tcp_output(struct tcpcb *tp)
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
unsigned ipsec_optlen = 0;
#endif
- int idle, sendalot;
+ int idle, sendalot, curticks;
int sack_rxmit, sack_bytes_rxmt;
struct sackhole *p;
int tso, mtu;
@@ -810,9 +810,12 @@ send:
/* Timestamps. */
if ((tp->t_flags & TF_RCVD_TSTMP) ||
((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
- to.to_tsval = tcp_ts_getticks() + tp->ts_offset;
+ curticks = tcp_ts_getticks();
+ to.to_tsval = curticks + tp->ts_offset;
to.to_tsecr = tp->ts_recent;
to.to_flags |= TOF_TS;
+ if (tp->t_rxtshift == 1)
+ tp->t_badrxtwin = curticks;
}
/* Set receive buffer autosizing timestamp. */
@@ -1313,10 +1316,6 @@ send:
}
#endif
- /* We're getting ready to send; log now. */
- TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
- len, NULL, false);
-
/*
* Enable TSO and specify the size of the segments.
* The TCP pseudo header checksum is always provided.
@@ -1365,6 +1364,10 @@ send:
#endif /* TCPDEBUG */
TCP_PROBE3(debug__output, tp, th, m);
+ /* We're getting ready to send; log now. */
+ TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
+ len, NULL, false);
+
/*
* Fill in IP length and desired time to live and
* send to IP level. There should be a better way
@@ -1588,8 +1591,6 @@ timer:
SOCKBUF_UNLOCK_ASSERT(&so->so_snd); /* Check gotos. */
switch (error) {
case EACCES:
- tp->t_softerror = error;
- return (0);
case EPERM:
tp->t_softerror = error;
return (error);
diff --git a/freebsd/sys/netinet/tcp_seq.h b/freebsd/sys/netinet/tcp_seq.h
index b29ae2aa..b6e682ec 100644
--- a/freebsd/sys/netinet/tcp_seq.h
+++ b/freebsd/sys/netinet/tcp_seq.h
@@ -47,10 +47,10 @@
#define SEQ_MIN(a, b) ((SEQ_LT(a, b)) ? (a) : (b))
#define SEQ_MAX(a, b) ((SEQ_GT(a, b)) ? (a) : (b))
-#define WIN_LT(a,b) ((short)(ntohs(a)-ntohs(b)) < 0)
-#define WIN_LEQ(a,b) ((short)(ntohs(a)-ntohs(b)) <= 0)
-#define WIN_GT(a,b) ((short)(ntohs(a)-ntohs(b)) > 0)
-#define WIN_GEQ(a,b) ((short)(ntohs(a)-ntohs(b)) >= 0)
+#define WIN_LT(a,b) (ntohs(a) < ntohs(b))
+#define WIN_LEQ(a,b) (ntohs(a) <= ntohs(b))
+#define WIN_GT(a,b) (ntohs(a) > ntohs(b))
+#define WIN_GEQ(a,b) (ntohs(a) >= ntohs(b))
#define WIN_MIN(a, b) ((WIN_LT(a, b)) ? (a) : (b))
#define WIN_MAX(a, b) ((WIN_GT(a, b)) ? (a) : (b))
diff --git a/freebsd/sys/netinet/tcp_subr.c b/freebsd/sys/netinet/tcp_subr.c
index 1b19aecb..787213b0 100644
--- a/freebsd/sys/netinet/tcp_subr.c
+++ b/freebsd/sys/netinet/tcp_subr.c
@@ -40,7 +40,6 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
-#include <rtems/bsd/local/opt_compat.h>
#include <rtems/bsd/local/opt_inet.h>
#include <rtems/bsd/local/opt_inet6.h>
#include <rtems/bsd/local/opt_ipsec.h>
@@ -106,6 +105,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/tcp_var.h>
#include <netinet/tcp_log_buf.h>
#include <netinet/tcp_syncache.h>
+#include <netinet/tcp_hpts.h>
#include <netinet/cc/cc.h>
#ifdef INET6
#include <netinet6/tcp6_var.h>
@@ -239,6 +239,9 @@ VNET_DEFINE(uma_zone_t, sack_hole_zone);
VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]);
#endif
+static int tcp_default_fb_init(struct tcpcb *tp);
+static void tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged);
+static int tcp_default_handoff_ok(struct tcpcb *tp);
static struct inpcb *tcp_notify(struct inpcb *, int);
static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int);
static void tcp_mtudisc(struct inpcb *, int);
@@ -247,21 +250,17 @@ static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th,
static struct tcp_function_block tcp_def_funcblk = {
- "default",
- tcp_output,
- tcp_do_segment,
- tcp_default_ctloutput,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- 0,
- 0
+ .tfb_tcp_block_name = "freebsd",
+ .tfb_tcp_output = tcp_output,
+ .tfb_tcp_do_segment = tcp_do_segment,
+ .tfb_tcp_ctloutput = tcp_default_ctloutput,
+ .tfb_tcp_handoff_ok = tcp_default_handoff_ok,
+ .tfb_tcp_fb_init = tcp_default_fb_init,
+ .tfb_tcp_fb_fini = tcp_default_fb_fini,
};
int t_functions_inited = 0;
+static int tcp_fb_cnt = 0;
struct tcp_funchead t_functions;
static struct tcp_function_block *tcp_func_set_ptr = &tcp_def_funcblk;
@@ -334,6 +333,88 @@ find_and_ref_tcp_fb(struct tcp_function_block *blk)
return(rblk);
}
+static struct tcp_function_block *
+find_and_ref_tcp_default_fb(void)
+{
+ struct tcp_function_block *rblk;
+
+ rw_rlock(&tcp_function_lock);
+ rblk = tcp_func_set_ptr;
+ refcount_acquire(&rblk->tfb_refcnt);
+ rw_runlock(&tcp_function_lock);
+ return (rblk);
+}
+
+void
+tcp_switch_back_to_default(struct tcpcb *tp)
+{
+ struct tcp_function_block *tfb;
+
+ KASSERT(tp->t_fb != &tcp_def_funcblk,
+ ("%s: called by the built-in default stack", __func__));
+
+ /*
+ * Release the old stack. This function will either find a new one
+ * or panic.
+ */
+ if (tp->t_fb->tfb_tcp_fb_fini != NULL)
+ (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
+ refcount_release(&tp->t_fb->tfb_refcnt);
+
+ /*
+ * Now, we'll find a new function block to use.
+ * Start by trying the current user-selected
+ * default, unless this stack is the user-selected
+ * default.
+ */
+ tfb = find_and_ref_tcp_default_fb();
+ if (tfb == tp->t_fb) {
+ refcount_release(&tfb->tfb_refcnt);
+ tfb = NULL;
+ }
+ /* Does the stack accept this connection? */
+ if (tfb != NULL && tfb->tfb_tcp_handoff_ok != NULL &&
+ (*tfb->tfb_tcp_handoff_ok)(tp)) {
+ refcount_release(&tfb->tfb_refcnt);
+ tfb = NULL;
+ }
+ /* Try to use that stack. */
+ if (tfb != NULL) {
+ /* Initialize the new stack. If it succeeds, we are done. */
+ tp->t_fb = tfb;
+ if (tp->t_fb->tfb_tcp_fb_init == NULL ||
+ (*tp->t_fb->tfb_tcp_fb_init)(tp) == 0)
+ return;
+
+ /*
+ * Initialization failed. Release the reference count on
+ * the stack.
+ */
+ refcount_release(&tfb->tfb_refcnt);
+ }
+
+ /*
+ * If that wasn't feasible, use the built-in default
+ * stack which is not allowed to reject anyone.
+ */
+ tfb = find_and_ref_tcp_fb(&tcp_def_funcblk);
+ if (tfb == NULL) {
+ /* there always should be a default */
+ panic("Can't refer to tcp_def_funcblk");
+ }
+ if (tfb->tfb_tcp_handoff_ok != NULL) {
+ if ((*tfb->tfb_tcp_handoff_ok) (tp)) {
+ /* The default stack cannot say no */
+ panic("Default stack rejects a new session?");
+ }
+ }
+ tp->t_fb = tfb;
+ if (tp->t_fb->tfb_tcp_fb_init != NULL &&
+ (*tp->t_fb->tfb_tcp_fb_init)(tp)) {
+ /* The default stack cannot fail */
+ panic("Default stack initialization failed");
+ }
+}
static int
sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS)
@@ -433,14 +514,14 @@ SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available,
"list available TCP Function sets");
/*
- * Exports one (struct tcp_function_id) for each non-alias.
+ * Exports one (struct tcp_function_info) for each alias/name.
*/
static int
-sysctl_net_inet_list_func_ids(SYSCTL_HANDLER_ARGS)
+sysctl_net_inet_list_func_info(SYSCTL_HANDLER_ARGS)
{
- int error, cnt;
+ int cnt, error;
struct tcp_function *f;
- struct tcp_function_id tfi;
+ struct tcp_function_info tfi;
/*
* We don't allow writes.
@@ -459,20 +540,31 @@ sysctl_net_inet_list_func_ids(SYSCTL_HANDLER_ARGS)
}
/*
- * Walk the list, comparing the name of the function entry and
- * function block to determine which is an alias.
- * If exporting the list, copy out matching entries. Otherwise,
- * just record the total length.
+ * Walk the list and copy out matching entries. If INVARIANTS
+ * is compiled in, also walk the list to verify the length of
+ * the list matches what we have recorded.
*/
- cnt = 0;
rw_rlock(&tcp_function_lock);
+
+ cnt = 0;
+#ifndef INVARIANTS
+ if (req->oldptr == NULL) {
+ cnt = tcp_fb_cnt;
+ goto skip_loop;
+ }
+#endif
TAILQ_FOREACH(f, &t_functions, tf_next) {
- if (strncmp(f->tf_name, f->tf_fb->tfb_tcp_block_name,
- TCP_FUNCTION_NAME_LEN_MAX))
- continue;
+#ifdef INVARIANTS
+ cnt++;
+#endif
if (req->oldptr != NULL) {
+ tfi.tfi_refcnt = f->tf_fb->tfb_refcnt;
tfi.tfi_id = f->tf_fb->tfb_id;
- (void)strncpy(tfi.tfi_name, f->tf_name,
+ (void)strncpy(tfi.tfi_alias, f->tf_name,
+ TCP_FUNCTION_NAME_LEN_MAX);
+ tfi.tfi_alias[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0';
+ (void)strncpy(tfi.tfi_name,
+ f->tf_fb->tfb_tcp_block_name,
TCP_FUNCTION_NAME_LEN_MAX);
tfi.tfi_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0';
error = SYSCTL_OUT(req, &tfi, sizeof(tfi));
@@ -481,23 +573,110 @@ sysctl_net_inet_list_func_ids(SYSCTL_HANDLER_ARGS)
* mechanism we use to accumulate length
* information if the buffer was too short.
*/
- } else
- cnt++;
+ }
}
+ KASSERT(cnt == tcp_fb_cnt,
+ ("%s: cnt (%d) != tcp_fb_cnt (%d)", __func__, cnt, tcp_fb_cnt));
+#ifndef INVARIANTS
+skip_loop:
+#endif
rw_runlock(&tcp_function_lock);
if (req->oldptr == NULL)
error = SYSCTL_OUT(req, NULL,
- (cnt + 1) * sizeof(struct tcp_function_id));
+ (cnt + 1) * sizeof(struct tcp_function_info));
return (error);
}
-SYSCTL_PROC(_net_inet_tcp, OID_AUTO, function_ids,
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, function_info,
CTLTYPE_OPAQUE | CTLFLAG_SKIP | CTLFLAG_RD | CTLFLAG_MPSAFE,
- NULL, 0, sysctl_net_inet_list_func_ids, "S,tcp_function_id",
+ NULL, 0, sysctl_net_inet_list_func_info, "S,tcp_function_info",
"List TCP function block name-to-ID mappings");
/*
+ * tfb_tcp_handoff_ok() function for the default stack.
+ * Note that we'll basically try to take all comers.
+ */
+static int
+tcp_default_handoff_ok(struct tcpcb *tp)
+{
+
+ return (0);
+}
+
+/*
+ * tfb_tcp_fb_init() function for the default stack.
+ *
+ * This handles making sure we have appropriate timers set if you are
+ * transitioning a socket that has some amount of setup done.
+ *
+ * The init() fuction from the default can *never* return non-zero i.e.
+ * it is required to always succeed since it is the stack of last resort!
+ */
+static int
+tcp_default_fb_init(struct tcpcb *tp)
+{
+
+ struct socket *so;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ KASSERT(tp->t_state >= 0 && tp->t_state < TCPS_TIME_WAIT,
+ ("%s: connection %p in unexpected state %d", __func__, tp,
+ tp->t_state));
+
+ /*
+ * Nothing to do for ESTABLISHED or LISTEN states. And, we don't
+ * know what to do for unexpected states (which includes TIME_WAIT).
+ */
+ if (tp->t_state <= TCPS_LISTEN || tp->t_state >= TCPS_TIME_WAIT)
+ return (0);
+
+ /*
+ * Make sure some kind of transmission timer is set if there is
+ * outstanding data.
+ */
+ so = tp->t_inpcb->inp_socket;
+ if ((!TCPS_HAVEESTABLISHED(tp->t_state) || sbavail(&so->so_snd) ||
+ tp->snd_una != tp->snd_max) && !(tcp_timer_active(tp, TT_REXMT) ||
+ tcp_timer_active(tp, TT_PERSIST))) {
+ /*
+ * If the session has established and it looks like it should
+ * be in the persist state, set the persist timer. Otherwise,
+ * set the retransmit timer.
+ */
+ if (TCPS_HAVEESTABLISHED(tp->t_state) && tp->snd_wnd == 0 &&
+ (int32_t)(tp->snd_nxt - tp->snd_una) <
+ (int32_t)sbavail(&so->so_snd))
+ tcp_setpersist(tp);
+ else
+ tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
+ }
+
+ /* All non-embryonic sessions get a keepalive timer. */
+ if (!tcp_timer_active(tp, TT_KEEP))
+ tcp_timer_activate(tp, TT_KEEP,
+ TCPS_HAVEESTABLISHED(tp->t_state) ? TP_KEEPIDLE(tp) :
+ TP_KEEPINIT(tp));
+
+ return (0);
+}
+
+/*
+ * tfb_tcp_fb_fini() function for the default stack.
+ *
+ * This changes state as necessary (or prudent) to prepare for another stack
+ * to assume responsibility for the connection.
+ */
+static void
+tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged)
+{
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ return;
+}
+
+/*
* Target size of TCP PCB hash tables. Must be a power of two.
*
* Note that this can be overridden by the kernel environment
@@ -660,6 +839,7 @@ register_tcp_functions_as_names(struct tcp_function_block *blk, int wait,
(void)strncpy(n->tf_name, names[i], TCP_FUNCTION_NAME_LEN_MAX);
n->tf_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0';
TAILQ_INSERT_TAIL(&t_functions, n, tf_next);
+ tcp_fb_cnt++;
rw_wunlock(&tcp_function_lock);
}
return(0);
@@ -676,6 +856,7 @@ cleanup:
if (!strncmp(n->tf_name, names[i],
TCP_FUNCTION_NAME_LEN_MAX)) {
TAILQ_REMOVE(&t_functions, n, tf_next);
+ tcp_fb_cnt--;
n->tf_fb = NULL;
free(n, M_TCPFUNCTIONS);
break;
@@ -721,11 +902,28 @@ register_tcp_functions(struct tcp_function_block *blk, int wait)
return (register_tcp_functions_as_name(blk, NULL, wait));
}
+/*
+ * Deregister all names associated with a function block. This
+ * functionally removes the function block from use within the system.
+ *
+ * When called with a true quiesce argument, mark the function block
+ * as being removed so no more stacks will use it and determine
+ * whether the removal would succeed.
+ *
+ * When called with a false quiesce argument, actually attempt the
+ * removal.
+ *
+ * When called with a force argument, attempt to switch all TCBs to
+ * use the default stack instead of returning EBUSY.
+ *
+ * Returns 0 on success (or if the removal would succeed, or an error
+ * code on failure.
+ */
int
-deregister_tcp_functions(struct tcp_function_block *blk)
+deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce,
+ bool force)
{
struct tcp_function *f;
- int error=ENOENT;
if (strcmp(blk->tfb_tcp_block_name, "default") == 0) {
/* You can't un-register the default */
@@ -737,21 +935,64 @@ deregister_tcp_functions(struct tcp_function_block *blk)
rw_wunlock(&tcp_function_lock);
return (EBUSY);
}
+ /* Mark the block so no more stacks can use it. */
+ blk->tfb_flags |= TCP_FUNC_BEING_REMOVED;
+ /*
+ * If TCBs are still attached to the stack, attempt to switch them
+ * to the default stack.
+ */
+ if (force && blk->tfb_refcnt) {
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ VNET_ITERATOR_DECL(vnet_iter);
+
+ rw_wunlock(&tcp_function_lock);
+
+ VNET_LIST_RLOCK();
+ /* XXX handle */
+ VNET_FOREACH(vnet_iter) {
+ CURVNET_SET(vnet_iter);
+ INP_INFO_WLOCK(&V_tcbinfo);
+ LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
+ INP_WLOCK(inp);
+ if (inp->inp_flags & INP_TIMEWAIT) {
+ INP_WUNLOCK(inp);
+ continue;
+ }
+ tp = intotcpcb(inp);
+ if (tp == NULL || tp->t_fb != blk) {
+ INP_WUNLOCK(inp);
+ continue;
+ }
+ tcp_switch_back_to_default(tp);
+ INP_WUNLOCK(inp);
+ }
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ CURVNET_RESTORE();
+ }
+ VNET_LIST_RUNLOCK();
+
+ rw_wlock(&tcp_function_lock);
+ }
if (blk->tfb_refcnt) {
- /* Still tcb attached, mark it. */
- blk->tfb_flags |= TCP_FUNC_BEING_REMOVED;
- rw_wunlock(&tcp_function_lock);
+ /* TCBs still attached. */
+ rw_wunlock(&tcp_function_lock);
return (EBUSY);
}
+ if (quiesce) {
+ /* Skip removal. */
+ rw_wunlock(&tcp_function_lock);
+ return (0);
+ }
+ /* Remove any function names that map to this function block. */
while (find_tcp_fb_locked(blk, &f) != NULL) {
- /* Found */
TAILQ_REMOVE(&t_functions, f, tf_next);
+ tcp_fb_cnt--;
f->tf_fb = NULL;
free(f, M_TCPFUNCTIONS);
- error = 0;
}
rw_wunlock(&tcp_function_lock);
- return (error);
+ return (0);
}
void
@@ -1498,6 +1739,7 @@ tcp_ccalgounload(struct cc_algo *unload_algo)
tmpalgo = CC_ALGO(tp);
/* NewReno does not require any init. */
CC_ALGO(tp) = &newreno_cc_algo;
+ /* XXX defer to epoch_call */
if (tmpalgo->cb_destroy != NULL)
tmpalgo->cb_destroy(tp->ccv);
}
@@ -1545,7 +1787,7 @@ tcp_discardcb(struct tcpcb *tp)
#ifdef INET6
int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
#endif /* INET6 */
- int released;
+ int released __unused;
INP_WLOCK_ASSERT(inp);
@@ -1868,6 +2110,7 @@ static int
tcp_pcblist(SYSCTL_HANDLER_ARGS)
{
int error, i, m, n, pcb_count;
+ struct in_pcblist *il;
struct inpcb *inp, **inp_list;
inp_gen_t gencnt;
struct xinpgen xig;
@@ -1914,7 +2157,8 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS)
if (error)
return (error);
- inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
+ il = malloc(sizeof(struct in_pcblist) + n * sizeof(struct inpcb *), M_TEMP, M_WAITOK|M_ZERO_INVARIANTS);
+ inp_list = il->il_inp_list;
INP_INFO_WLOCK(&V_tcbinfo);
for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0;
@@ -1957,14 +2201,10 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS)
} else
INP_RUNLOCK(inp);
}
- INP_INFO_RLOCK(&V_tcbinfo);
- for (i = 0; i < n; i++) {
- inp = inp_list[i];
- INP_RLOCK(inp);
- if (!in_pcbrele_rlocked(inp))
- INP_RUNLOCK(inp);
- }
- INP_INFO_RUNLOCK(&V_tcbinfo);
+
+ il->il_count = n;
+ il->il_pcbinfo = &V_tcbinfo;
+ epoch_call(net_epoch_preempt, &il->il_epoch_ctx, in_pcblist_rele_rlocked);
if (!error) {
/*
@@ -1981,7 +2221,6 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS)
INP_LIST_RUNLOCK(&V_tcbinfo);
error = SYSCTL_OUT(req, &xig, sizeof xig);
}
- free(inp_list, M_TEMP);
return (error);
}
diff --git a/freebsd/sys/netinet/tcp_syncache.c b/freebsd/sys/netinet/tcp_syncache.c
index 27e0e25f..e163aa54 100644
--- a/freebsd/sys/netinet/tcp_syncache.c
+++ b/freebsd/sys/netinet/tcp_syncache.c
@@ -862,6 +862,12 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
(*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
refcount_release(&tp->t_fb->tfb_refcnt);
tp->t_fb = rblk;
+ /*
+ * XXXrrs this is quite dangerous, it is possible
+ * for the new function to fail to init. We also
+ * are not asking if the handoff_is_ok though at
+ * the very start thats probalbly ok.
+ */
if (tp->t_fb->tfb_tcp_fb_init) {
(*tp->t_fb->tfb_tcp_fb_init)(tp);
}
diff --git a/freebsd/sys/netinet/tcp_timer.c b/freebsd/sys/netinet/tcp_timer.c
index 69bd052b..422e5122 100644
--- a/freebsd/sys/netinet/tcp_timer.c
+++ b/freebsd/sys/netinet/tcp_timer.c
@@ -664,8 +664,7 @@ tcp_timer_rexmt(void * xtp)
tcp_inpinfo_lock_del(inp, tp);
goto out;
}
- tp = tcp_drop(tp, tp->t_softerror ?
- tp->t_softerror : ETIMEDOUT);
+ tp = tcp_drop(tp, ETIMEDOUT);
tcp_inpinfo_lock_del(inp, tp);
goto out;
}
@@ -696,7 +695,12 @@ tcp_timer_rexmt(void * xtp)
tp->t_flags |= TF_WASCRECOVERY;
else
tp->t_flags &= ~TF_WASCRECOVERY;
- tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
+ if ((tp->t_flags & TF_RCVD_TSTMP) == 0)
+ tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
+ /* In the event that we've negotiated timestamps
+ * badrxtwin will be set to the value that we set
+ * the retransmitted packet's to_tsval to by tcp_output
+ */
tp->t_flags |= TF_PREVVALID;
} else
tp->t_flags &= ~TF_PREVVALID;
diff --git a/freebsd/sys/netinet/tcp_timewait.c b/freebsd/sys/netinet/tcp_timewait.c
index aa26cb37..afadf7cd 100644
--- a/freebsd/sys/netinet/tcp_timewait.c
+++ b/freebsd/sys/netinet/tcp_timewait.c
@@ -647,7 +647,7 @@ tcp_tw_2msl_stop(struct tcptw *tw, int reuse)
{
struct ucred *cred;
struct inpcb *inp;
- int released;
+ int released __unused;
INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
diff --git a/freebsd/sys/netinet/tcp_usrreq.c b/freebsd/sys/netinet/tcp_usrreq.c
index c93b2d4a..bf2cff4c 100644
--- a/freebsd/sys/netinet/tcp_usrreq.c
+++ b/freebsd/sys/netinet/tcp_usrreq.c
@@ -96,6 +96,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/tcpip.h>
#include <netinet/cc/cc.h>
#include <netinet/tcp_fastopen.h>
+#include <netinet/tcp_hpts.h>
#ifdef TCPPCAP
#include <netinet/tcp_pcap.h>
#endif
@@ -1097,7 +1098,9 @@ tcp_usr_abort(struct socket *so)
!(inp->inp_flags & INP_DROPPED)) {
tp = intotcpcb(inp);
TCPDEBUG1();
- tcp_drop(tp, ECONNABORTED);
+ tp = tcp_drop(tp, ECONNABORTED);
+ if (tp == NULL)
+ goto dropped;
TCPDEBUG2(PRU_ABORT);
TCP_PROBE2(debug__user, tp, PRU_ABORT);
}
@@ -1108,6 +1111,7 @@ tcp_usr_abort(struct socket *so)
inp->inp_flags |= INP_SOCKREF;
}
INP_WUNLOCK(inp);
+dropped:
INP_INFO_RUNLOCK(&V_tcbinfo);
}
@@ -1395,11 +1399,15 @@ tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti)
ti->tcpi_snd_nxt = tp->snd_nxt;
ti->tcpi_snd_mss = tp->t_maxseg;
ti->tcpi_rcv_mss = tp->t_maxseg;
- if (tp->t_flags & TF_TOE)
- ti->tcpi_options |= TCPI_OPT_TOE;
ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack;
ti->tcpi_rcv_ooopack = tp->t_rcvoopack;
ti->tcpi_snd_zerowin = tp->t_sndzerowin;
+#ifdef TCP_OFFLOAD
+ if (tp->t_flags & TF_TOE) {
+ ti->tcpi_options |= TCPI_OPT_TOE;
+ tcp_offload_tcp_info(tp, ti);
+ }
+#endif
}
/*
@@ -1516,22 +1524,41 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt)
*/
(*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
}
+#ifdef TCPHPTS
+ /* Assure that we are not on any hpts */
+ tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_ALL);
+#endif
+ if (blk->tfb_tcp_fb_init) {
+ error = (*blk->tfb_tcp_fb_init)(tp);
+ if (error) {
+ refcount_release(&blk->tfb_refcnt);
+ if (tp->t_fb->tfb_tcp_fb_init) {
+ if((*tp->t_fb->tfb_tcp_fb_init)(tp) != 0) {
+ /* Fall back failed, drop the connection */
+ INP_WUNLOCK(inp);
+ soabort(so);
+ return(error);
+ }
+ }
+ goto err_out;
+ }
+ }
refcount_release(&tp->t_fb->tfb_refcnt);
tp->t_fb = blk;
- if (tp->t_fb->tfb_tcp_fb_init) {
- (*tp->t_fb->tfb_tcp_fb_init)(tp);
- }
#ifdef TCP_OFFLOAD
if (tp->t_flags & TF_TOE) {
tcp_offload_ctloutput(tp, sopt->sopt_dir,
sopt->sopt_name);
}
#endif
+err_out:
INP_WUNLOCK(inp);
return (error);
} else if ((sopt->sopt_dir == SOPT_GET) &&
(sopt->sopt_name == TCP_FUNCTION_BLK)) {
- strcpy(fsn.function_set_name, tp->t_fb->tfb_tcp_block_name);
+ strncpy(fsn.function_set_name, tp->t_fb->tfb_tcp_block_name,
+ TCP_FUNCTION_NAME_LEN_MAX);
+ fsn.function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0';
fsn.pcbcnt = tp->t_fb->tfb_refcnt;
INP_WUNLOCK(inp);
error = sooptcopyout(sopt, &fsn, sizeof fsn);
diff --git a/freebsd/sys/netinet/tcp_var.h b/freebsd/sys/netinet/tcp_var.h
index f09bd19c..adaaff61 100644
--- a/freebsd/sys/netinet/tcp_var.h
+++ b/freebsd/sys/netinet/tcp_var.h
@@ -83,123 +83,123 @@ STAILQ_HEAD(tcp_log_stailq, tcp_log_mem);
/*
* Tcp control block, one per tcp; fields:
- * Organized for 16 byte cacheline efficiency.
+ * Organized for 64 byte cacheline efficiency based
+ * on common tcp_input/tcp_output processing.
*/
struct tcpcb {
- struct tsegqe_head t_segq; /* segment reassembly queue */
- int t_segqlen; /* segment reassembly queue length */
- int t_dupacks; /* consecutive dup acks recd */
-
- struct tcp_timer *t_timers; /* All the TCP timers in one struct */
-
+ /* Cache line 1 */
struct inpcb *t_inpcb; /* back pointer to internet pcb */
- int t_state; /* state of this connection */
+ struct tcp_function_block *t_fb;/* TCP function call block */
+ void *t_fb_ptr; /* Pointer to t_fb specific data */
+ uint32_t t_maxseg:24, /* maximum segment size */
+ t_logstate:8; /* State of "black box" logging */
+ uint32_t t_state:4, /* state of this connection */
+ bits_spare : 24;
u_int t_flags;
-
- struct vnet *t_vnet; /* back pointer to parent vnet */
-
tcp_seq snd_una; /* sent but unacknowledged */
tcp_seq snd_max; /* highest sequence number sent;
* used to recognize retransmits
*/
tcp_seq snd_nxt; /* send next */
tcp_seq snd_up; /* send urgent pointer */
-
- tcp_seq snd_wl1; /* window update seg seq number */
- tcp_seq snd_wl2; /* window update seg ack number */
- tcp_seq iss; /* initial send sequence number */
- tcp_seq irs; /* initial receive sequence number */
-
+ uint32_t snd_wnd; /* send window */
+ uint32_t snd_cwnd; /* congestion-controlled window */
+ uint32_t cl1_spare; /* Spare to round out CL 1 */
+ /* Cache line 2 */
+ u_int32_t ts_offset; /* our timestamp offset */
+ u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */
+ int rcv_numsacks; /* # distinct sack blks present */
+ u_int t_tsomax; /* TSO total burst length limit in bytes */
+ u_int t_tsomaxsegcount; /* TSO maximum segment count */
+ u_int t_tsomaxsegsize; /* TSO maximum segment size in bytes */
tcp_seq rcv_nxt; /* receive next */
tcp_seq rcv_adv; /* advertised window */
uint32_t rcv_wnd; /* receive window */
+ u_int t_flags2; /* More tcpcb flags storage */
+ int t_srtt; /* smoothed round-trip time */
+ int t_rttvar; /* variance in round-trip time */
+ u_int32_t ts_recent; /* timestamp echo data */
+ u_char snd_scale; /* window scaling for send window */
+ u_char rcv_scale; /* window scaling for recv window */
+ u_char snd_limited; /* segments limited transmitted */
+ u_char request_r_scale; /* pending window scaling */
+ tcp_seq last_ack_sent;
+ u_int t_rcvtime; /* inactivity time */
+ /* Cache line 3 */
tcp_seq rcv_up; /* receive urgent pointer */
-
- uint32_t snd_wnd; /* send window */
- uint32_t snd_cwnd; /* congestion-controlled window */
+ int t_segqlen; /* segment reassembly queue length */
+ struct tsegqe_head t_segq; /* segment reassembly queue */
+ struct mbuf *t_in_pkt;
+ struct mbuf *t_tail_pkt;
+ struct tcp_timer *t_timers; /* All the TCP timers in one struct */
+ struct vnet *t_vnet; /* back pointer to parent vnet */
uint32_t snd_ssthresh; /* snd_cwnd size threshold for
* for slow start exponential to
* linear switch
*/
+ tcp_seq snd_wl1; /* window update seg seq number */
+ /* Cache line 4 */
+ tcp_seq snd_wl2; /* window update seg ack number */
+
+ tcp_seq irs; /* initial receive sequence number */
+ tcp_seq iss; /* initial send sequence number */
+ u_int t_acktime;
+ u_int ts_recent_age; /* when last updated */
tcp_seq snd_recover; /* for use in NewReno Fast Recovery */
+ uint16_t cl4_spare; /* Spare to adjust CL 4 */
+ char t_oobflags; /* have some */
+ char t_iobc; /* input character */
+ int t_rxtcur; /* current retransmit value (ticks) */
- u_int t_rcvtime; /* inactivity time */
- u_int t_starttime; /* time connection was established */
+ int t_rxtshift; /* log(2) of rexmt exp. backoff */
u_int t_rtttime; /* RTT measurement start time */
+
tcp_seq t_rtseq; /* sequence number being timed */
+ u_int t_starttime; /* time connection was established */
- int t_rxtcur; /* current retransmit value (ticks) */
- u_int t_maxseg; /* maximum segment size */
u_int t_pmtud_saved_maxseg; /* pre-blackhole MSS */
- int t_srtt; /* smoothed round-trip time */
- int t_rttvar; /* variance in round-trip time */
-
- int t_rxtshift; /* log(2) of rexmt exp. backoff */
u_int t_rttmin; /* minimum rtt allowed */
+
u_int t_rttbest; /* best rtt we've seen */
- u_long t_rttupdated; /* number of times rtt sampled */
- uint32_t max_sndwnd; /* largest window peer has offered */
int t_softerror; /* possible error not yet reported */
-/* out-of-band data */
- char t_oobflags; /* have some */
- char t_iobc; /* input character */
-/* RFC 1323 variables */
- u_char snd_scale; /* window scaling for send window */
- u_char rcv_scale; /* window scaling for recv window */
- u_char request_r_scale; /* pending window scaling */
- u_int32_t ts_recent; /* timestamp echo data */
- u_int ts_recent_age; /* when last updated */
- u_int32_t ts_offset; /* our timestamp offset */
-
- tcp_seq last_ack_sent;
-/* experimental */
+ uint32_t max_sndwnd; /* largest window peer has offered */
+ /* Cache line 5 */
uint32_t snd_cwnd_prev; /* cwnd prior to retransmit */
uint32_t snd_ssthresh_prev; /* ssthresh prior to retransmit */
tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */
int t_sndzerowin; /* zero-window updates sent */
- u_int t_badrxtwin; /* window for retransmit recovery */
- u_char snd_limited; /* segments limited transmitted */
-/* SACK related state */
+ u_long t_rttupdated; /* number of times rtt sampled */
int snd_numholes; /* number of holes seen by sender */
+ u_int t_badrxtwin; /* window for retransmit recovery */
TAILQ_HEAD(sackhole_head, sackhole) snd_holes;
/* SACK scoreboard (sorted) */
tcp_seq snd_fack; /* last seq number(+1) sack'd by rcv'r*/
- int rcv_numsacks; /* # distinct sack blks present */
- struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */
tcp_seq sack_newdata; /* New data xmitted in this recovery
episode starts at this seq number */
+ struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */
struct sackhint sackhint; /* SACK scoreboard hint */
int t_rttlow; /* smallest observerved RTT */
- u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */
int rfbuf_cnt; /* recv buffer autoscaling byte count */
struct toedev *tod; /* toedev handling this connection */
int t_sndrexmitpack; /* retransmit packets sent */
int t_rcvoopack; /* out-of-order packets received */
void *t_toe; /* TOE pcb pointer */
- int t_bytes_acked; /* # bytes acked during current RTT */
struct cc_algo *cc_algo; /* congestion control algorithm */
struct cc_var *ccv; /* congestion control specific vars */
struct osd *osd; /* storage for Khelp module data */
-
+ int t_bytes_acked; /* # bytes acked during current RTT */
u_int t_keepinit; /* time to establish connection */
u_int t_keepidle; /* time before keepalive probes begin */
u_int t_keepintvl; /* interval between keepalives */
u_int t_keepcnt; /* number of keepalives before close */
-
- u_int t_tsomax; /* TSO total burst length limit in bytes */
- u_int t_tsomaxsegcount; /* TSO maximum segment count */
- u_int t_tsomaxsegsize; /* TSO maximum segment size in bytes */
- u_int t_flags2; /* More tcpcb flags storage */
- int t_logstate; /* State of "black box" logging */
- struct tcp_log_stailq t_logs; /* Log buffer */
+ int t_dupacks; /* consecutive dup acks recd */
int t_lognum; /* Number of log entries */
- uint32_t t_logsn; /* Log "serial number" */
+ struct tcp_log_stailq t_logs; /* Log buffer */
struct tcp_log_id_node *t_lin;
struct tcp_log_id_bucket *t_lib;
const char *t_output_caller; /* Function that called tcp_output */
- struct tcp_function_block *t_fb;/* TCP function call block */
- void *t_fb_ptr; /* Pointer to t_fb specific data */
+ uint32_t t_logsn; /* Log "serial number" */
uint8_t t_tfo_client_cookie_len; /* TCP Fast Open client cookie length */
unsigned int *t_tfo_pending; /* TCP Fast Open server pending counter */
union {
@@ -257,14 +257,19 @@ struct tcptemp {
struct tcp_function_block {
char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX];
int (*tfb_tcp_output)(struct tcpcb *);
+ int (*tfb_tcp_output_wtime)(struct tcpcb *, const struct timeval *);
void (*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *,
struct socket *, struct tcpcb *,
int, int, uint8_t,
int);
+ void (*tfb_tcp_hpts_do_segment)(struct mbuf *, struct tcphdr *,
+ struct socket *, struct tcpcb *,
+ int, int, uint8_t,
+ int, int, struct timeval *);
int (*tfb_tcp_ctloutput)(struct socket *so, struct sockopt *sopt,
struct inpcb *inp, struct tcpcb *tp);
/* Optional memory allocation/free routine */
- void (*tfb_tcp_fb_init)(struct tcpcb *);
+ int (*tfb_tcp_fb_init)(struct tcpcb *);
void (*tfb_tcp_fb_fini)(struct tcpcb *, int);
/* Optional timers, must define all if you define one */
int (*tfb_tcp_timer_stop_all)(struct tcpcb *);
@@ -274,6 +279,7 @@ struct tcp_function_block {
void (*tfb_tcp_timer_stop)(struct tcpcb *, uint32_t);
void (*tfb_tcp_rexmit_tmr)(struct tcpcb *);
int (*tfb_tcp_handoff_ok)(struct tcpcb *);
+ void (*tfb_tcp_mtu_chg)(struct tcpcb *);
volatile uint32_t tfb_refcnt;
uint32_t tfb_flags;
uint8_t tfb_id;
@@ -688,11 +694,14 @@ void tcp_inptoxtp(const struct inpcb *, struct xtcpcb *);
#endif
/*
- * TCP function name-to-id mapping exported to user-land via sysctl(3).
+ * TCP function information (name-to-id mapping, aliases, and refcnt)
+ * exported to user-land via sysctl(3).
*/
-struct tcp_function_id {
+struct tcp_function_info {
+ uint32_t tfi_refcnt;
uint8_t tfi_id;
char tfi_name[TCP_FUNCTION_NAME_LEN_MAX];
+ char tfi_alias[TCP_FUNCTION_NAME_LEN_MAX];
};
/*
@@ -848,9 +857,12 @@ int register_tcp_functions_as_names(struct tcp_function_block *blk,
int wait, const char *names[], int *num_names);
int register_tcp_functions_as_name(struct tcp_function_block *blk,
const char *name, int wait);
-int deregister_tcp_functions(struct tcp_function_block *blk);
+int deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce,
+ bool force);
struct tcp_function_block *find_and_ref_tcp_functions(struct tcp_function_set *fs);
-struct tcp_function_block *find_and_ref_tcp_fb(struct tcp_function_block *blk);
+void tcp_switch_back_to_default(struct tcpcb *tp);
+struct tcp_function_block *
+find_and_ref_tcp_fb(struct tcp_function_block *fs);
int tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp);
uint32_t tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *);
diff --git a/freebsd/sys/netinet/toecore.h b/freebsd/sys/netinet/toecore.h
index 633984a6..f2374d70 100644
--- a/freebsd/sys/netinet/toecore.h
+++ b/freebsd/sys/netinet/toecore.h
@@ -38,6 +38,7 @@
struct tcpopt;
struct tcphdr;
struct in_conninfo;
+struct tcp_info;
struct toedev {
TAILQ_ENTRY(toedev) link; /* glue for toedev_list */
@@ -101,6 +102,10 @@ struct toedev {
/* TCP socket option */
void (*tod_ctloutput)(struct toedev *, struct tcpcb *, int, int);
+
+ /* Update software state */
+ void (*tod_tcp_info)(struct toedev *, struct tcpcb *,
+ struct tcp_info *);
};
#include <sys/eventhandler.h>
diff --git a/freebsd/sys/netinet/udp_usrreq.c b/freebsd/sys/netinet/udp_usrreq.c
index da2dbe98..178a8d5e 100644
--- a/freebsd/sys/netinet/udp_usrreq.c
+++ b/freebsd/sys/netinet/udp_usrreq.c
@@ -842,6 +842,7 @@ udp_pcblist(SYSCTL_HANDLER_ARGS)
{
int error, i, n;
struct inpcb *inp, **inp_list;
+ struct in_pcblist *il;
inp_gen_t gencnt;
struct xinpgen xig;
@@ -879,10 +880,8 @@ udp_pcblist(SYSCTL_HANDLER_ARGS)
error = SYSCTL_OUT(req, &xig, sizeof xig);
if (error)
return (error);
-
- inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
- if (inp_list == NULL)
- return (ENOMEM);
+ il = malloc(sizeof(struct in_pcblist) + n * sizeof(struct inpcb *), M_TEMP, M_WAITOK|M_ZERO_INVARIANTS);
+ inp_list = il->il_inp_list;
INP_INFO_RLOCK(&V_udbinfo);
for (inp = LIST_FIRST(V_udbinfo.ipi_listhead), i = 0; inp && i < n;
@@ -911,14 +910,9 @@ udp_pcblist(SYSCTL_HANDLER_ARGS)
} else
INP_RUNLOCK(inp);
}
- INP_INFO_WLOCK(&V_udbinfo);
- for (i = 0; i < n; i++) {
- inp = inp_list[i];
- INP_RLOCK(inp);
- if (!in_pcbrele_rlocked(inp))
- INP_RUNLOCK(inp);
- }
- INP_INFO_WUNLOCK(&V_udbinfo);
+ il->il_count = n;
+ il->il_pcbinfo = &V_udbinfo;
+ epoch_call(net_epoch_preempt, &il->il_epoch_ctx, in_pcblist_rele_rlocked);
if (!error) {
/*
@@ -934,7 +928,6 @@ udp_pcblist(SYSCTL_HANDLER_ARGS)
INP_INFO_RUNLOCK(&V_udbinfo);
error = SYSCTL_OUT(req, &xig, sizeof xig);
}
- free(inp_list, M_TEMP);
return (error);
}
@@ -1578,6 +1571,7 @@ udp_abort(struct socket *so)
static int
udp_attach(struct socket *so, int proto, struct thread *td)
{
+ static uint32_t udp_flowid;
struct inpcb *inp;
struct inpcbinfo *pcbinfo;
int error;
@@ -1598,6 +1592,8 @@ udp_attach(struct socket *so, int proto, struct thread *td)
inp = sotoinpcb(so);
inp->inp_vflag |= INP_IPV4;
inp->inp_ip_ttl = V_ip_defttl;
+ inp->inp_flowid = atomic_fetchadd_int(&udp_flowid, 1);
+ inp->inp_flowtype = M_HASHTYPE_OPAQUE;
error = udp_newudpcb(inp);
if (error) {
@@ -1723,6 +1719,7 @@ udp_detach(struct socket *so)
INP_WLOCK(inp);
up = intoudpcb(inp);
KASSERT(up != NULL, ("%s: up == NULL", __func__));
+ /* XXX defer to epoch_call */
inp->inp_ppcb = NULL;
in_pcbdetach(inp);
in_pcbfree(inp);