diff options
author | Sebastian Huber <sebastian.huber@embedded-brains.de> | 2018-08-21 13:47:02 +0200 |
---|---|---|
committer | Sebastian Huber <sebastian.huber@embedded-brains.de> | 2018-09-21 10:29:41 +0200 |
commit | bcdce02d9bc8150e1d191ed5ca9da45b7604964a (patch) | |
tree | 3b2faf509db7672ee1fc98857736470be97e7ed8 /freebsd/sys/netinet | |
parent | Update to FreeBSD head 2018-04-01 (diff) | |
download | rtems-libbsd-bcdce02d9bc8150e1d191ed5ca9da45b7604964a.tar.bz2 |
Update to FreeBSD head 2018-06-01
Git mirror commit fb63610a69b0eb7f69a201ba05c4c1a7a2739cf9.
Update #3472.
Diffstat (limited to 'freebsd/sys/netinet')
43 files changed, 1779 insertions, 710 deletions
diff --git a/freebsd/sys/netinet/cc/cc_newreno.c b/freebsd/sys/netinet/cc/cc_newreno.c index b7f59520..4d5f8644 100644 --- a/freebsd/sys/netinet/cc/cc_newreno.c +++ b/freebsd/sys/netinet/cc/cc_newreno.c @@ -83,7 +83,7 @@ static MALLOC_DEFINE(M_NEWRENO, "newreno data", #define CAST_PTR_INT(X) (*((int*)(X))) -static int newreno_cb_init(struct cc_var *ccv); +static void newreno_cb_destroy(struct cc_var *ccv); static void newreno_ack_received(struct cc_var *ccv, uint16_t type); static void newreno_after_idle(struct cc_var *ccv); static void newreno_cong_signal(struct cc_var *ccv, uint32_t type); @@ -97,7 +97,7 @@ static VNET_DEFINE(uint32_t, newreno_beta_ecn) = 80; struct cc_algo newreno_cc_algo = { .name = "newreno", - .cb_init = newreno_cb_init, + .cb_destroy = newreno_cb_destroy, .ack_received = newreno_ack_received, .after_idle = newreno_after_idle, .cong_signal = newreno_cong_signal, @@ -110,18 +110,28 @@ struct newreno { uint32_t beta_ecn; }; -int -newreno_cb_init(struct cc_var *ccv) +static inline struct newreno * +newreno_malloc(struct cc_var *ccv) { - struct newreno *nreno; + struct newreno *nreno; - nreno = malloc(sizeof(struct newreno), M_NEWRENO, M_NOWAIT|M_ZERO); + nreno = malloc(sizeof(struct newreno), M_NEWRENO, M_NOWAIT); if (nreno != NULL) { + /* NB: nreno is not zeroed, so initialise all fields. */ nreno->beta = V_newreno_beta; nreno->beta_ecn = V_newreno_beta_ecn; + ccv->cc_data = nreno; } - return (0); + return (nreno); +} + +static void +newreno_cb_destroy(struct cc_var *ccv) +{ + + if (ccv->cc_data != NULL) + free(ccv->cc_data, M_NEWRENO); } static void @@ -226,20 +236,18 @@ static void newreno_cong_signal(struct cc_var *ccv, uint32_t type) { struct newreno *nreno; - uint32_t cwin, factor; + uint32_t beta, beta_ecn, cwin, factor; u_int mss; - factor = V_newreno_beta; - nreno = ccv->cc_data; - if (nreno != NULL) { - if (V_cc_do_abe) - factor = (type == CC_ECN ? nreno->beta_ecn: nreno->beta); - else - factor = nreno->beta; - } - cwin = CCV(ccv, snd_cwnd); mss = CCV(ccv, t_maxseg); + nreno = ccv->cc_data; + beta = (nreno == NULL) ? V_newreno_beta : nreno->beta; + beta_ecn = (nreno == NULL) ? V_newreno_beta_ecn : nreno->beta_ecn; + if (V_cc_do_abe && type == CC_ECN) + factor = beta_ecn; + else + factor = beta; /* Catch algos which mistakenly leak private signal types. */ KASSERT((type & CC_SIGPRIVMASK) == 0, @@ -255,8 +263,8 @@ newreno_cong_signal(struct cc_var *ccv, uint32_t type) V_cc_do_abe && V_cc_abe_frlossreduce)) { CCV(ccv, snd_ssthresh) = ((uint64_t)CCV(ccv, snd_ssthresh) * - (uint64_t)nreno->beta) / - (100ULL * (uint64_t)nreno->beta_ecn); + (uint64_t)beta) / + (100ULL * (uint64_t)beta_ecn); } if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) CCV(ccv, snd_ssthresh) = cwin; @@ -280,7 +288,6 @@ static void newreno_post_recovery(struct cc_var *ccv) { int pipe; - pipe = 0; if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { /* @@ -304,7 +311,7 @@ newreno_post_recovery(struct cc_var *ccv) } } -int +static int newreno_ctl_output(struct cc_var *ccv, struct sockopt *sopt, void *buf) { struct newreno *nreno; @@ -315,9 +322,15 @@ newreno_ctl_output(struct cc_var *ccv, struct sockopt *sopt, void *buf) nreno = ccv->cc_data; opt = buf; - + switch (sopt->sopt_dir) { case SOPT_SET: + /* We cannot set without cc_data memory. */ + if (nreno == NULL) { + nreno = newreno_malloc(ccv); + if (nreno == NULL) + return (ENOMEM); + } switch (opt->name) { case CC_NEWRENO_BETA: nreno->beta = opt->val; @@ -330,17 +343,21 @@ newreno_ctl_output(struct cc_var *ccv, struct sockopt *sopt, void *buf) default: return (ENOPROTOOPT); } + break; case SOPT_GET: switch (opt->name) { case CC_NEWRENO_BETA: - opt->val = nreno->beta; + opt->val = (nreno == NULL) ? + V_newreno_beta : nreno->beta; break; case CC_NEWRENO_BETA_ECN: - opt->val = nreno->beta_ecn; + opt->val = (nreno == NULL) ? + V_newreno_beta_ecn : nreno->beta_ecn; break; default: return (ENOPROTOOPT); } + break; default: return (EINVAL); } @@ -351,6 +368,7 @@ newreno_ctl_output(struct cc_var *ccv, struct sockopt *sopt, void *buf) static int newreno_beta_handler(SYSCTL_HANDLER_ARGS) { + if (req->newptr != NULL ) { if (arg1 == &VNET_NAME(newreno_beta_ecn) && !V_cc_do_abe) return (EACCES); diff --git a/freebsd/sys/netinet/if_ether.c b/freebsd/sys/netinet/if_ether.c index 699af2e4..0d608180 100644 --- a/freebsd/sys/netinet/if_ether.c +++ b/freebsd/sys/netinet/if_ether.c @@ -364,7 +364,7 @@ arprequest(struct ifnet *ifp, const struct in_addr *sip, struct ifaddr *ifa; IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; @@ -696,14 +696,6 @@ arpintr(struct mbuf *m) hlen = ETHER_ADDR_LEN; /* RFC 826 */ layer = "ethernet"; break; - case ARPHRD_IEEE802: - hlen = 6; /* RFC 1390, FDDI_ADDR_LEN */ - layer = "fddi"; - break; - case ARPHRD_ARCNET: - hlen = 1; /* RFC 1201, ARC_ADDR_LEN */ - layer = "arcnet"; - break; case ARPHRD_INFINIBAND: hlen = 20; /* RFC 4391, INFINIBAND_ALEN */ layer = "infiniband"; @@ -896,7 +888,7 @@ in_arpinput(struct mbuf *m) * as a dummy address for the rest of the function. */ IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET && (ifa->ifa_carp == NULL || (*carp_iamatch_p)(ifa, &enaddr))) { @@ -911,7 +903,7 @@ in_arpinput(struct mbuf *m) * If bridging, fall back to using any inet address. */ IN_IFADDR_RLOCK(&in_ifa_tracker); - if (!bridged || (ia = TAILQ_FIRST(&V_in_ifaddrhead)) == NULL) { + if (!bridged || (ia = CK_STAILQ_FIRST(&V_in_ifaddrhead)) == NULL) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto drop; } @@ -1455,7 +1447,7 @@ arp_handle_ifllchange(struct ifnet *ifp) { struct ifaddr *ifa; - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == AF_INET) arp_ifinit(ifp, ifa); } diff --git a/freebsd/sys/netinet/igmp.c b/freebsd/sys/netinet/igmp.c index cf319470..a4b99f62 100644 --- a/freebsd/sys/netinet/igmp.c +++ b/freebsd/sys/netinet/igmp.c @@ -138,7 +138,7 @@ static int igmp_v3_enqueue_group_record(struct mbufq *, struct in_multi *, const int, const int, const int); static int igmp_v3_enqueue_filter_change(struct mbufq *, struct in_multi *); -static void igmp_v3_process_group_timers(struct igmp_ifsoftc *, +static void igmp_v3_process_group_timers(struct in_multi_head *, struct mbufq *, struct mbufq *, struct in_multi *, const int); static int igmp_v3_merge_state_changes(struct in_multi *, @@ -164,12 +164,12 @@ static const struct netisr_handler igmp_nh = { * themselves are not virtualized. * * Locking: - * * The permitted lock order is: IN_MULTI_LOCK, IGMP_LOCK, IF_ADDR_LOCK. + * * The permitted lock order is: IN_MULTI_LIST_LOCK, IGMP_LOCK, IF_ADDR_LOCK. * Any may be taken independently; if any are held at the same * time, the above lock order must be followed. * * All output is delegated to the netisr. * Now that Giant has been eliminated, the netisr may be inlined. - * * IN_MULTI_LOCK covers in_multi. + * * IN_MULTI_LIST_LOCK covers in_multi. * * IGMP_LOCK covers igmp_ifsoftc and any global variables in this file, * including the output queue. * * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of @@ -443,7 +443,7 @@ sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS) if (error) return (error); - IN_MULTI_LOCK(); + IN_MULTI_LIST_LOCK(); IGMP_LOCK(); if (name[0] <= 0 || name[0] > V_if_index) { @@ -477,7 +477,7 @@ sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS) out_locked: IGMP_UNLOCK(); - IN_MULTI_UNLOCK(); + IN_MULTI_LIST_UNLOCK(); return (error); } @@ -588,7 +588,6 @@ igi_alloc_locked(/*const*/ struct ifnet *ifp) igi->igi_qi = IGMP_QI_INIT; igi->igi_qri = IGMP_QRI_INIT; igi->igi_uri = IGMP_URI_INIT; - SLIST_INIT(&igi->igi_relinmhead); mbufq_init(&igi->igi_gq, IGMP_MAX_RESPONSE_PACKETS); LIST_INSERT_HEAD(&V_igi_head, igi, igi_link); @@ -613,44 +612,37 @@ void igmp_ifdetach(struct ifnet *ifp) { struct igmp_ifsoftc *igi; - struct ifmultiaddr *ifma; - struct in_multi *inm, *tinm; - + struct ifmultiaddr *ifma, *next; + struct in_multi *inm; + struct in_multi_head inm_free_tmp; CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp, ifp->if_xname); + SLIST_INIT(&inm_free_tmp); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; if (igi->igi_version == IGMP_VERSION_3) { - IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + IF_ADDR_WLOCK(ifp); + restart: + CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; -#if 0 - KASSERT(ifma->ifma_protospec != NULL, - ("%s: ifma_protospec is NULL", __func__)); -#endif inm = (struct in_multi *)ifma->ifma_protospec; - if (inm->inm_state == IGMP_LEAVING_MEMBER) { - SLIST_INSERT_HEAD(&igi->igi_relinmhead, - inm, inm_nrele); - } + if (inm->inm_state == IGMP_LEAVING_MEMBER) + inm_rele_locked(&inm_free_tmp, inm); inm_clear_recorded(inm); + if (__predict_false(ifma_restart)) { + ifma_restart = false; + goto restart; + } } - IF_ADDR_RUNLOCK(ifp); - /* - * Free the in_multi reference(s) for this IGMP lifecycle. - */ - SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele, - tinm) { - SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele); - inm_release_locked(inm); - } + IF_ADDR_WUNLOCK(ifp); + inm_release_list_deferred(&inm_free_tmp); } - IGMP_UNLOCK(); + } /* @@ -686,11 +678,6 @@ igi_delete_locked(const struct ifnet *ifp) mbufq_drain(&igi->igi_gq); LIST_REMOVE(igi, igi_link); - - KASSERT(SLIST_EMPTY(&igi->igi_relinmhead), - ("%s: there are dangling in_multi references", - __func__)); - free(igi, M_IGMP); return; } @@ -724,7 +711,7 @@ igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip, } IGMPSTAT_INC(igps_rcv_gen_queries); - IN_MULTI_LOCK(); + IN_MULTI_LIST_LOCK(); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; @@ -749,7 +736,7 @@ igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip, * except those which are already running. */ IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; @@ -780,7 +767,7 @@ igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip, out_locked: IGMP_UNLOCK(); - IN_MULTI_UNLOCK(); + IN_MULTI_LIST_UNLOCK(); return (0); } @@ -818,7 +805,7 @@ igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip, IGMPSTAT_INC(igps_rcv_group_queries); } - IN_MULTI_LOCK(); + IN_MULTI_LIST_LOCK(); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; @@ -850,7 +837,7 @@ igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip, CTR2(KTR_IGMPV3, "process v2 general query on ifp %p(%s)", ifp, ifp->if_xname); IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; @@ -874,7 +861,7 @@ igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip, out_locked: IGMP_UNLOCK(); - IN_MULTI_UNLOCK(); + IN_MULTI_LIST_UNLOCK(); return (0); } @@ -901,7 +888,7 @@ igmp_v2_update_group(struct in_multi *inm, const int timer) CTR4(KTR_IGMPV3, "0x%08x: %s/%s timer=%d", __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname, timer); - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); switch (inm->inm_state) { case IGMP_NOT_MEMBER: @@ -1013,7 +1000,7 @@ igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip, IGMPSTAT_INC(igps_rcv_gsr_queries); } - IN_MULTI_LOCK(); + IN_MULTI_LIST_LOCK(); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; @@ -1094,7 +1081,7 @@ igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip, out_locked: IGMP_UNLOCK(); - IN_MULTI_UNLOCK(); + IN_MULTI_LIST_UNLOCK(); return (0); } @@ -1111,7 +1098,7 @@ igmp_input_v3_group_query(struct in_multi *inm, struct igmp_ifsoftc *igi, int retval; uint16_t nsrc; - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); retval = 0; @@ -1233,11 +1220,11 @@ igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip, * Replace 0.0.0.0 with the subnet address if told to do so. */ if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) { + NET_EPOCH_ENTER(); IFP_TO_IA(ifp, ia, &in_ifa_tracker); - if (ia != NULL) { + if (ia != NULL) ip->ip_src.s_addr = htonl(ia->ia_subnet); - ifa_free(&ia->ia_ifa); - } + NET_EPOCH_EXIT(); } CTR3(KTR_IGMPV3, "process v1 report 0x%08x on ifp %p(%s)", @@ -1248,7 +1235,7 @@ igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip, * If we are a member of this group, and our membership should be * reported, stop our group timer and transition to the 'lazy' state. */ - IN_MULTI_LOCK(); + IN_MULTI_LIST_LOCK(); inm = inm_lookup(ifp, igmp->igmp_group); if (inm != NULL) { struct igmp_ifsoftc *igi; @@ -1307,7 +1294,7 @@ igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip, } out_locked: - IN_MULTI_UNLOCK(); + IN_MULTI_LIST_UNLOCK(); return (0); } @@ -1330,24 +1317,23 @@ igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip, * leave requires knowing that we are the only member of a * group. */ + NET_EPOCH_ENTER(); IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL && in_hosteq(ip->ip_src, IA_SIN(ia)->sin_addr)) { - ifa_free(&ia->ia_ifa); + NET_EPOCH_EXIT(); return (0); } IGMPSTAT_INC(igps_rcv_reports); if (ifp->if_flags & IFF_LOOPBACK) { - if (ia != NULL) - ifa_free(&ia->ia_ifa); + NET_EPOCH_EXIT(); return (0); } if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) || !in_hosteq(igmp->igmp_group, ip->ip_dst)) { - if (ia != NULL) - ifa_free(&ia->ia_ifa); + NET_EPOCH_EXIT(); IGMPSTAT_INC(igps_rcv_badreports); return (EINVAL); } @@ -1363,8 +1349,7 @@ igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip, if (ia != NULL) ip->ip_src.s_addr = htonl(ia->ia_subnet); } - if (ia != NULL) - ifa_free(&ia->ia_ifa); + NET_EPOCH_EXIT(); CTR3(KTR_IGMPV3, "process v2 report 0x%08x on ifp %p(%s)", ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname); @@ -1375,7 +1360,7 @@ igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip, * reported, and our group timer is pending or about to be reset, * stop our group timer by transitioning to the 'lazy' state. */ - IN_MULTI_LOCK(); + IN_MULTI_LIST_LOCK(); inm = inm_lookup(ifp, igmp->igmp_group); if (inm != NULL) { struct igmp_ifsoftc *igi; @@ -1420,7 +1405,7 @@ igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip, } out_locked: - IN_MULTI_UNLOCK(); + IN_MULTI_LIST_UNLOCK(); return (0); } @@ -1647,8 +1632,9 @@ igmp_fasttimo_vnet(void) struct mbufq qrq; /* Query response packets */ struct ifnet *ifp; struct igmp_ifsoftc *igi; - struct ifmultiaddr *ifma; + struct ifmultiaddr *ifma, *next; struct in_multi *inm; + struct in_multi_head inm_free_tmp; int loop, uri_fasthz; loop = 0; @@ -1664,7 +1650,8 @@ igmp_fasttimo_vnet(void) !V_state_change_timers_running) return; - IN_MULTI_LOCK(); + SLIST_INIT(&inm_free_tmp); + IN_MULTI_LIST_LOCK(); IGMP_LOCK(); /* @@ -1709,8 +1696,9 @@ igmp_fasttimo_vnet(void) mbufq_init(&scq, IGMP_MAX_STATE_CHANGE_PACKETS); } - IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + IF_ADDR_WLOCK(ifp); + restart: + CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; @@ -1722,16 +1710,18 @@ igmp_fasttimo_vnet(void) igi->igi_version); break; case IGMP_VERSION_3: - igmp_v3_process_group_timers(igi, &qrq, + igmp_v3_process_group_timers(&inm_free_tmp, &qrq, &scq, inm, uri_fasthz); break; } + if (__predict_false(ifma_restart)) { + ifma_restart = false; + goto restart; + } } - IF_ADDR_RUNLOCK(ifp); + IF_ADDR_WUNLOCK(ifp); if (igi->igi_version == IGMP_VERSION_3) { - struct in_multi *tinm; - igmp_dispatch_queue(&qrq, 0, loop); igmp_dispatch_queue(&scq, 0, loop); @@ -1739,18 +1729,13 @@ igmp_fasttimo_vnet(void) * Free the in_multi reference(s) for this * IGMP lifecycle. */ - SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, - inm_nrele, tinm) { - SLIST_REMOVE_HEAD(&igi->igi_relinmhead, - inm_nrele); - inm_release_locked(inm); - } + inm_release_list_deferred(&inm_free_tmp); } } out_locked: IGMP_UNLOCK(); - IN_MULTI_UNLOCK(); + IN_MULTI_LIST_UNLOCK(); } /* @@ -1762,7 +1747,7 @@ igmp_v1v2_process_group_timer(struct in_multi *inm, const int version) { int report_timer_expired; - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); if (inm->inm_timer == 0) { @@ -1804,14 +1789,14 @@ igmp_v1v2_process_group_timer(struct in_multi *inm, const int version) * Note: Unlocked read from igi. */ static void -igmp_v3_process_group_timers(struct igmp_ifsoftc *igi, +igmp_v3_process_group_timers(struct in_multi_head *inmh, struct mbufq *qrq, struct mbufq *scq, struct in_multi *inm, const int uri_fasthz) { int query_response_timer_expired; int state_change_retransmit_timer_expired; - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); query_response_timer_expired = 0; @@ -1861,7 +1846,7 @@ igmp_v3_process_group_timers(struct igmp_ifsoftc *igi, * immediate transmission. */ if (query_response_timer_expired) { - int retval; + int retval __unused; retval = igmp_v3_enqueue_group_record(qrq, inm, 0, 1, (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)); @@ -1909,8 +1894,7 @@ igmp_v3_process_group_timers(struct igmp_ifsoftc *igi, if (inm->inm_state == IGMP_LEAVING_MEMBER && inm->inm_scrv == 0) { inm->inm_state = IGMP_NOT_MEMBER; - SLIST_INSERT_HEAD(&igi->igi_relinmhead, - inm, inm_nrele); + inm_rele_locked(inmh, inm); } } break; @@ -1931,7 +1915,7 @@ static void igmp_v3_suppress_group_record(struct in_multi *inm) { - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); KASSERT(inm->inm_igi->igi_version == IGMP_VERSION_3, ("%s: not IGMPv3 mode on link", __func__)); @@ -2005,13 +1989,15 @@ igmp_v3_cancel_link_timers(struct igmp_ifsoftc *igi) { struct ifmultiaddr *ifma; struct ifnet *ifp; - struct in_multi *inm, *tinm; + struct in_multi *inm; + struct in_multi_head inm_free_tmp; CTR3(KTR_IGMPV3, "%s: cancel v3 timers on ifp %p(%s)", __func__, igi->igi_ifp, igi->igi_ifp->if_xname); - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); + SLIST_INIT(&inm_free_tmp); /* * Stop the v3 General Query Response on this link stone dead. @@ -2026,7 +2012,7 @@ igmp_v3_cancel_link_timers(struct igmp_ifsoftc *igi) */ ifp = igi->igi_ifp; IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; @@ -2052,7 +2038,7 @@ igmp_v3_cancel_link_timers(struct igmp_ifsoftc *igi) * message is sent upstream to the old querier -- * transition to NOT would lose the leave and race. */ - SLIST_INSERT_HEAD(&igi->igi_relinmhead, inm, inm_nrele); + inm_rele_locked(&inm_free_tmp, inm); /* FALLTHROUGH */ case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: @@ -2071,10 +2057,8 @@ igmp_v3_cancel_link_timers(struct igmp_ifsoftc *igi) mbufq_drain(&inm->inm_scq); } IF_ADDR_RUNLOCK(ifp); - SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele, tinm) { - SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele); - inm_release_locked(inm); - } + + inm_release_list_deferred(&inm_free_tmp); } /* @@ -2201,7 +2185,7 @@ igmp_v1v2_queue_report(struct in_multi *inm, const int type) struct ip *ip; struct mbuf *m; - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); ifp = inm->inm_ifp; @@ -2278,10 +2262,8 @@ igmp_change_state(struct in_multi *inm) struct ifnet *ifp; int error; - IN_MULTI_LOCK_ASSERT(); - error = 0; - + IN_MULTI_LOCK_ASSERT(); /* * Try to detect if the upper layer just asked us to change state * for an interface which has now gone away. @@ -2381,9 +2363,10 @@ igmp_initial_join(struct in_multi *inm, struct igmp_ifsoftc *igi) * group around for the final INCLUDE {} enqueue. */ if (igi->igi_version == IGMP_VERSION_3 && - inm->inm_state == IGMP_LEAVING_MEMBER) - inm_release_locked(inm); - + inm->inm_state == IGMP_LEAVING_MEMBER) { + MPASS(inm->inm_refcount > 1); + inm_rele_locked(NULL, inm); + } inm->inm_state = IGMP_REPORTING_MEMBER; switch (igi->igi_version) { @@ -2475,7 +2458,7 @@ igmp_handle_state_change(struct in_multi *inm, struct igmp_ifsoftc *igi) ifp = inm->inm_ifp; - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__)); @@ -2533,7 +2516,7 @@ igmp_final_leave(struct in_multi *inm, struct igmp_ifsoftc *igi) __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp, inm->inm_ifp->if_xname); - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); switch (inm->inm_state) { @@ -2579,7 +2562,7 @@ igmp_final_leave(struct in_multi *inm, struct igmp_ifsoftc *igi) inm->inm_state = IGMP_NOT_MEMBER; inm->inm_sctimer = 0; } else { - int retval; + int retval __unused; inm_acquire_locked(inm); @@ -2652,7 +2635,7 @@ igmp_v3_enqueue_group_record(struct mbufq *mq, struct in_multi *inm, struct ifnet *ifp; struct ip_msource *ims, *nims; struct mbuf *m0, *m, *md; - int error, is_filter_list_change; + int is_filter_list_change; int minrec0len, m0srcs, msrcs, nbytes, off; int record_has_sources; int now; @@ -2660,9 +2643,8 @@ igmp_v3_enqueue_group_record(struct mbufq *mq, struct in_multi *inm, in_addr_t naddr; uint8_t mode; - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); - error = 0; ifp = inm->inm_ifp; is_filter_list_change = 0; m = NULL; @@ -3020,7 +3002,7 @@ igmp_v3_enqueue_filter_change(struct mbufq *mq, struct in_multi *inm) uint8_t mode, now, then; rectype_t crt, drt, nrt; - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); if (inm->inm_nsrc == 0 || (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0)) @@ -3223,7 +3205,7 @@ igmp_v3_merge_state_changes(struct in_multi *inm, struct mbufq *scq) domerge = 0; recslen = 0; - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); /* @@ -3320,9 +3302,9 @@ igmp_v3_dispatch_general_query(struct igmp_ifsoftc *igi) struct ifmultiaddr *ifma; struct ifnet *ifp; struct in_multi *inm; - int retval, loop; + int retval __unused, loop; - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); KASSERT(igi->igi_version == IGMP_VERSION_3, @@ -3340,7 +3322,7 @@ igmp_v3_dispatch_general_query(struct igmp_ifsoftc *igi) ifp = igi->igi_ifp; IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; @@ -3544,11 +3526,11 @@ igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m) if (m->m_flags & M_IGMP_LOOP) { struct in_ifaddr *ia; + NET_EPOCH_ENTER(); IFP_TO_IA(ifp, ia, &in_ifa_tracker); - if (ia != NULL) { + if (ia != NULL) ip->ip_src = ia->ia_addr.sin_addr; - ifa_free(&ia->ia_ifa); - } + NET_EPOCH_EXIT(); } ip->ip_dst.s_addr = htonl(INADDR_ALLRPTS_GROUP); @@ -3634,7 +3616,6 @@ DB_SHOW_COMMAND(igi_list, db_show_igi_list) db_printf(" qi %u\n", igi->igi_qi); db_printf(" qri %u\n", igi->igi_qri); db_printf(" uri %u\n", igi->igi_uri); - /* SLIST_HEAD(,in_multi) igi_relinmhead */ /* struct mbufq igi_gq; */ db_printf("\n"); } diff --git a/freebsd/sys/netinet/igmp_var.h b/freebsd/sys/netinet/igmp_var.h index 4f9db06c..11f086f8 100644 --- a/freebsd/sys/netinet/igmp_var.h +++ b/freebsd/sys/netinet/igmp_var.h @@ -214,7 +214,6 @@ struct igmp_ifsoftc { uint32_t igi_qi; /* IGMPv3 Query Interval (s) */ uint32_t igi_qri; /* IGMPv3 Query Response Interval (s) */ uint32_t igi_uri; /* IGMPv3 Unsolicited Report Interval (s) */ - SLIST_HEAD(,in_multi) igi_relinmhead; /* released groups */ struct mbufq igi_gq; /* general query responses queue */ }; diff --git a/freebsd/sys/netinet/in.c b/freebsd/sys/netinet/in.c index 28c257aa..7233f9a2 100644 --- a/freebsd/sys/netinet/in.c +++ b/freebsd/sys/netinet/in.c @@ -104,7 +104,7 @@ in_localaddr(struct in_addr in) struct in_ifaddr *ia; IN_IFADDR_RLOCK(&in_ifa_tracker); - TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { + CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if ((i & ia->ia_subnetmask) == ia->ia_subnet) { IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (1); @@ -145,7 +145,7 @@ in_ifhasaddr(struct ifnet *ifp, struct in_addr in) struct in_ifaddr *ia; IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = (struct in_ifaddr *)ifa; @@ -282,7 +282,7 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, * first one on the interface, if possible. */ IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = (struct in_ifaddr *)ifa; @@ -290,7 +290,7 @@ in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, break; } if (ifa == NULL) - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET) { ia = (struct in_ifaddr *)ifa; if (prison_check_ip4(td->td_ucred, @@ -381,7 +381,7 @@ in_aifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) iaIsFirst = true; ia = NULL; IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in_ifaddr *it; if (ifa->ifa_addr->sa_family != AF_INET) @@ -459,12 +459,12 @@ in_aifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) /* if_addrhead is already referenced by ifa_alloc() */ IF_ADDR_WLOCK(ifp); - TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link); + CK_STAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_ref(ifa); /* in_ifaddrhead */ IN_IFADDR_WLOCK(); - TAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link); + CK_STAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link); LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia, ia_hash); IN_IFADDR_WUNLOCK(); @@ -537,12 +537,12 @@ fail1: (*carp_detach_p)(&ia->ia_ifa, false); IF_ADDR_WLOCK(ifp); - TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); + CK_STAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ IN_IFADDR_WLOCK(); - TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link); + CK_STAILQ_REMOVE(&V_in_ifaddrhead, ia, in_ifaddr, ia_link); LIST_REMOVE(ia, ia_hash); IN_IFADDR_WUNLOCK(); ifa_free(&ia->ia_ifa); /* in_ifaddrhead */ @@ -576,7 +576,7 @@ in_difaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) iaIsLast = true; ia = NULL; IF_ADDR_WLOCK(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in_ifaddr *it; if (ifa->ifa_addr->sa_family != AF_INET) @@ -601,12 +601,12 @@ in_difaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) return (EADDRNOTAVAIL); } - TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); + CK_STAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ IN_IFADDR_WLOCK(); - TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link); + CK_STAILQ_REMOVE(&V_in_ifaddrhead, ia, in_ifaddr, ia_link); LIST_REMOVE(ia, ia_hash); IN_IFADDR_WUNLOCK(); @@ -636,12 +636,10 @@ in_difaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) struct in_ifinfo *ii; ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); - IN_MULTI_LOCK(); if (ii->ii_allhosts) { - (void)in_leavegroup_locked(ii->ii_allhosts, NULL); + (void)in_leavegroup(ii->ii_allhosts, NULL); ii->ii_allhosts = NULL; } - IN_MULTI_UNLOCK(); } IF_ADDR_WLOCK(ifp); @@ -682,7 +680,7 @@ in_addprefix(struct in_ifaddr *target, int flags) IN_IFADDR_RLOCK(&in_ifa_tracker); /* Look for an existing address with the same prefix, mask, and fib */ - TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { + CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (rtinitflags(ia)) { p = ia->ia_dstaddr.sin_addr; @@ -842,7 +840,7 @@ in_scrubprefix(struct in_ifaddr *target, u_int flags) } IN_IFADDR_RLOCK(&in_ifa_tracker); - TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { + CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (rtinitflags(ia)) { p = ia->ia_dstaddr.sin_addr; @@ -918,10 +916,10 @@ in_ifscrub_all(void) struct ifaliasreq ifr; IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* Cannot lock here - lock recursion. */ /* IF_ADDR_RLOCK(ifp); */ - TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, nifa) { + CK_STAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, nifa) { if (ifa->ifa_addr->sa_family != AF_INET) continue; @@ -982,7 +980,7 @@ in_broadcast(struct in_addr in, struct ifnet *ifp) * with a broadcast address. */ IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET && in_ifaddr_broadcast(in, (struct in_ifaddr *)ifa)) { found = 1; @@ -998,11 +996,12 @@ in_broadcast(struct in_addr in, struct ifnet *ifp) void in_ifdetach(struct ifnet *ifp) { - + IN_MULTI_LOCK(); in_pcbpurgeif0(&V_ripcbinfo, ifp); in_pcbpurgeif0(&V_udbinfo, ifp); in_pcbpurgeif0(&V_ulitecbinfo, ifp); in_purgemaddrs(ifp); + IN_MULTI_UNLOCK(); } /* @@ -1015,12 +1014,12 @@ in_ifdetach(struct ifnet *ifp) static void in_purgemaddrs(struct ifnet *ifp) { - LIST_HEAD(,in_multi) purgeinms; - struct in_multi *inm, *tinm; - struct ifmultiaddr *ifma; + struct in_multi_head purgeinms; + struct in_multi *inm; + struct ifmultiaddr *ifma, *next; - LIST_INIT(&purgeinms); - IN_MULTI_LOCK(); + SLIST_INIT(&purgeinms); + IN_MULTI_LIST_LOCK(); /* * Extract list of in_multi associated with the detaching ifp @@ -1028,27 +1027,24 @@ in_purgemaddrs(struct ifnet *ifp) * We need to do this as IF_ADDR_LOCK() may be re-acquired * by code further down. */ - IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + IF_ADDR_WLOCK(ifp); + restart: + CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; -#if 0 - KASSERT(ifma->ifma_protospec != NULL, - ("%s: ifma_protospec is NULL", __func__)); -#endif inm = (struct in_multi *)ifma->ifma_protospec; - LIST_INSERT_HEAD(&purgeinms, inm, inm_link); + inm_rele_locked(&purgeinms, inm); + if (__predict_false(ifma_restart)) { + ifma_restart = true; + goto restart; + } } - IF_ADDR_RUNLOCK(ifp); + IF_ADDR_WUNLOCK(ifp); - LIST_FOREACH_SAFE(inm, &purgeinms, inm_link, tinm) { - LIST_REMOVE(inm, inm_link); - inm_release_locked(inm); - } + inm_release_list_deferred(&purgeinms); igmp_ifdetach(ifp); - - IN_MULTI_UNLOCK(); + IN_MULTI_LIST_UNLOCK(); } struct in_llentry { @@ -1063,9 +1059,11 @@ struct in_llentry { * Do actual deallocation of @lle. */ static void -in_lltable_destroy_lle_unlocked(struct llentry *lle) +in_lltable_destroy_lle_unlocked(epoch_context_t ctx) { + struct llentry *lle; + lle = __containerof(ctx, struct llentry, lle_epoch_ctx); LLE_LOCK_DESTROY(lle); LLE_REQ_DESTROY(lle); free(lle, M_LLTABLE); @@ -1093,7 +1091,7 @@ in_lltable_destroy_lle(struct llentry *lle) { LLE_WUNLOCK(lle); - in_lltable_destroy_lle_unlocked(lle); + epoch_call(net_epoch_preempt, &lle->lle_epoch_ctx, in_lltable_destroy_lle_unlocked); } static struct llentry * @@ -1160,7 +1158,6 @@ in_lltable_match_prefix(const struct sockaddr *saddr, static void in_lltable_free_entry(struct lltable *llt, struct llentry *lle) { - struct ifnet *ifp; size_t pkts_dropped; LLE_WLOCK_ASSERT(lle); @@ -1168,8 +1165,7 @@ in_lltable_free_entry(struct lltable *llt, struct llentry *lle) /* Unlink entry from table if not already */ if ((lle->la_flags & LLE_LINKED) != 0) { - ifp = llt->llt_ifp; - IF_AFDATA_WLOCK_ASSERT(ifp); + IF_AFDATA_WLOCK_ASSERT(llt->llt_ifp); lltable_unlink_entry(llt, lle); } @@ -1304,7 +1300,7 @@ in_lltable_find_dst(struct lltable *llt, struct in_addr dst) hashidx = in_lltable_hash_dst(dst, llt->llt_hsize); lleh = &llt->lle_head[hashidx]; - LIST_FOREACH(lle, lleh, lle_next) { + CK_LIST_FOREACH(lle, lleh, lle_next) { if (lle->la_flags & LLE_DELETED) continue; if (lle->r_l3addr.addr4.s_addr == dst.s_addr) @@ -1360,7 +1356,7 @@ in_lltable_alloc(struct lltable *llt, u_int flags, const struct sockaddr *l3addr linkhdrsize = LLE_MAX_LINKHDR; if (lltable_calc_llheader(ifp, AF_INET, IF_LLADDR(ifp), linkhdr, &linkhdrsize, &lladdr_off) != 0) { - in_lltable_destroy_lle_unlocked(lle); + epoch_call(net_epoch_preempt, &lle->lle_epoch_ctx, in_lltable_destroy_lle_unlocked); return (NULL); } lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize, @@ -1420,52 +1416,51 @@ in_lltable_dump_entry(struct lltable *llt, struct llentry *lle, int error; bzero(&arpc, sizeof(arpc)); - /* skip deleted entries */ - if ((lle->la_flags & LLE_DELETED) == LLE_DELETED) - return (0); - /* Skip if jailed and not a valid IP of the prison. */ - lltable_fill_sa_entry(lle,(struct sockaddr *)&arpc.sin); - if (prison_if(wr->td->td_ucred, - (struct sockaddr *)&arpc.sin) != 0) - return (0); - /* - * produce a msg made of: - * struct rt_msghdr; - * struct sockaddr_in; (IPv4) - * struct sockaddr_dl; - */ - arpc.rtm.rtm_msglen = sizeof(arpc); - arpc.rtm.rtm_version = RTM_VERSION; - arpc.rtm.rtm_type = RTM_GET; - arpc.rtm.rtm_flags = RTF_UP; - arpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY; - - /* publish */ - if (lle->la_flags & LLE_PUB) - arpc.rtm.rtm_flags |= RTF_ANNOUNCE; - - sdl = &arpc.sdl; - sdl->sdl_family = AF_LINK; - sdl->sdl_len = sizeof(*sdl); - sdl->sdl_index = ifp->if_index; - sdl->sdl_type = ifp->if_type; - if ((lle->la_flags & LLE_VALID) == LLE_VALID) { - sdl->sdl_alen = ifp->if_addrlen; - bcopy(lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); - } else { - sdl->sdl_alen = 0; - bzero(LLADDR(sdl), ifp->if_addrlen); - } + /* skip deleted entries */ + if ((lle->la_flags & LLE_DELETED) == LLE_DELETED) + return (0); + /* Skip if jailed and not a valid IP of the prison. */ + lltable_fill_sa_entry(lle,(struct sockaddr *)&arpc.sin); + if (prison_if(wr->td->td_ucred, (struct sockaddr *)&arpc.sin) != 0) + return (0); + /* + * produce a msg made of: + * struct rt_msghdr; + * struct sockaddr_in; (IPv4) + * struct sockaddr_dl; + */ + arpc.rtm.rtm_msglen = sizeof(arpc); + arpc.rtm.rtm_version = RTM_VERSION; + arpc.rtm.rtm_type = RTM_GET; + arpc.rtm.rtm_flags = RTF_UP; + arpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY; + + /* publish */ + if (lle->la_flags & LLE_PUB) + arpc.rtm.rtm_flags |= RTF_ANNOUNCE; + + sdl = &arpc.sdl; + sdl->sdl_family = AF_LINK; + sdl->sdl_len = sizeof(*sdl); + sdl->sdl_index = ifp->if_index; + sdl->sdl_type = ifp->if_type; + if ((lle->la_flags & LLE_VALID) == LLE_VALID) { + sdl->sdl_alen = ifp->if_addrlen; + bcopy(lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); + } else { + sdl->sdl_alen = 0; + bzero(LLADDR(sdl), ifp->if_addrlen); + } - arpc.rtm.rtm_rmx.rmx_expire = - lle->la_flags & LLE_STATIC ? 0 : lle->la_expire; - arpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA); - if (lle->la_flags & LLE_STATIC) - arpc.rtm.rtm_flags |= RTF_STATIC; - if (lle->la_flags & LLE_IFADDR) - arpc.rtm.rtm_flags |= RTF_PINNED; - arpc.rtm.rtm_index = ifp->if_index; - error = SYSCTL_OUT(wr, &arpc, sizeof(arpc)); + arpc.rtm.rtm_rmx.rmx_expire = + lle->la_flags & LLE_STATIC ? 0 : lle->la_expire; + arpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA); + if (lle->la_flags & LLE_STATIC) + arpc.rtm.rtm_flags |= RTF_STATIC; + if (lle->la_flags & LLE_IFADDR) + arpc.rtm.rtm_flags |= RTF_PINNED; + arpc.rtm.rtm_index = ifp->if_index; + error = SYSCTL_OUT(wr, &arpc, sizeof(arpc)); return (error); } diff --git a/freebsd/sys/netinet/in_mcast.c b/freebsd/sys/netinet/in_mcast.c index 41beed9b..ea4779fc 100644 --- a/freebsd/sys/netinet/in_mcast.c +++ b/freebsd/sys/netinet/in_mcast.c @@ -53,6 +53,7 @@ __FBSDID("$FreeBSD$"); #include <sys/sysctl.h> #include <sys/ktr.h> #include <sys/taskqueue.h> +#include <sys/gtaskqueue.h> #include <sys/tree.h> #include <net/if.h> @@ -61,6 +62,8 @@ __FBSDID("$FreeBSD$"); #include <net/route.h> #include <net/vnet.h> +#include <net/ethernet.h> + #include <netinet/in.h> #include <netinet/in_systm.h> #include <netinet/in_fib.h> @@ -93,17 +96,25 @@ static MALLOC_DEFINE(M_IPMSOURCE, "ip_msource", /* * Locking: - * - Lock order is: Giant, INP_WLOCK, IN_MULTI_LOCK, IGMP_LOCK, IF_ADDR_LOCK. + * - Lock order is: Giant, INP_WLOCK, IN_MULTI_LIST_LOCK, IGMP_LOCK, IF_ADDR_LOCK. * - The IF_ADDR_LOCK is implicitly taken by inm_lookup() earlier, however * it can be taken by code in net/if.c also. * - ip_moptions and in_mfilter are covered by the INP_WLOCK. * - * struct in_multi is covered by IN_MULTI_LOCK. There isn't strictly + * struct in_multi is covered by IN_MULTI_LIST_LOCK. There isn't strictly * any need for in_multi itself to be virtualized -- it is bound to an ifp * anyway no matter what happens. */ -struct mtx in_multi_mtx; -MTX_SYSINIT(in_multi_mtx, &in_multi_mtx, "in_multi_mtx", MTX_DEF); +struct mtx in_multi_list_mtx; +MTX_SYSINIT(in_multi_mtx, &in_multi_list_mtx, "in_multi_list_mtx", MTX_DEF); + +struct mtx in_multi_free_mtx; +MTX_SYSINIT(in_multi_free_mtx, &in_multi_free_mtx, "in_multi_free_mtx", MTX_DEF); + +struct sx in_multi_sx; +SX_SYSINIT(in_multi_sx, &in_multi_sx, "in_multi_sx"); + +int ifma_restart; /* * Functions with non-static linkage defined in this file should be @@ -153,10 +164,9 @@ static int inm_is_ifp_detached(const struct in_multi *); static int inm_merge(struct in_multi *, /*const*/ struct in_mfilter *); static void inm_purge(struct in_multi *); static void inm_reap(struct in_multi *); +static void inm_release(struct in_multi *); static struct ip_moptions * inp_findmoptions(struct inpcb *); -static void inp_freemoptions_internal(struct ip_moptions *); -static void inp_gcmoptions(void *, int); static int inp_get_source_filters(struct inpcb *, struct sockopt *); static int inp_join_group(struct inpcb *, struct sockopt *); static int inp_leave_group(struct inpcb *, struct sockopt *); @@ -189,10 +199,6 @@ static SYSCTL_NODE(_net_inet_ip_mcast, OID_AUTO, filters, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip_mcast_filters, "Per-interface stack-wide source filters"); -static STAILQ_HEAD(, ip_moptions) imo_gc_list = - STAILQ_HEAD_INITIALIZER(imo_gc_list); -static struct task imo_gc_task = TASK_INITIALIZER(0, inp_gcmoptions, NULL); - #ifdef KTR /* * Inline function which wraps assertions for a valid ifp. @@ -218,6 +224,93 @@ inm_is_ifp_detached(const struct in_multi *inm) } #endif +static struct grouptask free_gtask; +static struct in_multi_head inm_free_list; +static void inm_release_task(void *arg __unused); +static void inm_init(void) +{ + SLIST_INIT(&inm_free_list); + taskqgroup_config_gtask_init(NULL, &free_gtask, inm_release_task, "inm release task"); +} + +SYSINIT(inm_init, SI_SUB_SMP + 1, SI_ORDER_FIRST, + inm_init, NULL); + + +void +inm_release_list_deferred(struct in_multi_head *inmh) +{ + + if (SLIST_EMPTY(inmh)) + return; + mtx_lock(&in_multi_free_mtx); + SLIST_CONCAT(&inm_free_list, inmh, in_multi, inm_nrele); + mtx_unlock(&in_multi_free_mtx); + GROUPTASK_ENQUEUE(&free_gtask); +} + +void +inm_disconnect(struct in_multi *inm) +{ + struct ifnet *ifp; + struct ifmultiaddr *ifma, *ll_ifma; + + ifp = inm->inm_ifp; + IF_ADDR_WLOCK_ASSERT(ifp); + ifma = inm->inm_ifma; + + if_ref(ifp); + CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link); + MCDPRINTF("removed ifma: %p from %s\n", ifma, ifp->if_xname); + if ((ll_ifma = ifma->ifma_llifma) != NULL) { + MPASS(ifma != ll_ifma); + ifma->ifma_llifma = NULL; + MPASS(ll_ifma->ifma_llifma == NULL); + MPASS(ll_ifma->ifma_ifp == ifp); + if (--ll_ifma->ifma_refcount == 0) { + CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link); + MCDPRINTF("removed ll_ifma: %p from %s\n", ll_ifma, ifp->if_xname); + if_freemulti(ll_ifma); + ifma_restart = true; + } + } +} + +void +inm_release_deferred(struct in_multi *inm) +{ + struct in_multi_head tmp; + + IN_MULTI_LIST_LOCK_ASSERT(); + MPASS(inm->inm_refcount > 0); + if (--inm->inm_refcount == 0) { + SLIST_INIT(&tmp); + inm_disconnect(inm); + inm->inm_ifma->ifma_protospec = NULL; + SLIST_INSERT_HEAD(&tmp, inm, inm_nrele); + inm_release_list_deferred(&tmp); + } +} + +static void +inm_release_task(void *arg __unused) +{ + struct in_multi_head inm_free_tmp; + struct in_multi *inm, *tinm; + + SLIST_INIT(&inm_free_tmp); + mtx_lock(&in_multi_free_mtx); + SLIST_CONCAT(&inm_free_tmp, &inm_free_list, in_multi, inm_nrele); + mtx_unlock(&in_multi_free_mtx); + IN_MULTI_LOCK(); + SLIST_FOREACH_SAFE(inm, &inm_free_tmp, inm_nrele, tinm) { + SLIST_REMOVE_HEAD(&inm_free_tmp, inm_nrele); + MPASS(inm); + inm_release(inm); + } + IN_MULTI_UNLOCK(); +} + /* * Initialize an in_mfilter structure to a known state at t0, t1 * with an empty source filter list. @@ -234,7 +327,7 @@ imf_init(struct in_mfilter *imf, const int st0, const int st1) /* * Function for looking up an in_multi record for an IPv4 multicast address * on a given interface. ifp must be valid. If no record found, return NULL. - * The IN_MULTI_LOCK and IF_ADDR_LOCK on ifp must be held. + * The IN_MULTI_LIST_LOCK and IF_ADDR_LOCK on ifp must be held. */ struct in_multi * inm_lookup_locked(struct ifnet *ifp, const struct in_addr ina) @@ -242,17 +335,18 @@ inm_lookup_locked(struct ifnet *ifp, const struct in_addr ina) struct ifmultiaddr *ifma; struct in_multi *inm; - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); IF_ADDR_LOCK_ASSERT(ifp); inm = NULL; - TAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) { - if (ifma->ifma_addr->sa_family == AF_INET) { - inm = (struct in_multi *)ifma->ifma_protospec; - if (inm->inm_addr.s_addr == ina.s_addr) - break; - inm = NULL; - } + CK_STAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) { + if (ifma->ifma_addr->sa_family != AF_INET || + ifma->ifma_protospec == NULL) + continue; + inm = (struct in_multi *)ifma->ifma_protospec; + if (inm->inm_addr.s_addr == ina.s_addr) + break; + inm = NULL; } return (inm); } @@ -266,7 +360,7 @@ inm_lookup(struct ifnet *ifp, const struct in_addr ina) { struct in_multi *inm; - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); IF_ADDR_RLOCK(ifp); inm = inm_lookup_locked(ifp, ina); IF_ADDR_RUNLOCK(ifp); @@ -453,7 +547,7 @@ in_getmulti(struct ifnet *ifp, const struct in_addr *group, IN_MULTI_LOCK_ASSERT(); ii = (struct in_ifinfo *)ifp->if_afdata[AF_INET]; - + IN_MULTI_LIST_LOCK(); inm = inm_lookup(ifp, *group); if (inm != NULL) { /* @@ -462,11 +556,13 @@ in_getmulti(struct ifnet *ifp, const struct in_addr *group, */ KASSERT(inm->inm_refcount >= 1, ("%s: bad refcount %d", __func__, inm->inm_refcount)); - ++inm->inm_refcount; + inm_acquire_locked(inm); *pinm = inm; - return (0); } - + IN_MULTI_LIST_UNLOCK(); + if (inm != NULL) + return (0); + memset(&gsin, 0, sizeof(gsin)); gsin.sin_family = AF_INET; gsin.sin_len = sizeof(struct sockaddr_in); @@ -481,6 +577,7 @@ in_getmulti(struct ifnet *ifp, const struct in_addr *group, return (error); /* XXX ifma_protospec must be covered by IF_ADDR_LOCK */ + IN_MULTI_LIST_LOCK(); IF_ADDR_WLOCK(ifp); /* @@ -506,10 +603,9 @@ in_getmulti(struct ifnet *ifp, const struct in_addr *group, __func__, ifma, inm, inet_ntoa_r(*group, addrbuf)); } #endif - ++inm->inm_refcount; + inm_acquire_locked(inm); *pinm = inm; - IF_ADDR_WUNLOCK(ifp); - return (0); + goto out_locked; } IF_ADDR_WLOCK_ASSERT(ifp); @@ -524,6 +620,7 @@ in_getmulti(struct ifnet *ifp, const struct in_addr *group, inm = malloc(sizeof(*inm), M_IPMADDR, M_NOWAIT | M_ZERO); if (inm == NULL) { IF_ADDR_WUNLOCK(ifp); + IN_MULTI_LIST_UNLOCK(); if_delmulti_ifma(ifma); return (ENOMEM); } @@ -541,8 +638,9 @@ in_getmulti(struct ifnet *ifp, const struct in_addr *group, ifma->ifma_protospec = inm; *pinm = inm; - + out_locked: IF_ADDR_WUNLOCK(ifp); + IN_MULTI_LIST_UNLOCK(); return (0); } @@ -552,36 +650,33 @@ in_getmulti(struct ifnet *ifp, const struct in_addr *group, * If the refcount drops to 0, free the in_multi record and * delete the underlying link-layer membership. */ -void -inm_release_locked(struct in_multi *inm) +static void +inm_release(struct in_multi *inm) { struct ifmultiaddr *ifma; - - IN_MULTI_LOCK_ASSERT(); + struct ifnet *ifp; CTR2(KTR_IGMPV3, "%s: refcount is %d", __func__, inm->inm_refcount); - - if (--inm->inm_refcount > 0) { - CTR2(KTR_IGMPV3, "%s: refcount is now %d", __func__, - inm->inm_refcount); - return; - } - + MPASS(inm->inm_refcount == 0); CTR2(KTR_IGMPV3, "%s: freeing inm %p", __func__, inm); ifma = inm->inm_ifma; + ifp = inm->inm_ifp; /* XXX this access is not covered by IF_ADDR_LOCK */ CTR2(KTR_IGMPV3, "%s: purging ifma %p", __func__, ifma); - KASSERT(ifma->ifma_protospec == inm, - ("%s: ifma_protospec != inm", __func__)); - ifma->ifma_protospec = NULL; - - inm_purge(inm); - - free(inm, M_IPMADDR); - - if_delmulti_ifma(ifma); + if (ifp != NULL) { + CURVNET_SET(ifp->if_vnet); + inm_purge(inm); + free(inm, M_IPMADDR); + if_delmulti_ifma_flags(ifma, 1); + CURVNET_RESTORE(); + if_rele(ifp); + } else { + inm_purge(inm); + free(inm, M_IPMADDR); + if_delmulti_ifma_flags(ifma, 1); + } } /* @@ -594,7 +689,7 @@ inm_clear_recorded(struct in_multi *inm) { struct ip_msource *ims; - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { if (ims->ims_stp) { @@ -634,7 +729,7 @@ inm_record_source(struct in_multi *inm, const in_addr_t naddr) struct ip_msource find; struct ip_msource *ims, *nims; - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); find.ims_haddr = ntohl(naddr); ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find); @@ -961,6 +1056,7 @@ inm_merge(struct in_multi *inm, /*const*/ struct in_mfilter *imf) schanged = 0; error = 0; nsrc1 = nsrc0 = 0; + IN_MULTI_LIST_LOCK_ASSERT(); /* * Update the source filters first, as this may fail. @@ -1167,6 +1263,7 @@ in_joingroup_locked(struct ifnet *ifp, const struct in_addr *gina, int error; IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_UNLOCK_ASSERT(); CTR4(KTR_IGMPV3, "%s: join 0x%08x on %p(%s))", __func__, ntohl(gina->s_addr), ifp, ifp->if_xname); @@ -1188,7 +1285,7 @@ in_joingroup_locked(struct ifnet *ifp, const struct in_addr *gina, CTR1(KTR_IGMPV3, "%s: in_getmulti() failure", __func__); return (error); } - + IN_MULTI_LIST_LOCK(); CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); error = inm_merge(inm, imf); if (error) { @@ -1203,13 +1300,15 @@ in_joingroup_locked(struct ifnet *ifp, const struct in_addr *gina, goto out_inm_release; } -out_inm_release: + out_inm_release: if (error) { + CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm); - inm_release_locked(inm); + inm_release_deferred(inm); } else { *pinm = inm; } + IN_MULTI_LIST_UNLOCK(); return (error); } @@ -1251,6 +1350,7 @@ in_leavegroup_locked(struct in_multi *inm, /*const*/ struct in_mfilter *imf) error = 0; IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_UNLOCK_ASSERT(); CTR5(KTR_IGMPV3, "%s: leave inm %p, 0x%08x/%s, imf %p", __func__, inm, ntohl(inm->inm_addr.s_addr), @@ -1274,18 +1374,22 @@ in_leavegroup_locked(struct in_multi *inm, /*const*/ struct in_mfilter *imf) * the transaction, it MUST NOT fail. */ CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); + IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); KASSERT(error == 0, ("%s: failed to merge inm state", __func__)); CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); CURVNET_SET(inm->inm_ifp->if_vnet); error = igmp_change_state(inm); + IF_ADDR_WLOCK(inm->inm_ifp); + inm_release_deferred(inm); + IF_ADDR_WUNLOCK(inm->inm_ifp); + IN_MULTI_LIST_UNLOCK(); CURVNET_RESTORE(); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm); - inm_release_locked(inm); return (error); } @@ -1317,18 +1421,6 @@ in_addmulti(struct in_addr *ap, struct ifnet *ifp) } /* - * Leave an IPv4 multicast group, assumed to be in exclusive (*,G) mode. - * This KPI is for legacy kernel consumers only. - */ -void -in_delmulti(struct in_multi *inm) -{ - - (void)in_leavegroup(inm, NULL); -} -/*#endif*/ - -/* * Block or unblock an ASM multicast source on an inpcb. * This implements the delta-based API described in RFC 3678. * @@ -1489,7 +1581,7 @@ inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) * Begin state merge transaction at IGMP layer. */ IN_MULTI_LOCK(); - + IN_MULTI_LIST_LOCK(); CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); error = inm_merge(inm, imf); if (error) { @@ -1505,7 +1597,7 @@ inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) out_in_multi_locked: IN_MULTI_UNLOCK(); - + IN_MULTI_UNLOCK(); out_imf_rollback: if (error) imf_rollback(imf); @@ -1571,37 +1663,31 @@ inp_findmoptions(struct inpcb *inp) return (imo); } -/* - * Discard the IP multicast options (and source filters). To minimize - * the amount of work done while holding locks such as the INP's - * pcbinfo lock (which is used in the receive path), the free - * operation is performed asynchronously in a separate task. - * - * SMPng: NOTE: assumes INP write lock is held. - */ -void -inp_freemoptions(struct ip_moptions *imo) -{ - - KASSERT(imo != NULL, ("%s: ip_moptions is NULL", __func__)); - IN_MULTI_LOCK(); - STAILQ_INSERT_TAIL(&imo_gc_list, imo, imo_link); - IN_MULTI_UNLOCK(); - taskqueue_enqueue(taskqueue_thread, &imo_gc_task); -} - static void -inp_freemoptions_internal(struct ip_moptions *imo) +inp_gcmoptions(epoch_context_t ctx) { + struct ip_moptions *imo; struct in_mfilter *imf; + struct in_multi *inm; + struct ifnet *ifp; size_t idx, nmships; + imo = __containerof(ctx, struct ip_moptions, imo_epoch_ctx); + nmships = imo->imo_num_memberships; for (idx = 0; idx < nmships; ++idx) { imf = imo->imo_mfilters ? &imo->imo_mfilters[idx] : NULL; if (imf) imf_leave(imf); - (void)in_leavegroup(imo->imo_membership[idx], imf); + inm = imo->imo_membership[idx]; + ifp = inm->inm_ifp; + if (ifp != NULL) { + CURVNET_SET(ifp->if_vnet); + (void)in_leavegroup(inm, imf); + CURVNET_RESTORE(); + } else { + (void)in_leavegroup(inm, imf); + } if (imf) imf_purge(imf); } @@ -1612,20 +1698,18 @@ inp_freemoptions_internal(struct ip_moptions *imo) free(imo, M_IPMOPTS); } -static void -inp_gcmoptions(void *context, int pending) +/* + * Discard the IP multicast options (and source filters). To minimize + * the amount of work done while holding locks such as the INP's + * pcbinfo lock (which is used in the receive path), the free + * operation is deferred to the epoch callback task. + */ +void +inp_freemoptions(struct ip_moptions *imo) { - struct ip_moptions *imo; - - IN_MULTI_LOCK(); - while (!STAILQ_EMPTY(&imo_gc_list)) { - imo = STAILQ_FIRST(&imo_gc_list); - STAILQ_REMOVE_HEAD(&imo_gc_list, imo_link); - IN_MULTI_UNLOCK(); - inp_freemoptions_internal(imo); - IN_MULTI_LOCK(); - } - IN_MULTI_UNLOCK(); + if (imo == NULL) + return; + epoch_call(net_epoch_preempt, &imo->imo_epoch_ctx, inp_gcmoptions); } /* @@ -1794,12 +1878,12 @@ inp_getmoptions(struct inpcb *inp, struct sockopt *sopt) mreqn.imr_address = imo->imo_multicast_addr; } else if (ifp != NULL) { mreqn.imr_ifindex = ifp->if_index; + NET_EPOCH_ENTER(); IFP_TO_IA(ifp, ia, &in_ifa_tracker); - if (ia != NULL) { + if (ia != NULL) mreqn.imr_address = IA_SIN(ia)->sin_addr; - ifa_free(&ia->ia_ifa); - } + NET_EPOCH_EXIT(); } } INP_WUNLOCK(inp); @@ -1907,7 +1991,7 @@ inp_lookup_mcast_ifp(const struct inpcb *inp, mifp = NULL; IN_IFADDR_RLOCK(&in_ifa_tracker); - TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { + CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { mifp = ia->ia_ifp; if (!(mifp->if_flags & IFF_LOOPBACK) && (mifp->if_flags & IFF_MULTICAST)) { @@ -2165,6 +2249,8 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) /* * Begin state merge transaction at IGMP layer. */ + in_pcbref(inp); + INP_WUNLOCK(inp); IN_MULTI_LOCK(); if (is_new) { @@ -2173,20 +2259,23 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) if (error) { CTR1(KTR_IGMPV3, "%s: in_joingroup_locked failed", __func__); - IN_MULTI_UNLOCK(); + IN_MULTI_LIST_UNLOCK(); goto out_imo_free; } imo->imo_membership[idx] = inm; } else { CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); + IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", - __func__); + __func__); + IN_MULTI_LIST_UNLOCK(); goto out_in_multi_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); + IN_MULTI_LIST_UNLOCK(); if (error) { CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); @@ -2197,8 +2286,9 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) out_in_multi_locked: IN_MULTI_UNLOCK(); - - INP_WLOCK_ASSERT(inp); + INP_WLOCK(inp); + if (in_pcbrele_wlocked(inp)) + return (ENXIO); if (error) { imf_rollback(imf); if (is_new) @@ -2387,6 +2477,8 @@ inp_leave_group(struct inpcb *inp, struct sockopt *sopt) /* * Begin state merge transaction at IGMP layer. */ + in_pcbref(inp); + INP_WUNLOCK(inp); IN_MULTI_LOCK(); if (is_final) { @@ -2397,6 +2489,7 @@ inp_leave_group(struct inpcb *inp, struct sockopt *sopt) (void)in_leavegroup_locked(inm, imf); } else { CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); + IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", @@ -2406,6 +2499,7 @@ inp_leave_group(struct inpcb *inp, struct sockopt *sopt) CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); + IN_MULTI_LIST_UNLOCK(); if (error) { CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); @@ -2415,6 +2509,9 @@ inp_leave_group(struct inpcb *inp, struct sockopt *sopt) out_in_multi_locked: IN_MULTI_UNLOCK(); + INP_WLOCK(inp); + if (in_pcbrele_wlocked(inp)) + return (ENXIO); if (error) imf_rollback(imf); @@ -2641,6 +2738,7 @@ inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) INP_WLOCK_ASSERT(inp); IN_MULTI_LOCK(); + IN_MULTI_LIST_LOCK(); /* * Begin state merge transaction at IGMP layer. @@ -2649,11 +2747,13 @@ inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); + IN_MULTI_LIST_UNLOCK(); goto out_in_multi_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); + IN_MULTI_LIST_UNLOCK(); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); @@ -2885,10 +2985,10 @@ sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS) if (retval) return (retval); - IN_MULTI_LOCK(); + IN_MULTI_LIST_LOCK(); IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; @@ -2918,7 +3018,7 @@ sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS) } IF_ADDR_RUNLOCK(ifp); - IN_MULTI_UNLOCK(); + IN_MULTI_LIST_UNLOCK(); return (retval); } diff --git a/freebsd/sys/netinet/in_pcb.c b/freebsd/sys/netinet/in_pcb.c index 0d388132..f89487b6 100644 --- a/freebsd/sys/netinet/in_pcb.c +++ b/freebsd/sys/netinet/in_pcb.c @@ -64,6 +64,7 @@ __FBSDID("$FreeBSD$"); #include <sys/domain.h> #include <sys/protosw.h> #include <sys/rmlock.h> +#include <sys/smp.h> #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/sockio.h> @@ -93,6 +94,9 @@ __FBSDID("$FreeBSD$"); #include <netinet/in_pcb.h> #include <netinet/ip_var.h> #include <netinet/tcp_var.h> +#ifdef TCPHPTS +#include <netinet/tcp_hpts.h> +#endif #include <netinet/udp.h> #include <netinet/udp_var.h> #endif @@ -590,7 +594,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(pcbinfo); - if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */ + if (CK_STAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */ return (EADDRNOTAVAIL); laddr.s_addr = *laddrp; if (nam != NULL && laddr.s_addr != INADDR_ANY) @@ -802,7 +806,6 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, int error; KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); - /* * Bypass source address selection and use the primary jail IP * if requested. @@ -835,15 +838,18 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, * network and try to find a corresponding interface to take * the source address from. */ + NET_EPOCH_ENTER(); if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) { struct in_ifaddr *ia; struct ifnet *ifp; ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin, inp->inp_socket->so_fibnum)); - if (ia == NULL) + if (ia == NULL) { ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0, inp->inp_socket->so_fibnum)); + + } if (ia == NULL) { error = ENETUNREACH; goto done; @@ -851,15 +857,13 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, if (cred == NULL || !prison_flag(cred, PR_IP4)) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; - ifa_free(&ia->ia_ifa); goto done; } ifp = ia->ia_ifp; - ifa_free(&ia->ia_ifa); ia = NULL; IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) @@ -918,7 +922,7 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, ia = NULL; ifp = sro.ro_rt->rt_ifp; IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; @@ -972,7 +976,6 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, goto done; } laddr->s_addr = ia->ia_addr.sin_addr.s_addr; - ifa_free(&ia->ia_ifa); goto done; } @@ -981,10 +984,9 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, struct ifnet *ifp; ifp = ia->ia_ifp; - ifa_free(&ia->ia_ifa); ia = NULL; IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) @@ -1010,6 +1012,7 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, } done: + NET_EPOCH_EXIT(); if (sro.ro_rt != NULL) RTFREE(sro.ro_rt); return (error); @@ -1063,7 +1066,7 @@ in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, faddr = sin->sin_addr; fport = sin->sin_port; - if (!TAILQ_EMPTY(&V_in_ifaddrhead)) { + if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) { /* * If the destination address is INADDR_ANY, * use the primary local address. @@ -1074,16 +1077,16 @@ in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, if (faddr.s_addr == INADDR_ANY) { IN_IFADDR_RLOCK(&in_ifa_tracker); faddr = - IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; + IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; IN_IFADDR_RUNLOCK(&in_ifa_tracker); if (cred != NULL && (error = prison_get_ip4(cred, &faddr)) != 0) return (error); } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) { IN_IFADDR_RLOCK(&in_ifa_tracker); - if (TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & + if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & IFF_BROADCAST) - faddr = satosin(&TAILQ_FIRST( + faddr = satosin(&CK_STAILQ_FIRST( &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; IN_IFADDR_RUNLOCK(&in_ifa_tracker); } @@ -1104,7 +1107,7 @@ in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, if (imo->imo_multicast_ifp != NULL) { ifp = imo->imo_multicast_ifp; IN_IFADDR_RLOCK(&in_ifa_tracker); - TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { + CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if ((ia->ia_ifp == ifp) && (cred == NULL || prison_check_ip4(cred, @@ -1236,9 +1239,28 @@ in_pcbrele_rlocked(struct inpcb *inp) } return (0); } - + KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); - +#ifdef TCPHPTS + if (inp->inp_in_hpts || inp->inp_in_input) { + struct tcp_hpts_entry *hpts; + /* + * We should not be on the hpts at + * this point in any form. we must + * get the lock to be sure. + */ + hpts = tcp_hpts_lock(inp); + if (inp->inp_in_hpts) + panic("Hpts:%p inp:%p at free still on hpts", + hpts, inp); + mtx_unlock(&hpts->p_mtx); + hpts = tcp_input_lock(inp); + if (inp->inp_in_input) + panic("Hpts:%p inp:%p at free still on input hpts", + hpts, inp); + mtx_unlock(&hpts->p_mtx); + } +#endif INP_RUNLOCK(inp); pcbinfo = inp->inp_pcbinfo; uma_zfree(pcbinfo->ipi_zone, inp); @@ -1267,7 +1289,26 @@ in_pcbrele_wlocked(struct inpcb *inp) } KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); - +#ifdef TCPHPTS + if (inp->inp_in_hpts || inp->inp_in_input) { + struct tcp_hpts_entry *hpts; + /* + * We should not be on the hpts at + * this point in any form. we must + * get the lock to be sure. + */ + hpts = tcp_hpts_lock(inp); + if (inp->inp_in_hpts) + panic("Hpts:%p inp:%p at free still on hpts", + hpts, inp); + mtx_unlock(&hpts->p_mtx); + hpts = tcp_input_lock(inp); + if (inp->inp_in_input) + panic("Hpts:%p inp:%p at free still on input hpts", + hpts, inp); + mtx_unlock(&hpts->p_mtx); + } +#endif INP_WUNLOCK(inp); pcbinfo = inp->inp_pcbinfo; uma_zfree(pcbinfo->ipi_zone, inp); @@ -1284,6 +1325,28 @@ in_pcbrele(struct inpcb *inp) return (in_pcbrele_wlocked(inp)); } +void +in_pcblist_rele_rlocked(epoch_context_t ctx) +{ + struct in_pcblist *il; + struct inpcb *inp; + struct inpcbinfo *pcbinfo; + int i, n; + + il = __containerof(ctx, struct in_pcblist, il_epoch_ctx); + pcbinfo = il->il_pcbinfo; + n = il->il_count; + INP_INFO_WLOCK(pcbinfo); + for (i = 0; i < n; i++) { + inp = il->il_inp_list[i]; + INP_RLOCK(inp); + if (!in_pcbrele_rlocked(inp)) + INP_RUNLOCK(inp); + } + INP_INFO_WUNLOCK(pcbinfo); + free(il, M_TEMP); +} + /* * Unconditionally schedule an inpcb to be freed by decrementing its * reference count, which should occur only after the inpcb has been detached @@ -1298,8 +1361,21 @@ in_pcbfree(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; +#ifdef INET6 + struct ip6_moptions *im6o = NULL; +#endif +#ifdef INET + struct ip_moptions *imo = NULL; +#endif KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); + KASSERT((inp->inp_flags2 & INP_FREED) == 0, + ("%s: called twice for pcb %p", __func__, inp)); + if (inp->inp_flags2 & INP_FREED) { + INP_WUNLOCK(inp); + return; + } + #ifdef INVARIANTS if (pcbinfo == &V_tcbinfo) { INP_INFO_LOCK_ASSERT(pcbinfo); @@ -1309,6 +1385,10 @@ in_pcbfree(struct inpcb *inp) #endif INP_WLOCK_ASSERT(inp); +#ifdef INET + imo = inp->inp_moptions; + inp->inp_moptions = NULL; +#endif /* XXXRW: Do as much as possible here. */ #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (inp->inp_sp != NULL) @@ -1321,16 +1401,12 @@ in_pcbfree(struct inpcb *inp) #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) { ip6_freepcbopts(inp->in6p_outputopts); - if (inp->in6p_moptions != NULL) - ip6_freemoptions(inp->in6p_moptions); + im6o = inp->in6p_moptions; + inp->in6p_moptions = NULL; } #endif if (inp->inp_options) (void)m_free(inp->inp_options); -#ifdef INET - if (inp->inp_moptions != NULL) - inp_freemoptions(inp->inp_moptions); -#endif RO_INVALIDATE_CACHE(&inp->inp_route); inp->inp_vflag = 0; @@ -1339,6 +1415,12 @@ in_pcbfree(struct inpcb *inp) #ifdef MAC mac_inpcb_destroy(inp); #endif +#ifdef INET6 + ip6_freemoptions(im6o); +#endif +#ifdef INET + inp_freemoptions(imo); +#endif if (!in_pcbrele_wlocked(inp)) INP_WUNLOCK(inp); } @@ -1492,11 +1574,14 @@ in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) /* * Drop multicast group membership if we joined * through the interface being detached. + * + * XXX This can all be deferred to an epoch_call */ for (i = 0, gap = 0; i < imo->imo_num_memberships; i++) { if (imo->imo_membership[i]->inm_ifp == ifp) { - in_delmulti(imo->imo_membership[i]); + IN_MULTI_LOCK_ASSERT(); + in_leavegroup_locked(imo->imo_membership[i], NULL); gap++; } else if (gap != 0) imo->imo_membership[i - gap] = diff --git a/freebsd/sys/netinet/in_pcb.h b/freebsd/sys/netinet/in_pcb.h index 574ab407..d00dd456 100644 --- a/freebsd/sys/netinet/in_pcb.h +++ b/freebsd/sys/netinet/in_pcb.h @@ -41,6 +41,7 @@ #define _NETINET_IN_PCB_H_ #include <sys/queue.h> +#include <sys/epoch.h> #include <sys/_lock.h> #include <sys/_mutex.h> #include <sys/_rwlock.h> @@ -156,6 +157,7 @@ struct in_conninfo { * from the global list. * * Key: + * (b) - Protected by the hpts lock. * (c) - Constant after initialization * (g) - Protected by the pcbgroup lock * (i) - Protected by the inpcb lock @@ -164,6 +166,51 @@ struct in_conninfo { * (h) - Protected by the pcbhash lock for the inpcb * (s) - Protected by another subsystem's locks * (x) - Undefined locking + * + * Notes on the tcp_hpts: + * + * First Hpts lock order is + * 1) INP_WLOCK() + * 2) HPTS_LOCK() i.e. hpts->pmtx + * + * To insert a TCB on the hpts you *must* be holding the INP_WLOCK(). + * You may check the inp->inp_in_hpts flag without the hpts lock. + * The hpts is the only one that will clear this flag holding + * only the hpts lock. This means that in your tcp_output() + * routine when you test for the inp_in_hpts flag to be 1 + * it may be transitioning to 0 (by the hpts). + * That's ok since that will just mean an extra call to tcp_output + * that most likely will find the call you executed + * (when the mis-match occured) will have put the TCB back + * on the hpts and it will return. If your + * call did not add the inp back to the hpts then you will either + * over-send or the cwnd will block you from sending more. + * + * Note you should also be holding the INP_WLOCK() when you + * call the remove from the hpts as well. Though usually + * you are either doing this from a timer, where you need and have + * the INP_WLOCK() or from destroying your TCB where again + * you should already have the INP_WLOCK(). + * + * The inp_hpts_cpu, inp_hpts_cpu_set, inp_input_cpu and + * inp_input_cpu_set fields are controlled completely by + * the hpts. Do not ever set these. The inp_hpts_cpu_set + * and inp_input_cpu_set fields indicate if the hpts has + * setup the respective cpu field. It is advised if this + * field is 0, to enqueue the packet with the appropriate + * hpts_immediate() call. If the _set field is 1, then + * you may compare the inp_*_cpu field to the curcpu and + * may want to again insert onto the hpts if these fields + * are not equal (i.e. you are not on the expected CPU). + * + * A note on inp_hpts_calls and inp_input_calls, these + * flags are set when the hpts calls either the output + * or do_segment routines respectively. If the routine + * being called wants to use this, then it needs to + * clear the flag before returning. The hpts will not + * clear the flag. The flags can be used to tell if + * the hpts is the function calling the respective + * routine. * * A few other notes: * @@ -190,14 +237,45 @@ struct inpcb { LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */ struct rwlock inp_lock; /* Cache line #2 (amd64) */ -#define inp_start_zero inp_refcount +#define inp_start_zero inp_hpts #define inp_zero_size (sizeof(struct inpcb) - \ offsetof(struct inpcb, inp_start_zero)) + TAILQ_ENTRY(inpcb) inp_hpts; /* pacing out queue next lock(b) */ + + uint32_t inp_hpts_request; /* Current hpts request, zero if + * fits in the pacing window (i&b). */ + /* + * Note the next fields are protected by a + * different lock (hpts-lock). This means that + * they must correspond in size to the smallest + * protectable bit field (uint8_t on x86, and + * other platfomrs potentially uint32_t?). Also + * since CPU switches can occur at different times the two + * fields can *not* be collapsed into a signal bit field. + */ +#if defined(__amd64__) || defined(__i386__) + volatile uint8_t inp_in_hpts; /* on output hpts (lock b) */ + volatile uint8_t inp_in_input; /* on input hpts (lock b) */ +#else + volatile uint32_t inp_in_hpts; /* on output hpts (lock b) */ + volatile uint32_t inp_in_input; /* on input hpts (lock b) */ +#endif + volatile uint16_t inp_hpts_cpu; /* Lock (i) */ u_int inp_refcount; /* (i) refcount */ int inp_flags; /* (i) generic IP/datagram flags */ int inp_flags2; /* (i) generic IP/datagram flags #2*/ + volatile uint16_t inp_input_cpu; /* Lock (i) */ + volatile uint8_t inp_hpts_cpu_set :1, /* on output hpts (i) */ + inp_input_cpu_set : 1, /* on input hpts (i) */ + inp_hpts_calls :1, /* (i) from output hpts */ + inp_input_calls :1, /* (i) from input hpts */ + inp_spare_bits2 : 4; + uint8_t inp_spare_byte; /* Compiler hole */ void *inp_ppcb; /* (i) pointer to per-protocol pcb */ struct socket *inp_socket; /* (i) back pointer to socket */ + uint32_t inp_hptsslot; /* Hpts wheel slot this tcb is Lock(i&b) */ + uint32_t inp_hpts_drop_reas; /* reason we are dropping the PCB (lock i&b) */ + TAILQ_ENTRY(inpcb) inp_input; /* pacing in queue next lock(b) */ struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */ struct inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */ LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/h) group wildcard entry */ @@ -330,6 +408,13 @@ struct inpcbport { u_short phd_port; }; +struct in_pcblist { + int il_count; + struct epoch_context il_epoch_ctx; + struct inpcbinfo *il_pcbinfo; + struct inpcb *il_inp_list[0]; +}; + /*- * Global data structure for each high-level protocol (UDP, TCP, ...) in both * IPv4 and IPv6. Holds inpcb lists and information for managing them. @@ -638,6 +723,7 @@ short inp_so_options(const struct inpcb *inp); #define INP_RECVRSSBUCKETID 0x00000200 /* populate recv datagram with bucket id */ #define INP_RATE_LIMIT_CHANGED 0x00000400 /* rate limit needs attention */ #define INP_ORIGDSTADDR 0x00000800 /* receive IP dst address/port */ +#define INP_CANNOT_DO_ECN 0x00001000 /* The stack does not do ECN */ /* * Flags passed to in_pcblookup*() functions. @@ -751,6 +837,7 @@ void in_pcbrehash_mbuf(struct inpcb *, struct mbuf *); int in_pcbrele(struct inpcb *); int in_pcbrele_rlocked(struct inpcb *); int in_pcbrele_wlocked(struct inpcb *); +void in_pcblist_rele_rlocked(epoch_context_t ctx); void in_losing(struct inpcb *); void in_pcbsetsolabel(struct socket *so); int in_getpeeraddr(struct socket *so, struct sockaddr **nam); diff --git a/freebsd/sys/netinet/in_proto.c b/freebsd/sys/netinet/in_proto.c index f1dec6c5..a563c950 100644 --- a/freebsd/sys/netinet/in_proto.c +++ b/freebsd/sys/netinet/in_proto.c @@ -229,7 +229,6 @@ struct protosw inetsw[] = { .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap4_input, .pr_ctloutput = rip_ctloutput, - .pr_init = encap_init, .pr_usrreqs = &rip_usrreqs }, { @@ -239,7 +238,6 @@ struct protosw inetsw[] = { .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap4_input, .pr_ctloutput = rip_ctloutput, - .pr_init = encap_init, .pr_usrreqs = &rip_usrreqs }, { @@ -249,7 +247,6 @@ struct protosw inetsw[] = { .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap4_input, .pr_ctloutput = rip_ctloutput, - .pr_init = encap_init, .pr_usrreqs = &rip_usrreqs }, { @@ -259,7 +256,6 @@ struct protosw inetsw[] = { .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap4_input, .pr_ctloutput = rip_ctloutput, - .pr_init = encap_init, .pr_usrreqs = &rip_usrreqs }, # ifdef INET6 @@ -270,7 +266,6 @@ struct protosw inetsw[] = { .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap4_input, .pr_ctloutput = rip_ctloutput, - .pr_init = encap_init, .pr_usrreqs = &rip_usrreqs }, #endif diff --git a/freebsd/sys/netinet/in_var.h b/freebsd/sys/netinet/in_var.h index ff722fc9..5b7a464b 100644 --- a/freebsd/sys/netinet/in_var.h +++ b/freebsd/sys/netinet/in_var.h @@ -55,6 +55,7 @@ struct in_aliasreq { struct igmp_ifsoftc; struct in_multi; struct lltable; +SLIST_HEAD(in_multi_head, in_multi); /* * IPv4 per-interface state. @@ -79,7 +80,7 @@ struct in_ifaddr { u_long ia_subnet; /* subnet address */ u_long ia_subnetmask; /* mask of subnet */ LIST_ENTRY(in_ifaddr) ia_hash; /* entry in bucket of inet addresses */ - TAILQ_ENTRY(in_ifaddr) ia_link; /* list of internet addresses */ + CK_STAILQ_ENTRY(in_ifaddr) ia_link; /* list of internet addresses */ struct sockaddr_in ia_addr; /* reserve space for interface name */ struct sockaddr_in ia_dstaddr; /* reserve space for broadcast addr */ #define ia_broadaddr ia_dstaddr @@ -106,7 +107,7 @@ extern u_char inetctlerrmap[]; /* * Hash table for IP addresses. */ -TAILQ_HEAD(in_ifaddrhead, in_ifaddr); +CK_STAILQ_HEAD(in_ifaddrhead, in_ifaddr); LIST_HEAD(in_ifaddrhashhead, in_ifaddr); VNET_DECLARE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); @@ -171,12 +172,10 @@ do { \ /* struct rm_priotracker *t; */ \ do { \ IN_IFADDR_RLOCK((t)); \ - for ((ia) = TAILQ_FIRST(&V_in_ifaddrhead); \ + for ((ia) = CK_STAILQ_FIRST(&V_in_ifaddrhead); \ (ia) != NULL && (ia)->ia_ifp != (ifp); \ - (ia) = TAILQ_NEXT((ia), ia_link)) \ + (ia) = CK_STAILQ_NEXT((ia), ia_link)) \ continue; \ - if ((ia) != NULL) \ - ifa_ref(&(ia)->ia_ifa); \ IN_IFADDR_RUNLOCK((t)); \ } while (0) @@ -329,21 +328,53 @@ SYSCTL_DECL(_net_inet_raw); * consumers of IN_*_MULTI() macros should acquire the locks before * calling them; users of the in_{add,del}multi() functions should not. */ -extern struct mtx in_multi_mtx; -#define IN_MULTI_LOCK() mtx_lock(&in_multi_mtx) -#define IN_MULTI_UNLOCK() mtx_unlock(&in_multi_mtx) -#define IN_MULTI_LOCK_ASSERT() mtx_assert(&in_multi_mtx, MA_OWNED) -#define IN_MULTI_UNLOCK_ASSERT() mtx_assert(&in_multi_mtx, MA_NOTOWNED) +extern struct mtx in_multi_list_mtx; +extern struct sx in_multi_sx; + +#define IN_MULTI_LIST_LOCK() mtx_lock(&in_multi_list_mtx) +#define IN_MULTI_LIST_UNLOCK() mtx_unlock(&in_multi_list_mtx) +#define IN_MULTI_LIST_LOCK_ASSERT() mtx_assert(&in_multi_list_mtx, MA_OWNED) +#define IN_MULTI_LIST_UNLOCK_ASSERT() mtx_assert(&in_multi_list_mtx, MA_NOTOWNED) + +#define IN_MULTI_LOCK() sx_xlock(&in_multi_sx) +#define IN_MULTI_UNLOCK() sx_xunlock(&in_multi_sx) +#define IN_MULTI_LOCK_ASSERT() sx_assert(&in_multi_sx, SA_XLOCKED) +#define IN_MULTI_UNLOCK_ASSERT() sx_assert(&in_multi_sx, SA_XUNLOCKED) + +void inm_disconnect(struct in_multi *inm); +extern int ifma_restart; /* Acquire an in_multi record. */ static __inline void inm_acquire_locked(struct in_multi *inm) { - IN_MULTI_LOCK_ASSERT(); + IN_MULTI_LIST_LOCK_ASSERT(); ++inm->inm_refcount; } +static __inline void +inm_acquire(struct in_multi *inm) +{ + IN_MULTI_LIST_LOCK(); + inm_acquire_locked(inm); + IN_MULTI_LIST_UNLOCK(); +} + +static __inline void +inm_rele_locked(struct in_multi_head *inmh, struct in_multi *inm) +{ + MPASS(inm->inm_refcount > 0); + IN_MULTI_LIST_LOCK_ASSERT(); + + if (--inm->inm_refcount == 0) { + MPASS(inmh != NULL); + inm_disconnect(inm); + inm->inm_ifma->ifma_protospec = NULL; + SLIST_INSERT_HEAD(inmh, inm, inm_nrele); + } +} + /* * Return values for imo_multi_filter(). */ @@ -364,11 +395,10 @@ void inm_commit(struct in_multi *); void inm_clear_recorded(struct in_multi *); void inm_print(const struct in_multi *); int inm_record_source(struct in_multi *inm, const in_addr_t); -void inm_release(struct in_multi *); -void inm_release_locked(struct in_multi *); +void inm_release_deferred(struct in_multi *); +void inm_release_list_deferred(struct in_multi_head *); struct in_multi * - in_addmulti(struct in_addr *, struct ifnet *); -void in_delmulti(struct in_multi *); +in_addmulti(struct in_addr *, struct ifnet *); int in_joingroup(struct ifnet *, const struct in_addr *, /*const*/ struct in_mfilter *, struct in_multi **); int in_joingroup_locked(struct ifnet *, const struct in_addr *, diff --git a/freebsd/sys/netinet/ip_carp.c b/freebsd/sys/netinet/ip_carp.c index e2bd0a0a..6f5160e0 100644 --- a/freebsd/sys/netinet/ip_carp.c +++ b/freebsd/sys/netinet/ip_carp.c @@ -57,7 +57,6 @@ __FBSDID("$FreeBSD$"); #include <sys/counter.h> #include <net/ethernet.h> -#include <net/fddi.h> #include <net/if.h> #include <net/if_var.h> #include <net/if_dl.h> @@ -213,11 +212,13 @@ static VNET_DEFINE(int, carp_senderr_adj) = CARP_MAXSKEW; static VNET_DEFINE(int, carp_ifdown_adj) = CARP_MAXSKEW; #define V_carp_ifdown_adj VNET(carp_ifdown_adj) +static int carp_allow_sysctl(SYSCTL_HANDLER_ARGS); static int carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS); SYSCTL_NODE(_net_inet, IPPROTO_CARP, carp, CTLFLAG_RW, 0, "CARP"); -SYSCTL_INT(_net_inet_carp, OID_AUTO, allow, CTLFLAG_VNET | CTLFLAG_RW, - &VNET_NAME(carp_allow), 0, "Accept incoming CARP packets"); +SYSCTL_PROC(_net_inet_carp, OID_AUTO, allow, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, 0, 0, carp_allow_sysctl, "I", + "Accept incoming CARP packets"); SYSCTL_INT(_net_inet_carp, OID_AUTO, preempt, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(carp_preempt), 0, "High-priority backup preemption mode"); SYSCTL_INT(_net_inet_carp, OID_AUTO, log, CTLFLAG_VNET | CTLFLAG_RW, @@ -277,8 +278,7 @@ SYSCTL_VNET_PCPUSTAT(_net_inet_carp, OID_AUTO, stats, struct carpstats, } while (0) #define IFNET_FOREACH_IFA(ifp, ifa) \ - IF_ADDR_LOCK_ASSERT(ifp); \ - TAILQ_FOREACH((ifa), &(ifp)->if_addrhead, ifa_link) \ + CK_STAILQ_FOREACH((ifa), &(ifp)->if_addrhead, ifa_link) \ if ((ifa)->ifa_carp != NULL) #define CARP_FOREACH_IFA(sc, ifa) \ @@ -879,7 +879,7 @@ carp_best_ifa(int af, struct ifnet *ifp) return (NULL); best = NULL; IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == af && (best == NULL || ifa_preferred(best, ifa))) best = ifa; @@ -1161,7 +1161,7 @@ carp_iamatch6(struct ifnet *ifp, struct in6_addr *taddr) ifa = NULL; IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (!IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) @@ -1294,7 +1294,8 @@ carp_setrun(struct carp_softc *sc, sa_family_t af) if ((sc->sc_carpdev->if_flags & IFF_UP) == 0 || sc->sc_carpdev->if_link_state != LINK_STATE_UP || - (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0)) + (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0) || + !V_carp_allow) return; switch (sc->sc_state) { @@ -1408,7 +1409,7 @@ carp_multicast_setup(struct carp_if *cif, sa_family_t sa) break; } in6m = NULL; - if ((error = in6_mc_join(ifp, &in6, NULL, &in6m, 0)) != 0) { + if ((error = in6_joingroup(ifp, &in6, NULL, &in6m, 0)) != 0) { free(im6o->im6o_membership, M_CARP); break; } @@ -1423,13 +1424,13 @@ carp_multicast_setup(struct carp_if *cif, sa_family_t sa) in6.s6_addr32[3] = 0; in6.s6_addr8[12] = 0xff; if ((error = in6_setscope(&in6, ifp, NULL)) != 0) { - in6_mc_leave(im6o->im6o_membership[0], NULL); + in6_leavegroup(im6o->im6o_membership[0], NULL); free(im6o->im6o_membership, M_CARP); break; } in6m = NULL; - if ((error = in6_mc_join(ifp, &in6, NULL, &in6m, 0)) != 0) { - in6_mc_leave(im6o->im6o_membership[0], NULL); + if ((error = in6_joingroup(ifp, &in6, NULL, &in6m, 0)) != 0) { + in6_leavegroup(im6o->im6o_membership[0], NULL); free(im6o->im6o_membership, M_CARP); break; } @@ -1472,8 +1473,8 @@ carp_multicast_cleanup(struct carp_if *cif, sa_family_t sa) if (cif->cif_naddrs6 == 0) { struct ip6_moptions *im6o = &cif->cif_im6o; - in6_mc_leave(im6o->im6o_membership[0], NULL); - in6_mc_leave(im6o->im6o_membership[1], NULL); + in6_leavegroup(im6o->im6o_membership[0], NULL); + in6_leavegroup(im6o->im6o_membership[1], NULL); KASSERT(im6o->im6o_mfilters == NULL, ("%s: im6o_mfilters != NULL", __func__)); free(im6o->im6o_membership, M_CARP); @@ -1528,18 +1529,6 @@ carp_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa) eh->ether_shost[5] = sc->sc_vhid; } break; - case IFT_FDDI: { - struct fddi_header *fh; - - fh = mtod(m, struct fddi_header *); - fh->fddi_shost[0] = 0; - fh->fddi_shost[1] = 0; - fh->fddi_shost[2] = 0x5e; - fh->fddi_shost[3] = 0; - fh->fddi_shost[4] = 1; - fh->fddi_shost[5] = sc->sc_vhid; - } - break; default: printf("%s: carp is not supported for the %d interface type\n", ifp->if_xname, ifp->if_type); @@ -1721,7 +1710,6 @@ carp_ioctl(struct ifreq *ifr, u_long cmd, struct thread *td) case IFT_ETHER: case IFT_L2VLAN: case IFT_BRIDGE: - case IFT_FDDI: break; default: error = EOPNOTSUPP; @@ -2057,7 +2045,8 @@ carp_sc_state(struct carp_softc *sc) CARP_LOCK_ASSERT(sc); if (sc->sc_carpdev->if_link_state != LINK_STATE_UP || - !(sc->sc_carpdev->if_flags & IFF_UP)) { + !(sc->sc_carpdev->if_flags & IFF_UP) || + !V_carp_allow) { callout_stop(&sc->sc_ad_tmo); #ifdef INET callout_stop(&sc->sc_md_tmo); @@ -2088,6 +2077,33 @@ carp_demote_adj(int adj, char *reason) } static int +carp_allow_sysctl(SYSCTL_HANDLER_ARGS) +{ + int new, error; + struct carp_softc *sc; + + new = V_carp_allow; + error = sysctl_handle_int(oidp, &new, 0, req); + if (error || !req->newptr) + return (error); + + if (V_carp_allow != new) { + V_carp_allow = new; + + mtx_lock(&carp_mtx); + LIST_FOREACH(sc, &carp_list, sc_next) { + CARP_LOCK(sc); + if (curvnet == sc->sc_carpdev->if_vnet) + carp_sc_state(sc); + CARP_UNLOCK(sc); + } + mtx_unlock(&carp_mtx); + } + + return (0); +} + +static int carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS) { int new, error; diff --git a/freebsd/sys/netinet/ip_divert.c b/freebsd/sys/netinet/ip_divert.c index 53a0445e..84f39023 100644 --- a/freebsd/sys/netinet/ip_divert.c +++ b/freebsd/sys/netinet/ip_divert.c @@ -76,7 +76,6 @@ __FBSDID("$FreeBSD$"); #endif #include <security/mac/mac_framework.h> - /* * Divert sockets */ @@ -237,7 +236,7 @@ divert_packet(struct mbuf *m, int incoming) /* Find IP address for receive interface */ ifp = m->m_pkthdr.rcvif; if_addr_rlock(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; divsrc.sin_addr = @@ -471,13 +470,15 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin, bzero(sin->sin_zero, sizeof(sin->sin_zero)); sin->sin_port = 0; + NET_EPOCH_ENTER(); ifa = ifa_ifwithaddr((struct sockaddr *) sin); if (ifa == NULL) { error = EADDRNOTAVAIL; + NET_EPOCH_EXIT(); goto cantsend; } m->m_pkthdr.rcvif = ifa->ifa_ifp; - ifa_free(ifa); + NET_EPOCH_EXIT(); } #ifdef MAC mac_socket_create_mbuf(so, m); @@ -553,6 +554,7 @@ div_detach(struct socket *so) KASSERT(inp != NULL, ("div_detach: inp == NULL")); INP_INFO_WLOCK(&V_divcbinfo); INP_WLOCK(inp); + /* XXX defer destruction to epoch_call */ in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(&V_divcbinfo); @@ -632,6 +634,7 @@ static int div_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; + struct in_pcblist *il; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; @@ -671,9 +674,8 @@ div_pcblist(SYSCTL_HANDLER_ARGS) if (error) return error; - inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); - if (inp_list == NULL) - return ENOMEM; + il = malloc(sizeof(struct in_pcblist) + n * sizeof(struct inpcb *), M_TEMP, M_WAITOK|M_ZERO_INVARIANTS); + inp_list = il->il_inp_list; INP_INFO_RLOCK(&V_divcbinfo); for (inp = LIST_FIRST(V_divcbinfo.ipi_listhead), i = 0; inp && i < n; @@ -702,14 +704,9 @@ div_pcblist(SYSCTL_HANDLER_ARGS) } else INP_RUNLOCK(inp); } - INP_INFO_WLOCK(&V_divcbinfo); - for (i = 0; i < n; i++) { - inp = inp_list[i]; - INP_RLOCK(inp); - if (!in_pcbrele_rlocked(inp)) - INP_RUNLOCK(inp); - } - INP_INFO_WUNLOCK(&V_divcbinfo); + il->il_count = n; + il->il_pcbinfo = &V_divcbinfo; + epoch_call(net_epoch_preempt, &il->il_epoch_ctx, in_pcblist_rele_rlocked); if (!error) { /* @@ -726,7 +723,6 @@ div_pcblist(SYSCTL_HANDLER_ARGS) INP_INFO_RUNLOCK(&V_divcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } - free(inp_list, M_TEMP); return error; } @@ -806,6 +802,7 @@ div_modevent(module_t mod, int type, void *unused) break; } ip_divert_ptr = NULL; + /* XXX defer to epoch_call ? */ err = pf_proto_unregister(PF_INET, IPPROTO_DIVERT, SOCK_RAW); INP_INFO_WUNLOCK(&V_divcbinfo); #ifndef VIMAGE diff --git a/freebsd/sys/netinet/ip_encap.c b/freebsd/sys/netinet/ip_encap.c index d0866b00..52cd0b40 100644 --- a/freebsd/sys/netinet/ip_encap.c +++ b/freebsd/sys/netinet/ip_encap.c @@ -110,15 +110,6 @@ static struct mtx encapmtx; MTX_SYSINIT(encapmtx, &encapmtx, "encapmtx", MTX_DEF); static LIST_HEAD(, encaptab) encaptab = LIST_HEAD_INITIALIZER(encaptab); -/* - * We currently keey encap_init() for source code compatibility reasons -- - * it's referenced by KAME pieces in netinet6. - */ -void -encap_init(void) -{ -} - #ifdef INET int encap4_input(struct mbuf **mp, int *offp, int proto) diff --git a/freebsd/sys/netinet/ip_encap.h b/freebsd/sys/netinet/ip_encap.h index bbbee390..ef232189 100644 --- a/freebsd/sys/netinet/ip_encap.h +++ b/freebsd/sys/netinet/ip_encap.h @@ -50,7 +50,6 @@ struct encaptab { void *arg; /* passed via m->m_pkthdr.aux */ }; -void encap_init(void); int encap4_input(struct mbuf **, int *, int); int encap6_input(struct mbuf **, int *, int); const struct encaptab *encap_attach(int, int, const struct sockaddr *, diff --git a/freebsd/sys/netinet/ip_icmp.c b/freebsd/sys/netinet/ip_icmp.c index b03fea56..3fc59a14 100644 --- a/freebsd/sys/netinet/ip_icmp.c +++ b/freebsd/sys/netinet/ip_icmp.c @@ -71,6 +71,7 @@ __FBSDID("$FreeBSD$"); #include <netinet/tcpip.h> #include <netinet/icmp_var.h> + #ifdef INET #include <machine/in_cksum.h> @@ -407,6 +408,7 @@ icmp_input(struct mbuf **mp, int *offp, int proto) inet_ntoa_r(ip->ip_dst, dstbuf), icmplen); } #endif + NET_EPOCH_ENTER(); if (icmplen < ICMP_MINLEN) { ICMPSTAT_INC(icps_tooshort); goto freeit; @@ -414,6 +416,7 @@ icmp_input(struct mbuf **mp, int *offp, int proto) i = hlen + min(icmplen, ICMP_ADVLENMIN); if (m->m_len < i && (m = m_pullup(m, i)) == NULL) { ICMPSTAT_INC(icps_tooshort); + NET_EPOCH_EXIT(); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); @@ -531,6 +534,7 @@ icmp_input(struct mbuf **mp, int *offp, int proto) if (m->m_len < i && (m = m_pullup(m, i)) == NULL) { /* This should actually not happen */ ICMPSTAT_INC(icps_tooshort); + NET_EPOCH_EXIT(); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); @@ -606,10 +610,8 @@ icmp_input(struct mbuf **mp, int *offp, int proto) (struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif); if (ia == NULL) break; - if (ia->ia_ifp == NULL) { - ifa_free(&ia->ia_ifa); + if (ia->ia_ifp == NULL) break; - } icp->icmp_type = ICMP_MASKREPLY; if (V_icmpmaskfake == 0) icp->icmp_mask = ia->ia_sockmask.sin_addr.s_addr; @@ -621,11 +623,11 @@ icmp_input(struct mbuf **mp, int *offp, int proto) else if (ia->ia_ifp->if_flags & IFF_POINTOPOINT) ip->ip_src = satosin(&ia->ia_dstaddr)->sin_addr; } - ifa_free(&ia->ia_ifa); reflect: ICMPSTAT_INC(icps_reflect); ICMPSTAT_INC(icps_outhist[icp->icmp_type]); icmp_reflect(m); + NET_EPOCH_EXIT(); return (IPPROTO_DONE); case ICMP_REDIRECT: @@ -702,11 +704,13 @@ reflect: } raw: + NET_EPOCH_EXIT(); *mp = m; rip_input(mp, offp, proto); return (IPPROTO_DONE); freeit: + NET_EPOCH_EXIT(); m_freem(m); return (IPPROTO_DONE); } @@ -762,7 +766,7 @@ icmp_reflect(struct mbuf *m) ifp = m->m_pkthdr.rcvif; if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) { IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); @@ -783,7 +787,7 @@ icmp_reflect(struct mbuf *m) */ if (V_icmp_rfi && ifp != NULL) { IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); @@ -801,7 +805,7 @@ icmp_reflect(struct mbuf *m) */ if (V_reply_src[0] != '\0' && (ifp = ifunit(V_reply_src))) { IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); diff --git a/freebsd/sys/netinet/ip_input.c b/freebsd/sys/netinet/ip_input.c index 2c8bf427..343eec5e 100644 --- a/freebsd/sys/netinet/ip_input.c +++ b/freebsd/sys/netinet/ip_input.c @@ -306,7 +306,7 @@ ip_init(void) struct protosw *pr; int i; - TAILQ_INIT(&V_in_ifaddrhead); + CK_STAILQ_INIT(&V_in_ifaddrhead); V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask); /* Initialize IP reassembly queue. */ @@ -401,7 +401,7 @@ ip_destroy(void *unused __unused) /* Make sure the IPv4 routes are gone as well. */ IFNET_RLOCK(); - TAILQ_FOREACH(ifp, &V_ifnet, if_link) + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) rt_flushifroutes_af(ifp, AF_INET); IFNET_RUNLOCK(); @@ -652,7 +652,7 @@ passin: * we receive might be for us (and let the upper layers deal * with it). */ - if (TAILQ_EMPTY(&V_in_ifaddrhead) && + if (CK_STAILQ_EMPTY(&V_in_ifaddrhead) && (m->m_flags & (M_MCAST|M_BCAST)) == 0) goto ours; @@ -709,7 +709,7 @@ passin: */ if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) { IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); @@ -979,9 +979,9 @@ ip_forward(struct mbuf *m, int srcrt) #else in_rtalloc_ign(&ro, 0, M_GETFIB(m)); #endif + NET_EPOCH_ENTER(); if (ro.ro_rt != NULL) { ia = ifatoia(ro.ro_rt->rt_ifa); - ifa_ref(&ia->ia_ifa); } else ia = NULL; /* @@ -1027,7 +1027,7 @@ ip_forward(struct mbuf *m, int srcrt) m_freem(mcopy); if (error != EINPROGRESS) IPSTAT_INC(ips_cantforward); - return; + goto out; } /* No IPsec processing required */ } @@ -1080,16 +1080,12 @@ ip_forward(struct mbuf *m, int srcrt) else { if (mcopy) m_freem(mcopy); - if (ia != NULL) - ifa_free(&ia->ia_ifa); - return; + goto out; } } - if (mcopy == NULL) { - if (ia != NULL) - ifa_free(&ia->ia_ifa); - return; - } + if (mcopy == NULL) + goto out; + switch (error) { @@ -1131,13 +1127,11 @@ ip_forward(struct mbuf *m, int srcrt) case ENOBUFS: case EACCES: /* ipfw denied packet */ m_freem(mcopy); - if (ia != NULL) - ifa_free(&ia->ia_ifa); - return; + goto out; } - if (ia != NULL) - ifa_free(&ia->ia_ifa); icmp_error(mcopy, type, code, dest.s_addr, mtu); + out: + NET_EPOCH_EXIT(); } #define CHECK_SO_CT(sp, ct) \ diff --git a/freebsd/sys/netinet/ip_mroute.c b/freebsd/sys/netinet/ip_mroute.c index 3bf4fa91..ac901601 100644 --- a/freebsd/sys/netinet/ip_mroute.c +++ b/freebsd/sys/netinet/ip_mroute.c @@ -880,13 +880,15 @@ add_vif(struct vifctl *vifcp) ifp = NULL; } else { sin.sin_addr = vifcp->vifc_lcl_addr; + NET_EPOCH_ENTER(); ifa = ifa_ifwithaddr((struct sockaddr *)&sin); if (ifa == NULL) { + NET_EPOCH_EXIT(); VIF_UNLOCK(); return EADDRNOTAVAIL; } ifp = ifa->ifa_ifp; - ifa_free(ifa); + NET_EPOCH_EXIT(); } if ((vifcp->vifc_flags & VIFF_TUNNEL) != 0) { @@ -1682,7 +1684,7 @@ send_packet(struct vif *vifp, struct mbuf *m) { struct ip_moptions imo; struct in_multi *imm[2]; - int error; + int error __unused; VIF_LOCK_ASSERT(); diff --git a/freebsd/sys/netinet/ip_options.c b/freebsd/sys/netinet/ip_options.c index d85aecf3..cc2f3eed 100644 --- a/freebsd/sys/netinet/ip_options.c +++ b/freebsd/sys/netinet/ip_options.c @@ -112,6 +112,7 @@ ip_dooptions(struct mbuf *m, int pass) struct nhop4_extended nh_ext; struct sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET }; + NET_EPOCH_ENTER(); /* Ignore or reject packets with IP options. */ if (V_ip_doopts == 0) return 0; @@ -226,6 +227,7 @@ dropit: #endif IPSTAT_INC(ips_cantforward); m_freem(m); + NET_EPOCH_EXIT(); return (1); } } @@ -252,7 +254,6 @@ dropit: memcpy(cp + off, &(IA_SIN(ia)->sin_addr), sizeof(struct in_addr)); - ifa_free(&ia->ia_ifa); } else { /* XXX MRT 0 for routing */ if (fib4_lookup_nh_ext(M_GETFIB(m), @@ -300,7 +301,6 @@ dropit: if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) != NULL) { memcpy(cp + off, &(IA_SIN(ia)->sin_addr), sizeof(struct in_addr)); - ifa_free(&ia->ia_ifa); } else if (fib4_lookup_nh_ext(M_GETFIB(m), ipaddr.sin_addr, 0, 0, &nh_ext) == 0) { memcpy(cp + off, &nh_ext.nh_src, @@ -355,7 +355,6 @@ dropit: continue; (void)memcpy(sin, &IA_SIN(ia)->sin_addr, sizeof(struct in_addr)); - ifa_free(&ia->ia_ifa); cp[IPOPT_OFFSET] += sizeof(struct in_addr); off += sizeof(struct in_addr); break; @@ -383,12 +382,14 @@ dropit: cp[IPOPT_OFFSET] += sizeof(uint32_t); } } + NET_EPOCH_EXIT(); if (forward && V_ipforwarding) { ip_forward(m, 1); return (1); } return (0); bad: + NET_EPOCH_EXIT(); icmp_error(m, type, code, 0, 0); IPSTAT_INC(ips_badoptions); return (1); diff --git a/freebsd/sys/netinet/ip_output.c b/freebsd/sys/netinet/ip_output.c index 21b3919a..792f2311 100644 --- a/freebsd/sys/netinet/ip_output.c +++ b/freebsd/sys/netinet/ip_output.c @@ -227,7 +227,6 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, struct route iproute; struct rtentry *rte; /* cache for ro->ro_rt */ uint32_t fibnum; - int have_ia_ref; #if defined(IPSEC) || defined(IPSEC_SUPPORT) int no_route_but_check_spd = 0; #endif @@ -283,6 +282,7 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, dst->sin_len = sizeof(*dst); dst->sin_addr = ip->ip_dst; } + NET_EPOCH_ENTER(); again: /* * Validate route against routing table additions; @@ -308,7 +308,6 @@ again: rte = NULL; } ia = NULL; - have_ia_ref = 0; /* * If routing to interface only, short circuit routing lookup. * The use of an all-ones broadcast address implies this; an @@ -324,7 +323,6 @@ again: error = ENETUNREACH; goto bad; } - have_ia_ref = 1; ip->ip_dst.s_addr = INADDR_BROADCAST; dst->sin_addr = ip->ip_dst; ifp = ia->ia_ifp; @@ -339,7 +337,6 @@ again: error = ENETUNREACH; goto bad; } - have_ia_ref = 1; ifp = ia->ia_ifp; ip->ip_ttl = 1; isbroadcast = ifp->if_flags & IFF_BROADCAST ? @@ -352,8 +349,6 @@ again: */ ifp = imo->imo_multicast_ifp; IFP_TO_IA(ifp, ia, &in_ifa_tracker); - if (ia) - have_ia_ref = 1; isbroadcast = 0; /* fool gcc */ } else { /* @@ -581,8 +576,6 @@ sendit: case -1: /* Need to try again */ /* Reset everything for a new round */ RO_RTFREE(ro); - if (have_ia_ref) - ifa_free(&ia->ia_ifa); ro->ro_prepend = NULL; rte = NULL; gw = dst; @@ -737,10 +730,9 @@ done: * calling RTFREE on it again. */ ro->ro_rt = NULL; - if (have_ia_ref) - ifa_free(&ia->ia_ifa); + NET_EPOCH_EXIT(); return (error); -bad: + bad: m_freem(m); goto done; } diff --git a/freebsd/sys/netinet/ip_var.h b/freebsd/sys/netinet/ip_var.h index 9e7ee591..f874628a 100644 --- a/freebsd/sys/netinet/ip_var.h +++ b/freebsd/sys/netinet/ip_var.h @@ -36,6 +36,7 @@ #define _NETINET_IP_VAR_H_ #include <sys/queue.h> +#include <sys/epoch.h> /* * Overlay for ip header used by other protocols (tcp, udp). @@ -95,7 +96,7 @@ struct ip_moptions { u_short imo_max_memberships; /* max memberships this socket */ struct in_multi **imo_membership; /* group memberships */ struct in_mfilter *imo_mfilters; /* source filters */ - STAILQ_ENTRY(ip_moptions) imo_link; + struct epoch_context imo_epoch_ctx; }; struct ipstat { @@ -175,6 +176,7 @@ struct ip; struct inpcb; struct route; struct sockopt; +struct inpcbinfo; VNET_DECLARE(int, ip_defttl); /* default IP ttl */ VNET_DECLARE(int, ipforwarding); /* ip forwarding */ diff --git a/freebsd/sys/netinet/netdump/netdump.h b/freebsd/sys/netinet/netdump/netdump.h new file mode 100644 index 00000000..12a527ee --- /dev/null +++ b/freebsd/sys/netinet/netdump/netdump.h @@ -0,0 +1,132 @@ +/*- + * Copyright (c) 2005-2014 Sandvine Incorporated + * Copyright (c) 2000 Darrell Anderson <anderson@cs.duke.edu> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_NETDUMP_H_ +#define _NETINET_NETDUMP_H_ + +#include <sys/types.h> +#include <sys/disk.h> +#include <sys/ioccom.h> + +#include <net/if.h> +#include <netinet/in.h> + +#define NETDUMP_PORT 20023 /* Server UDP port for heralds. */ +#define NETDUMP_ACKPORT 20024 /* Client UDP port for acks. */ + +#define NETDUMP_HERALD 1 /* Broadcast before starting a dump. */ +#define NETDUMP_FINISHED 2 /* Send after finishing a dump. */ +#define NETDUMP_VMCORE 3 /* Contains dump data. */ +#define NETDUMP_KDH 4 /* Contains kernel dump header. */ +#define NETDUMP_EKCD_KEY 5 /* Contains kernel dump key. */ + +#define NETDUMP_DATASIZE 4096 /* Arbitrary packet size limit. */ + +struct netdump_msg_hdr { + uint32_t mh_type; /* Netdump message type. */ + uint32_t mh_seqno; /* Match acks with msgs. */ + uint64_t mh_offset; /* vmcore offset (bytes). */ + uint32_t mh_len; /* Attached data (bytes). */ + uint32_t mh__pad; +} __packed; + +struct netdump_ack { + uint32_t na_seqno; /* Match acks with msgs. */ +} __packed; + +struct netdump_conf { +#ifndef __rtems__ + struct diocskerneldump_arg ndc_kda; +#endif /* __rtems__ */ + char ndc_iface[IFNAMSIZ]; + struct in_addr ndc_server; + struct in_addr ndc_client; + struct in_addr ndc_gateway; +}; + +#define _PATH_NETDUMP "/dev/netdump" + +#define NETDUMPGCONF _IOR('n', 1, struct netdump_conf) +#define NETDUMPSCONF _IOW('n', 2, struct netdump_conf) + +#ifdef _KERNEL +#ifdef NETDUMP + +#define NETDUMP_MAX_IN_FLIGHT 64 + +enum netdump_ev { + NETDUMP_START, + NETDUMP_END, +}; + +struct ifnet; +struct mbuf; + +void netdump_reinit(struct ifnet *); + +typedef void netdump_init_t(struct ifnet *, int *nrxr, int *ncl, int *clsize); +typedef void netdump_event_t(struct ifnet *, enum netdump_ev); +typedef int netdump_transmit_t(struct ifnet *, struct mbuf *); +typedef int netdump_poll_t(struct ifnet *, int); + +struct netdump_methods { + netdump_init_t *nd_init; + netdump_event_t *nd_event; + netdump_transmit_t *nd_transmit; + netdump_poll_t *nd_poll; +}; + +#define NETDUMP_DEFINE(driver) \ + static netdump_init_t driver##_netdump_init; \ + static netdump_event_t driver##_netdump_event; \ + static netdump_transmit_t driver##_netdump_transmit; \ + static netdump_poll_t driver##_netdump_poll; \ + \ + static struct netdump_methods driver##_netdump_methods = { \ + .nd_init = driver##_netdump_init, \ + .nd_event = driver##_netdump_event, \ + .nd_transmit = driver##_netdump_transmit, \ + .nd_poll = driver##_netdump_poll, \ + } + +#define NETDUMP_REINIT(ifp) netdump_reinit(ifp) + +#define NETDUMP_SET(ifp, driver) \ + (ifp)->if_netdump_methods = &driver##_netdump_methods + +#else /* !NETDUMP */ + +#define NETDUMP_DEFINE(driver) +#define NETDUMP_REINIT(ifp) +#define NETDUMP_SET(ifp, driver) + +#endif /* NETDUMP */ +#endif /* _KERNEL */ + +#endif /* _NETINET_NETDUMP_H_ */ diff --git a/freebsd/sys/netinet/raw_ip.c b/freebsd/sys/netinet/raw_ip.c index 0ed185ae..7dea3ec1 100644 --- a/freebsd/sys/netinet/raw_ip.c +++ b/freebsd/sys/netinet/raw_ip.c @@ -745,7 +745,7 @@ rip_ctlinput(int cmd, struct sockaddr *sa, void *vip) switch (cmd) { case PRC_IFDOWN: IN_IFADDR_RLOCK(&in_ifa_tracker); - TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { + CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (ia->ia_ifa.ifa_addr == sa && (ia->ia_flags & IFA_ROUTE)) { ifa_ref(&ia->ia_ifa); @@ -771,7 +771,7 @@ rip_ctlinput(int cmd, struct sockaddr *sa, void *vip) case PRC_IFUP: IN_IFADDR_RLOCK(&in_ifa_tracker); - TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { + CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (ia->ia_ifa.ifa_addr == sa) break; } @@ -853,6 +853,7 @@ rip_detach(struct socket *so) ip_rsvp_force_done(so); if (so == V_ip_rsvpd) ip_rsvp_done(); + /* XXX defer to epoch_call */ in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); @@ -930,7 +931,7 @@ rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td) inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_bind: inp == NULL")); - if (TAILQ_EMPTY(&V_ifnet) || + if (CK_STAILQ_EMPTY(&V_ifnet) || (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) || (addr->sin_addr.s_addr && (inp->inp_flags & INP_BINDANY) == 0 && @@ -955,7 +956,7 @@ rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td) if (nam->sa_len != sizeof(*addr)) return (EINVAL); - if (TAILQ_EMPTY(&V_ifnet)) + if (CK_STAILQ_EMPTY(&V_ifnet)) return (EADDRNOTAVAIL); if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) return (EAFNOSUPPORT); @@ -1022,6 +1023,7 @@ static int rip_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; + struct in_pcblist *il; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; @@ -1056,9 +1058,8 @@ rip_pcblist(SYSCTL_HANDLER_ARGS) if (error) return (error); - inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); - if (inp_list == NULL) - return (ENOMEM); + il = malloc(sizeof(struct in_pcblist) + n * sizeof(struct inpcb *), M_TEMP, M_WAITOK|M_ZERO_INVARIANTS); + inp_list = il->il_inp_list; INP_INFO_RLOCK(&V_ripcbinfo); for (inp = LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n; @@ -1087,14 +1088,9 @@ rip_pcblist(SYSCTL_HANDLER_ARGS) } else INP_RUNLOCK(inp); } - INP_INFO_WLOCK(&V_ripcbinfo); - for (i = 0; i < n; i++) { - inp = inp_list[i]; - INP_RLOCK(inp); - if (!in_pcbrele_rlocked(inp)) - INP_RUNLOCK(inp); - } - INP_INFO_WUNLOCK(&V_ripcbinfo); + il->il_count = n; + il->il_pcbinfo = &V_ripcbinfo; + epoch_call(net_epoch_preempt, &il->il_epoch_ctx, in_pcblist_rele_rlocked); if (!error) { /* @@ -1110,7 +1106,6 @@ rip_pcblist(SYSCTL_HANDLER_ARGS) INP_INFO_RUNLOCK(&V_ripcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } - free(inp_list, M_TEMP); return (error); } diff --git a/freebsd/sys/netinet/sctp_bsd_addr.c b/freebsd/sys/netinet/sctp_bsd_addr.c index 7e2ef189..94c23bff 100644 --- a/freebsd/sys/netinet/sctp_bsd_addr.c +++ b/freebsd/sys/netinet/sctp_bsd_addr.c @@ -209,13 +209,13 @@ sctp_init_ifns_for_vrf(int vrfid) #endif IFNET_RLOCK(); - TAILQ_FOREACH(ifn, &MODULE_GLOBAL(ifnet), if_link) { + CK_STAILQ_FOREACH(ifn, &MODULE_GLOBAL(ifnet), if_link) { if (sctp_is_desired_interface_type(ifn) == 0) { /* non desired type */ continue; } IF_ADDR_RLOCK(ifn); - TAILQ_FOREACH(ifa, &ifn->if_addrhead, ifa_link) { + CK_STAILQ_FOREACH(ifa, &ifn->if_addrhead, ifa_link) { if (ifa->ifa_addr == NULL) { continue; } @@ -362,11 +362,11 @@ void struct ifaddr *ifa; IFNET_RLOCK(); - TAILQ_FOREACH(ifn, &MODULE_GLOBAL(ifnet), if_link) { + CK_STAILQ_FOREACH(ifn, &MODULE_GLOBAL(ifnet), if_link) { if (!(*pred) (ifn)) { continue; } - TAILQ_FOREACH(ifa, &ifn->if_addrhead, ifa_link) { + CK_STAILQ_FOREACH(ifa, &ifn->if_addrhead, ifa_link) { sctp_addr_change(ifa, add ? RTM_ADD : RTM_DELETE); } } @@ -389,10 +389,7 @@ sctp_get_mbuf_for_msg(unsigned int space_needed, int want_header, m_freem(m); return (NULL); } - } - if (SCTP_BUF_NEXT(m)) { - sctp_m_freem(SCTP_BUF_NEXT(m)); - SCTP_BUF_NEXT(m) = NULL; + KASSERT(SCTP_BUF_NEXT(m) == NULL, ("%s: no chain allowed", __FUNCTION__)); } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { diff --git a/freebsd/sys/netinet/sctp_indata.c b/freebsd/sys/netinet/sctp_indata.c index 3325dd03..98b397a2 100644 --- a/freebsd/sys/netinet/sctp_indata.c +++ b/freebsd/sys/netinet/sctp_indata.c @@ -1673,9 +1673,7 @@ sctp_process_a_data_chunk(struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_nets *net, uint32_t *high_tsn, int *abort_flag, int *break_flag, int last_chunk, uint8_t chk_type) { - /* Process a data chunk */ - /* struct sctp_tmit_chunk *chk; */ - struct sctp_tmit_chunk *chk; + struct sctp_tmit_chunk *chk = NULL; /* make gcc happy */ uint32_t tsn, fsn, gap, mid; struct mbuf *dmbuf; int the_len; @@ -3623,7 +3621,9 @@ sctp_strike_gap_ack_chunks(struct sctp_tcb *stcb, struct sctp_association *asoc, SCTP_SO_NOT_LOCKED); } /* Make sure to flag we had a FR */ - tp1->whoTo->net_ack++; + if (tp1->whoTo != NULL) { + tp1->whoTo->net_ack++; + } continue; } } diff --git a/freebsd/sys/netinet/sctp_input.c b/freebsd/sys/netinet/sctp_input.c index 9a74ef4b..ee206551 100644 --- a/freebsd/sys/netinet/sctp_input.c +++ b/freebsd/sys/netinet/sctp_input.c @@ -2618,7 +2618,7 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset, (sizeof(uint32_t)))); diff = now; timevalsub(&diff, &time_expires); - if (diff.tv_sec > UINT32_MAX / 1000000) { + if ((uint32_t)diff.tv_sec > UINT32_MAX / 1000000) { staleness = UINT32_MAX; } else { staleness = diff.tv_sec * 1000000; diff --git a/freebsd/sys/netinet/sctp_os_bsd.h b/freebsd/sys/netinet/sctp_os_bsd.h index c9eaa069..d8d9e6e8 100644 --- a/freebsd/sys/netinet/sctp_os_bsd.h +++ b/freebsd/sys/netinet/sctp_os_bsd.h @@ -40,7 +40,6 @@ __FBSDID("$FreeBSD$"); /* * includes */ -#include <rtems/bsd/local/opt_compat.h> #include <rtems/bsd/local/opt_inet6.h> #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_sctp.h> diff --git a/freebsd/sys/netinet/sctp_output.c b/freebsd/sys/netinet/sctp_output.c index 9dd2e0fa..bdef958c 100644 --- a/freebsd/sys/netinet/sctp_output.c +++ b/freebsd/sys/netinet/sctp_output.c @@ -7452,7 +7452,7 @@ dont_do_it: /* Not enough room for a chunk header, get some */ struct mbuf *m; - m = sctp_get_mbuf_for_msg(1, 0, M_NOWAIT, 0, MT_DATA); + m = sctp_get_mbuf_for_msg(1, 0, M_NOWAIT, 1, MT_DATA); if (m == NULL) { /* * we're in trouble here. _PREPEND below will free @@ -11032,9 +11032,8 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst, struct sctp_chunkhdr *ch; #if defined(INET) || defined(INET6) struct udphdr *udp; - int ret; #endif - int len, cause_len, padding_len; + int ret, len, cause_len, padding_len; #ifdef INET struct sockaddr_in *src_sin, *dst_sin; struct ip *ip; @@ -11261,9 +11260,13 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst, SCTP_LTRACE_ERR_RET_PKT(mout, NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EFAULT); return; } + SCTPDBG(SCTP_DEBUG_OUTPUT3, "return from send is %d\n", ret); SCTP_STAT_INCR(sctps_sendpackets); SCTP_STAT_INCR_COUNTER64(sctps_outpackets); SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); + if (ret) { + SCTP_STAT_INCR(sctps_senderrors); + } return; } diff --git a/freebsd/sys/netinet/sctp_usrreq.c b/freebsd/sys/netinet/sctp_usrreq.c index 05ddee01..071d44c2 100644 --- a/freebsd/sys/netinet/sctp_usrreq.c +++ b/freebsd/sys/netinet/sctp_usrreq.c @@ -206,7 +206,7 @@ sctp_notify(struct sctp_inpcb *inp, #endif /* no need to unlock here, since the TCB is gone */ } else if (icmp_code == ICMP_UNREACH_NEEDFRAG) { - if ((net->dest_state & SCTP_ADDR_NO_PMTUD) == 0) { + if (net->dest_state & SCTP_ADDR_NO_PMTUD) { SCTP_TCB_UNLOCK(stcb); return; } @@ -707,22 +707,10 @@ sctp_disconnect(struct socket *so) if (SCTP_GET_STATE(asoc) != SCTP_STATE_COOKIE_WAIT) { /* Left with Data unread */ - struct mbuf *err; - - err = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), 0, M_NOWAIT, 1, MT_DATA); - if (err) { - /* - * Fill in the user - * initiated abort - */ - struct sctp_paramhdr *ph; + struct mbuf *op_err; - ph = mtod(err, struct sctp_paramhdr *); - SCTP_BUF_LEN(err) = sizeof(struct sctp_paramhdr); - ph->param_type = htons(SCTP_CAUSE_USER_INITIATED_ABT); - ph->param_length = htons(SCTP_BUF_LEN(err)); - } - sctp_send_abort_tcb(stcb, err, SCTP_SO_LOCKED); + op_err = sctp_generate_cause(SCTP_CAUSE_USER_INITIATED_ABT, ""); + sctp_send_abort_tcb(stcb, op_err, SCTP_SO_LOCKED); SCTP_STAT_INCR_COUNTER32(sctps_aborted); } SCTP_INP_RUNLOCK(inp); diff --git a/freebsd/sys/netinet/sctputil.c b/freebsd/sys/netinet/sctputil.c index 5511df64..aad1e19d 100644 --- a/freebsd/sys/netinet/sctputil.c +++ b/freebsd/sys/netinet/sctputil.c @@ -74,6 +74,7 @@ extern const struct sctp_ss_functions sctp_ss_functions[]; void sctp_sblog(struct sockbuf *sb, struct sctp_tcb *stcb, int from, int incr) { +#if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; sctp_clog.x.sb.stcb = stcb; @@ -90,11 +91,13 @@ sctp_sblog(struct sockbuf *sb, struct sctp_tcb *stcb, int from, int incr) sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); +#endif } void sctp_log_closing(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int16_t loc) { +#if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; sctp_clog.x.close.inp = (void *)inp; @@ -114,11 +117,13 @@ sctp_log_closing(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int16_t loc) sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); +#endif } void rto_logging(struct sctp_nets *net, int from) { +#if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; memset(&sctp_clog, 0, sizeof(sctp_clog)); @@ -131,11 +136,13 @@ rto_logging(struct sctp_nets *net, int from) sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); +#endif } void sctp_log_strm_del_alt(struct sctp_tcb *stcb, uint32_t tsn, uint16_t sseq, uint16_t stream, int from) { +#if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; sctp_clog.x.strlog.stcb = stcb; @@ -151,11 +158,13 @@ sctp_log_strm_del_alt(struct sctp_tcb *stcb, uint32_t tsn, uint16_t sseq, uint16 sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); +#endif } void sctp_log_nagle_event(struct sctp_tcb *stcb, int action) { +#if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; sctp_clog.x.nagle.stcb = (void *)stcb; @@ -170,11 +179,13 @@ sctp_log_nagle_event(struct sctp_tcb *stcb, int action) sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); +#endif } void sctp_log_sack(uint32_t old_cumack, uint32_t cumack, uint32_t tsn, uint16_t gaps, uint16_t dups, int from) { +#if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; sctp_clog.x.sack.cumack = cumack; @@ -189,11 +200,13 @@ sctp_log_sack(uint32_t old_cumack, uint32_t cumack, uint32_t tsn, uint16_t gaps, sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); +#endif } void sctp_log_map(uint32_t map, uint32_t cum, uint32_t high, int from) { +#if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; memset(&sctp_clog, 0, sizeof(sctp_clog)); @@ -207,11 +220,13 @@ sctp_log_map(uint32_t map, uint32_t cum, uint32_t high, int from) sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); +#endif } void sctp_log_fr(uint32_t biggest_tsn, uint32_t biggest_new_tsn, uint32_t tsn, int from) { +#if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; memset(&sctp_clog, 0, sizeof(sctp_clog)); @@ -225,12 +240,14 @@ sctp_log_fr(uint32_t biggest_tsn, uint32_t biggest_new_tsn, uint32_t tsn, int fr sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); +#endif } #ifdef SCTP_MBUF_LOGGING void sctp_log_mb(struct mbuf *m, int from) { +#if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; sctp_clog.x.mb.mp = m; @@ -251,6 +268,7 @@ sctp_log_mb(struct mbuf *m, int from) sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); +#endif } void @@ -267,6 +285,7 @@ sctp_log_mbc(struct mbuf *m, int from) void sctp_log_strm_del(struct sctp_queued_to_read *control, struct sctp_queued_to_read *poschk, int from) { +#if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; if (control == NULL) { @@ -291,11 +310,13 @@ sctp_log_strm_del(struct sctp_queued_to_read *control, struct sctp_queued_to_rea sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); +#endif } void sctp_log_cwnd(struct sctp_tcb *stcb, struct sctp_nets *net, int augment, uint8_t from) { +#if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; sctp_clog.x.cwnd.net = net; @@ -326,11 +347,13 @@ sctp_log_cwnd(struct sctp_tcb *stcb, struct sctp_nets *net, int augment, uint8_t sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); +#endif } void sctp_log_lock(struct sctp_inpcb *inp, struct sctp_tcb *stcb, uint8_t from) { +#if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; memset(&sctp_clog, 0, sizeof(sctp_clog)); @@ -370,11 +393,13 @@ sctp_log_lock(struct sctp_inpcb *inp, struct sctp_tcb *stcb, uint8_t from) sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); +#endif } void sctp_log_maxburst(struct sctp_tcb *stcb, struct sctp_nets *net, int error, int burst, uint8_t from) { +#if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; memset(&sctp_clog, 0, sizeof(sctp_clog)); @@ -397,11 +422,13 @@ sctp_log_maxburst(struct sctp_tcb *stcb, struct sctp_nets *net, int error, int b sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); +#endif } void sctp_log_rwnd(uint8_t from, uint32_t peers_rwnd, uint32_t snd_size, uint32_t overhead) { +#if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; sctp_clog.x.rwnd.rwnd = peers_rwnd; @@ -415,11 +442,13 @@ sctp_log_rwnd(uint8_t from, uint32_t peers_rwnd, uint32_t snd_size, uint32_t ove sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); +#endif } void sctp_log_rwnd_set(uint8_t from, uint32_t peers_rwnd, uint32_t flight_size, uint32_t overhead, uint32_t a_rwndval) { +#if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; sctp_clog.x.rwnd.rwnd = peers_rwnd; @@ -433,12 +462,14 @@ sctp_log_rwnd_set(uint8_t from, uint32_t peers_rwnd, uint32_t flight_size, uint3 sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); +#endif } #ifdef SCTP_MBCNT_LOGGING static void sctp_log_mbcnt(uint8_t from, uint32_t total_oq, uint32_t book, uint32_t total_mbcnt_q, uint32_t mbcnt) { +#if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; sctp_clog.x.mbcnt.total_queue_size = total_oq; @@ -452,21 +483,25 @@ sctp_log_mbcnt(uint8_t from, uint32_t total_oq, uint32_t book, uint32_t total_mb sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); +#endif } #endif void sctp_misc_ints(uint8_t from, uint32_t a, uint32_t b, uint32_t c, uint32_t d) { +#if defined(SCTP_LOCAL_TRACE_BUF) SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", SCTP_LOG_MISC_EVENT, from, a, b, c, d); +#endif } void sctp_wakeup_log(struct sctp_tcb *stcb, uint32_t wake_cnt, int from) { +#if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; sctp_clog.x.wake.stcb = (void *)stcb; @@ -508,11 +543,13 @@ sctp_wakeup_log(struct sctp_tcb *stcb, uint32_t wake_cnt, int from) sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); +#endif } void sctp_log_block(uint8_t from, struct sctp_association *asoc, size_t sendlen) { +#if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; sctp_clog.x.blk.onsb = asoc->total_output_queue_size; @@ -529,6 +566,7 @@ sctp_log_block(uint8_t from, struct sctp_association *asoc, size_t sendlen) sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); +#endif } int @@ -760,8 +798,8 @@ sctp_stop_timers_for_shutdown(struct sctp_tcb *stcb) } /* - * a list of sizes based on typical mtu's, used only if next hop size not - * returned. + * A list of sizes based on typical mtu's, used only if next hop size not + * returned. These values MUST be multiples of 4 and MUST be ordered. */ static uint32_t sctp_mtu_sizes[] = { 68, @@ -770,29 +808,32 @@ static uint32_t sctp_mtu_sizes[] = { 512, 544, 576, - 1006, + 1004, 1492, 1500, 1536, - 2002, + 2000, 2048, 4352, 4464, 8166, - 17914, + 17912, 32000, - 65535 + 65532 }; /* - * Return the largest MTU smaller than val. If there is no - * entry, just return val. + * Return the largest MTU in sctp_mtu_sizes smaller than val. + * If val is smaller than the minimum, just return the largest + * multiple of 4 smaller or equal to val. + * Ensure that the result is a multiple of 4. */ uint32_t sctp_get_prev_mtu(uint32_t val) { uint32_t i; + val &= 0xfffffffc; if (val <= sctp_mtu_sizes[0]) { return (val); } @@ -801,12 +842,16 @@ sctp_get_prev_mtu(uint32_t val) break; } } + KASSERT((sctp_mtu_sizes[i - 1] & 0x00000003) == 0, + ("sctp_mtu_sizes[%u] not a multiple of 4", i - 1)); return (sctp_mtu_sizes[i - 1]); } /* - * Return the smallest MTU larger than val. If there is no - * entry, just return val. + * Return the smallest MTU in sctp_mtu_sizes larger than val. + * If val is larger than the maximum, just return the largest multiple of 4 smaller + * or equal to val. + * Ensure that the result is a multiple of 4. */ uint32_t sctp_get_next_mtu(uint32_t val) @@ -814,8 +859,11 @@ sctp_get_next_mtu(uint32_t val) /* select another MTU that is just bigger than this one */ uint32_t i; + val &= 0xfffffffc; for (i = 0; i < (sizeof(sctp_mtu_sizes) / sizeof(uint32_t)); i++) { if (val < sctp_mtu_sizes[i]) { + KASSERT((sctp_mtu_sizes[i] & 0x00000003) == 0, + ("sctp_mtu_sizes[%u] not a multiple of 4", i)); return (sctp_mtu_sizes[i]); } } @@ -2662,6 +2710,13 @@ sctp_notify_assoc_change(uint16_t state, struct sctp_tcb *stcb, notif_len = (unsigned int)sizeof(struct sctp_assoc_change); if (abort != NULL) { abort_len = ntohs(abort->ch.chunk_length); + /* + * Only SCTP_CHUNK_BUFFER_SIZE are guaranteed to be + * contiguous. + */ + if (abort_len > SCTP_CHUNK_BUFFER_SIZE) { + abort_len = SCTP_CHUNK_BUFFER_SIZE; + } } else { abort_len = 0; } @@ -3567,6 +3622,13 @@ sctp_notify_remote_error(struct sctp_tcb *stcb, uint16_t error, struct sctp_erro } if (chunk != NULL) { chunk_len = ntohs(chunk->ch.chunk_length); + /* + * Only SCTP_CHUNK_BUFFER_SIZE are guaranteed to be + * contiguous. + */ + if (chunk_len > SCTP_CHUNK_BUFFER_SIZE) { + chunk_len = SCTP_CHUNK_BUFFER_SIZE; + } } else { chunk_len = 0; } diff --git a/freebsd/sys/netinet/tcp_hpts.h b/freebsd/sys/netinet/tcp_hpts.h new file mode 100644 index 00000000..c52a1d78 --- /dev/null +++ b/freebsd/sys/netinet/tcp_hpts.h @@ -0,0 +1,304 @@ +/*- + * Copyright (c) 2016-2018 Netflix Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef __tcp_hpts_h__ +#define __tcp_hpts_h__ + +/* + * The hpts uses a 102400 wheel. The wheel + * defines the time in 10 usec increments (102400 x 10). + * This gives a range of 10usec - 1024ms to place + * an entry within. If the user requests more than + * 1.024 second, a remaineder is attached and the hpts + * when seeing the remainder will re-insert the + * inpcb forward in time from where it is until + * the remainder is zero. + */ + +#define NUM_OF_HPTSI_SLOTS 102400 + +TAILQ_HEAD(hptsh, inpcb); + +/* Number of useconds in a hpts tick */ +#define HPTS_TICKS_PER_USEC 10 +#define HPTS_MS_TO_SLOTS(x) (x * 100) +#define HPTS_USEC_TO_SLOTS(x) ((x+9) /10) +#define HPTS_USEC_IN_SEC 1000000 +#define HPTS_MSEC_IN_SEC 1000 +#define HPTS_USEC_IN_MSEC 1000 + +#define DEFAULT_HPTS_LOG 3072 + +/* + * Log flags consist of + * 7f 7f 1 1 bits + * p_cpu | p_num | INPUT_ACTIVE | HPTS_ACTIVE + * + * So for example cpu 10, number 10 would with + * input active would show up as: + * p_flags = 0001010 0001010 1 0 + * <or> + * p_flags = 0x142a + */ +#define HPTS_HPTS_ACTIVE 0x01 +#define HPTS_INPUT_ACTIVE 0x02 + +#define HPTSLOG_IMMEDIATE 1 +#define HPTSLOG_INSERT_NORMAL 2 +#define HPTSLOG_INSERT_SLEEPER 3 +#define HPTSLOG_SLEEP_AFTER 4 +#define HPTSLOG_SLEEP_BEFORE 5 +#define HPTSLOG_INSERTED 6 +#define HPTSLOG_WAKEUP_HPTS 7 +#define HPTSLOG_SETTORUN 8 +#define HPTSLOG_HPTSI 9 +#define HPTSLOG_TOLONG 10 +#define HPTSLOG_AWAKENS 11 +#define HPTSLOG_TIMESOUT 12 +#define HPTSLOG_SLEEPSET 13 +#define HPTSLOG_WAKEUP_INPUT 14 +#define HPTSLOG_RESCHEDULE 15 +#define HPTSLOG_AWAKE 16 +#define HPTSLOG_INP_DONE 17 + +struct hpts_log { + struct inpcb *inp; + int32_t event; + uint32_t cts; + int32_t line; + uint32_t ticknow; + uint32_t t_paceslot; + uint32_t t_hptsreq; + uint32_t p_curtick; + uint32_t p_prevtick; + uint32_t slot_req; + uint32_t p_on_queue_cnt; + uint32_t p_nxt_slot; + uint32_t p_cur_slot; + uint32_t p_hpts_sleep_time; + uint16_t p_flags; + uint8_t p_onhpts; + uint8_t p_oninput; + uint8_t is_notempty; +}; + +struct hpts_diag { + uint32_t p_hpts_active; + uint32_t p_nxt_slot; + uint32_t p_cur_slot; + uint32_t slot_req; + uint32_t inp_hptsslot; + uint32_t slot_now; + uint32_t have_slept; + uint32_t hpts_sleep_time; + uint32_t yet_to_sleep; + uint32_t need_new_to; + int32_t co_ret; + uint8_t p_on_min_sleep; +}; + +#ifdef _KERNEL +/* Each hpts has its own p_mtx which is used for locking */ +struct tcp_hpts_entry { + /* Cache line 0x00 */ + struct mtx p_mtx; /* Mutex for hpts */ + uint32_t p_hpts_active; /* Flag that says hpts is awake */ + uint32_t p_curtick; /* Current tick in 10 us the hpts is at */ + uint32_t p_prevtick; /* Previous tick in 10 us the hpts ran */ + uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */ + uint32_t p_nxt_slot; /* The next slot outside the current range of + * slots that the hpts is running on. */ + int32_t p_on_queue_cnt; /* Count on queue in this hpts */ + uint32_t enobuf_cnt; + uint16_t p_log_at; + uint8_t p_direct_wake :1, /* boolean */ + p_log_wrapped :1, /* boolean */ + p_on_min_sleep:1; /* boolean */ + uint8_t p_fill; + /* Cache line 0x40 */ + void *p_inp; + struct hptsh p_input; /* For the tcp-input runner */ + /* Hptsi wheel */ + struct hptsh *p_hptss; + struct hpts_log *p_log; + uint32_t p_logsize; + int32_t p_on_inqueue_cnt; /* Count on input queue in this hpts */ + uint32_t hit_no_enobuf; + uint32_t p_dyn_adjust; + uint32_t p_hpts_sleep_time; /* Current sleep interval having a max + * of 255ms */ + uint32_t p_delayed_by; /* How much were we delayed by */ + /* Cache line 0x80 */ + struct sysctl_ctx_list hpts_ctx; + struct sysctl_oid *hpts_root; + struct intr_event *ie; + void *ie_cookie; + uint16_t p_num; /* The hpts number one per cpu */ + uint16_t p_cpu; /* The hpts CPU */ + /* There is extra space in here */ + /* Cache line 0x100 */ + struct callout co __aligned(CACHE_LINE_SIZE); +} __aligned(CACHE_LINE_SIZE); + +struct tcp_hptsi { + struct proc *rp_proc; /* Process structure for hpts */ + struct tcp_hpts_entry **rp_ent; /* Array of hptss */ + uint32_t rp_num_hptss; /* Number of hpts threads */ +}; + +#endif + +#define HPTS_REMOVE_INPUT 0x01 +#define HPTS_REMOVE_OUTPUT 0x02 +#define HPTS_REMOVE_ALL (HPTS_REMOVE_INPUT | HPTS_REMOVE_OUTPUT) + +/* + * When using the hpts, a TCP stack must make sure + * that once a INP_DROPPED flag is applied to a INP + * that it does not expect tcp_output() to ever be + * called by the hpts. The hpts will *not* call + * any output (or input) functions on a TCB that + * is in the DROPPED state. + * + * This implies final ACK's and RST's that might + * be sent when a TCB is still around must be + * sent from a routine like tcp_respond(). + */ +#define DEFAULT_MIN_SLEEP 250 /* How many usec's is default for hpts sleep + * this determines min granularity of the + * hpts. If 0, granularity is 10useconds at + * the cost of more CPU (context switching). */ +#ifdef _KERNEL +#define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED) +struct tcp_hpts_entry *tcp_hpts_lock(struct inpcb *inp); +struct tcp_hpts_entry *tcp_input_lock(struct inpcb *inp); +int __tcp_queue_to_hpts_immediate(struct inpcb *inp, int32_t line); +#define tcp_queue_to_hpts_immediate(a)__tcp_queue_to_hpts_immediate(a, __LINE__) + +struct tcp_hpts_entry *tcp_cur_hpts(struct inpcb *inp); +#define tcp_hpts_remove(a, b) __tcp_hpts_remove(a, b, __LINE__) +void __tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line); + +/* + * To insert a TCB on the hpts you *must* be holding the + * INP_WLOCK(). The hpts insert code will then acqurire + * the hpts's lock and insert the TCB on the requested + * slot possibly waking up the hpts if you are requesting + * a time earlier than what the hpts is sleeping to (if + * the hpts is sleeping). You may check the inp->inp_in_hpts + * flag without the hpts lock. The hpts is the only one + * that will clear this flag holding only the hpts lock. This + * means that in your tcp_output() routine when you test for + * it to be 1 (so you wont call output) it may be transitioning + * to 0 (by the hpts). That will be fine since that will just + * mean an extra call to tcp_output that most likely will find + * the call you executed (when the mis-match occured) will have + * put the TCB back on the hpts and it will return. If your + * call did not add it back to the hpts then you will either + * over-send or the cwnd will block you from sending more. + * + * Note you should also be holding the INP_WLOCK() when you + * call the remove from the hpts as well. Thoug usually + * you are either doing this from a timer, where you need + * that INP_WLOCK() or from destroying your TCB where again + * you should already have the INP_WLOCK(). + */ +uint32_t __tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line); +#define tcp_hpts_insert(a, b) __tcp_hpts_insert(a, b, __LINE__) + +uint32_t +tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag); + +int + __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line); +#define tcp_queue_to_input_locked(a, b) __tcp_queue_to_input_locked(a, b, __LINE__); +void +tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, + int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, uint8_t ti_locked); +int +__tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, + int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, uint8_t ti_locked, int32_t line); +#define tcp_queue_to_input(a, b, c, d, e, f, g) __tcp_queue_to_input(a, b, c, d, e, f, g, __LINE__) + +uint16_t tcp_hpts_delayedby(struct inpcb *inp); + +void __tcp_set_hpts(struct inpcb *inp, int32_t line); +#define tcp_set_hpts(a) __tcp_set_hpts(a, __LINE__) + +void __tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line); +#define tcp_set_inp_to_drop(a, b) __tcp_set_inp_to_drop(a, b, __LINE__) + +extern int32_t tcp_min_hptsi_time; + +static __inline uint32_t +tcp_tv_to_hptstick(struct timeval *sv) +{ + return ((sv->tv_sec * 100000) + (sv->tv_usec / 10)); +} + +static __inline uint32_t +tcp_gethptstick(struct timeval *sv) +{ + struct timeval tv; + + if (sv == NULL) + sv = &tv; + microuptime(sv); + return (tcp_tv_to_hptstick(sv)); +} + +static __inline uint32_t +tcp_tv_to_usectick(struct timeval *sv) +{ + return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec)); +} + +static __inline uint32_t +tcp_tv_to_mssectick(struct timeval *sv) +{ + return ((uint32_t) ((sv->tv_sec * HPTS_MSEC_IN_SEC) + (sv->tv_usec/HPTS_USEC_IN_MSEC))); +} + +static __inline void +tcp_hpts_unlock(struct tcp_hpts_entry *hpts) +{ + mtx_unlock(&hpts->p_mtx); +} + +static __inline uint32_t +tcp_get_usecs(struct timeval *tv) +{ + struct timeval tvd; + + if (tv == NULL) + tv = &tvd; + microuptime(tv); + return (tcp_tv_to_usectick(tv)); +} + +#endif /* _KERNEL */ +#endif /* __tcp_hpts_h__ */ diff --git a/freebsd/sys/netinet/tcp_input.c b/freebsd/sys/netinet/tcp_input.c index 7c907da9..20bea2de 100644 --- a/freebsd/sys/netinet/tcp_input.c +++ b/freebsd/sys/netinet/tcp_input.c @@ -1684,6 +1684,9 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, to.to_tsecr -= tp->ts_offset; if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) to.to_tsecr = 0; + else if (tp->t_flags & TF_PREVVALID && + tp->t_badrxtwin != 0 && SEQ_LT(to.to_tsecr, tp->t_badrxtwin)) + cc_cong_signal(tp, th, CC_RTO_ERR); } /* * Process options only when we get SYN/ACK back. The SYN case @@ -1796,9 +1799,10 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, TCPSTAT_INC(tcps_predack); /* - * "bad retransmit" recovery. + * "bad retransmit" recovery without timestamps. */ - if (tp->t_rxtshift == 1 && + if ((to.to_flags & TOF_TS) == 0 && + tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && (int)(ticks - tp->t_badrxtwin) < 0) { cc_cong_signal(tp, th, CC_RTO_ERR); @@ -2789,8 +2793,10 @@ process_ACK: * original cwnd and ssthresh, and proceed to transmit where * we left off. */ - if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && - (int)(ticks - tp->t_badrxtwin) < 0) + if (tp->t_rxtshift == 1 && + tp->t_flags & TF_PREVVALID && + tp->t_badrxtwin && + SEQ_LT(to.to_tsecr, tp->t_badrxtwin)) cc_cong_signal(tp, th, CC_RTO_ERR); /* diff --git a/freebsd/sys/netinet/tcp_offload.c b/freebsd/sys/netinet/tcp_offload.c index 41302db1..f3ab3b50 100644 --- a/freebsd/sys/netinet/tcp_offload.c +++ b/freebsd/sys/netinet/tcp_offload.c @@ -170,6 +170,17 @@ tcp_offload_ctloutput(struct tcpcb *tp, int sopt_dir, int sopt_name) } void +tcp_offload_tcp_info(struct tcpcb *tp, struct tcp_info *ti) +{ + struct toedev *tod = tp->tod; + + KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp)); + INP_WLOCK_ASSERT(tp->t_inpcb); + + tod->tod_tcp_info(tod, tp, ti); +} + +void tcp_offload_detach(struct tcpcb *tp) { struct toedev *tod = tp->tod; diff --git a/freebsd/sys/netinet/tcp_offload.h b/freebsd/sys/netinet/tcp_offload.h index 8485fa29..f755ce7e 100644 --- a/freebsd/sys/netinet/tcp_offload.h +++ b/freebsd/sys/netinet/tcp_offload.h @@ -45,6 +45,7 @@ void tcp_offload_input(struct tcpcb *, struct mbuf *); int tcp_offload_output(struct tcpcb *); void tcp_offload_rcvd(struct tcpcb *); void tcp_offload_ctloutput(struct tcpcb *, int, int); +void tcp_offload_tcp_info(struct tcpcb *, struct tcp_info *); void tcp_offload_detach(struct tcpcb *); #endif diff --git a/freebsd/sys/netinet/tcp_output.c b/freebsd/sys/netinet/tcp_output.c index 8762407f..bdbfe984 100644 --- a/freebsd/sys/netinet/tcp_output.c +++ b/freebsd/sys/netinet/tcp_output.c @@ -208,7 +208,7 @@ tcp_output(struct tcpcb *tp) #if defined(IPSEC) || defined(IPSEC_SUPPORT) unsigned ipsec_optlen = 0; #endif - int idle, sendalot; + int idle, sendalot, curticks; int sack_rxmit, sack_bytes_rxmt; struct sackhole *p; int tso, mtu; @@ -810,9 +810,12 @@ send: /* Timestamps. */ if ((tp->t_flags & TF_RCVD_TSTMP) || ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { - to.to_tsval = tcp_ts_getticks() + tp->ts_offset; + curticks = tcp_ts_getticks(); + to.to_tsval = curticks + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; + if (tp->t_rxtshift == 1) + tp->t_badrxtwin = curticks; } /* Set receive buffer autosizing timestamp. */ @@ -1313,10 +1316,6 @@ send: } #endif - /* We're getting ready to send; log now. */ - TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, - len, NULL, false); - /* * Enable TSO and specify the size of the segments. * The TCP pseudo header checksum is always provided. @@ -1365,6 +1364,10 @@ send: #endif /* TCPDEBUG */ TCP_PROBE3(debug__output, tp, th, m); + /* We're getting ready to send; log now. */ + TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, + len, NULL, false); + /* * Fill in IP length and desired time to live and * send to IP level. There should be a better way @@ -1588,8 +1591,6 @@ timer: SOCKBUF_UNLOCK_ASSERT(&so->so_snd); /* Check gotos. */ switch (error) { case EACCES: - tp->t_softerror = error; - return (0); case EPERM: tp->t_softerror = error; return (error); diff --git a/freebsd/sys/netinet/tcp_seq.h b/freebsd/sys/netinet/tcp_seq.h index b29ae2aa..b6e682ec 100644 --- a/freebsd/sys/netinet/tcp_seq.h +++ b/freebsd/sys/netinet/tcp_seq.h @@ -47,10 +47,10 @@ #define SEQ_MIN(a, b) ((SEQ_LT(a, b)) ? (a) : (b)) #define SEQ_MAX(a, b) ((SEQ_GT(a, b)) ? (a) : (b)) -#define WIN_LT(a,b) ((short)(ntohs(a)-ntohs(b)) < 0) -#define WIN_LEQ(a,b) ((short)(ntohs(a)-ntohs(b)) <= 0) -#define WIN_GT(a,b) ((short)(ntohs(a)-ntohs(b)) > 0) -#define WIN_GEQ(a,b) ((short)(ntohs(a)-ntohs(b)) >= 0) +#define WIN_LT(a,b) (ntohs(a) < ntohs(b)) +#define WIN_LEQ(a,b) (ntohs(a) <= ntohs(b)) +#define WIN_GT(a,b) (ntohs(a) > ntohs(b)) +#define WIN_GEQ(a,b) (ntohs(a) >= ntohs(b)) #define WIN_MIN(a, b) ((WIN_LT(a, b)) ? (a) : (b)) #define WIN_MAX(a, b) ((WIN_GT(a, b)) ? (a) : (b)) diff --git a/freebsd/sys/netinet/tcp_subr.c b/freebsd/sys/netinet/tcp_subr.c index 1b19aecb..787213b0 100644 --- a/freebsd/sys/netinet/tcp_subr.c +++ b/freebsd/sys/netinet/tcp_subr.c @@ -40,7 +40,6 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); -#include <rtems/bsd/local/opt_compat.h> #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> #include <rtems/bsd/local/opt_ipsec.h> @@ -106,6 +105,7 @@ __FBSDID("$FreeBSD$"); #include <netinet/tcp_var.h> #include <netinet/tcp_log_buf.h> #include <netinet/tcp_syncache.h> +#include <netinet/tcp_hpts.h> #include <netinet/cc/cc.h> #ifdef INET6 #include <netinet6/tcp6_var.h> @@ -239,6 +239,9 @@ VNET_DEFINE(uma_zone_t, sack_hole_zone); VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]); #endif +static int tcp_default_fb_init(struct tcpcb *tp); +static void tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged); +static int tcp_default_handoff_ok(struct tcpcb *tp); static struct inpcb *tcp_notify(struct inpcb *, int); static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int); static void tcp_mtudisc(struct inpcb *, int); @@ -247,21 +250,17 @@ static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, static struct tcp_function_block tcp_def_funcblk = { - "default", - tcp_output, - tcp_do_segment, - tcp_default_ctloutput, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - 0, - 0 + .tfb_tcp_block_name = "freebsd", + .tfb_tcp_output = tcp_output, + .tfb_tcp_do_segment = tcp_do_segment, + .tfb_tcp_ctloutput = tcp_default_ctloutput, + .tfb_tcp_handoff_ok = tcp_default_handoff_ok, + .tfb_tcp_fb_init = tcp_default_fb_init, + .tfb_tcp_fb_fini = tcp_default_fb_fini, }; int t_functions_inited = 0; +static int tcp_fb_cnt = 0; struct tcp_funchead t_functions; static struct tcp_function_block *tcp_func_set_ptr = &tcp_def_funcblk; @@ -334,6 +333,88 @@ find_and_ref_tcp_fb(struct tcp_function_block *blk) return(rblk); } +static struct tcp_function_block * +find_and_ref_tcp_default_fb(void) +{ + struct tcp_function_block *rblk; + + rw_rlock(&tcp_function_lock); + rblk = tcp_func_set_ptr; + refcount_acquire(&rblk->tfb_refcnt); + rw_runlock(&tcp_function_lock); + return (rblk); +} + +void +tcp_switch_back_to_default(struct tcpcb *tp) +{ + struct tcp_function_block *tfb; + + KASSERT(tp->t_fb != &tcp_def_funcblk, + ("%s: called by the built-in default stack", __func__)); + + /* + * Release the old stack. This function will either find a new one + * or panic. + */ + if (tp->t_fb->tfb_tcp_fb_fini != NULL) + (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); + refcount_release(&tp->t_fb->tfb_refcnt); + + /* + * Now, we'll find a new function block to use. + * Start by trying the current user-selected + * default, unless this stack is the user-selected + * default. + */ + tfb = find_and_ref_tcp_default_fb(); + if (tfb == tp->t_fb) { + refcount_release(&tfb->tfb_refcnt); + tfb = NULL; + } + /* Does the stack accept this connection? */ + if (tfb != NULL && tfb->tfb_tcp_handoff_ok != NULL && + (*tfb->tfb_tcp_handoff_ok)(tp)) { + refcount_release(&tfb->tfb_refcnt); + tfb = NULL; + } + /* Try to use that stack. */ + if (tfb != NULL) { + /* Initialize the new stack. If it succeeds, we are done. */ + tp->t_fb = tfb; + if (tp->t_fb->tfb_tcp_fb_init == NULL || + (*tp->t_fb->tfb_tcp_fb_init)(tp) == 0) + return; + + /* + * Initialization failed. Release the reference count on + * the stack. + */ + refcount_release(&tfb->tfb_refcnt); + } + + /* + * If that wasn't feasible, use the built-in default + * stack which is not allowed to reject anyone. + */ + tfb = find_and_ref_tcp_fb(&tcp_def_funcblk); + if (tfb == NULL) { + /* there always should be a default */ + panic("Can't refer to tcp_def_funcblk"); + } + if (tfb->tfb_tcp_handoff_ok != NULL) { + if ((*tfb->tfb_tcp_handoff_ok) (tp)) { + /* The default stack cannot say no */ + panic("Default stack rejects a new session?"); + } + } + tp->t_fb = tfb; + if (tp->t_fb->tfb_tcp_fb_init != NULL && + (*tp->t_fb->tfb_tcp_fb_init)(tp)) { + /* The default stack cannot fail */ + panic("Default stack initialization failed"); + } +} static int sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS) @@ -433,14 +514,14 @@ SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available, "list available TCP Function sets"); /* - * Exports one (struct tcp_function_id) for each non-alias. + * Exports one (struct tcp_function_info) for each alias/name. */ static int -sysctl_net_inet_list_func_ids(SYSCTL_HANDLER_ARGS) +sysctl_net_inet_list_func_info(SYSCTL_HANDLER_ARGS) { - int error, cnt; + int cnt, error; struct tcp_function *f; - struct tcp_function_id tfi; + struct tcp_function_info tfi; /* * We don't allow writes. @@ -459,20 +540,31 @@ sysctl_net_inet_list_func_ids(SYSCTL_HANDLER_ARGS) } /* - * Walk the list, comparing the name of the function entry and - * function block to determine which is an alias. - * If exporting the list, copy out matching entries. Otherwise, - * just record the total length. + * Walk the list and copy out matching entries. If INVARIANTS + * is compiled in, also walk the list to verify the length of + * the list matches what we have recorded. */ - cnt = 0; rw_rlock(&tcp_function_lock); + + cnt = 0; +#ifndef INVARIANTS + if (req->oldptr == NULL) { + cnt = tcp_fb_cnt; + goto skip_loop; + } +#endif TAILQ_FOREACH(f, &t_functions, tf_next) { - if (strncmp(f->tf_name, f->tf_fb->tfb_tcp_block_name, - TCP_FUNCTION_NAME_LEN_MAX)) - continue; +#ifdef INVARIANTS + cnt++; +#endif if (req->oldptr != NULL) { + tfi.tfi_refcnt = f->tf_fb->tfb_refcnt; tfi.tfi_id = f->tf_fb->tfb_id; - (void)strncpy(tfi.tfi_name, f->tf_name, + (void)strncpy(tfi.tfi_alias, f->tf_name, + TCP_FUNCTION_NAME_LEN_MAX); + tfi.tfi_alias[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; + (void)strncpy(tfi.tfi_name, + f->tf_fb->tfb_tcp_block_name, TCP_FUNCTION_NAME_LEN_MAX); tfi.tfi_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; error = SYSCTL_OUT(req, &tfi, sizeof(tfi)); @@ -481,23 +573,110 @@ sysctl_net_inet_list_func_ids(SYSCTL_HANDLER_ARGS) * mechanism we use to accumulate length * information if the buffer was too short. */ - } else - cnt++; + } } + KASSERT(cnt == tcp_fb_cnt, + ("%s: cnt (%d) != tcp_fb_cnt (%d)", __func__, cnt, tcp_fb_cnt)); +#ifndef INVARIANTS +skip_loop: +#endif rw_runlock(&tcp_function_lock); if (req->oldptr == NULL) error = SYSCTL_OUT(req, NULL, - (cnt + 1) * sizeof(struct tcp_function_id)); + (cnt + 1) * sizeof(struct tcp_function_info)); return (error); } -SYSCTL_PROC(_net_inet_tcp, OID_AUTO, function_ids, +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, function_info, CTLTYPE_OPAQUE | CTLFLAG_SKIP | CTLFLAG_RD | CTLFLAG_MPSAFE, - NULL, 0, sysctl_net_inet_list_func_ids, "S,tcp_function_id", + NULL, 0, sysctl_net_inet_list_func_info, "S,tcp_function_info", "List TCP function block name-to-ID mappings"); /* + * tfb_tcp_handoff_ok() function for the default stack. + * Note that we'll basically try to take all comers. + */ +static int +tcp_default_handoff_ok(struct tcpcb *tp) +{ + + return (0); +} + +/* + * tfb_tcp_fb_init() function for the default stack. + * + * This handles making sure we have appropriate timers set if you are + * transitioning a socket that has some amount of setup done. + * + * The init() fuction from the default can *never* return non-zero i.e. + * it is required to always succeed since it is the stack of last resort! + */ +static int +tcp_default_fb_init(struct tcpcb *tp) +{ + + struct socket *so; + + INP_WLOCK_ASSERT(tp->t_inpcb); + + KASSERT(tp->t_state >= 0 && tp->t_state < TCPS_TIME_WAIT, + ("%s: connection %p in unexpected state %d", __func__, tp, + tp->t_state)); + + /* + * Nothing to do for ESTABLISHED or LISTEN states. And, we don't + * know what to do for unexpected states (which includes TIME_WAIT). + */ + if (tp->t_state <= TCPS_LISTEN || tp->t_state >= TCPS_TIME_WAIT) + return (0); + + /* + * Make sure some kind of transmission timer is set if there is + * outstanding data. + */ + so = tp->t_inpcb->inp_socket; + if ((!TCPS_HAVEESTABLISHED(tp->t_state) || sbavail(&so->so_snd) || + tp->snd_una != tp->snd_max) && !(tcp_timer_active(tp, TT_REXMT) || + tcp_timer_active(tp, TT_PERSIST))) { + /* + * If the session has established and it looks like it should + * be in the persist state, set the persist timer. Otherwise, + * set the retransmit timer. + */ + if (TCPS_HAVEESTABLISHED(tp->t_state) && tp->snd_wnd == 0 && + (int32_t)(tp->snd_nxt - tp->snd_una) < + (int32_t)sbavail(&so->so_snd)) + tcp_setpersist(tp); + else + tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); + } + + /* All non-embryonic sessions get a keepalive timer. */ + if (!tcp_timer_active(tp, TT_KEEP)) + tcp_timer_activate(tp, TT_KEEP, + TCPS_HAVEESTABLISHED(tp->t_state) ? TP_KEEPIDLE(tp) : + TP_KEEPINIT(tp)); + + return (0); +} + +/* + * tfb_tcp_fb_fini() function for the default stack. + * + * This changes state as necessary (or prudent) to prepare for another stack + * to assume responsibility for the connection. + */ +static void +tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged) +{ + + INP_WLOCK_ASSERT(tp->t_inpcb); + return; +} + +/* * Target size of TCP PCB hash tables. Must be a power of two. * * Note that this can be overridden by the kernel environment @@ -660,6 +839,7 @@ register_tcp_functions_as_names(struct tcp_function_block *blk, int wait, (void)strncpy(n->tf_name, names[i], TCP_FUNCTION_NAME_LEN_MAX); n->tf_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; TAILQ_INSERT_TAIL(&t_functions, n, tf_next); + tcp_fb_cnt++; rw_wunlock(&tcp_function_lock); } return(0); @@ -676,6 +856,7 @@ cleanup: if (!strncmp(n->tf_name, names[i], TCP_FUNCTION_NAME_LEN_MAX)) { TAILQ_REMOVE(&t_functions, n, tf_next); + tcp_fb_cnt--; n->tf_fb = NULL; free(n, M_TCPFUNCTIONS); break; @@ -721,11 +902,28 @@ register_tcp_functions(struct tcp_function_block *blk, int wait) return (register_tcp_functions_as_name(blk, NULL, wait)); } +/* + * Deregister all names associated with a function block. This + * functionally removes the function block from use within the system. + * + * When called with a true quiesce argument, mark the function block + * as being removed so no more stacks will use it and determine + * whether the removal would succeed. + * + * When called with a false quiesce argument, actually attempt the + * removal. + * + * When called with a force argument, attempt to switch all TCBs to + * use the default stack instead of returning EBUSY. + * + * Returns 0 on success (or if the removal would succeed, or an error + * code on failure. + */ int -deregister_tcp_functions(struct tcp_function_block *blk) +deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce, + bool force) { struct tcp_function *f; - int error=ENOENT; if (strcmp(blk->tfb_tcp_block_name, "default") == 0) { /* You can't un-register the default */ @@ -737,21 +935,64 @@ deregister_tcp_functions(struct tcp_function_block *blk) rw_wunlock(&tcp_function_lock); return (EBUSY); } + /* Mark the block so no more stacks can use it. */ + blk->tfb_flags |= TCP_FUNC_BEING_REMOVED; + /* + * If TCBs are still attached to the stack, attempt to switch them + * to the default stack. + */ + if (force && blk->tfb_refcnt) { + struct inpcb *inp; + struct tcpcb *tp; + VNET_ITERATOR_DECL(vnet_iter); + + rw_wunlock(&tcp_function_lock); + + VNET_LIST_RLOCK(); + /* XXX handle */ + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + INP_INFO_WLOCK(&V_tcbinfo); + LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { + INP_WLOCK(inp); + if (inp->inp_flags & INP_TIMEWAIT) { + INP_WUNLOCK(inp); + continue; + } + tp = intotcpcb(inp); + if (tp == NULL || tp->t_fb != blk) { + INP_WUNLOCK(inp); + continue; + } + tcp_switch_back_to_default(tp); + INP_WUNLOCK(inp); + } + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK(); + + rw_wlock(&tcp_function_lock); + } if (blk->tfb_refcnt) { - /* Still tcb attached, mark it. */ - blk->tfb_flags |= TCP_FUNC_BEING_REMOVED; - rw_wunlock(&tcp_function_lock); + /* TCBs still attached. */ + rw_wunlock(&tcp_function_lock); return (EBUSY); } + if (quiesce) { + /* Skip removal. */ + rw_wunlock(&tcp_function_lock); + return (0); + } + /* Remove any function names that map to this function block. */ while (find_tcp_fb_locked(blk, &f) != NULL) { - /* Found */ TAILQ_REMOVE(&t_functions, f, tf_next); + tcp_fb_cnt--; f->tf_fb = NULL; free(f, M_TCPFUNCTIONS); - error = 0; } rw_wunlock(&tcp_function_lock); - return (error); + return (0); } void @@ -1498,6 +1739,7 @@ tcp_ccalgounload(struct cc_algo *unload_algo) tmpalgo = CC_ALGO(tp); /* NewReno does not require any init. */ CC_ALGO(tp) = &newreno_cc_algo; + /* XXX defer to epoch_call */ if (tmpalgo->cb_destroy != NULL) tmpalgo->cb_destroy(tp->ccv); } @@ -1545,7 +1787,7 @@ tcp_discardcb(struct tcpcb *tp) #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ - int released; + int released __unused; INP_WLOCK_ASSERT(inp); @@ -1868,6 +2110,7 @@ static int tcp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, m, n, pcb_count; + struct in_pcblist *il; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; @@ -1914,7 +2157,8 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) if (error) return (error); - inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); + il = malloc(sizeof(struct in_pcblist) + n * sizeof(struct inpcb *), M_TEMP, M_WAITOK|M_ZERO_INVARIANTS); + inp_list = il->il_inp_list; INP_INFO_WLOCK(&V_tcbinfo); for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0; @@ -1957,14 +2201,10 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) } else INP_RUNLOCK(inp); } - INP_INFO_RLOCK(&V_tcbinfo); - for (i = 0; i < n; i++) { - inp = inp_list[i]; - INP_RLOCK(inp); - if (!in_pcbrele_rlocked(inp)) - INP_RUNLOCK(inp); - } - INP_INFO_RUNLOCK(&V_tcbinfo); + + il->il_count = n; + il->il_pcbinfo = &V_tcbinfo; + epoch_call(net_epoch_preempt, &il->il_epoch_ctx, in_pcblist_rele_rlocked); if (!error) { /* @@ -1981,7 +2221,6 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) INP_LIST_RUNLOCK(&V_tcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } - free(inp_list, M_TEMP); return (error); } diff --git a/freebsd/sys/netinet/tcp_syncache.c b/freebsd/sys/netinet/tcp_syncache.c index 27e0e25f..e163aa54 100644 --- a/freebsd/sys/netinet/tcp_syncache.c +++ b/freebsd/sys/netinet/tcp_syncache.c @@ -862,6 +862,12 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); refcount_release(&tp->t_fb->tfb_refcnt); tp->t_fb = rblk; + /* + * XXXrrs this is quite dangerous, it is possible + * for the new function to fail to init. We also + * are not asking if the handoff_is_ok though at + * the very start thats probalbly ok. + */ if (tp->t_fb->tfb_tcp_fb_init) { (*tp->t_fb->tfb_tcp_fb_init)(tp); } diff --git a/freebsd/sys/netinet/tcp_timer.c b/freebsd/sys/netinet/tcp_timer.c index 69bd052b..422e5122 100644 --- a/freebsd/sys/netinet/tcp_timer.c +++ b/freebsd/sys/netinet/tcp_timer.c @@ -664,8 +664,7 @@ tcp_timer_rexmt(void * xtp) tcp_inpinfo_lock_del(inp, tp); goto out; } - tp = tcp_drop(tp, tp->t_softerror ? - tp->t_softerror : ETIMEDOUT); + tp = tcp_drop(tp, ETIMEDOUT); tcp_inpinfo_lock_del(inp, tp); goto out; } @@ -696,7 +695,12 @@ tcp_timer_rexmt(void * xtp) tp->t_flags |= TF_WASCRECOVERY; else tp->t_flags &= ~TF_WASCRECOVERY; - tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); + if ((tp->t_flags & TF_RCVD_TSTMP) == 0) + tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); + /* In the event that we've negotiated timestamps + * badrxtwin will be set to the value that we set + * the retransmitted packet's to_tsval to by tcp_output + */ tp->t_flags |= TF_PREVVALID; } else tp->t_flags &= ~TF_PREVVALID; diff --git a/freebsd/sys/netinet/tcp_timewait.c b/freebsd/sys/netinet/tcp_timewait.c index aa26cb37..afadf7cd 100644 --- a/freebsd/sys/netinet/tcp_timewait.c +++ b/freebsd/sys/netinet/tcp_timewait.c @@ -647,7 +647,7 @@ tcp_tw_2msl_stop(struct tcptw *tw, int reuse) { struct ucred *cred; struct inpcb *inp; - int released; + int released __unused; INP_INFO_RLOCK_ASSERT(&V_tcbinfo); diff --git a/freebsd/sys/netinet/tcp_usrreq.c b/freebsd/sys/netinet/tcp_usrreq.c index c93b2d4a..bf2cff4c 100644 --- a/freebsd/sys/netinet/tcp_usrreq.c +++ b/freebsd/sys/netinet/tcp_usrreq.c @@ -96,6 +96,7 @@ __FBSDID("$FreeBSD$"); #include <netinet/tcpip.h> #include <netinet/cc/cc.h> #include <netinet/tcp_fastopen.h> +#include <netinet/tcp_hpts.h> #ifdef TCPPCAP #include <netinet/tcp_pcap.h> #endif @@ -1097,7 +1098,9 @@ tcp_usr_abort(struct socket *so) !(inp->inp_flags & INP_DROPPED)) { tp = intotcpcb(inp); TCPDEBUG1(); - tcp_drop(tp, ECONNABORTED); + tp = tcp_drop(tp, ECONNABORTED); + if (tp == NULL) + goto dropped; TCPDEBUG2(PRU_ABORT); TCP_PROBE2(debug__user, tp, PRU_ABORT); } @@ -1108,6 +1111,7 @@ tcp_usr_abort(struct socket *so) inp->inp_flags |= INP_SOCKREF; } INP_WUNLOCK(inp); +dropped: INP_INFO_RUNLOCK(&V_tcbinfo); } @@ -1395,11 +1399,15 @@ tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) ti->tcpi_snd_nxt = tp->snd_nxt; ti->tcpi_snd_mss = tp->t_maxseg; ti->tcpi_rcv_mss = tp->t_maxseg; - if (tp->t_flags & TF_TOE) - ti->tcpi_options |= TCPI_OPT_TOE; ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; ti->tcpi_rcv_ooopack = tp->t_rcvoopack; ti->tcpi_snd_zerowin = tp->t_sndzerowin; +#ifdef TCP_OFFLOAD + if (tp->t_flags & TF_TOE) { + ti->tcpi_options |= TCPI_OPT_TOE; + tcp_offload_tcp_info(tp, ti); + } +#endif } /* @@ -1516,22 +1524,41 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) */ (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); } +#ifdef TCPHPTS + /* Assure that we are not on any hpts */ + tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_ALL); +#endif + if (blk->tfb_tcp_fb_init) { + error = (*blk->tfb_tcp_fb_init)(tp); + if (error) { + refcount_release(&blk->tfb_refcnt); + if (tp->t_fb->tfb_tcp_fb_init) { + if((*tp->t_fb->tfb_tcp_fb_init)(tp) != 0) { + /* Fall back failed, drop the connection */ + INP_WUNLOCK(inp); + soabort(so); + return(error); + } + } + goto err_out; + } + } refcount_release(&tp->t_fb->tfb_refcnt); tp->t_fb = blk; - if (tp->t_fb->tfb_tcp_fb_init) { - (*tp->t_fb->tfb_tcp_fb_init)(tp); - } #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { tcp_offload_ctloutput(tp, sopt->sopt_dir, sopt->sopt_name); } #endif +err_out: INP_WUNLOCK(inp); return (error); } else if ((sopt->sopt_dir == SOPT_GET) && (sopt->sopt_name == TCP_FUNCTION_BLK)) { - strcpy(fsn.function_set_name, tp->t_fb->tfb_tcp_block_name); + strncpy(fsn.function_set_name, tp->t_fb->tfb_tcp_block_name, + TCP_FUNCTION_NAME_LEN_MAX); + fsn.function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; fsn.pcbcnt = tp->t_fb->tfb_refcnt; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &fsn, sizeof fsn); diff --git a/freebsd/sys/netinet/tcp_var.h b/freebsd/sys/netinet/tcp_var.h index f09bd19c..adaaff61 100644 --- a/freebsd/sys/netinet/tcp_var.h +++ b/freebsd/sys/netinet/tcp_var.h @@ -83,123 +83,123 @@ STAILQ_HEAD(tcp_log_stailq, tcp_log_mem); /* * Tcp control block, one per tcp; fields: - * Organized for 16 byte cacheline efficiency. + * Organized for 64 byte cacheline efficiency based + * on common tcp_input/tcp_output processing. */ struct tcpcb { - struct tsegqe_head t_segq; /* segment reassembly queue */ - int t_segqlen; /* segment reassembly queue length */ - int t_dupacks; /* consecutive dup acks recd */ - - struct tcp_timer *t_timers; /* All the TCP timers in one struct */ - + /* Cache line 1 */ struct inpcb *t_inpcb; /* back pointer to internet pcb */ - int t_state; /* state of this connection */ + struct tcp_function_block *t_fb;/* TCP function call block */ + void *t_fb_ptr; /* Pointer to t_fb specific data */ + uint32_t t_maxseg:24, /* maximum segment size */ + t_logstate:8; /* State of "black box" logging */ + uint32_t t_state:4, /* state of this connection */ + bits_spare : 24; u_int t_flags; - - struct vnet *t_vnet; /* back pointer to parent vnet */ - tcp_seq snd_una; /* sent but unacknowledged */ tcp_seq snd_max; /* highest sequence number sent; * used to recognize retransmits */ tcp_seq snd_nxt; /* send next */ tcp_seq snd_up; /* send urgent pointer */ - - tcp_seq snd_wl1; /* window update seg seq number */ - tcp_seq snd_wl2; /* window update seg ack number */ - tcp_seq iss; /* initial send sequence number */ - tcp_seq irs; /* initial receive sequence number */ - + uint32_t snd_wnd; /* send window */ + uint32_t snd_cwnd; /* congestion-controlled window */ + uint32_t cl1_spare; /* Spare to round out CL 1 */ + /* Cache line 2 */ + u_int32_t ts_offset; /* our timestamp offset */ + u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */ + int rcv_numsacks; /* # distinct sack blks present */ + u_int t_tsomax; /* TSO total burst length limit in bytes */ + u_int t_tsomaxsegcount; /* TSO maximum segment count */ + u_int t_tsomaxsegsize; /* TSO maximum segment size in bytes */ tcp_seq rcv_nxt; /* receive next */ tcp_seq rcv_adv; /* advertised window */ uint32_t rcv_wnd; /* receive window */ + u_int t_flags2; /* More tcpcb flags storage */ + int t_srtt; /* smoothed round-trip time */ + int t_rttvar; /* variance in round-trip time */ + u_int32_t ts_recent; /* timestamp echo data */ + u_char snd_scale; /* window scaling for send window */ + u_char rcv_scale; /* window scaling for recv window */ + u_char snd_limited; /* segments limited transmitted */ + u_char request_r_scale; /* pending window scaling */ + tcp_seq last_ack_sent; + u_int t_rcvtime; /* inactivity time */ + /* Cache line 3 */ tcp_seq rcv_up; /* receive urgent pointer */ - - uint32_t snd_wnd; /* send window */ - uint32_t snd_cwnd; /* congestion-controlled window */ + int t_segqlen; /* segment reassembly queue length */ + struct tsegqe_head t_segq; /* segment reassembly queue */ + struct mbuf *t_in_pkt; + struct mbuf *t_tail_pkt; + struct tcp_timer *t_timers; /* All the TCP timers in one struct */ + struct vnet *t_vnet; /* back pointer to parent vnet */ uint32_t snd_ssthresh; /* snd_cwnd size threshold for * for slow start exponential to * linear switch */ + tcp_seq snd_wl1; /* window update seg seq number */ + /* Cache line 4 */ + tcp_seq snd_wl2; /* window update seg ack number */ + + tcp_seq irs; /* initial receive sequence number */ + tcp_seq iss; /* initial send sequence number */ + u_int t_acktime; + u_int ts_recent_age; /* when last updated */ tcp_seq snd_recover; /* for use in NewReno Fast Recovery */ + uint16_t cl4_spare; /* Spare to adjust CL 4 */ + char t_oobflags; /* have some */ + char t_iobc; /* input character */ + int t_rxtcur; /* current retransmit value (ticks) */ - u_int t_rcvtime; /* inactivity time */ - u_int t_starttime; /* time connection was established */ + int t_rxtshift; /* log(2) of rexmt exp. backoff */ u_int t_rtttime; /* RTT measurement start time */ + tcp_seq t_rtseq; /* sequence number being timed */ + u_int t_starttime; /* time connection was established */ - int t_rxtcur; /* current retransmit value (ticks) */ - u_int t_maxseg; /* maximum segment size */ u_int t_pmtud_saved_maxseg; /* pre-blackhole MSS */ - int t_srtt; /* smoothed round-trip time */ - int t_rttvar; /* variance in round-trip time */ - - int t_rxtshift; /* log(2) of rexmt exp. backoff */ u_int t_rttmin; /* minimum rtt allowed */ + u_int t_rttbest; /* best rtt we've seen */ - u_long t_rttupdated; /* number of times rtt sampled */ - uint32_t max_sndwnd; /* largest window peer has offered */ int t_softerror; /* possible error not yet reported */ -/* out-of-band data */ - char t_oobflags; /* have some */ - char t_iobc; /* input character */ -/* RFC 1323 variables */ - u_char snd_scale; /* window scaling for send window */ - u_char rcv_scale; /* window scaling for recv window */ - u_char request_r_scale; /* pending window scaling */ - u_int32_t ts_recent; /* timestamp echo data */ - u_int ts_recent_age; /* when last updated */ - u_int32_t ts_offset; /* our timestamp offset */ - - tcp_seq last_ack_sent; -/* experimental */ + uint32_t max_sndwnd; /* largest window peer has offered */ + /* Cache line 5 */ uint32_t snd_cwnd_prev; /* cwnd prior to retransmit */ uint32_t snd_ssthresh_prev; /* ssthresh prior to retransmit */ tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */ int t_sndzerowin; /* zero-window updates sent */ - u_int t_badrxtwin; /* window for retransmit recovery */ - u_char snd_limited; /* segments limited transmitted */ -/* SACK related state */ + u_long t_rttupdated; /* number of times rtt sampled */ int snd_numholes; /* number of holes seen by sender */ + u_int t_badrxtwin; /* window for retransmit recovery */ TAILQ_HEAD(sackhole_head, sackhole) snd_holes; /* SACK scoreboard (sorted) */ tcp_seq snd_fack; /* last seq number(+1) sack'd by rcv'r*/ - int rcv_numsacks; /* # distinct sack blks present */ - struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */ tcp_seq sack_newdata; /* New data xmitted in this recovery episode starts at this seq number */ + struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */ struct sackhint sackhint; /* SACK scoreboard hint */ int t_rttlow; /* smallest observerved RTT */ - u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */ int rfbuf_cnt; /* recv buffer autoscaling byte count */ struct toedev *tod; /* toedev handling this connection */ int t_sndrexmitpack; /* retransmit packets sent */ int t_rcvoopack; /* out-of-order packets received */ void *t_toe; /* TOE pcb pointer */ - int t_bytes_acked; /* # bytes acked during current RTT */ struct cc_algo *cc_algo; /* congestion control algorithm */ struct cc_var *ccv; /* congestion control specific vars */ struct osd *osd; /* storage for Khelp module data */ - + int t_bytes_acked; /* # bytes acked during current RTT */ u_int t_keepinit; /* time to establish connection */ u_int t_keepidle; /* time before keepalive probes begin */ u_int t_keepintvl; /* interval between keepalives */ u_int t_keepcnt; /* number of keepalives before close */ - - u_int t_tsomax; /* TSO total burst length limit in bytes */ - u_int t_tsomaxsegcount; /* TSO maximum segment count */ - u_int t_tsomaxsegsize; /* TSO maximum segment size in bytes */ - u_int t_flags2; /* More tcpcb flags storage */ - int t_logstate; /* State of "black box" logging */ - struct tcp_log_stailq t_logs; /* Log buffer */ + int t_dupacks; /* consecutive dup acks recd */ int t_lognum; /* Number of log entries */ - uint32_t t_logsn; /* Log "serial number" */ + struct tcp_log_stailq t_logs; /* Log buffer */ struct tcp_log_id_node *t_lin; struct tcp_log_id_bucket *t_lib; const char *t_output_caller; /* Function that called tcp_output */ - struct tcp_function_block *t_fb;/* TCP function call block */ - void *t_fb_ptr; /* Pointer to t_fb specific data */ + uint32_t t_logsn; /* Log "serial number" */ uint8_t t_tfo_client_cookie_len; /* TCP Fast Open client cookie length */ unsigned int *t_tfo_pending; /* TCP Fast Open server pending counter */ union { @@ -257,14 +257,19 @@ struct tcptemp { struct tcp_function_block { char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX]; int (*tfb_tcp_output)(struct tcpcb *); + int (*tfb_tcp_output_wtime)(struct tcpcb *, const struct timeval *); void (*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t, int); + void (*tfb_tcp_hpts_do_segment)(struct mbuf *, struct tcphdr *, + struct socket *, struct tcpcb *, + int, int, uint8_t, + int, int, struct timeval *); int (*tfb_tcp_ctloutput)(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp); /* Optional memory allocation/free routine */ - void (*tfb_tcp_fb_init)(struct tcpcb *); + int (*tfb_tcp_fb_init)(struct tcpcb *); void (*tfb_tcp_fb_fini)(struct tcpcb *, int); /* Optional timers, must define all if you define one */ int (*tfb_tcp_timer_stop_all)(struct tcpcb *); @@ -274,6 +279,7 @@ struct tcp_function_block { void (*tfb_tcp_timer_stop)(struct tcpcb *, uint32_t); void (*tfb_tcp_rexmit_tmr)(struct tcpcb *); int (*tfb_tcp_handoff_ok)(struct tcpcb *); + void (*tfb_tcp_mtu_chg)(struct tcpcb *); volatile uint32_t tfb_refcnt; uint32_t tfb_flags; uint8_t tfb_id; @@ -688,11 +694,14 @@ void tcp_inptoxtp(const struct inpcb *, struct xtcpcb *); #endif /* - * TCP function name-to-id mapping exported to user-land via sysctl(3). + * TCP function information (name-to-id mapping, aliases, and refcnt) + * exported to user-land via sysctl(3). */ -struct tcp_function_id { +struct tcp_function_info { + uint32_t tfi_refcnt; uint8_t tfi_id; char tfi_name[TCP_FUNCTION_NAME_LEN_MAX]; + char tfi_alias[TCP_FUNCTION_NAME_LEN_MAX]; }; /* @@ -848,9 +857,12 @@ int register_tcp_functions_as_names(struct tcp_function_block *blk, int wait, const char *names[], int *num_names); int register_tcp_functions_as_name(struct tcp_function_block *blk, const char *name, int wait); -int deregister_tcp_functions(struct tcp_function_block *blk); +int deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce, + bool force); struct tcp_function_block *find_and_ref_tcp_functions(struct tcp_function_set *fs); -struct tcp_function_block *find_and_ref_tcp_fb(struct tcp_function_block *blk); +void tcp_switch_back_to_default(struct tcpcb *tp); +struct tcp_function_block * +find_and_ref_tcp_fb(struct tcp_function_block *fs); int tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp); uint32_t tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *); diff --git a/freebsd/sys/netinet/toecore.h b/freebsd/sys/netinet/toecore.h index 633984a6..f2374d70 100644 --- a/freebsd/sys/netinet/toecore.h +++ b/freebsd/sys/netinet/toecore.h @@ -38,6 +38,7 @@ struct tcpopt; struct tcphdr; struct in_conninfo; +struct tcp_info; struct toedev { TAILQ_ENTRY(toedev) link; /* glue for toedev_list */ @@ -101,6 +102,10 @@ struct toedev { /* TCP socket option */ void (*tod_ctloutput)(struct toedev *, struct tcpcb *, int, int); + + /* Update software state */ + void (*tod_tcp_info)(struct toedev *, struct tcpcb *, + struct tcp_info *); }; #include <sys/eventhandler.h> diff --git a/freebsd/sys/netinet/udp_usrreq.c b/freebsd/sys/netinet/udp_usrreq.c index da2dbe98..178a8d5e 100644 --- a/freebsd/sys/netinet/udp_usrreq.c +++ b/freebsd/sys/netinet/udp_usrreq.c @@ -842,6 +842,7 @@ udp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; struct inpcb *inp, **inp_list; + struct in_pcblist *il; inp_gen_t gencnt; struct xinpgen xig; @@ -879,10 +880,8 @@ udp_pcblist(SYSCTL_HANDLER_ARGS) error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); - - inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); - if (inp_list == NULL) - return (ENOMEM); + il = malloc(sizeof(struct in_pcblist) + n * sizeof(struct inpcb *), M_TEMP, M_WAITOK|M_ZERO_INVARIANTS); + inp_list = il->il_inp_list; INP_INFO_RLOCK(&V_udbinfo); for (inp = LIST_FIRST(V_udbinfo.ipi_listhead), i = 0; inp && i < n; @@ -911,14 +910,9 @@ udp_pcblist(SYSCTL_HANDLER_ARGS) } else INP_RUNLOCK(inp); } - INP_INFO_WLOCK(&V_udbinfo); - for (i = 0; i < n; i++) { - inp = inp_list[i]; - INP_RLOCK(inp); - if (!in_pcbrele_rlocked(inp)) - INP_RUNLOCK(inp); - } - INP_INFO_WUNLOCK(&V_udbinfo); + il->il_count = n; + il->il_pcbinfo = &V_udbinfo; + epoch_call(net_epoch_preempt, &il->il_epoch_ctx, in_pcblist_rele_rlocked); if (!error) { /* @@ -934,7 +928,6 @@ udp_pcblist(SYSCTL_HANDLER_ARGS) INP_INFO_RUNLOCK(&V_udbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } - free(inp_list, M_TEMP); return (error); } @@ -1578,6 +1571,7 @@ udp_abort(struct socket *so) static int udp_attach(struct socket *so, int proto, struct thread *td) { + static uint32_t udp_flowid; struct inpcb *inp; struct inpcbinfo *pcbinfo; int error; @@ -1598,6 +1592,8 @@ udp_attach(struct socket *so, int proto, struct thread *td) inp = sotoinpcb(so); inp->inp_vflag |= INP_IPV4; inp->inp_ip_ttl = V_ip_defttl; + inp->inp_flowid = atomic_fetchadd_int(&udp_flowid, 1); + inp->inp_flowtype = M_HASHTYPE_OPAQUE; error = udp_newudpcb(inp); if (error) { @@ -1723,6 +1719,7 @@ udp_detach(struct socket *so) INP_WLOCK(inp); up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); + /* XXX defer to epoch_call */ inp->inp_ppcb = NULL; in_pcbdetach(inp); in_pcbfree(inp); |