diff options
Diffstat (limited to 'freebsd/sys/netinet/in_pcb.c')
-rw-r--r-- | freebsd/sys/netinet/in_pcb.c | 319 |
1 files changed, 308 insertions, 11 deletions
diff --git a/freebsd/sys/netinet/in_pcb.c b/freebsd/sys/netinet/in_pcb.c index 809a7de0..e423eed8 100644 --- a/freebsd/sys/netinet/in_pcb.c +++ b/freebsd/sys/netinet/in_pcb.c @@ -18,7 +18,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors + * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * @@ -48,6 +48,7 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_ipsec.h> #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> +#include <rtems/bsd/local/opt_ratelimit.h> #include <rtems/bsd/local/opt_pcbgroup.h> #include <rtems/bsd/local/opt_rss.h> @@ -63,6 +64,7 @@ __FBSDID("$FreeBSD$"); #include <sys/rmlock.h> #include <sys/socket.h> #include <sys/socketvar.h> +#include <sys/sockio.h> #include <sys/priv.h> #include <sys/proc.h> #include <sys/refcount.h> @@ -102,11 +104,7 @@ __FBSDID("$FreeBSD$"); #include <netinet6/ip6_var.h> #endif /* INET6 */ - -#ifdef IPSEC -#include <netipsec/ipsec.h> -#include <netipsec/key.h> -#endif /* IPSEC */ +#include <netipsec/ipsec_support.h> #include <security/mac/mac_framework.h> @@ -309,8 +307,8 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) goto out; mac_inpcb_create(so, inp); #endif -#ifdef IPSEC - error = ipsec_init_policy(so, &inp->inp_sp); +#if defined(IPSEC) || defined(IPSEC_SUPPORT) + error = ipsec_init_pcbpolicy(inp); if (error != 0) { #ifdef MAC mac_inpcb_destroy(inp); @@ -336,8 +334,14 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) #endif inp->inp_gencnt = ++pcbinfo->ipi_gencnt; refcount_init(&inp->inp_refcount, 1); /* Reference from inpcbinfo */ + + /* + * Routes in inpcb's can cache L2 as well; they are guaranteed + * to be cleaned up. + */ + inp->inp_route.ro_flags = RT_LLE_CACHE; INP_LIST_WUNLOCK(pcbinfo); -#if defined(IPSEC) || defined(MAC) +#if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) out: if (error != 0) { crfree(inp->inp_cred); @@ -1152,6 +1156,10 @@ in_pcbdetach(struct inpcb *inp) KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); +#ifdef RATELIMIT + if (inp->inp_snd_tag != NULL) + in_pcbdetach_txrtlmt(inp); +#endif inp->inp_socket->so_pcb = NULL; inp->inp_socket = NULL; } @@ -1290,7 +1298,7 @@ in_pcbfree(struct inpcb *inp) INP_WLOCK_ASSERT(inp); /* XXXRW: Do as much as possible here. */ -#ifdef IPSEC +#if defined(IPSEC) || defined(IPSEC_SUPPORT) if (inp->inp_sp != NULL) ipsec_delete_pcbpolicy(inp); #endif @@ -2358,7 +2366,7 @@ inp_runlock(struct inpcb *inp) INP_RUNLOCK(inp); } -#ifdef INVARIANTS +#ifdef INVARIANT_SUPPORT void inp_lock_assert(struct inpcb *inp) { @@ -2444,6 +2452,41 @@ so_sototcpcb(struct socket *so) return (sototcpcb(so)); } +/* + * Create an external-format (``xinpcb'') structure using the information in + * the kernel-format in_pcb structure pointed to by inp. This is done to + * reduce the spew of irrelevant information over this interface, to isolate + * user code from changes in the kernel structure, and potentially to provide + * information-hiding if we decide that some of this information should be + * hidden from users. + */ +void +in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi) +{ + + xi->xi_len = sizeof(struct xinpcb); + if (inp->inp_socket) + sotoxsocket(inp->inp_socket, &xi->xi_socket); + else + bzero(&xi->xi_socket, sizeof(struct xsocket)); + bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo)); + xi->inp_gencnt = inp->inp_gencnt; + xi->inp_ppcb = inp->inp_ppcb; + xi->inp_flow = inp->inp_flow; + xi->inp_flowid = inp->inp_flowid; + xi->inp_flowtype = inp->inp_flowtype; + xi->inp_flags = inp->inp_flags; + xi->inp_flags2 = inp->inp_flags2; + xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket; + xi->in6p_cksum = inp->in6p_cksum; + xi->in6p_hops = inp->in6p_hops; + xi->inp_ip_tos = inp->inp_ip_tos; + xi->inp_vflag = inp->inp_vflag; + xi->inp_ip_ttl = inp->inp_ip_ttl; + xi->inp_ip_p = inp->inp_ip_p; + xi->inp_ip_minttl = inp->inp_ip_minttl; +} + #ifdef DDB static void db_print_indent(int indent) @@ -2502,6 +2545,10 @@ db_print_inpflags(int inp_flags) db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); comma = 1; } + if (inp_flags & INP_ORIGDSTADDR) { + db_printf("%sINP_ORIGDSTADDR", comma ? ", " : ""); + comma = 1; + } if (inp_flags & INP_HDRINCL) { db_printf("%sINP_HDRINCL", comma ? ", " : ""); comma = 1; @@ -2689,3 +2736,253 @@ DB_SHOW_COMMAND(inpcb, db_show_inpcb) db_print_inpcb(inp, "inpcb", 0); } #endif /* DDB */ + +#ifdef RATELIMIT +/* + * Modify TX rate limit based on the existing "inp->inp_snd_tag", + * if any. + */ +int +in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate) +{ + union if_snd_tag_modify_params params = { + .rate_limit.max_rate = max_pacing_rate, + }; + struct m_snd_tag *mst; + struct ifnet *ifp; + int error; + + mst = inp->inp_snd_tag; + if (mst == NULL) + return (EINVAL); + + ifp = mst->ifp; + if (ifp == NULL) + return (EINVAL); + + if (ifp->if_snd_tag_modify == NULL) { + error = EOPNOTSUPP; + } else { + error = ifp->if_snd_tag_modify(mst, ¶ms); + } + return (error); +} + +/* + * Query existing TX rate limit based on the existing + * "inp->inp_snd_tag", if any. + */ +int +in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate) +{ + union if_snd_tag_query_params params = { }; + struct m_snd_tag *mst; + struct ifnet *ifp; + int error; + + mst = inp->inp_snd_tag; + if (mst == NULL) + return (EINVAL); + + ifp = mst->ifp; + if (ifp == NULL) + return (EINVAL); + + if (ifp->if_snd_tag_query == NULL) { + error = EOPNOTSUPP; + } else { + error = ifp->if_snd_tag_query(mst, ¶ms); + if (error == 0 && p_max_pacing_rate != NULL) + *p_max_pacing_rate = params.rate_limit.max_rate; + } + return (error); +} + +/* + * Allocate a new TX rate limit send tag from the network interface + * given by the "ifp" argument and save it in "inp->inp_snd_tag": + */ +int +in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, + uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate) +{ + union if_snd_tag_alloc_params params = { + .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, + .rate_limit.hdr.flowid = flowid, + .rate_limit.hdr.flowtype = flowtype, + .rate_limit.max_rate = max_pacing_rate, + }; + int error; + + INP_WLOCK_ASSERT(inp); + + if (inp->inp_snd_tag != NULL) + return (EINVAL); + + if (ifp->if_snd_tag_alloc == NULL) { + error = EOPNOTSUPP; + } else { + error = ifp->if_snd_tag_alloc(ifp, ¶ms, &inp->inp_snd_tag); + + /* + * At success increment the refcount on + * the send tag's network interface: + */ + if (error == 0) + if_ref(inp->inp_snd_tag->ifp); + } + return (error); +} + +/* + * Free an existing TX rate limit tag based on the "inp->inp_snd_tag", + * if any: + */ +void +in_pcbdetach_txrtlmt(struct inpcb *inp) +{ + struct m_snd_tag *mst; + struct ifnet *ifp; + + INP_WLOCK_ASSERT(inp); + + mst = inp->inp_snd_tag; + inp->inp_snd_tag = NULL; + + if (mst == NULL) + return; + + ifp = mst->ifp; + if (ifp == NULL) + return; + + /* + * If the device was detached while we still had reference(s) + * on the ifp, we assume if_snd_tag_free() was replaced with + * stubs. + */ + ifp->if_snd_tag_free(mst); + + /* release reference count on network interface */ + if_rele(ifp); +} + +/* + * This function should be called when the INP_RATE_LIMIT_CHANGED flag + * is set in the fast path and will attach/detach/modify the TX rate + * limit send tag based on the socket's so_max_pacing_rate value. + */ +void +in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) +{ + struct socket *socket; + uint32_t max_pacing_rate; + bool did_upgrade; + int error; + + if (inp == NULL) + return; + + socket = inp->inp_socket; + if (socket == NULL) + return; + + if (!INP_WLOCKED(inp)) { + /* + * NOTE: If the write locking fails, we need to bail + * out and use the non-ratelimited ring for the + * transmit until there is a new chance to get the + * write lock. + */ + if (!INP_TRY_UPGRADE(inp)) + return; + did_upgrade = 1; + } else { + did_upgrade = 0; + } + + /* + * NOTE: The so_max_pacing_rate value is read unlocked, + * because atomic updates are not required since the variable + * is checked at every mbuf we send. It is assumed that the + * variable read itself will be atomic. + */ + max_pacing_rate = socket->so_max_pacing_rate; + + /* + * NOTE: When attaching to a network interface a reference is + * made to ensure the network interface doesn't go away until + * all ratelimit connections are gone. The network interface + * pointers compared below represent valid network interfaces, + * except when comparing towards NULL. + */ + if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { + error = 0; + } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { + if (inp->inp_snd_tag != NULL) + in_pcbdetach_txrtlmt(inp); + error = 0; + } else if (inp->inp_snd_tag == NULL) { + /* + * In order to utilize packet pacing with RSS, we need + * to wait until there is a valid RSS hash before we + * can proceed: + */ + if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { + error = EAGAIN; + } else { + error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), + mb->m_pkthdr.flowid, max_pacing_rate); + } + } else { + error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); + } + if (error == 0 || error == EOPNOTSUPP) + inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; + if (did_upgrade) + INP_DOWNGRADE(inp); +} + +/* + * Track route changes for TX rate limiting. + */ +void +in_pcboutput_eagain(struct inpcb *inp) +{ + struct socket *socket; + bool did_upgrade; + + if (inp == NULL) + return; + + socket = inp->inp_socket; + if (socket == NULL) + return; + + if (inp->inp_snd_tag == NULL) + return; + + if (!INP_WLOCKED(inp)) { + /* + * NOTE: If the write locking fails, we need to bail + * out and use the non-ratelimited ring for the + * transmit until there is a new chance to get the + * write lock. + */ + if (!INP_TRY_UPGRADE(inp)) + return; + did_upgrade = 1; + } else { + did_upgrade = 0; + } + + /* detach rate limiting */ + in_pcbdetach_txrtlmt(inp); + + /* make sure new mbuf send tag allocation is made */ + inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; + + if (did_upgrade) + INP_DOWNGRADE(inp); +} +#endif /* RATELIMIT */ |