diff options
author | Sebastian Huber <sebastian.huber@embedded-brains.de> | 2018-08-22 14:59:50 +0200 |
---|---|---|
committer | Sebastian Huber <sebastian.huber@embedded-brains.de> | 2018-09-21 10:29:41 +0200 |
commit | 3489e3b6396ee9944a6a2e19e675ca54c36993b4 (patch) | |
tree | cd55cfac1c96ff4b888a9606fd6a0d8eb65bb446 /freebsd/sys/netinet | |
parent | ck: Define CK_MD_PPC32_LWSYNC if available (diff) | |
download | rtems-libbsd-3489e3b6396ee9944a6a2e19e675ca54c36993b4.tar.bz2 |
Update to FreeBSD head 2018-09-17
Git mirror commit 6c2192b1ef8c50788c751f878552526800b1e319.
Update #3472.
Diffstat (limited to 'freebsd/sys/netinet')
76 files changed, 4060 insertions, 1991 deletions
diff --git a/freebsd/sys/netinet/cc/cc_newreno.c b/freebsd/sys/netinet/cc/cc_newreno.c index 4d5f8644..2450f08e 100644 --- a/freebsd/sys/netinet/cc/cc_newreno.c +++ b/freebsd/sys/netinet/cc/cc_newreno.c @@ -90,8 +90,8 @@ static void newreno_cong_signal(struct cc_var *ccv, uint32_t type); static void newreno_post_recovery(struct cc_var *ccv); static int newreno_ctl_output(struct cc_var *ccv, struct sockopt *sopt, void *buf); -static VNET_DEFINE(uint32_t, newreno_beta) = 50; -static VNET_DEFINE(uint32_t, newreno_beta_ecn) = 80; +VNET_DEFINE_STATIC(uint32_t, newreno_beta) = 50; +VNET_DEFINE_STATIC(uint32_t, newreno_beta_ecn) = 80; #define V_newreno_beta VNET(newreno_beta) #define V_newreno_beta_ecn VNET(newreno_beta_ecn) @@ -129,9 +129,7 @@ newreno_malloc(struct cc_var *ccv) static void newreno_cb_destroy(struct cc_var *ccv) { - - if (ccv->cc_data != NULL) - free(ccv->cc_data, M_NEWRENO); + free(ccv->cc_data, M_NEWRENO); } static void diff --git a/freebsd/sys/netinet/if_ether.c b/freebsd/sys/netinet/if_ether.c index 0d608180..6ee6b71c 100644 --- a/freebsd/sys/netinet/if_ether.c +++ b/freebsd/sys/netinet/if_ether.c @@ -96,13 +96,13 @@ static SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, ""); static SYSCTL_NODE(_net_link_ether, PF_ARP, arp, CTLFLAG_RW, 0, ""); /* timer values */ -static VNET_DEFINE(int, arpt_keep) = (20*60); /* once resolved, good for 20 +VNET_DEFINE_STATIC(int, arpt_keep) = (20*60); /* once resolved, good for 20 * minutes */ -static VNET_DEFINE(int, arp_maxtries) = 5; -static VNET_DEFINE(int, arp_proxyall) = 0; -static VNET_DEFINE(int, arpt_down) = 20; /* keep incomplete entries for +VNET_DEFINE_STATIC(int, arp_maxtries) = 5; +VNET_DEFINE_STATIC(int, arp_proxyall) = 0; +VNET_DEFINE_STATIC(int, arpt_down) = 20; /* keep incomplete entries for * 20 seconds */ -static VNET_DEFINE(int, arpt_rexmit) = 1; /* retransmit arp entries, sec*/ +VNET_DEFINE_STATIC(int, arpt_rexmit) = 1; /* retransmit arp entries, sec*/ VNET_PCPUSTAT_DEFINE(struct arpstat, arpstat); /* ARP statistics, see if_arp.h */ VNET_PCPUSTAT_SYSINIT(arpstat); @@ -110,7 +110,7 @@ VNET_PCPUSTAT_SYSINIT(arpstat); VNET_PCPUSTAT_SYSUNINIT(arpstat); #endif /* VIMAGE */ -static VNET_DEFINE(int, arp_maxhold) = 1; +VNET_DEFINE_STATIC(int, arp_maxhold) = 1; #define V_arpt_keep VNET(arpt_keep) #define V_arpt_down VNET(arpt_down) diff --git a/freebsd/sys/netinet/igmp.c b/freebsd/sys/netinet/igmp.c index a4b99f62..970a01a0 100644 --- a/freebsd/sys/netinet/igmp.c +++ b/freebsd/sys/netinet/igmp.c @@ -219,11 +219,11 @@ static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state"); * FUTURE: Stop using IFP_TO_IA/INADDR_ANY, and use source address selection * policy to control the address used by IGMP on the link. */ -static VNET_DEFINE(int, interface_timers_running); /* IGMPv3 general +VNET_DEFINE_STATIC(int, interface_timers_running); /* IGMPv3 general * query response */ -static VNET_DEFINE(int, state_change_timers_running); /* IGMPv3 state-change +VNET_DEFINE_STATIC(int, state_change_timers_running); /* IGMPv3 state-change * retransmit */ -static VNET_DEFINE(int, current_state_timers_running); /* IGMPv1/v2 host +VNET_DEFINE_STATIC(int, current_state_timers_running); /* IGMPv1/v2 host * report; IGMPv3 g/sg * query response */ @@ -231,25 +231,25 @@ static VNET_DEFINE(int, current_state_timers_running); /* IGMPv1/v2 host #define V_state_change_timers_running VNET(state_change_timers_running) #define V_current_state_timers_running VNET(current_state_timers_running) -static VNET_DEFINE(LIST_HEAD(, igmp_ifsoftc), igi_head) = +VNET_DEFINE_STATIC(LIST_HEAD(, igmp_ifsoftc), igi_head) = LIST_HEAD_INITIALIZER(igi_head); -static VNET_DEFINE(struct igmpstat, igmpstat) = { +VNET_DEFINE_STATIC(struct igmpstat, igmpstat) = { .igps_version = IGPS_VERSION_3, .igps_len = sizeof(struct igmpstat), }; -static VNET_DEFINE(struct timeval, igmp_gsrdelay) = {10, 0}; +VNET_DEFINE_STATIC(struct timeval, igmp_gsrdelay) = {10, 0}; #define V_igi_head VNET(igi_head) #define V_igmpstat VNET(igmpstat) #define V_igmp_gsrdelay VNET(igmp_gsrdelay) -static VNET_DEFINE(int, igmp_recvifkludge) = 1; -static VNET_DEFINE(int, igmp_sendra) = 1; -static VNET_DEFINE(int, igmp_sendlocal) = 1; -static VNET_DEFINE(int, igmp_v1enable) = 1; -static VNET_DEFINE(int, igmp_v2enable) = 1; -static VNET_DEFINE(int, igmp_legacysupp); -static VNET_DEFINE(int, igmp_default_version) = IGMP_VERSION_3; +VNET_DEFINE_STATIC(int, igmp_recvifkludge) = 1; +VNET_DEFINE_STATIC(int, igmp_sendra) = 1; +VNET_DEFINE_STATIC(int, igmp_sendlocal) = 1; +VNET_DEFINE_STATIC(int, igmp_v1enable) = 1; +VNET_DEFINE_STATIC(int, igmp_v2enable) = 1; +VNET_DEFINE_STATIC(int, igmp_legacysupp); +VNET_DEFINE_STATIC(int, igmp_default_version) = IGMP_VERSION_3; #define V_igmp_recvifkludge VNET(igmp_recvifkludge) #define V_igmp_sendra VNET(igmp_sendra) diff --git a/freebsd/sys/netinet/in.c b/freebsd/sys/netinet/in.c index 7233f9a2..78fd00c0 100644 --- a/freebsd/sys/netinet/in.c +++ b/freebsd/sys/netinet/in.c @@ -80,7 +80,7 @@ static int in_difaddr_ioctl(u_long, caddr_t, struct ifnet *, struct thread *); static void in_socktrim(struct sockaddr_in *); static void in_purgemaddrs(struct ifnet *); -static VNET_DEFINE(int, nosameprefix); +VNET_DEFINE_STATIC(int, nosameprefix); #define V_nosameprefix VNET(nosameprefix) SYSCTL_INT(_net_inet_ip, OID_AUTO, no_same_prefix, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nosameprefix), 0, @@ -624,8 +624,7 @@ in_difaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) in_ifadown(&ia->ia_ifa, 1); if (ia->ia_ifa.ifa_carp) - (*carp_detach_p)(&ia->ia_ifa, - (cmd == SIOCDIFADDR) ? false : true); + (*carp_detach_p)(&ia->ia_ifa, cmd == SIOCAIFADDR); /* * If this is the last IPv4 address configured on this @@ -1169,10 +1168,6 @@ in_lltable_free_entry(struct lltable *llt, struct llentry *lle) lltable_unlink_entry(llt, lle); } - /* cancel timer */ - if (callout_stop(&lle->lle_timer) > 0) - LLE_REMREF(lle); - /* Drop hold queue */ pkts_dropped = llentry_free(lle); ARPSTAT_ADD(dropped, pkts_dropped); diff --git a/freebsd/sys/netinet/in_fib.c b/freebsd/sys/netinet/in_fib.c index f62bc4a1..f61909ea 100644 --- a/freebsd/sys/netinet/in_fib.c +++ b/freebsd/sys/netinet/in_fib.c @@ -39,7 +39,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> #include <sys/lock.h> -#include <sys/rwlock.h> +#include <sys/rmlock.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/socket.h> @@ -136,6 +136,7 @@ int fib4_lookup_nh_basic(uint32_t fibnum, struct in_addr dst, uint32_t flags, uint32_t flowid, struct nhop4_basic *pnh4) { + RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; struct sockaddr_in sin; @@ -184,6 +185,7 @@ int fib4_lookup_nh_ext(uint32_t fibnum, struct in_addr dst, uint32_t flags, uint32_t flowid, struct nhop4_extended *pnh4) { + RIB_RLOCK_TRACKER; struct rib_head *rh; struct radix_node *rn; struct sockaddr_in sin; diff --git a/freebsd/sys/netinet/in_gif.c b/freebsd/sys/netinet/in_gif.c index d072161f..03aaaf08 100644 --- a/freebsd/sys/netinet/in_gif.c +++ b/freebsd/sys/netinet/in_gif.c @@ -4,6 +4,7 @@ * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * Copyright (c) 2018 Andrey V. Elsukov <ae@FreeBSD.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -40,18 +41,18 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_inet6.h> #include <sys/param.h> -#include <sys/lock.h> -#include <sys/rmlock.h> #include <sys/systm.h> +#include <sys/jail.h> #include <sys/socket.h> #include <sys/sockio.h> #include <sys/mbuf.h> #include <sys/errno.h> #include <sys/kernel.h> #include <sys/sysctl.h> -#include <sys/protosw.h> #include <sys/malloc.h> +#include <sys/proc.h> +#include <net/ethernet.h> #include <net/if.h> #include <net/if_var.h> #include <net/route.h> @@ -72,35 +73,161 @@ __FBSDID("$FreeBSD$"); #include <net/if_gif.h> -static int in_gif_input(struct mbuf **, int *, int); - -extern struct domain inetdomain; -static struct protosw in_gif_protosw = { - .pr_type = SOCK_RAW, - .pr_domain = &inetdomain, - .pr_protocol = 0/* IPPROTO_IPV[46] */, - .pr_flags = PR_ATOMIC|PR_ADDR, - .pr_input = in_gif_input, - .pr_output = rip_output, - .pr_ctloutput = rip_ctloutput, - .pr_usrreqs = &rip_usrreqs -}; - #define GIF_TTL 30 -static VNET_DEFINE(int, ip_gif_ttl) = GIF_TTL; +VNET_DEFINE_STATIC(int, ip_gif_ttl) = GIF_TTL; #define V_ip_gif_ttl VNET(ip_gif_ttl) SYSCTL_INT(_net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_VNET | CTLFLAG_RW, - &VNET_NAME(ip_gif_ttl), 0, ""); + &VNET_NAME(ip_gif_ttl), 0, "Default TTL value for encapsulated packets"); + +/* + * We keep interfaces in a hash table using src+dst as key. + * Interfaces with GIF_IGNORE_SOURCE flag are linked into plain list. + */ +VNET_DEFINE_STATIC(struct gif_list *, ipv4_hashtbl) = NULL; +VNET_DEFINE_STATIC(struct gif_list, ipv4_list) = CK_LIST_HEAD_INITIALIZER(); +#define V_ipv4_hashtbl VNET(ipv4_hashtbl) +#define V_ipv4_list VNET(ipv4_list) + +#define GIF_HASH(src, dst) (V_ipv4_hashtbl[\ + in_gif_hashval((src), (dst)) & (GIF_HASH_SIZE - 1)]) +#define GIF_HASH_SC(sc) GIF_HASH((sc)->gif_iphdr->ip_src.s_addr,\ + (sc)->gif_iphdr->ip_dst.s_addr) +static uint32_t +in_gif_hashval(in_addr_t src, in_addr_t dst) +{ + uint32_t ret; + + ret = fnv_32_buf(&src, sizeof(src), FNV1_32_INIT); + return (fnv_32_buf(&dst, sizeof(dst), ret)); +} + +static int +in_gif_checkdup(const struct gif_softc *sc, in_addr_t src, in_addr_t dst) +{ + struct gif_softc *tmp; + + if (sc->gif_family == AF_INET && + sc->gif_iphdr->ip_src.s_addr == src && + sc->gif_iphdr->ip_dst.s_addr == dst) + return (EEXIST); + + CK_LIST_FOREACH(tmp, &GIF_HASH(src, dst), chain) { + if (tmp == sc) + continue; + if (tmp->gif_iphdr->ip_src.s_addr == src && + tmp->gif_iphdr->ip_dst.s_addr == dst) + return (EADDRNOTAVAIL); + } + return (0); +} + +static void +in_gif_attach(struct gif_softc *sc) +{ + + if (sc->gif_options & GIF_IGNORE_SOURCE) + CK_LIST_INSERT_HEAD(&V_ipv4_list, sc, chain); + else + CK_LIST_INSERT_HEAD(&GIF_HASH_SC(sc), sc, chain); +} + +int +in_gif_setopts(struct gif_softc *sc, u_int options) +{ + + /* NOTE: we are protected with gif_ioctl_sx lock */ + MPASS(sc->gif_family == AF_INET); + MPASS(sc->gif_options != options); + + if ((options & GIF_IGNORE_SOURCE) != + (sc->gif_options & GIF_IGNORE_SOURCE)) { + CK_LIST_REMOVE(sc, chain); + sc->gif_options = options; + in_gif_attach(sc); + } + return (0); +} + +int +in_gif_ioctl(struct gif_softc *sc, u_long cmd, caddr_t data) +{ + struct ifreq *ifr = (struct ifreq *)data; + struct sockaddr_in *dst, *src; + struct ip *ip; + int error; + + /* NOTE: we are protected with gif_ioctl_sx lock */ + error = EINVAL; + switch (cmd) { + case SIOCSIFPHYADDR: + src = &((struct in_aliasreq *)data)->ifra_addr; + dst = &((struct in_aliasreq *)data)->ifra_dstaddr; + + /* sanity checks */ + if (src->sin_family != dst->sin_family || + src->sin_family != AF_INET || + src->sin_len != dst->sin_len || + src->sin_len != sizeof(*src)) + break; + if (src->sin_addr.s_addr == INADDR_ANY || + dst->sin_addr.s_addr == INADDR_ANY) { + error = EADDRNOTAVAIL; + break; + } + if (V_ipv4_hashtbl == NULL) + V_ipv4_hashtbl = gif_hashinit(); + error = in_gif_checkdup(sc, src->sin_addr.s_addr, + dst->sin_addr.s_addr); + if (error == EADDRNOTAVAIL) + break; + if (error == EEXIST) { + /* Addresses are the same. Just return. */ + error = 0; + break; + } + ip = malloc(sizeof(*ip), M_GIF, M_WAITOK | M_ZERO); + ip->ip_src.s_addr = src->sin_addr.s_addr; + ip->ip_dst.s_addr = dst->sin_addr.s_addr; + if (sc->gif_family != 0) { + /* Detach existing tunnel first */ + CK_LIST_REMOVE(sc, chain); + GIF_WAIT(); + free(sc->gif_hdr, M_GIF); + /* XXX: should we notify about link state change? */ + } + sc->gif_family = AF_INET; + sc->gif_iphdr = ip; + in_gif_attach(sc); + break; + case SIOCGIFPSRCADDR: + case SIOCGIFPDSTADDR: + if (sc->gif_family != AF_INET) { + error = EADDRNOTAVAIL; + break; + } + src = (struct sockaddr_in *)&ifr->ifr_addr; + memset(src, 0, sizeof(*src)); + src->sin_family = AF_INET; + src->sin_len = sizeof(*src); + src->sin_addr = (cmd == SIOCGIFPSRCADDR) ? + sc->gif_iphdr->ip_src: sc->gif_iphdr->ip_dst; + error = prison_if(curthread->td_ucred, (struct sockaddr *)src); + if (error != 0) + memset(src, 0, sizeof(*src)); + break; + } + return (error); +} int in_gif_output(struct ifnet *ifp, struct mbuf *m, int proto, uint8_t ecn) { - GIF_RLOCK_TRACKER; struct gif_softc *sc = ifp->if_softc; struct ip *ip; int len; /* prepend new IP header */ + MPASS(in_epoch(net_epoch_preempt)); len = sizeof(struct ip); #ifndef __NO_STRICT_ALIGNMENT if (proto == IPPROTO_ETHERIP) @@ -119,15 +246,9 @@ in_gif_output(struct ifnet *ifp, struct mbuf *m, int proto, uint8_t ecn) } #endif ip = mtod(m, struct ip *); - GIF_RLOCK(sc); - if (sc->gif_family != AF_INET) { - m_freem(m); - GIF_RUNLOCK(sc); - return (ENETDOWN); - } - bcopy(sc->gif_iphdr, ip, sizeof(struct ip)); - GIF_RUNLOCK(sc); + MPASS(sc->gif_family == AF_INET); + bcopy(sc->gif_iphdr, ip, sizeof(struct ip)); ip->ip_p = proto; /* version will be set in ip_output() */ ip->ip_ttl = V_ip_gif_ttl; @@ -138,15 +259,14 @@ in_gif_output(struct ifnet *ifp, struct mbuf *m, int proto, uint8_t ecn) } static int -in_gif_input(struct mbuf **mp, int *offp, int proto) +in_gif_input(struct mbuf *m, int off, int proto, void *arg) { - struct mbuf *m = *mp; - struct gif_softc *sc; + struct gif_softc *sc = arg; struct ifnet *gifp; struct ip *ip; uint8_t ecn; - sc = encap_getarg(m); + MPASS(in_epoch(net_epoch_preempt)); if (sc == NULL) { m_freem(m); KMOD_IPSTAT_INC(ips_nogif); @@ -156,7 +276,7 @@ in_gif_input(struct mbuf **mp, int *offp, int proto) if ((gifp->if_flags & IFF_UP) != 0) { ip = mtod(m, struct ip *); ecn = ip->ip_tos; - m_adj(m, *offp); + m_adj(m, off); gif_input(m, gifp, proto, ecn); } else { m_freem(m); @@ -165,56 +285,125 @@ in_gif_input(struct mbuf **mp, int *offp, int proto) return (IPPROTO_DONE); } -/* - * we know that we are in IFF_UP, outer address available, and outer family - * matched the physical addr family. see gif_encapcheck(). - */ -int -in_gif_encapcheck(const struct mbuf *m, int off, int proto, void *arg) +static int +in_gif_lookup(const struct mbuf *m, int off, int proto, void **arg) { const struct ip *ip; struct gif_softc *sc; int ret; - /* sanity check done in caller */ - sc = (struct gif_softc *)arg; - GIF_RLOCK_ASSERT(sc); + if (V_ipv4_hashtbl == NULL) + return (0); - /* check for address match */ + MPASS(in_epoch(net_epoch_preempt)); ip = mtod(m, const struct ip *); - if (sc->gif_iphdr->ip_src.s_addr != ip->ip_dst.s_addr) + /* + * NOTE: it is safe to iterate without any locking here, because softc + * can be reclaimed only when we are not within net_epoch_preempt + * section, but ip_encap lookup+input are executed in epoch section. + */ + ret = 0; + CK_LIST_FOREACH(sc, &GIF_HASH(ip->ip_dst.s_addr, + ip->ip_src.s_addr), chain) { + /* + * This is an inbound packet, its ip_dst is source address + * in softc. + */ + if (sc->gif_iphdr->ip_src.s_addr == ip->ip_dst.s_addr && + sc->gif_iphdr->ip_dst.s_addr == ip->ip_src.s_addr) { + ret = ENCAP_DRV_LOOKUP; + goto done; + } + } + /* + * No exact match. + * Check the list of interfaces with GIF_IGNORE_SOURCE flag. + */ + CK_LIST_FOREACH(sc, &V_ipv4_list, chain) { + if (sc->gif_iphdr->ip_src.s_addr == ip->ip_dst.s_addr) { + ret = 32 + 8; /* src + proto */ + goto done; + } + } + return (0); +done: + if ((GIF2IFP(sc)->if_flags & IFF_UP) == 0) return (0); - ret = 32; - if (sc->gif_iphdr->ip_dst.s_addr != ip->ip_src.s_addr) { - if ((sc->gif_options & GIF_IGNORE_SOURCE) == 0) - return (0); - } else - ret += 32; - /* ingress filters on outer source */ if ((GIF2IFP(sc)->if_flags & IFF_LINK2) == 0) { struct nhop4_basic nh4; struct in_addr dst; dst = ip->ip_src; - if (fib4_lookup_nh_basic(sc->gif_fibnum, dst, 0, 0, &nh4) != 0) return (0); - if (nh4.nh_ifp != m->m_pkthdr.rcvif) return (0); } + *arg = sc; return (ret); } -int -in_gif_attach(struct gif_softc *sc) +static struct { + const struct encap_config encap; + const struct encaptab *cookie; +} ipv4_encap_cfg[] = { + { + .encap = { + .proto = IPPROTO_IPV4, + .min_length = 2 * sizeof(struct ip), + .exact_match = ENCAP_DRV_LOOKUP, + .lookup = in_gif_lookup, + .input = in_gif_input + }, + }, +#ifdef INET6 + { + .encap = { + .proto = IPPROTO_IPV6, + .min_length = sizeof(struct ip) + + sizeof(struct ip6_hdr), + .exact_match = ENCAP_DRV_LOOKUP, + .lookup = in_gif_lookup, + .input = in_gif_input + }, + }, +#endif + { + .encap = { + .proto = IPPROTO_ETHERIP, + .min_length = sizeof(struct ip) + + sizeof(struct etherip_header) + + sizeof(struct ether_header), + .exact_match = ENCAP_DRV_LOOKUP, + .lookup = in_gif_lookup, + .input = in_gif_input + }, + } +}; + +void +in_gif_init(void) { + int i; - KASSERT(sc->gif_ecookie == NULL, ("gif_ecookie isn't NULL")); - sc->gif_ecookie = encap_attach_func(AF_INET, -1, gif_encapcheck, - &in_gif_protosw, sc); - if (sc->gif_ecookie == NULL) - return (EEXIST); - return (0); + if (!IS_DEFAULT_VNET(curvnet)) + return; + for (i = 0; i < nitems(ipv4_encap_cfg); i++) + ipv4_encap_cfg[i].cookie = ip_encap_attach( + &ipv4_encap_cfg[i].encap, NULL, M_WAITOK); +} + +void +in_gif_uninit(void) +{ + int i; + + if (IS_DEFAULT_VNET(curvnet)) { + for (i = 0; i < nitems(ipv4_encap_cfg); i++) + ip_encap_detach(ipv4_encap_cfg[i].cookie); + } + if (V_ipv4_hashtbl != NULL) + gif_hashdestroy(V_ipv4_hashtbl); } + diff --git a/freebsd/sys/netinet/in_gif.h b/freebsd/sys/netinet/in_gif.h deleted file mode 100644 index e1f4ae48..00000000 --- a/freebsd/sys/netinet/in_gif.h +++ /dev/null @@ -1,45 +0,0 @@ -/* $FreeBSD$ */ -/* $KAME: in_gif.h,v 1.5 2000/04/14 08:36:02 itojun Exp $ */ - -/*- - * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the project nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#ifndef _NETINET_IN_GIF_H_ -#define _NETINET_IN_GIF_H_ - -#define GIF_TTL 30 - -struct gif_softc; -void in_gif_input(struct mbuf *, int); -int in_gif_output(struct ifnet *, int, struct mbuf *); -int gif_encapcheck4(const struct mbuf *, int, int, void *); -int in_gif_attach(struct gif_softc *); -int in_gif_detach(struct gif_softc *); - -#endif /*_NETINET_IN_GIF_H_*/ diff --git a/freebsd/sys/netinet/in_kdtrace.h b/freebsd/sys/netinet/in_kdtrace.h index ba63a9a9..ccf53833 100644 --- a/freebsd/sys/netinet/in_kdtrace.h +++ b/freebsd/sys/netinet/in_kdtrace.h @@ -34,6 +34,8 @@ SDT_PROBE6(ip, , , probe, arg0, arg1, arg2, arg3, arg4, arg5) #define UDP_PROBE(probe, arg0, arg1, arg2, arg3, arg4) \ SDT_PROBE5(udp, , , probe, arg0, arg1, arg2, arg3, arg4) +#define UDPLITE_PROBE(probe, arg0, arg1, arg2, arg3, arg4) \ + SDT_PROBE5(udplite, , , probe, arg0, arg1, arg2, arg3, arg4) #define TCP_PROBE1(probe, arg0) \ SDT_PROBE1(tcp, , , probe, arg0) #define TCP_PROBE2(probe, arg0, arg1) \ @@ -46,14 +48,32 @@ SDT_PROBE5(tcp, , , probe, arg0, arg1, arg2, arg3, arg4) #define TCP_PROBE6(probe, arg0, arg1, arg2, arg3, arg4, arg5) \ SDT_PROBE6(tcp, , , probe, arg0, arg1, arg2, arg3, arg4, arg5) +#define SCTP_PROBE1(probe, arg0) \ + SDT_PROBE1(sctp, , , probe, arg0) +#define SCTP_PROBE2(probe, arg0, arg1) \ + SDT_PROBE2(sctp, , , probe, arg0, arg1) +#define SCTP_PROBE3(probe, arg0, arg1, arg2) \ + SDT_PROBE3(sctp, , , probe, arg0, arg1, arg2) +#define SCTP_PROBE4(probe, arg0, arg1, arg2, arg3) \ + SDT_PROBE4(sctp, , , probe, arg0, arg1, arg2, arg3) +#define SCTP_PROBE5(probe, arg0, arg1, arg2, arg3, arg4) \ + SDT_PROBE5(sctp, , , probe, arg0, arg1, arg2, arg3, arg4) +#define SCTP_PROBE6(probe, arg0, arg1, arg2, arg3, arg4, arg5) \ + SDT_PROBE6(sctp, , , probe, arg0, arg1, arg2, arg3, arg4, arg5) SDT_PROVIDER_DECLARE(ip); +SDT_PROVIDER_DECLARE(sctp); SDT_PROVIDER_DECLARE(tcp); SDT_PROVIDER_DECLARE(udp); +SDT_PROVIDER_DECLARE(udplite); SDT_PROBE_DECLARE(ip, , , receive); SDT_PROBE_DECLARE(ip, , , send); +SDT_PROBE_DECLARE(sctp, , , receive); +SDT_PROBE_DECLARE(sctp, , , send); +SDT_PROBE_DECLARE(sctp, , , state__change); + SDT_PROBE_DECLARE(tcp, , , accept__established); SDT_PROBE_DECLARE(tcp, , , accept__refused); SDT_PROBE_DECLARE(tcp, , , connect__established); @@ -72,4 +92,7 @@ SDT_PROBE_DECLARE(tcp, , , receive__autoresize); SDT_PROBE_DECLARE(udp, , , receive); SDT_PROBE_DECLARE(udp, , , send); +SDT_PROBE_DECLARE(udplite, , , receive); +SDT_PROBE_DECLARE(udplite, , , send); + #endif diff --git a/freebsd/sys/netinet/in_mcast.c b/freebsd/sys/netinet/in_mcast.c index ea4779fc..e0fd4c37 100644 --- a/freebsd/sys/netinet/in_mcast.c +++ b/freebsd/sys/netinet/in_mcast.c @@ -233,8 +233,13 @@ static void inm_init(void) taskqgroup_config_gtask_init(NULL, &free_gtask, inm_release_task, "inm release task"); } +#ifdef EARLY_AP_STARTUP SYSINIT(inm_init, SI_SUB_SMP + 1, SI_ORDER_FIRST, inm_init, NULL); +#else +SYSINIT(inm_init, SI_SUB_ROOT_CONF - 1, SI_ORDER_FIRST, + inm_init, NULL); +#endif void @@ -260,7 +265,10 @@ inm_disconnect(struct in_multi *inm) ifma = inm->inm_ifma; if_ref(ifp); - CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link); + if (ifma->ifma_flags & IFMA_F_ENQUEUED) { + CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link); + ifma->ifma_flags &= ~IFMA_F_ENQUEUED; + } MCDPRINTF("removed ifma: %p from %s\n", ifma, ifp->if_xname); if ((ll_ifma = ifma->ifma_llifma) != NULL) { MPASS(ifma != ll_ifma); @@ -268,7 +276,10 @@ inm_disconnect(struct in_multi *inm) MPASS(ll_ifma->ifma_llifma == NULL); MPASS(ll_ifma->ifma_ifp == ifp); if (--ll_ifma->ifma_refcount == 0) { - CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link); + if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) { + CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link); + ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED; + } MCDPRINTF("removed ll_ifma: %p from %s\n", ll_ifma, ifp->if_xname); if_freemulti(ll_ifma); ifma_restart = true; @@ -1581,23 +1592,24 @@ inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) * Begin state merge transaction at IGMP layer. */ IN_MULTI_LOCK(); - IN_MULTI_LIST_LOCK(); CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); + IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); + IN_MULTI_LIST_UNLOCK(); goto out_in_multi_locked; } CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); error = igmp_change_state(inm); + IN_MULTI_LIST_UNLOCK(); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); out_in_multi_locked: IN_MULTI_UNLOCK(); - IN_MULTI_UNLOCK(); out_imf_rollback: if (error) imf_rollback(imf); @@ -1664,16 +1676,13 @@ inp_findmoptions(struct inpcb *inp) } static void -inp_gcmoptions(epoch_context_t ctx) +inp_gcmoptions(struct ip_moptions *imo) { - struct ip_moptions *imo; struct in_mfilter *imf; struct in_multi *inm; struct ifnet *ifp; size_t idx, nmships; - imo = __containerof(ctx, struct ip_moptions, imo_epoch_ctx); - nmships = imo->imo_num_memberships; for (idx = 0; idx < nmships; ++idx) { imf = imo->imo_mfilters ? &imo->imo_mfilters[idx] : NULL; @@ -1709,7 +1718,7 @@ inp_freemoptions(struct ip_moptions *imo) { if (imo == NULL) return; - epoch_call(net_epoch_preempt, &imo->imo_epoch_ctx, inp_gcmoptions); + inp_gcmoptions(imo); } /* @@ -2261,7 +2270,8 @@ inp_join_group(struct inpcb *inp, struct sockopt *sopt) __func__); IN_MULTI_LIST_UNLOCK(); goto out_imo_free; - } + } + inm_acquire(inm); imo->imo_membership[idx] = inm; } else { CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); @@ -2301,6 +2311,12 @@ out_in_multi_locked: out_imo_free: if (error && is_new) { + inm = imo->imo_membership[idx]; + if (inm != NULL) { + IN_MULTI_LIST_LOCK(); + inm_release_deferred(inm); + IN_MULTI_LIST_UNLOCK(); + } imo->imo_membership[idx] = NULL; --imo->imo_num_memberships; } @@ -2494,6 +2510,7 @@ inp_leave_group(struct inpcb *inp, struct sockopt *sopt) if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); + IN_MULTI_LIST_UNLOCK(); goto out_in_multi_locked; } @@ -2738,12 +2755,12 @@ inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) INP_WLOCK_ASSERT(inp); IN_MULTI_LOCK(); - IN_MULTI_LIST_LOCK(); /* * Begin state merge transaction at IGMP layer. */ CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); + IN_MULTI_LIST_LOCK(); error = inm_merge(inm, imf); if (error) { CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); diff --git a/freebsd/sys/netinet/in_pcb.c b/freebsd/sys/netinet/in_pcb.c index f89487b6..5ba918fa 100644 --- a/freebsd/sys/netinet/in_pcb.c +++ b/freebsd/sys/netinet/in_pcb.c @@ -114,6 +114,9 @@ __FBSDID("$FreeBSD$"); #include <security/mac/mac_framework.h> +#define INPCBLBGROUP_SIZMIN 8 +#define INPCBLBGROUP_SIZMAX 256 + static struct callout ipport_tick_callout; /* @@ -141,7 +144,7 @@ VNET_DEFINE(int, ipport_randomcps) = 10; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_randomtime) = 45; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_stoprandom); /* toggled by ipport_tick */ VNET_DEFINE(int, ipport_tcpallocs); -static VNET_DEFINE(int, ipport_tcplastcount); +VNET_DEFINE_STATIC(int, ipport_tcplastcount); #define V_ipport_tcplastcount VNET(ipport_tcplastcount) @@ -223,6 +226,222 @@ SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, * functions often modify hash chains or addresses in pcbs. */ +static struct inpcblbgroup * +in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag, + uint16_t port, const union in_dependaddr *addr, int size) +{ + struct inpcblbgroup *grp; + size_t bytes; + + bytes = __offsetof(struct inpcblbgroup, il_inp[size]); + grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT); + if (!grp) + return (NULL); + grp->il_vflag = vflag; + grp->il_lport = port; + grp->il_dependladdr = *addr; + grp->il_inpsiz = size; + CK_LIST_INSERT_HEAD(hdr, grp, il_list); + return (grp); +} + +static void +in_pcblbgroup_free_deferred(epoch_context_t ctx) +{ + struct inpcblbgroup *grp; + + grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx); + free(grp, M_PCB); +} + +static void +in_pcblbgroup_free(struct inpcblbgroup *grp) +{ + + CK_LIST_REMOVE(grp, il_list); + epoch_call(net_epoch_preempt, &grp->il_epoch_ctx, + in_pcblbgroup_free_deferred); +} + +static struct inpcblbgroup * +in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, + struct inpcblbgroup *old_grp, int size) +{ + struct inpcblbgroup *grp; + int i; + + grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag, + old_grp->il_lport, &old_grp->il_dependladdr, size); + if (!grp) + return (NULL); + + KASSERT(old_grp->il_inpcnt < grp->il_inpsiz, + ("invalid new local group size %d and old local group count %d", + grp->il_inpsiz, old_grp->il_inpcnt)); + + for (i = 0; i < old_grp->il_inpcnt; ++i) + grp->il_inp[i] = old_grp->il_inp[i]; + grp->il_inpcnt = old_grp->il_inpcnt; + in_pcblbgroup_free(old_grp); + return (grp); +} + +/* + * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i] + * and shrink group if possible. + */ +static void +in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp, + int i) +{ + struct inpcblbgroup *grp = *grpp; + + for (; i + 1 < grp->il_inpcnt; ++i) + grp->il_inp[i] = grp->il_inp[i + 1]; + grp->il_inpcnt--; + + if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN && + grp->il_inpcnt <= (grp->il_inpsiz / 4)) { + /* Shrink this group. */ + struct inpcblbgroup *new_grp = + in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2); + if (new_grp) + *grpp = new_grp; + } + return; +} + +/* + * Add PCB to load balance group for SO_REUSEPORT_LB option. + */ +static int +in_pcbinslbgrouphash(struct inpcb *inp) +{ + const static struct timeval interval = { 60, 0 }; + static struct timeval lastprint; + struct inpcbinfo *pcbinfo; + struct inpcblbgrouphead *hdr; + struct inpcblbgroup *grp; + uint16_t hashmask, lport; + uint32_t group_index; + struct ucred *cred; + + pcbinfo = inp->inp_pcbinfo; + + INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK_ASSERT(pcbinfo); + + if (pcbinfo->ipi_lbgrouphashbase == NULL) + return (0); + + hashmask = pcbinfo->ipi_lbgrouphashmask; + lport = inp->inp_lport; + group_index = INP_PCBLBGROUP_PORTHASH(lport, hashmask); + hdr = &pcbinfo->ipi_lbgrouphashbase[group_index]; + + /* + * Don't allow jailed socket to join local group. + */ + if (inp->inp_socket != NULL) + cred = inp->inp_socket->so_cred; + else + cred = NULL; + if (cred != NULL && jailed(cred)) + return (0); + +#ifdef INET6 + /* + * Don't allow IPv4 mapped INET6 wild socket. + */ + if ((inp->inp_vflag & INP_IPV4) && + inp->inp_laddr.s_addr == INADDR_ANY && + INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) { + return (0); + } +#endif + + hdr = &pcbinfo->ipi_lbgrouphashbase[ + INP_PCBLBGROUP_PORTHASH(inp->inp_lport, + pcbinfo->ipi_lbgrouphashmask)]; + CK_LIST_FOREACH(grp, hdr, il_list) { + if (grp->il_vflag == inp->inp_vflag && + grp->il_lport == inp->inp_lport && + memcmp(&grp->il_dependladdr, + &inp->inp_inc.inc_ie.ie_dependladdr, + sizeof(grp->il_dependladdr)) == 0) { + break; + } + } + if (grp == NULL) { + /* Create new load balance group. */ + grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag, + inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, + INPCBLBGROUP_SIZMIN); + if (!grp) + return (ENOBUFS); + } else if (grp->il_inpcnt == grp->il_inpsiz) { + if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) { + if (ratecheck(&lastprint, &interval)) + printf("lb group port %d, limit reached\n", + ntohs(grp->il_lport)); + return (0); + } + + /* Expand this local group. */ + grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2); + if (!grp) + return (ENOBUFS); + } + + KASSERT(grp->il_inpcnt < grp->il_inpsiz, + ("invalid local group size %d and count %d", + grp->il_inpsiz, grp->il_inpcnt)); + + grp->il_inp[grp->il_inpcnt] = inp; + grp->il_inpcnt++; + return (0); +} + +/* + * Remove PCB from load balance group. + */ +static void +in_pcbremlbgrouphash(struct inpcb *inp) +{ + struct inpcbinfo *pcbinfo; + struct inpcblbgrouphead *hdr; + struct inpcblbgroup *grp; + int i; + + pcbinfo = inp->inp_pcbinfo; + + INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK_ASSERT(pcbinfo); + + if (pcbinfo->ipi_lbgrouphashbase == NULL) + return; + + hdr = &pcbinfo->ipi_lbgrouphashbase[ + INP_PCBLBGROUP_PORTHASH(inp->inp_lport, + pcbinfo->ipi_lbgrouphashmask)]; + + CK_LIST_FOREACH(grp, hdr, il_list) { + for (i = 0; i < grp->il_inpcnt; ++i) { + if (grp->il_inp[i] != inp) + continue; + + if (grp->il_inpcnt == 1) { + /* We are the last, free this local group. */ + in_pcblbgroup_free(grp); + } else { + /* Pull up inpcbs, shrink group if possible. */ + in_pcblbgroup_reorder(hdr, &grp, i); + } + return; + } + } +} + /* * Different protocols initialize their inpcbs differently - giving * different name to the lock. But they all are disposed the same. @@ -252,12 +471,14 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, pcbinfo->ipi_vnet = curvnet; #endif pcbinfo->ipi_listhead = listhead; - LIST_INIT(pcbinfo->ipi_listhead); + CK_LIST_INIT(pcbinfo->ipi_listhead); pcbinfo->ipi_count = 0; pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, &pcbinfo->ipi_hashmask); pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_porthashmask); + pcbinfo->ipi_lbgrouphashbase = hashinit(hash_nelements, M_PCB, + &pcbinfo->ipi_lbgrouphashmask); #ifdef PCBGROUP in_pcbgroup_init(pcbinfo, hashfields, hash_nelements); #endif @@ -281,6 +502,8 @@ in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, pcbinfo->ipi_porthashmask); + hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, + pcbinfo->ipi_lbgrouphashmask); #ifdef PCBGROUP in_pcbgroup_destroy(pcbinfo); #endif @@ -341,7 +564,7 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) #endif INP_WLOCK(inp); INP_LIST_WLOCK(pcbinfo); - LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list); + CK_LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list); pcbinfo->ipi_count++; so->so_pcb = (caddr_t)inp; #ifdef INET6 @@ -519,18 +742,20 @@ in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, /* * Return cached socket options. */ -short +int inp_so_options(const struct inpcb *inp) { - short so_options; + int so_options; - so_options = 0; + so_options = 0; - if ((inp->inp_flags2 & INP_REUSEPORT) != 0) - so_options |= SO_REUSEPORT; - if ((inp->inp_flags2 & INP_REUSEADDR) != 0) - so_options |= SO_REUSEADDR; - return (so_options); + if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) + so_options |= SO_REUSEPORT_LB; + if ((inp->inp_flags2 & INP_REUSEPORT) != 0) + so_options |= SO_REUSEPORT; + if ((inp->inp_flags2 & INP_REUSEADDR) != 0) + so_options |= SO_REUSEADDR; + return (so_options); } #endif /* INET || INET6 */ @@ -589,6 +814,12 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, int error; /* + * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here + * so that we don't have to add to the (already messy) code below. + */ + int reuseport_lb = (so->so_options & SO_REUSEPORT_LB); + + /* * No state changes, so read locks are sufficient here. */ INP_LOCK_ASSERT(inp); @@ -599,7 +830,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, laddr.s_addr = *laddrp; if (nam != NULL && laddr.s_addr != INADDR_ANY) return (EINVAL); - if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) + if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0) lookupflags = INPLOOKUP_WILDCARD; if (nam == NULL) { if ((error = prison_local_ip4(cred, &laddr)) != 0) @@ -636,16 +867,23 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) reuseport = SO_REUSEADDR|SO_REUSEPORT; + /* + * XXX: How to deal with SO_REUSEPORT_LB here? + * Treat same as SO_REUSEPORT for now. + */ + if ((so->so_options & + (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0) + reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB; } else if (sin->sin_addr.s_addr != INADDR_ANY) { sin->sin_port = 0; /* yech... */ bzero(&sin->sin_zero, sizeof(sin->sin_zero)); /* - * Is the address a local IP address? + * Is the address a local IP address? * If INP_BINDANY is set, then the socket may be bound * to any endpoint address, local or not. */ if ((inp->inp_flags & INP_BINDANY) == 0 && - ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) + ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) return (EADDRNOTAVAIL); } laddr = sin->sin_addr; @@ -675,7 +913,8 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || - (t->inp_flags2 & INP_REUSEPORT) == 0) && + (t->inp_flags2 & INP_REUSEPORT) || + (t->inp_flags2 & INP_REUSEPORT_LB) == 0) && #ifndef __rtems__ (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) @@ -704,11 +943,15 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, */ tw = intotw(t); if (tw == NULL || - (reuseport & tw->tw_so_options) == 0) + ((reuseport & tw->tw_so_options) == 0 && + (reuseport_lb & + tw->tw_so_options) == 0)) { return (EADDRINUSE); + } } else if (t && - ((inp->inp_flags2 & INP_BINDMULTI) == 0) && - (reuseport & inp_so_options(t)) == 0) { + ((inp->inp_flags2 & INP_BINDMULTI) == 0) && + (reuseport & inp_so_options(t)) == 0 && + (reuseport_lb & inp_so_options(t)) == 0) { #ifdef INET6 if (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || @@ -717,7 +960,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, (inp->inp_vflag & INP_IPV6PROTO) == 0 || (t->inp_vflag & INP_IPV6PROTO) == 0) #endif - return (EADDRINUSE); + return (EADDRINUSE); if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } @@ -862,7 +1105,6 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, ifp = ia->ia_ifp; ia = NULL; - IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; @@ -876,10 +1118,8 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; - IF_ADDR_RUNLOCK(ifp); goto done; } - IF_ADDR_RUNLOCK(ifp); /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); @@ -921,7 +1161,6 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, */ ia = NULL; ifp = sro.ro_rt->rt_ifp; - IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) @@ -934,10 +1173,8 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; - IF_ADDR_RUNLOCK(ifp); goto done; } - IF_ADDR_RUNLOCK(ifp); /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); @@ -985,9 +1222,7 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, ifp = ia->ia_ifp; ia = NULL; - IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; @@ -1000,10 +1235,8 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; - IF_ADDR_RUNLOCK(ifp); goto done; } - IF_ADDR_RUNLOCK(ifp); } /* 3. As a last resort return the 'default' jail address. */ @@ -1347,6 +1580,58 @@ in_pcblist_rele_rlocked(epoch_context_t ctx) free(il, M_TEMP); } +static void +inpcbport_free(epoch_context_t ctx) +{ + struct inpcbport *phd; + + phd = __containerof(ctx, struct inpcbport, phd_epoch_ctx); + free(phd, M_PCB); +} + +static void +in_pcbfree_deferred(epoch_context_t ctx) +{ + struct inpcb *inp; + int released __unused; + + inp = __containerof(ctx, struct inpcb, inp_epoch_ctx); + + INP_WLOCK(inp); +#ifdef INET + struct ip_moptions *imo = inp->inp_moptions; + inp->inp_moptions = NULL; +#endif + /* XXXRW: Do as much as possible here. */ +#if defined(IPSEC) || defined(IPSEC_SUPPORT) + if (inp->inp_sp != NULL) + ipsec_delete_pcbpolicy(inp); +#endif +#ifdef INET6 + struct ip6_moptions *im6o = NULL; + if (inp->inp_vflag & INP_IPV6PROTO) { + ip6_freepcbopts(inp->in6p_outputopts); + im6o = inp->in6p_moptions; + inp->in6p_moptions = NULL; + } +#endif + if (inp->inp_options) + (void)m_free(inp->inp_options); + inp->inp_vflag = 0; + crfree(inp->inp_cred); +#ifdef MAC + mac_inpcb_destroy(inp); +#endif + released = in_pcbrele_wlocked(inp); + MPASS(released); +#ifdef INET6 + ip6_freemoptions(im6o); +#endif +#ifdef INET + inp_freemoptions(imo); +#endif +} + /* * Unconditionally schedule an inpcb to be freed by decrementing its * reference count, which should occur only after the inpcb has been detached @@ -1361,14 +1646,7 @@ in_pcbfree(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; -#ifdef INET6 - struct ip6_moptions *im6o = NULL; -#endif -#ifdef INET - struct ip_moptions *imo = NULL; -#endif KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); - KASSERT((inp->inp_flags2 & INP_FREED) == 0, ("%s: called twice for pcb %p", __func__, inp)); if (inp->inp_flags2 & INP_FREED) { @@ -1384,45 +1662,14 @@ in_pcbfree(struct inpcb *inp) } #endif INP_WLOCK_ASSERT(inp); - -#ifdef INET - imo = inp->inp_moptions; - inp->inp_moptions = NULL; -#endif - /* XXXRW: Do as much as possible here. */ -#if defined(IPSEC) || defined(IPSEC_SUPPORT) - if (inp->inp_sp != NULL) - ipsec_delete_pcbpolicy(inp); -#endif INP_LIST_WLOCK(pcbinfo); - inp->inp_gencnt = ++pcbinfo->ipi_gencnt; in_pcbremlists(inp); INP_LIST_WUNLOCK(pcbinfo); -#ifdef INET6 - if (inp->inp_vflag & INP_IPV6PROTO) { - ip6_freepcbopts(inp->in6p_outputopts); - im6o = inp->in6p_moptions; - inp->in6p_moptions = NULL; - } -#endif - if (inp->inp_options) - (void)m_free(inp->inp_options); RO_INVALIDATE_CACHE(&inp->inp_route); - - inp->inp_vflag = 0; + /* mark as destruction in progress */ inp->inp_flags2 |= INP_FREED; - crfree(inp->inp_cred); -#ifdef MAC - mac_inpcb_destroy(inp); -#endif -#ifdef INET6 - ip6_freemoptions(im6o); -#endif -#ifdef INET - inp_freemoptions(imo); -#endif - if (!in_pcbrele_wlocked(inp)) - INP_WUNLOCK(inp); + INP_WUNLOCK(inp); + epoch_call(net_epoch_preempt, &inp->inp_epoch_ctx, in_pcbfree_deferred); } /* @@ -1444,6 +1691,10 @@ in_pcbdrop(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); +#ifdef INVARIANTS + if (inp->inp_socket != NULL && inp->inp_ppcb != NULL) + MPASS(inp->inp_refcount > 1); +#endif /* * XXXRW: Possibly we should protect the setting of INP_DROPPED with @@ -1454,11 +1705,12 @@ in_pcbdrop(struct inpcb *inp) struct inpcbport *phd = inp->inp_phd; INP_HASH_WLOCK(inp->inp_pcbinfo); - LIST_REMOVE(inp, inp_hash); - LIST_REMOVE(inp, inp_portlist); - if (LIST_FIRST(&phd->phd_pcblist) == NULL) { - LIST_REMOVE(phd, phd_hash); - free(phd, M_PCB); + in_pcbremlbgrouphash(inp); + CK_LIST_REMOVE(inp, inp_hash); + CK_LIST_REMOVE(inp, inp_portlist); + if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { + CK_LIST_REMOVE(phd, phd_hash); + epoch_call(net_epoch_preempt, &phd->phd_epoch_ctx, inpcbport_free); } INP_HASH_WUNLOCK(inp->inp_pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; @@ -1532,7 +1784,7 @@ in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, struct inpcb *inp, *inp_temp; INP_INFO_WLOCK(pcbinfo); - LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { + CK_LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { INP_WLOCK(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) { @@ -1559,7 +1811,7 @@ in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) int i, gap; INP_INFO_WLOCK(pcbinfo); - LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { + CK_LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { INP_WLOCK(inp); imo = inp->inp_moptions; if ((inp->inp_vflag & INP_IPV4) && @@ -1624,7 +1876,7 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_hashmask)]; - LIST_FOREACH(inp, head, inp_hash) { + CK_LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) @@ -1658,7 +1910,7 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, */ porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, pcbinfo->ipi_porthashmask)]; - LIST_FOREACH(phd, porthash, phd_hash) { + CK_LIST_FOREACH(phd, porthash, phd_hash) { if (phd->phd_port == lport) break; } @@ -1667,7 +1919,7 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, * Port is in use by one or more PCBs. Look for best * fit. */ - LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { + CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { wildcard = 0; if (cred != NULL && !prison_equal_ip4(inp->inp_cred->cr_prison, @@ -1717,6 +1969,50 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, } #undef INP_LOOKUP_MAPPED_PCB_COST +static struct inpcb * +in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, + const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr, + uint16_t fport, int lookupflags) +{ + struct inpcb *local_wild; + const struct inpcblbgrouphead *hdr; + struct inpcblbgroup *grp; + uint32_t idx; + + INP_HASH_LOCK_ASSERT(pcbinfo); + + hdr = &pcbinfo->ipi_lbgrouphashbase[INP_PCBLBGROUP_PORTHASH(lport, + pcbinfo->ipi_lbgrouphashmask)]; + + /* + * Order of socket selection: + * 1. non-wild. + * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD). + * + * NOTE: + * - Load balanced group does not contain jailed sockets + * - Load balanced group does not contain IPv4 mapped INET6 wild sockets + */ + local_wild = NULL; + CK_LIST_FOREACH(grp, hdr, il_list) { +#ifdef INET6 + if (!(grp->il_vflag & INP_IPV4)) + continue; +#endif + if (grp->il_lport != lport) + continue; + + idx = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport) % + grp->il_inpcnt; + if (grp->il_laddr.s_addr == laddr->s_addr) + return (grp->il_inp[idx]); + if (grp->il_laddr.s_addr == INADDR_ANY && + (lookupflags & INPLOOKUP_WILDCARD) != 0) + local_wild = grp->il_inp[idx]; + } + return (local_wild); +} + #ifdef PCBGROUP /* * Lookup PCB in hash list, using pcbgroup tables. @@ -1738,7 +2034,7 @@ in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, INP_GROUP_LOCK(pcbgroup); head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbgroup->ipg_hashmask)]; - LIST_FOREACH(inp, head, inp_pcbgrouphash) { + CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) @@ -1788,7 +2084,7 @@ in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbgroup->ipg_hashmask)]; - LIST_FOREACH(inp, head, inp_pcbgrouphash) { + CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) @@ -1862,7 +2158,7 @@ in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, */ head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_wildmask)]; - LIST_FOREACH(inp, head, inp_pcbgroup_wild) { + CK_LIST_FOREACH(inp, head, inp_pcbgroup_wild) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) @@ -1922,7 +2218,13 @@ found: locked = INP_TRY_RLOCK(inp); else panic("%s: locking bug", __func__); - if (!locked) + if (__predict_false(locked && (inp->inp_flags2 & INP_FREED))) { + if (lookupflags & INPLOOKUP_WLOCKPCB) + INP_WUNLOCK(inp); + else + INP_RUNLOCK(inp); + return (NULL); + } else if (!locked) in_pcbref(inp); INP_GROUP_UNLOCK(pcbgroup); if (!locked) { @@ -1960,18 +2262,19 @@ in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; +#ifdef INVARIANTS KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); - - INP_HASH_LOCK_ASSERT(pcbinfo); - + if (!mtx_owned(&pcbinfo->ipi_hash_lock)) + MPASS(in_epoch_verbose(net_epoch_preempt, 1)); +#endif /* * First look for an exact match. */ tmpinp = NULL; head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbinfo->ipi_hashmask)]; - LIST_FOREACH(inp, head, inp_hash) { + CK_LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) @@ -1996,6 +2299,18 @@ in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, return (tmpinp); /* + * Then look in lb group (for wildcard match). + */ + if (pcbinfo->ipi_lbgrouphashbase != NULL && + (lookupflags & INPLOOKUP_WILDCARD)) { + inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr, + fport, lookupflags); + if (inp != NULL) { + return (inp); + } + } + + /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { @@ -2016,7 +2331,7 @@ in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_hashmask)]; - LIST_FOREACH(inp, head, inp_hash) { + CK_LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) @@ -2080,40 +2395,35 @@ in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, struct ifnet *ifp) { struct inpcb *inp; - bool locked; INP_HASH_RLOCK(pcbinfo); inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp); if (inp != NULL) { - if (lookupflags & INPLOOKUP_WLOCKPCB) - locked = INP_TRY_WLOCK(inp); - else if (lookupflags & INPLOOKUP_RLOCKPCB) - locked = INP_TRY_RLOCK(inp); - else - panic("%s: locking bug", __func__); - if (!locked) - in_pcbref(inp); - INP_HASH_RUNLOCK(pcbinfo); - if (!locked) { - if (lookupflags & INPLOOKUP_WLOCKPCB) { - INP_WLOCK(inp); - if (in_pcbrele_wlocked(inp)) - return (NULL); - } else { - INP_RLOCK(inp); - if (in_pcbrele_rlocked(inp)) - return (NULL); + if (lookupflags & INPLOOKUP_WLOCKPCB) { + INP_WLOCK(inp); + if (__predict_false(inp->inp_flags2 & INP_FREED)) { + INP_WUNLOCK(inp); + inp = NULL; } - } + } else if (lookupflags & INPLOOKUP_RLOCKPCB) { + INP_RLOCK(inp); + if (__predict_false(inp->inp_flags2 & INP_FREED)) { + INP_RUNLOCK(inp); + inp = NULL; + } + } else + panic("%s: locking bug", __func__); #ifdef INVARIANTS - if (lookupflags & INPLOOKUP_WLOCKPCB) - INP_WLOCK_ASSERT(inp); - else - INP_RLOCK_ASSERT(inp); + if (inp != NULL) { + if (lookupflags & INPLOOKUP_WLOCKPCB) + INP_WLOCK_ASSERT(inp); + else + INP_RLOCK_ASSERT(inp); + } #endif - } else - INP_HASH_RUNLOCK(pcbinfo); + } + INP_HASH_RUNLOCK(pcbinfo); return (inp); } @@ -2212,6 +2522,7 @@ in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update) struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbport *phd; u_int32_t hashkey_faddr; + int so_options; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); @@ -2233,9 +2544,22 @@ in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update) INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; /* + * Add entry to load balance group. + * Only do this if SO_REUSEPORT_LB is set. + */ + so_options = inp_so_options(inp); + if (so_options & SO_REUSEPORT_LB) { + int ret = in_pcbinslbgrouphash(inp); + if (ret) { + /* pcb lb group malloc fail (ret=ENOBUFS). */ + return (ret); + } + } + + /* * Go through port list and look for a head for this lport. */ - LIST_FOREACH(phd, pcbporthash, phd_hash) { + CK_LIST_FOREACH(phd, pcbporthash, phd_hash) { if (phd->phd_port == inp->inp_lport) break; } @@ -2247,13 +2571,14 @@ in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update) if (phd == NULL) { return (ENOBUFS); /* XXX */ } + bzero(&phd->phd_epoch_ctx, sizeof(struct epoch_context)); phd->phd_port = inp->inp_lport; - LIST_INIT(&phd->phd_pcblist); - LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); + CK_LIST_INIT(&phd->phd_pcblist); + CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); } inp->inp_phd = phd; - LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); - LIST_INSERT_HEAD(pcbhash, inp, inp_hash); + CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); + CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash); inp->inp_flags |= INP_INHASHLIST; #ifdef PCBGROUP if (do_pcbgroup_update) @@ -2316,8 +2641,8 @@ in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m) head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; - LIST_REMOVE(inp, inp_hash); - LIST_INSERT_HEAD(head, inp, inp_hash); + CK_LIST_REMOVE(inp, inp_hash); + CK_LIST_INSERT_HEAD(head, inp, inp_hash); #ifdef PCBGROUP if (m != NULL) @@ -2358,16 +2683,20 @@ in_pcbremlists(struct inpcb *inp) struct inpcbport *phd = inp->inp_phd; INP_HASH_WLOCK(pcbinfo); - LIST_REMOVE(inp, inp_hash); - LIST_REMOVE(inp, inp_portlist); - if (LIST_FIRST(&phd->phd_pcblist) == NULL) { - LIST_REMOVE(phd, phd_hash); - free(phd, M_PCB); + + /* XXX: Only do if SO_REUSEPORT_LB set? */ + in_pcbremlbgrouphash(inp); + + CK_LIST_REMOVE(inp, inp_hash); + CK_LIST_REMOVE(inp, inp_portlist); + if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { + CK_LIST_REMOVE(phd, phd_hash); + epoch_call(net_epoch_preempt, &phd->phd_epoch_ctx, inpcbport_free); } INP_HASH_WUNLOCK(pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; } - LIST_REMOVE(inp, inp_list); + CK_LIST_REMOVE(inp, inp_list); pcbinfo->ipi_count--; #ifdef PCBGROUP in_pcbgroup_remove(inp); @@ -2511,7 +2840,7 @@ inp_apply_all(void (*func)(struct inpcb *, void *), void *arg) struct inpcb *inp; INP_INFO_WLOCK(&V_tcbinfo); - LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { + CK_LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { INP_WLOCK(inp); func(inp, arg); INP_WUNLOCK(inp); @@ -2594,7 +2923,7 @@ in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi) bzero(&xi->xi_socket, sizeof(struct xsocket)); bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo)); xi->inp_gencnt = inp->inp_gencnt; - xi->inp_ppcb = inp->inp_ppcb; + xi->inp_ppcb = (uintptr_t)inp->inp_ppcb; xi->inp_flow = inp->inp_flow; xi->inp_flowid = inp->inp_flowid; xi->inp_flowtype = inp->inp_flowtype; diff --git a/freebsd/sys/netinet/in_pcb.h b/freebsd/sys/netinet/in_pcb.h index d00dd456..86c9705c 100644 --- a/freebsd/sys/netinet/in_pcb.h +++ b/freebsd/sys/netinet/in_pcb.h @@ -51,8 +51,11 @@ #include <sys/lock.h> #include <sys/rwlock.h> #include <net/vnet.h> +#include <net/if.h> +#include <net/if_var.h> #include <vm/uma.h> #endif +#include <sys/ck.h> #define in6pcb inpcb /* for KAME src sync over BSD*'s */ #define in6p_sp inp_sp /* for KAME src sync over BSD*'s */ @@ -65,8 +68,9 @@ * numbers, and pointers up (to a socket structure) and down (to a * protocol-specific control block) are stored here. */ -LIST_HEAD(inpcbhead, inpcb); -LIST_HEAD(inpcbporthead, inpcbport); +CK_LIST_HEAD(inpcbhead, inpcb); +CK_LIST_HEAD(inpcbporthead, inpcbport); +CK_LIST_HEAD(inpcblbgrouphead, inpcblbgroup); typedef uint64_t inp_gen_t; /* @@ -79,6 +83,11 @@ struct in_addr_4in6 { struct in_addr ia46_addr4; }; +union in_dependaddr { + struct in_addr_4in6 id46_addr; + struct in6_addr id6_addr; +}; + /* * NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553. in_conninfo has * some extra padding to accomplish this. @@ -89,22 +98,14 @@ struct in_endpoints { u_int16_t ie_fport; /* foreign port */ u_int16_t ie_lport; /* local port */ /* protocol dependent part, local and foreign addr */ - union { - /* foreign host table entry */ - struct in_addr_4in6 ie46_foreign; - struct in6_addr ie6_foreign; - } ie_dependfaddr; - union { - /* local host table entry */ - struct in_addr_4in6 ie46_local; - struct in6_addr ie6_local; - } ie_dependladdr; + union in_dependaddr ie_dependfaddr; /* foreign host table entry */ + union in_dependaddr ie_dependladdr; /* local host table entry */ +#define ie_faddr ie_dependfaddr.id46_addr.ia46_addr4 +#define ie_laddr ie_dependladdr.id46_addr.ia46_addr4 +#define ie6_faddr ie_dependfaddr.id6_addr +#define ie6_laddr ie_dependladdr.id6_addr u_int32_t ie6_zoneid; /* scope zone id */ }; -#define ie_faddr ie_dependfaddr.ie46_foreign.ia46_addr4 -#define ie_laddr ie_dependladdr.ie46_local.ia46_addr4 -#define ie6_faddr ie_dependfaddr.ie6_foreign -#define ie6_laddr ie_dependladdr.ie6_local /* * XXX The defines for inc_* are hacks and should be changed to direct @@ -122,8 +123,8 @@ struct in_conninfo { * Flags for inc_flags. */ #define INC_ISIPV6 0x01 +#define INC_IPV6MINMTU 0x02 -#define inc_isipv6 inc_flags /* temp compatibility */ #define inc_fport inc_ie.ie_fport #define inc_lport inc_ie.ie_lport #define inc_faddr inc_ie.ie_faddr @@ -159,6 +160,7 @@ struct in_conninfo { * Key: * (b) - Protected by the hpts lock. * (c) - Constant after initialization + * (e) - Protected by the net_epoch_prempt epoch * (g) - Protected by the pcbgroup lock * (i) - Protected by the inpcb lock * (p) - Protected by the pcbinfo lock for the inpcb @@ -233,8 +235,8 @@ struct inpcbpolicy; struct m_snd_tag; struct inpcb { /* Cache line #1 (amd64) */ - LIST_ENTRY(inpcb) inp_hash; /* (h/i) hash list */ - LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */ + CK_LIST_ENTRY(inpcb) inp_hash; /* [w](h/i) [r](e/i) hash list */ + CK_LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */ struct rwlock inp_lock; /* Cache line #2 (amd64) */ #define inp_start_zero inp_hpts @@ -278,7 +280,7 @@ struct inpcb { TAILQ_ENTRY(inpcb) inp_input; /* pacing in queue next lock(b) */ struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */ struct inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */ - LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/h) group wildcard entry */ + CK_LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/h) group wildcard entry */ struct ucred *inp_cred; /* (c) cache of socket cred */ u_int32_t inp_flow; /* (i) IPv6 flow information */ u_char inp_vflag; /* (i) IP version flag (v4/v6) */ @@ -316,18 +318,19 @@ struct inpcb { int in6p_cksum; short in6p_hops; }; - LIST_ENTRY(inpcb) inp_portlist; /* (i/h) */ + CK_LIST_ENTRY(inpcb) inp_portlist; /* (i/h) */ struct inpcbport *inp_phd; /* (i/h) head of this list */ inp_gen_t inp_gencnt; /* (c) generation count */ - struct llentry *inp_lle; /* cached L2 information */ + void *spare_ptr; /* Spare pointer. */ rt_gen_t inp_rt_cookie; /* generation for route entry */ union { /* cached L3 information */ struct route inp_route; struct route_in6 inp_route6; }; - LIST_ENTRY(inpcb) inp_list; /* (p/l) list for all PCBs for proto */ - /* (p[w]) for list iteration */ - /* (p[r]/l) for addition/removal */ + CK_LIST_ENTRY(inpcb) inp_list; /* (p/l) list for all PCBs for proto */ + /* (e[r]) for list iteration */ + /* (p[w]/l) for addition/removal */ + struct epoch_context inp_epoch_ctx; }; #endif /* _KERNEL */ @@ -364,14 +367,11 @@ struct inpcb { */ #ifdef _SYS_SOCKETVAR_H_ struct xinpcb { - size_t xi_len; /* length of this structure */ + ksize_t xi_len; /* length of this structure */ struct xsocket xi_socket; /* (s,p) */ struct in_conninfo inp_inc; /* (s,p) */ uint64_t inp_gencnt; /* (s,p) */ - union { - void *inp_ppcb; /* (s) netstat(1) */ - int64_t ph_ppcb; - }; + kvaddr_t inp_ppcb; /* (s) netstat(1) */ int64_t inp_spare64[4]; uint32_t inp_flow; /* (s) */ uint32_t inp_flowid; /* (s) */ @@ -392,10 +392,12 @@ struct xinpcb { } __aligned(8); struct xinpgen { - size_t xig_len; /* length of this structure */ + ksize_t xig_len; /* length of this structure */ u_int xig_count; /* number of PCBs at this time */ + uint32_t _xig_spare32; inp_gen_t xig_gen; /* generation count at this time */ so_gen_t xig_sogen; /* socket generation count this time */ + uint64_t _xig_spare64[4]; } __aligned(8); #ifdef _KERNEL void in_pcbtoxinpcb(const struct inpcb *, struct xinpcb *); @@ -403,7 +405,8 @@ void in_pcbtoxinpcb(const struct inpcb *, struct xinpcb *); #endif /* _SYS_SOCKETVAR_H_ */ struct inpcbport { - LIST_ENTRY(inpcbport) phd_hash; + struct epoch_context phd_epoch_ctx; + CK_LIST_ENTRY(inpcbport) phd_hash; struct inpcbhead phd_pcblist; u_short phd_port; }; @@ -436,22 +439,23 @@ struct in_pcblist { * Locking key: * * (c) Constant or nearly constant after initialisation + * (e) - Protected by the net_epoch_prempt epoch * (g) Locked by ipi_lock * (l) Locked by ipi_list_lock - * (h) Read using either ipi_hash_lock or inpcb lock; write requires both + * (h) Read using either net_epoch_preempt or inpcb lock; write requires both ipi_hash_lock and inpcb lock * (p) Protected by one or more pcbgroup locks * (x) Synchronisation properties poorly defined */ struct inpcbinfo { /* - * Global lock protecting full inpcb list traversal + * Global lock protecting inpcb list modification */ - struct rwlock ipi_lock; + struct mtx ipi_lock; /* * Global list of inpcbs on the protocol. */ - struct inpcbhead *ipi_listhead; /* (g/l) */ + struct inpcbhead *ipi_listhead; /* [r](e) [w](g/l) */ u_int ipi_count; /* (l) */ /* @@ -482,9 +486,9 @@ struct inpcbinfo { u_int ipi_hashfields; /* (c) */ /* - * Global lock protecting non-pcbgroup hash lookup tables. + * Global lock protecting modification non-pcbgroup hash lookup tables. */ - struct rwlock ipi_hash_lock; + struct mtx ipi_hash_lock; /* * Global hash of inpcbs, hashed by local and foreign addresses and @@ -508,6 +512,13 @@ struct inpcbinfo { u_long ipi_wildmask; /* (p) */ /* + * Load balance groups used for the SO_REUSEPORT_LB option, + * hashed by local port. + */ + struct inpcblbgrouphead *ipi_lbgrouphashbase; /* (h) */ + u_long ipi_lbgrouphashmask; /* (h) */ + + /* * Pointer to network stack instance */ struct vnet *ipi_vnet; /* (c) */ @@ -549,6 +560,27 @@ struct inpcbgroup { struct mtx ipg_lock; } __aligned(CACHE_LINE_SIZE); +/* + * Load balance groups used for the SO_REUSEPORT_LB socket option. Each group + * (or unique address:port combination) can be re-used at most + * INPCBLBGROUP_SIZMAX (256) times. The inpcbs are stored in il_inp which + * is dynamically resized as processes bind/unbind to that specific group. + */ +struct inpcblbgroup { + CK_LIST_ENTRY(inpcblbgroup) il_list; + struct epoch_context il_epoch_ctx; + uint16_t il_lport; /* (c) */ + u_char il_vflag; /* (c) */ + u_char il_pad; + uint32_t il_pad2; + union in_dependaddr il_dependladdr; /* (c) */ +#define il_laddr il_dependladdr.id46_addr.ia46_addr4 +#define il6_laddr il_dependladdr.id6_addr + uint32_t il_inpsiz; /* max count in il_inp[] (h) */ + uint32_t il_inpcnt; /* cur count in il_inp[] (h) */ + struct inpcb *il_inp[]; /* (h) */ +}; + #define INP_LOCK_INIT(inp, d, t) \ rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE | RW_DUPOK) #define INP_LOCK_DESTROY(inp) rw_destroy(&(inp)->inp_lock) @@ -593,25 +625,24 @@ struct tcpcb * inp_inpcbtotcpcb(struct inpcb *inp); void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, uint32_t *faddr, uint16_t *fp); -short inp_so_options(const struct inpcb *inp); +int inp_so_options(const struct inpcb *inp); #endif /* _KERNEL */ #define INP_INFO_LOCK_INIT(ipi, d) \ - rw_init_flags(&(ipi)->ipi_lock, (d), RW_RECURSE) -#define INP_INFO_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_lock) -#define INP_INFO_RLOCK(ipi) rw_rlock(&(ipi)->ipi_lock) -#define INP_INFO_WLOCK(ipi) rw_wlock(&(ipi)->ipi_lock) -#define INP_INFO_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_lock) -#define INP_INFO_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_lock) -#define INP_INFO_TRY_UPGRADE(ipi) rw_try_upgrade(&(ipi)->ipi_lock) -#define INP_INFO_WLOCKED(ipi) rw_wowned(&(ipi)->ipi_lock) -#define INP_INFO_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_lock) -#define INP_INFO_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_lock) -#define INP_INFO_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_LOCKED) -#define INP_INFO_RLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_RLOCKED) -#define INP_INFO_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_WLOCKED) -#define INP_INFO_UNLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_UNLOCKED) + mtx_init(&(ipi)->ipi_lock, (d), NULL, MTX_DEF| MTX_RECURSE) +#define INP_INFO_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_lock) +#define INP_INFO_RLOCK_ET(ipi, et) NET_EPOCH_ENTER_ET((et)) +#define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_lock) +#define INP_INFO_TRY_WLOCK(ipi) mtx_trylock(&(ipi)->ipi_lock) +#define INP_INFO_WLOCKED(ipi) mtx_owned(&(ipi)->ipi_lock) +#define INP_INFO_RUNLOCK_ET(ipi, et) NET_EPOCH_EXIT_ET((et)) +#define INP_INFO_RUNLOCK_TP(ipi, tp) NET_EPOCH_EXIT_ET(*(tp)->t_inpcb->inp_et) +#define INP_INFO_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_lock) +#define INP_INFO_LOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ipi)->ipi_lock)) +#define INP_INFO_RLOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt)) +#define INP_INFO_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_lock, MA_OWNED) +#define INP_INFO_UNLOCK_ASSERT(ipi) MPASS(!in_epoch(net_epoch_preempt) && !mtx_owned(&(ipi)->ipi_lock)) #define INP_LIST_LOCK_INIT(ipi, d) \ rw_init_flags(&(ipi)->ipi_list_lock, (d), 0) @@ -632,17 +663,16 @@ short inp_so_options(const struct inpcb *inp); #define INP_LIST_UNLOCK_ASSERT(ipi) \ rw_assert(&(ipi)->ipi_list_lock, RA_UNLOCKED) -#define INP_HASH_LOCK_INIT(ipi, d) \ - rw_init_flags(&(ipi)->ipi_hash_lock, (d), 0) -#define INP_HASH_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_hash_lock) -#define INP_HASH_RLOCK(ipi) rw_rlock(&(ipi)->ipi_hash_lock) -#define INP_HASH_WLOCK(ipi) rw_wlock(&(ipi)->ipi_hash_lock) -#define INP_HASH_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_hash_lock) -#define INP_HASH_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_hash_lock) -#define INP_HASH_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_hash_lock, \ - RA_LOCKED) -#define INP_HASH_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_hash_lock, \ - RA_WLOCKED) +#define INP_HASH_LOCK_INIT(ipi, d) mtx_init(&(ipi)->ipi_hash_lock, (d), NULL, MTX_DEF) +#define INP_HASH_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_hash_lock) +#define INP_HASH_RLOCK(ipi) struct epoch_tracker inp_hash_et; epoch_enter_preempt(net_epoch_preempt, &inp_hash_et) +#define INP_HASH_RLOCK_ET(ipi, et) epoch_enter_preempt(net_epoch_preempt, &(et)) +#define INP_HASH_WLOCK(ipi) mtx_lock(&(ipi)->ipi_hash_lock) +#define INP_HASH_RUNLOCK(ipi) NET_EPOCH_EXIT_ET(inp_hash_et) +#define INP_HASH_RUNLOCK_ET(ipi, et) NET_EPOCH_EXIT_ET((et)) +#define INP_HASH_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_hash_lock) +#define INP_HASH_LOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ipi)->ipi_hash_lock)) +#define INP_HASH_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_hash_lock, MA_OWNED); #define INP_GROUP_LOCK_INIT(ipg, d) mtx_init(&(ipg)->ipg_lock, (d), NULL, \ MTX_DEF | MTX_DUPOK) @@ -656,6 +686,10 @@ short inp_so_options(const struct inpcb *inp); (((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask)) #define INP_PCBPORTHASH(lport, mask) \ (ntohs((lport)) & (mask)) +#define INP_PCBLBGROUP_PORTHASH(lport, mask) \ + (ntohs((lport)) & (mask)) +#define INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) \ + ((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) #define INP6_PCBHASHKEY(faddr) ((faddr)->s6_addr32[3]) /* @@ -711,8 +745,8 @@ short inp_so_options(const struct inpcb *inp); /* * Flags for inp_flags2. */ -#define INP_LLE_VALID 0x00000001 /* cached lle is valid */ -#define INP_RT_VALID 0x00000002 /* cached rtentry is valid */ +#define INP_2UNUSED1 0x00000001 +#define INP_2UNUSED2 0x00000002 #define INP_PCBGROUPWILD 0x00000004 /* in pcbgroup wildcard list */ #define INP_REUSEPORT 0x00000008 /* SO_REUSEPORT option is set */ #define INP_FREED 0x00000010 /* inp itself is not valid */ @@ -724,6 +758,7 @@ short inp_so_options(const struct inpcb *inp); #define INP_RATE_LIMIT_CHANGED 0x00000400 /* rate limit needs attention */ #define INP_ORIGDSTADDR 0x00000800 /* receive IP dst address/port */ #define INP_CANNOT_DO_ECN 0x00001000 /* The stack does not do ECN */ +#define INP_REUSEPORT_LB 0x00002000 /* SO_REUSEPORT_LB option is set */ /* * Flags passed to in_pcblookup*() functions. diff --git a/freebsd/sys/netinet/ip.h b/freebsd/sys/netinet/ip.h index 6d902fe4..934bd812 100644 --- a/freebsd/sys/netinet/ip.h +++ b/freebsd/sys/netinet/ip.h @@ -94,6 +94,11 @@ struct ip { #define IPTOS_PREC_ROUTINE IPTOS_DSCP_CS0 /* + * Offset of Diffserv decimal value to convert it to tos value . + */ +#define IPTOS_DSCP_OFFSET 2 + +/* * Definitions for DiffServ Codepoints as per RFC2474 and RFC5865. */ #define IPTOS_DSCP_CS0 0x00 diff --git a/freebsd/sys/netinet/ip6.h b/freebsd/sys/netinet/ip6.h index a0dfcb0f..1f4be3fd 100644 --- a/freebsd/sys/netinet/ip6.h +++ b/freebsd/sys/netinet/ip6.h @@ -104,6 +104,7 @@ struct ip6_hdr { #define IPV6_FLOWLABEL_MASK 0xffff0f00 /* flow label (20 bits) */ #endif /* LITTLE_ENDIAN */ #endif +#define IPV6_FLOWLABEL_LEN 20 #if 1 /* ECN bits proposed by Sally Floyd */ #define IP6TOS_CE 0x01 /* congestion experienced */ diff --git a/freebsd/sys/netinet/ip_carp.c b/freebsd/sys/netinet/ip_carp.c index 6f5160e0..8f7f6edf 100644 --- a/freebsd/sys/netinet/ip_carp.c +++ b/freebsd/sys/netinet/ip_carp.c @@ -189,36 +189,44 @@ static int proto_reg[] = {-1, -1}; */ /* Accept incoming CARP packets. */ -static VNET_DEFINE(int, carp_allow) = 1; +VNET_DEFINE_STATIC(int, carp_allow) = 1; #define V_carp_allow VNET(carp_allow) +/* Set DSCP in outgoing CARP packets. */ +VNET_DEFINE_STATIC(int, carp_dscp) = 56; +#define V_carp_dscp VNET(carp_dscp) + /* Preempt slower nodes. */ -static VNET_DEFINE(int, carp_preempt) = 0; +VNET_DEFINE_STATIC(int, carp_preempt) = 0; #define V_carp_preempt VNET(carp_preempt) /* Log level. */ -static VNET_DEFINE(int, carp_log) = 1; +VNET_DEFINE_STATIC(int, carp_log) = 1; #define V_carp_log VNET(carp_log) /* Global advskew demotion. */ -static VNET_DEFINE(int, carp_demotion) = 0; +VNET_DEFINE_STATIC(int, carp_demotion) = 0; #define V_carp_demotion VNET(carp_demotion) /* Send error demotion factor. */ -static VNET_DEFINE(int, carp_senderr_adj) = CARP_MAXSKEW; +VNET_DEFINE_STATIC(int, carp_senderr_adj) = CARP_MAXSKEW; #define V_carp_senderr_adj VNET(carp_senderr_adj) /* Iface down demotion factor. */ -static VNET_DEFINE(int, carp_ifdown_adj) = CARP_MAXSKEW; +VNET_DEFINE_STATIC(int, carp_ifdown_adj) = CARP_MAXSKEW; #define V_carp_ifdown_adj VNET(carp_ifdown_adj) static int carp_allow_sysctl(SYSCTL_HANDLER_ARGS); +static int carp_dscp_sysctl(SYSCTL_HANDLER_ARGS); static int carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS); SYSCTL_NODE(_net_inet, IPPROTO_CARP, carp, CTLFLAG_RW, 0, "CARP"); SYSCTL_PROC(_net_inet_carp, OID_AUTO, allow, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, 0, 0, carp_allow_sysctl, "I", "Accept incoming CARP packets"); +SYSCTL_PROC(_net_inet_carp, OID_AUTO, dscp, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, 0, 0, carp_dscp_sysctl, "I", + "DSCP value for carp packets"); SYSCTL_INT(_net_inet_carp, OID_AUTO, preempt, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(carp_preempt), 0, "High-priority backup preemption mode"); SYSCTL_INT(_net_inet_carp, OID_AUTO, log, CTLFLAG_VNET | CTLFLAG_RW, @@ -935,7 +943,7 @@ carp_send_ad_locked(struct carp_softc *sc) ip = mtod(m, struct ip *); ip->ip_v = IPVERSION; ip->ip_hl = sizeof(*ip) >> 2; - ip->ip_tos = IPTOS_LOWDELAY; + ip->ip_tos = V_carp_dscp << IPTOS_DSCP_OFFSET; ip->ip_len = htons(len); ip->ip_off = htons(IP_DF); ip->ip_ttl = CARP_DFLTTL; @@ -985,6 +993,10 @@ carp_send_ad_locked(struct carp_softc *sc) ip6 = mtod(m, struct ip6_hdr *); bzero(ip6, sizeof(*ip6)); ip6->ip6_vfc |= IPV6_VERSION; + /* Traffic class isn't defined in ip6 struct instead + * it gets offset into flowid field */ + ip6->ip6_flow |= htonl(V_carp_dscp << (IPV6_FLOWLABEL_LEN + + IPTOS_DSCP_OFFSET)); ip6->ip6_hlim = CARP_DFLTTL; ip6->ip6_nxt = IPPROTO_CARP; @@ -1413,6 +1425,7 @@ carp_multicast_setup(struct carp_if *cif, sa_family_t sa) free(im6o->im6o_membership, M_CARP); break; } + in6m_acquire(in6m); im6o->im6o_membership[0] = in6m; im6o->im6o_num_memberships++; @@ -1434,6 +1447,7 @@ carp_multicast_setup(struct carp_if *cif, sa_family_t sa) free(im6o->im6o_membership, M_CARP); break; } + in6m_acquire(in6m); im6o->im6o_membership[1] = in6m; im6o->im6o_num_memberships++; break; @@ -2104,6 +2118,24 @@ carp_allow_sysctl(SYSCTL_HANDLER_ARGS) } static int +carp_dscp_sysctl(SYSCTL_HANDLER_ARGS) +{ + int new, error; + + new = V_carp_dscp; + error = sysctl_handle_int(oidp, &new, 0, req); + if (error || !req->newptr) + return (error); + + if (new < 0 || new > 63) + return (EINVAL); + + V_carp_dscp = new; + + return (0); +} + +static int carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS) { int new, error; diff --git a/freebsd/sys/netinet/ip_divert.c b/freebsd/sys/netinet/ip_divert.c index 84f39023..fbf74ca1 100644 --- a/freebsd/sys/netinet/ip_divert.c +++ b/freebsd/sys/netinet/ip_divert.c @@ -113,8 +113,8 @@ __FBSDID("$FreeBSD$"); */ /* Internal variables. */ -static VNET_DEFINE(struct inpcbhead, divcb); -static VNET_DEFINE(struct inpcbinfo, divcbinfo); +VNET_DEFINE_STATIC(struct inpcbhead, divcb); +VNET_DEFINE_STATIC(struct inpcbinfo, divcbinfo); #define V_divcb VNET(divcb) #define V_divcbinfo VNET(divcbinfo) @@ -194,6 +194,7 @@ divert_packet(struct mbuf *m, int incoming) u_int16_t nport; struct sockaddr_in divsrc; struct m_tag *mtag; + struct epoch_tracker et; mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL); if (mtag == NULL) { @@ -274,8 +275,8 @@ divert_packet(struct mbuf *m, int incoming) /* Put packet on socket queue, if any */ sa = NULL; nport = htons((u_int16_t)(((struct ipfw_rule_ref *)(mtag+1))->info)); - INP_INFO_RLOCK(&V_divcbinfo); - LIST_FOREACH(inp, &V_divcb, inp_list) { + INP_INFO_RLOCK_ET(&V_divcbinfo, et); + CK_LIST_FOREACH(inp, &V_divcb, inp_list) { /* XXX why does only one socket match? */ if (inp->inp_lport == nport) { INP_RLOCK(inp); @@ -292,7 +293,7 @@ divert_packet(struct mbuf *m, int incoming) break; } } - INP_INFO_RUNLOCK(&V_divcbinfo); + INP_INFO_RUNLOCK_ET(&V_divcbinfo, et); if (sa == NULL) { m_freem(m); KMOD_IPSTAT_INC(ips_noproto); @@ -554,7 +555,6 @@ div_detach(struct socket *so) KASSERT(inp != NULL, ("div_detach: inp == NULL")); INP_INFO_WLOCK(&V_divcbinfo); INP_WLOCK(inp); - /* XXX defer destruction to epoch_call */ in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(&V_divcbinfo); @@ -634,10 +634,10 @@ static int div_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; - struct in_pcblist *il; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; + struct epoch_tracker et; /* * The process of preparing the TCB list is too time-consuming and @@ -656,10 +656,10 @@ div_pcblist(SYSCTL_HANDLER_ARGS) /* * OK, now we're committed to doing something. */ - INP_INFO_RLOCK(&V_divcbinfo); + INP_INFO_WLOCK(&V_divcbinfo); gencnt = V_divcbinfo.ipi_gencnt; n = V_divcbinfo.ipi_count; - INP_INFO_RUNLOCK(&V_divcbinfo); + INP_INFO_WUNLOCK(&V_divcbinfo); error = sysctl_wire_old_buffer(req, 2 * sizeof(xig) + n*sizeof(struct xinpcb)); @@ -674,12 +674,13 @@ div_pcblist(SYSCTL_HANDLER_ARGS) if (error) return error; - il = malloc(sizeof(struct in_pcblist) + n * sizeof(struct inpcb *), M_TEMP, M_WAITOK|M_ZERO_INVARIANTS); - inp_list = il->il_inp_list; + inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); + if (inp_list == NULL) + return ENOMEM; - INP_INFO_RLOCK(&V_divcbinfo); - for (inp = LIST_FIRST(V_divcbinfo.ipi_listhead), i = 0; inp && i < n; - inp = LIST_NEXT(inp, inp_list)) { + INP_INFO_RLOCK_ET(&V_divcbinfo, et); + for (inp = CK_LIST_FIRST(V_divcbinfo.ipi_listhead), i = 0; inp && i < n; + inp = CK_LIST_NEXT(inp, inp_list)) { INP_WLOCK(inp); if (inp->inp_gencnt <= gencnt && cr_canseeinpcb(req->td->td_ucred, inp) == 0) { @@ -688,7 +689,7 @@ div_pcblist(SYSCTL_HANDLER_ARGS) } INP_WUNLOCK(inp); } - INP_INFO_RUNLOCK(&V_divcbinfo); + INP_INFO_RUNLOCK_ET(&V_divcbinfo, et); n = i; error = 0; @@ -704,11 +705,17 @@ div_pcblist(SYSCTL_HANDLER_ARGS) } else INP_RUNLOCK(inp); } - il->il_count = n; - il->il_pcbinfo = &V_divcbinfo; - epoch_call(net_epoch_preempt, &il->il_epoch_ctx, in_pcblist_rele_rlocked); + INP_INFO_WLOCK(&V_divcbinfo); + for (i = 0; i < n; i++) { + inp = inp_list[i]; + INP_RLOCK(inp); + if (!in_pcbrele_rlocked(inp)) + INP_RUNLOCK(inp); + } + INP_INFO_WUNLOCK(&V_divcbinfo); if (!error) { + struct epoch_tracker et; /* * Give the user an updated idea of our state. * If the generation differs from what we told @@ -716,13 +723,14 @@ div_pcblist(SYSCTL_HANDLER_ARGS) * while we were processing this request, and it * might be necessary to retry. */ - INP_INFO_RLOCK(&V_divcbinfo); + INP_INFO_RLOCK_ET(&V_divcbinfo, et); xig.xig_gen = V_divcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_divcbinfo.ipi_count; - INP_INFO_RUNLOCK(&V_divcbinfo); + INP_INFO_RUNLOCK_ET(&V_divcbinfo, et); error = SYSCTL_OUT(req, &xig, sizeof xig); } + free(inp_list, M_TEMP); return error; } @@ -802,7 +810,6 @@ div_modevent(module_t mod, int type, void *unused) break; } ip_divert_ptr = NULL; - /* XXX defer to epoch_call ? */ err = pf_proto_unregister(PF_INET, IPPROTO_DIVERT, SOCK_RAW); INP_INFO_WUNLOCK(&V_divcbinfo); #ifndef VIMAGE diff --git a/freebsd/sys/netinet/ip_encap.c b/freebsd/sys/netinet/ip_encap.c index 52cd0b40..1e794f73 100644 --- a/freebsd/sys/netinet/ip_encap.c +++ b/freebsd/sys/netinet/ip_encap.c @@ -6,6 +6,7 @@ * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * Copyright (c) 2018 Andrey V. Elsukov <ae@FreeBSD.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -58,417 +59,214 @@ * So, clearly good old protosw does not work for protocol #4 and #41. * The code will let you match protocol via src/dst address pair. */ -/* XXX is M_NETADDR correct? */ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); -#include <rtems/bsd/local/opt_mrouting.h> #include <rtems/bsd/local/opt_inet.h> #include <rtems/bsd/local/opt_inet6.h> #include <sys/param.h> #include <sys/systm.h> +#include <sys/kernel.h> #include <sys/lock.h> +#include <sys/malloc.h> #include <sys/mutex.h> -#include <sys/socket.h> -#include <sys/sockio.h> #include <sys/mbuf.h> #include <sys/errno.h> -#include <sys/protosw.h> -#include <sys/queue.h> +#include <sys/socket.h> #include <net/if.h> -#include <net/route.h> +#include <net/if_var.h> #include <netinet/in.h> -#include <netinet/in_systm.h> -#include <netinet/ip.h> #include <netinet/ip_var.h> #include <netinet/ip_encap.h> #ifdef INET6 -#include <netinet/ip6.h> #include <netinet6/ip6_var.h> #endif -#include <machine/stdarg.h> +static MALLOC_DEFINE(M_NETADDR, "encap_export_host", + "Export host address structure"); -#include <sys/kernel.h> -#include <sys/malloc.h> -static MALLOC_DEFINE(M_NETADDR, "encap_export_host", "Export host address structure"); +struct encaptab { + CK_LIST_ENTRY(encaptab) chain; + int proto; + int min_length; + int exact_match; + void *arg; -static void encap_add(struct encaptab *); -static int mask_match(const struct encaptab *, const struct sockaddr *, - const struct sockaddr *); -static void encap_fillarg(struct mbuf *, void *); + encap_lookup_t lookup; + encap_check_t check; + encap_input_t input; +}; + +CK_LIST_HEAD(encaptab_head, encaptab); +#ifdef INET +static struct encaptab_head ipv4_encaptab = CK_LIST_HEAD_INITIALIZER(); +#endif +#ifdef INET6 +static struct encaptab_head ipv6_encaptab = CK_LIST_HEAD_INITIALIZER(); +#endif -/* - * All global variables in ip_encap.c are locked using encapmtx. - */ static struct mtx encapmtx; MTX_SYSINIT(encapmtx, &encapmtx, "encapmtx", MTX_DEF); -static LIST_HEAD(, encaptab) encaptab = LIST_HEAD_INITIALIZER(encaptab); - -#ifdef INET -int -encap4_input(struct mbuf **mp, int *offp, int proto) +#define ENCAP_WLOCK() mtx_lock(&encapmtx) +#define ENCAP_WUNLOCK() mtx_unlock(&encapmtx) +#define ENCAP_RLOCK() struct epoch_tracker encap_et; epoch_enter_preempt(net_epoch_preempt, &encap_et) +#define ENCAP_RUNLOCK() epoch_exit_preempt(net_epoch_preempt, &encap_et) +#define ENCAP_WAIT() epoch_wait_preempt(net_epoch_preempt) + +static struct encaptab * +encap_attach(struct encaptab_head *head, const struct encap_config *cfg, + void *arg, int mflags) { - struct ip *ip; - struct mbuf *m; - struct sockaddr_in s, d; - const struct protosw *psw; - struct encaptab *ep, *match; - void *arg; - int matchprio, off, prio; - - m = *mp; - off = *offp; - ip = mtod(m, struct ip *); - - bzero(&s, sizeof(s)); - s.sin_family = AF_INET; - s.sin_len = sizeof(struct sockaddr_in); - s.sin_addr = ip->ip_src; - bzero(&d, sizeof(d)); - d.sin_family = AF_INET; - d.sin_len = sizeof(struct sockaddr_in); - d.sin_addr = ip->ip_dst; - - arg = NULL; - psw = NULL; - match = NULL; - matchprio = 0; - mtx_lock(&encapmtx); - LIST_FOREACH(ep, &encaptab, chain) { - if (ep->af != AF_INET) - continue; - if (ep->proto >= 0 && ep->proto != proto) - continue; - if (ep->func) - prio = (*ep->func)(m, off, proto, ep->arg); - else { - /* - * it's inbound traffic, we need to match in reverse - * order - */ - prio = mask_match(ep, (struct sockaddr *)&d, - (struct sockaddr *)&s); - } + struct encaptab *ep, *tmp; - /* - * We prioritize the matches by using bit length of the - * matches. mask_match() and user-supplied matching function - * should return the bit length of the matches (for example, - * if both src/dst are matched for IPv4, 64 should be returned). - * 0 or negative return value means "it did not match". - * - * The question is, since we have two "mask" portion, we - * cannot really define total order between entries. - * For example, which of these should be preferred? - * mask_match() returns 48 (32 + 16) for both of them. - * src=3ffe::/16, dst=3ffe:501::/32 - * src=3ffe:501::/32, dst=3ffe::/16 - * - * We need to loop through all the possible candidates - * to get the best match - the search takes O(n) for - * n attachments (i.e. interfaces). - */ - if (prio <= 0) - continue; - if (prio > matchprio) { - matchprio = prio; - match = ep; - } - } - if (match != NULL) { - psw = match->psw; - arg = match->arg; - } - mtx_unlock(&encapmtx); + if (cfg == NULL || cfg->input == NULL || + (cfg->check == NULL && cfg->lookup == NULL) || + (cfg->lookup != NULL && cfg->exact_match != ENCAP_DRV_LOOKUP) || + (cfg->exact_match == ENCAP_DRV_LOOKUP && cfg->lookup == NULL)) + return (NULL); - if (match != NULL) { - /* found a match, "match" has the best one */ - if (psw != NULL && psw->pr_input != NULL) { - encap_fillarg(m, arg); - (*psw->pr_input)(mp, offp, proto); - } else - m_freem(m); - return (IPPROTO_DONE); + ep = malloc(sizeof(*ep), M_NETADDR, mflags); + if (ep == NULL) + return (NULL); + + ep->proto = cfg->proto; + ep->min_length = cfg->min_length; + ep->exact_match = cfg->exact_match; + ep->arg = arg; + ep->lookup = cfg->exact_match == ENCAP_DRV_LOOKUP ? cfg->lookup: NULL; + ep->check = cfg->exact_match != ENCAP_DRV_LOOKUP ? cfg->check: NULL; + ep->input = cfg->input; + + ENCAP_WLOCK(); + CK_LIST_FOREACH(tmp, head, chain) { + if (tmp->exact_match <= ep->exact_match) + break; } + if (tmp == NULL) + CK_LIST_INSERT_HEAD(head, ep, chain); + else + CK_LIST_INSERT_BEFORE(tmp, ep, chain); + ENCAP_WUNLOCK(); + return (ep); +} + +static int +encap_detach(struct encaptab_head *head, const struct encaptab *cookie) +{ + struct encaptab *ep; - /* last resort: inject to raw socket */ - return (rip_input(mp, offp, proto)); + ENCAP_WLOCK(); + CK_LIST_FOREACH(ep, head, chain) { + if (ep == cookie) { + CK_LIST_REMOVE(ep, chain); + ENCAP_WUNLOCK(); + ENCAP_WAIT(); + free(ep, M_NETADDR); + return (0); + } + } + ENCAP_WUNLOCK(); + return (EINVAL); } -#endif -#ifdef INET6 -int -encap6_input(struct mbuf **mp, int *offp, int proto) +static int +encap_input(struct encaptab_head *head, struct mbuf *m, int off, int proto) { - struct mbuf *m = *mp; - struct ip6_hdr *ip6; - struct sockaddr_in6 s, d; - const struct protosw *psw; struct encaptab *ep, *match; void *arg; - int prio, matchprio; - - ip6 = mtod(m, struct ip6_hdr *); + int matchprio, ret; - bzero(&s, sizeof(s)); - s.sin6_family = AF_INET6; - s.sin6_len = sizeof(struct sockaddr_in6); - s.sin6_addr = ip6->ip6_src; - bzero(&d, sizeof(d)); - d.sin6_family = AF_INET6; - d.sin6_len = sizeof(struct sockaddr_in6); - d.sin6_addr = ip6->ip6_dst; - - arg = NULL; - psw = NULL; match = NULL; matchprio = 0; - mtx_lock(&encapmtx); - LIST_FOREACH(ep, &encaptab, chain) { - if (ep->af != AF_INET6) - continue; + + ENCAP_RLOCK(); + CK_LIST_FOREACH(ep, head, chain) { if (ep->proto >= 0 && ep->proto != proto) continue; - if (ep->func) - prio = (*ep->func)(m, *offp, proto, ep->arg); - else { - /* - * it's inbound traffic, we need to match in reverse - * order - */ - prio = mask_match(ep, (struct sockaddr *)&d, - (struct sockaddr *)&s); - } - - /* see encap4_input() for issues here */ - if (prio <= 0) + if (ep->min_length > m->m_pkthdr.len) continue; - if (prio > matchprio) { - matchprio = prio; + if (ep->exact_match == ENCAP_DRV_LOOKUP) + ret = (*ep->lookup)(m, off, proto, &arg); + else + ret = (*ep->check)(m, off, proto, ep->arg); + if (ret <= 0) + continue; + if (ret > matchprio) { match = ep; + if (ep->exact_match != ENCAP_DRV_LOOKUP) + arg = ep->arg; + /* + * No need to continue the search, we got the + * exact match. + */ + if (ret >= ep->exact_match) + break; + matchprio = ret; } } - if (match != NULL) { - psw = match->psw; - arg = match->arg; - } - mtx_unlock(&encapmtx); if (match != NULL) { - /* found a match */ - if (psw != NULL && psw->pr_input != NULL) { - encap_fillarg(m, arg); - return (*psw->pr_input)(mp, offp, proto); - } else { - m_freem(m); - return (IPPROTO_DONE); - } + /* found a match, "match" has the best one */ + ret = (*match->input)(m, off, proto, arg); + ENCAP_RUNLOCK(); + MPASS(ret == IPPROTO_DONE); + return (IPPROTO_DONE); } - - /* last resort: inject to raw socket */ - return rip6_input(mp, offp, proto); -} -#endif - -/*lint -sem(encap_add, custodial(1)) */ -static void -encap_add(struct encaptab *ep) -{ - - mtx_assert(&encapmtx, MA_OWNED); - LIST_INSERT_HEAD(&encaptab, ep, chain); + ENCAP_RUNLOCK(); + return (0); } -/* - * sp (src ptr) is always my side, and dp (dst ptr) is always remote side. - * length of mask (sm and dm) is assumed to be same as sp/dp. - * Return value will be necessary as input (cookie) for encap_detach(). - */ +#ifdef INET const struct encaptab * -encap_attach(int af, int proto, const struct sockaddr *sp, - const struct sockaddr *sm, const struct sockaddr *dp, - const struct sockaddr *dm, const struct protosw *psw, void *arg) +ip_encap_attach(const struct encap_config *cfg, void *arg, int mflags) { - struct encaptab *ep; - - /* sanity check on args */ - if (sp->sa_len > sizeof(ep->src) || dp->sa_len > sizeof(ep->dst)) - return (NULL); - if (sp->sa_len != dp->sa_len) - return (NULL); - if (af != sp->sa_family || af != dp->sa_family) - return (NULL); - /* check if anyone have already attached with exactly same config */ - mtx_lock(&encapmtx); - LIST_FOREACH(ep, &encaptab, chain) { - if (ep->af != af) - continue; - if (ep->proto != proto) - continue; - if (ep->src.ss_len != sp->sa_len || - bcmp(&ep->src, sp, sp->sa_len) != 0 || - bcmp(&ep->srcmask, sm, sp->sa_len) != 0) - continue; - if (ep->dst.ss_len != dp->sa_len || - bcmp(&ep->dst, dp, dp->sa_len) != 0 || - bcmp(&ep->dstmask, dm, dp->sa_len) != 0) - continue; - - mtx_unlock(&encapmtx); - return (NULL); - } - - ep = malloc(sizeof(*ep), M_NETADDR, M_NOWAIT); /*XXX*/ - if (ep == NULL) { - mtx_unlock(&encapmtx); - return (NULL); - } - bzero(ep, sizeof(*ep)); - - ep->af = af; - ep->proto = proto; - bcopy(sp, &ep->src, sp->sa_len); - bcopy(sm, &ep->srcmask, sp->sa_len); - bcopy(dp, &ep->dst, dp->sa_len); - bcopy(dm, &ep->dstmask, dp->sa_len); - ep->psw = psw; - ep->arg = arg; - - encap_add(ep); - mtx_unlock(&encapmtx); - return (ep); + return (encap_attach(&ipv4_encaptab, cfg, arg, mflags)); } -const struct encaptab * -encap_attach_func(int af, int proto, - int (*func)(const struct mbuf *, int, int, void *), - const struct protosw *psw, void *arg) +int +ip_encap_detach(const struct encaptab *cookie) { - struct encaptab *ep; - /* sanity check on args */ - if (!func) - return (NULL); - - ep = malloc(sizeof(*ep), M_NETADDR, M_NOWAIT); /*XXX*/ - if (ep == NULL) - return (NULL); - bzero(ep, sizeof(*ep)); - - ep->af = af; - ep->proto = proto; - ep->func = func; - ep->psw = psw; - ep->arg = arg; - - mtx_lock(&encapmtx); - encap_add(ep); - mtx_unlock(&encapmtx); - return (ep); + return (encap_detach(&ipv4_encaptab, cookie)); } int -encap_detach(const struct encaptab *cookie) +encap4_input(struct mbuf **mp, int *offp, int proto) { - const struct encaptab *ep = cookie; - struct encaptab *p; - - mtx_lock(&encapmtx); - LIST_FOREACH(p, &encaptab, chain) { - if (p == ep) { - LIST_REMOVE(p, chain); - mtx_unlock(&encapmtx); - free(p, M_NETADDR); /*XXX*/ - return 0; - } - } - mtx_unlock(&encapmtx); - return EINVAL; + if (encap_input(&ipv4_encaptab, *mp, *offp, proto) != IPPROTO_DONE) + return (rip_input(mp, offp, proto)); + return (IPPROTO_DONE); } +#endif /* INET */ -static int -mask_match(const struct encaptab *ep, const struct sockaddr *sp, - const struct sockaddr *dp) +#ifdef INET6 +const struct encaptab * +ip6_encap_attach(const struct encap_config *cfg, void *arg, int mflags) { - struct sockaddr_storage s; - struct sockaddr_storage d; - int i; - const u_int8_t *p, *q; - u_int8_t *r; - int matchlen; - - if (sp->sa_len > sizeof(s) || dp->sa_len > sizeof(d)) - return 0; - if (sp->sa_family != ep->af || dp->sa_family != ep->af) - return 0; - if (sp->sa_len != ep->src.ss_len || dp->sa_len != ep->dst.ss_len) - return 0; - - matchlen = 0; - - p = (const u_int8_t *)sp; - q = (const u_int8_t *)&ep->srcmask; - r = (u_int8_t *)&s; - for (i = 0 ; i < sp->sa_len; i++) { - r[i] = p[i] & q[i]; - /* XXX estimate */ - matchlen += (q[i] ? 8 : 0); - } - p = (const u_int8_t *)dp; - q = (const u_int8_t *)&ep->dstmask; - r = (u_int8_t *)&d; - for (i = 0 ; i < dp->sa_len; i++) { - r[i] = p[i] & q[i]; - /* XXX rough estimate */ - matchlen += (q[i] ? 8 : 0); - } - - /* need to overwrite len/family portion as we don't compare them */ - s.ss_len = sp->sa_len; - s.ss_family = sp->sa_family; - d.ss_len = dp->sa_len; - d.ss_family = dp->sa_family; - - if (bcmp(&s, &ep->src, ep->src.ss_len) == 0 && - bcmp(&d, &ep->dst, ep->dst.ss_len) == 0) { - return matchlen; - } else - return 0; + return (encap_attach(&ipv6_encaptab, cfg, arg, mflags)); } -static void -encap_fillarg(struct mbuf *m, void *arg) +int +ip6_encap_detach(const struct encaptab *cookie) { - struct m_tag *tag; - if (arg != NULL) { - tag = m_tag_get(PACKET_TAG_ENCAP, sizeof(void *), M_NOWAIT); - if (tag != NULL) { - *(void**)(tag+1) = arg; - m_tag_prepend(m, tag); - } - } + return (encap_detach(&ipv6_encaptab, cookie)); } -void * -encap_getarg(struct mbuf *m) +int +encap6_input(struct mbuf **mp, int *offp, int proto) { - void *p = NULL; - struct m_tag *tag; - tag = m_tag_find(m, PACKET_TAG_ENCAP, NULL); - if (tag) { - p = *(void**)(tag+1); - m_tag_delete(m, tag); - } - return p; + if (encap_input(&ipv6_encaptab, *mp, *offp, proto) != IPPROTO_DONE) + return (rip6_input(mp, offp, proto)); + return (IPPROTO_DONE); } +#endif /* INET6 */ diff --git a/freebsd/sys/netinet/ip_encap.h b/freebsd/sys/netinet/ip_encap.h index ef232189..f3d1d3af 100644 --- a/freebsd/sys/netinet/ip_encap.h +++ b/freebsd/sys/netinet/ip_encap.h @@ -5,6 +5,7 @@ * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * Copyright (c) 2018 Andrey V. Elsukov <ae@FreeBSD.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -37,29 +38,33 @@ #ifdef _KERNEL -struct encaptab { - LIST_ENTRY(encaptab) chain; - int af; - int proto; /* -1: don't care, I'll check myself */ - struct sockaddr_storage src; /* my addr */ - struct sockaddr_storage srcmask; - struct sockaddr_storage dst; /* remote addr */ - struct sockaddr_storage dstmask; - int (*func)(const struct mbuf *, int, int, void *); - const struct protosw *psw; /* only pr_input will be used */ - void *arg; /* passed via m->m_pkthdr.aux */ -}; - int encap4_input(struct mbuf **, int *, int); int encap6_input(struct mbuf **, int *, int); -const struct encaptab *encap_attach(int, int, const struct sockaddr *, - const struct sockaddr *, const struct sockaddr *, - const struct sockaddr *, const struct protosw *, void *); -const struct encaptab *encap_attach_func(int, int, - int (*)(const struct mbuf *, int, int, void *), - const struct protosw *, void *); -int encap_detach(const struct encaptab *); -void *encap_getarg(struct mbuf *); + +typedef int (*encap_lookup_t)(const struct mbuf *, int, int, void **); +typedef int (*encap_check_t)(const struct mbuf *, int, int, void *); +typedef int (*encap_input_t)(struct mbuf *, int , int, void *); + +struct encap_config { + int proto; /* protocol */ + int min_length; /* minimum packet length */ + int exact_match; /* a packet is exactly matched */ +#define ENCAP_DRV_LOOKUP 0x7fffffff + + encap_lookup_t lookup; + encap_check_t check; + encap_input_t input; +}; + +struct encaptab; + +const struct encaptab *ip_encap_attach(const struct encap_config *, + void *arg, int mflags); +const struct encaptab *ip6_encap_attach(const struct encap_config *, + void *arg, int mflags); + +int ip_encap_detach(const struct encaptab *); +int ip6_encap_detach(const struct encaptab *); #endif #endif /*_NETINET_IP_ENCAP_H_*/ diff --git a/freebsd/sys/netinet/ip_fastfwd.c b/freebsd/sys/netinet/ip_fastfwd.c index b084fdc6..05deb4d8 100644 --- a/freebsd/sys/netinet/ip_fastfwd.c +++ b/freebsd/sys/netinet/ip_fastfwd.c @@ -155,7 +155,7 @@ ip_tryforward(struct mbuf *m) struct mbuf *m0 = NULL; struct nhop4_basic nh; struct sockaddr_in dst; - struct in_addr odest, dest; + struct in_addr dest, odest, rtdest; uint16_t ip_len, ip_off; int error = 0; struct m_tag *fwd_tag = NULL; @@ -296,12 +296,31 @@ passin: #endif /* + * Next hop forced by pfil(9) hook? + */ + if ((m->m_flags & M_IP_NEXTHOP) && + ((fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL)) { + /* + * Now we will find route to forced destination. + */ + dest.s_addr = ((struct sockaddr_in *) + (fwd_tag + 1))->sin_addr.s_addr; + m_tag_delete(m, fwd_tag); + m->m_flags &= ~M_IP_NEXTHOP; + } + + /* * Find route to destination. */ if (ip_findroute(&nh, dest, m) != 0) return (NULL); /* icmp unreach already sent */ /* + * Avoid second route lookup by caching destination. + */ + rtdest.s_addr = dest.s_addr; + + /* * Step 5: outgoing firewall packet processing */ if (!PFIL_HOOKED(&V_inet_pfil_hook)) @@ -323,6 +342,8 @@ passin: */ if (m->m_flags & M_IP_NEXTHOP) fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); + else + fwd_tag = NULL; if (odest.s_addr != dest.s_addr || fwd_tag != NULL) { /* * Is it now for a local address on this host? @@ -344,7 +365,8 @@ forwardlocal: m_tag_delete(m, fwd_tag); m->m_flags &= ~M_IP_NEXTHOP; } - if (ip_findroute(&nh, dest, m) != 0) + if (dest.s_addr != rtdest.s_addr && + ip_findroute(&nh, dest, m) != 0) return (NULL); /* icmp unreach already sent */ } diff --git a/freebsd/sys/netinet/ip_fw.h b/freebsd/sys/netinet/ip_fw.h index 286eb03f..a7bf5b4d 100644 --- a/freebsd/sys/netinet/ip_fw.h +++ b/freebsd/sys/netinet/ip_fw.h @@ -285,6 +285,8 @@ enum ipfw_opcodes { /* arguments (4 byte each) */ O_EXTERNAL_INSTANCE, /* arg1=id of eaction handler instance */ O_EXTERNAL_DATA, /* variable length data */ + O_SKIP_ACTION, /* none */ + O_LAST_OPCODE /* not an opcode! */ }; diff --git a/freebsd/sys/netinet/ip_gre.c b/freebsd/sys/netinet/ip_gre.c index 673e23d5..65ab0ab9 100644 --- a/freebsd/sys/netinet/ip_gre.c +++ b/freebsd/sys/netinet/ip_gre.c @@ -4,7 +4,7 @@ * SPDX-License-Identifier: BSD-2-Clause-NetBSD * * Copyright (c) 1998 The NetBSD Foundation, Inc. - * Copyright (c) 2014 Andrey V. Elsukov <ae@FreeBSD.org> + * Copyright (c) 2014, 2018 Andrey V. Elsukov <ae@FreeBSD.org> * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation @@ -43,18 +43,17 @@ __FBSDID("$FreeBSD$"); #include <rtems/bsd/local/opt_inet6.h> #include <sys/param.h> +#include <sys/jail.h> #include <sys/systm.h> -#include <sys/mbuf.h> #include <sys/socket.h> -#include <sys/socketvar.h> -#include <sys/protosw.h> +#include <sys/sockio.h> +#include <sys/mbuf.h> #include <sys/errno.h> -#include <sys/time.h> #include <sys/kernel.h> -#include <sys/lock.h> -#include <sys/rmlock.h> #include <sys/sysctl.h> -#include <net/ethernet.h> +#include <sys/malloc.h> +#include <sys/proc.h> + #include <net/if.h> #include <net/if_var.h> #include <net/vnet.h> @@ -71,61 +70,177 @@ __FBSDID("$FreeBSD$"); #include <net/if_gre.h> -extern struct domain inetdomain; -static const struct protosw in_gre_protosw = { - .pr_type = SOCK_RAW, - .pr_domain = &inetdomain, - .pr_protocol = IPPROTO_GRE, - .pr_flags = PR_ATOMIC|PR_ADDR, - .pr_input = gre_input, - .pr_output = rip_output, - .pr_ctlinput = rip_ctlinput, - .pr_ctloutput = rip_ctloutput, - .pr_usrreqs = &rip_usrreqs -}; - #define GRE_TTL 30 VNET_DEFINE(int, ip_gre_ttl) = GRE_TTL; #define V_ip_gre_ttl VNET(ip_gre_ttl) SYSCTL_INT(_net_inet_ip, OID_AUTO, grettl, CTLFLAG_VNET | CTLFLAG_RW, - &VNET_NAME(ip_gre_ttl), 0, ""); + &VNET_NAME(ip_gre_ttl), 0, "Default TTL value for encapsulated packets"); + +VNET_DEFINE_STATIC(struct gre_list *, ipv4_hashtbl) = NULL; +#define V_ipv4_hashtbl VNET(ipv4_hashtbl) +#define GRE_HASH(src, dst) (V_ipv4_hashtbl[\ + in_gre_hashval((src), (dst)) & (GRE_HASH_SIZE - 1)]) +#define GRE_HASH_SC(sc) GRE_HASH((sc)->gre_oip.ip_src.s_addr,\ + (sc)->gre_oip.ip_dst.s_addr) + +static uint32_t +in_gre_hashval(in_addr_t src, in_addr_t dst) +{ + uint32_t ret; + + ret = fnv_32_buf(&src, sizeof(src), FNV1_32_INIT); + return (fnv_32_buf(&dst, sizeof(dst), ret)); +} + +static int +in_gre_checkdup(const struct gre_softc *sc, in_addr_t src, in_addr_t dst) +{ + struct gre_softc *tmp; + + if (sc->gre_family == AF_INET && + sc->gre_oip.ip_src.s_addr == src && + sc->gre_oip.ip_dst.s_addr == dst) + return (EEXIST); + + CK_LIST_FOREACH(tmp, &GRE_HASH(src, dst), chain) { + if (tmp == sc) + continue; + if (tmp->gre_oip.ip_src.s_addr == src && + tmp->gre_oip.ip_dst.s_addr == dst) + return (EADDRNOTAVAIL); + } + return (0); +} static int -in_gre_encapcheck(const struct mbuf *m, int off, int proto, void *arg) +in_gre_lookup(const struct mbuf *m, int off, int proto, void **arg) { - GRE_RLOCK_TRACKER; + const struct ip *ip; struct gre_softc *sc; - struct ip *ip; - sc = (struct gre_softc *)arg; - if ((GRE2IFP(sc)->if_flags & IFF_UP) == 0) + if (V_ipv4_hashtbl == NULL) return (0); - M_ASSERTPKTHDR(m); - /* - * We expect that payload contains at least IPv4 - * or IPv6 packet. - */ - if (m->m_pkthdr.len < sizeof(struct greip) + sizeof(struct ip)) - return (0); + MPASS(in_epoch(net_epoch_preempt)); + ip = mtod(m, const struct ip *); + CK_LIST_FOREACH(sc, &GRE_HASH(ip->ip_dst.s_addr, + ip->ip_src.s_addr), chain) { + /* + * This is an inbound packet, its ip_dst is source address + * in softc. + */ + if (sc->gre_oip.ip_src.s_addr == ip->ip_dst.s_addr && + sc->gre_oip.ip_dst.s_addr == ip->ip_src.s_addr) { + if ((GRE2IFP(sc)->if_flags & IFF_UP) == 0) + return (0); + *arg = sc; + return (ENCAP_DRV_LOOKUP); + } + } + return (0); +} - GRE_RLOCK(sc); - if (sc->gre_family == 0) - goto bad; +static void +in_gre_attach(struct gre_softc *sc) +{ - KASSERT(sc->gre_family == AF_INET, - ("wrong gre_family: %d", sc->gre_family)); + sc->gre_hlen = sizeof(struct greip); + sc->gre_oip.ip_v = IPVERSION; + sc->gre_oip.ip_hl = sizeof(struct ip) >> 2; + sc->gre_oip.ip_p = IPPROTO_GRE; + gre_updatehdr(sc, &sc->gre_gihdr->gi_gre); + CK_LIST_INSERT_HEAD(&GRE_HASH_SC(sc), sc, chain); +} - ip = mtod(m, struct ip *); - if (sc->gre_oip.ip_src.s_addr != ip->ip_dst.s_addr || - sc->gre_oip.ip_dst.s_addr != ip->ip_src.s_addr) - goto bad; +void +in_gre_setopts(struct gre_softc *sc, u_long cmd, uint32_t value) +{ - GRE_RUNLOCK(sc); - return (32 * 2); -bad: - GRE_RUNLOCK(sc); - return (0); + MPASS(cmd == GRESKEY || cmd == GRESOPTS); + + /* NOTE: we are protected with gre_ioctl_sx lock */ + MPASS(sc->gre_family == AF_INET); + CK_LIST_REMOVE(sc, chain); + GRE_WAIT(); + if (cmd == GRESKEY) + sc->gre_key = value; + else + sc->gre_options = value; + in_gre_attach(sc); +} + +int +in_gre_ioctl(struct gre_softc *sc, u_long cmd, caddr_t data) +{ + struct ifreq *ifr = (struct ifreq *)data; + struct sockaddr_in *dst, *src; + struct ip *ip; + int error; + + /* NOTE: we are protected with gre_ioctl_sx lock */ + error = EINVAL; + switch (cmd) { + case SIOCSIFPHYADDR: + src = &((struct in_aliasreq *)data)->ifra_addr; + dst = &((struct in_aliasreq *)data)->ifra_dstaddr; + + /* sanity checks */ + if (src->sin_family != dst->sin_family || + src->sin_family != AF_INET || + src->sin_len != dst->sin_len || + src->sin_len != sizeof(*src)) + break; + if (src->sin_addr.s_addr == INADDR_ANY || + dst->sin_addr.s_addr == INADDR_ANY) { + error = EADDRNOTAVAIL; + break; + } + if (V_ipv4_hashtbl == NULL) + V_ipv4_hashtbl = gre_hashinit(); + error = in_gre_checkdup(sc, src->sin_addr.s_addr, + dst->sin_addr.s_addr); + if (error == EADDRNOTAVAIL) + break; + if (error == EEXIST) { + /* Addresses are the same. Just return. */ + error = 0; + break; + } + ip = malloc(sizeof(struct greip) + 3 * sizeof(uint32_t), + M_GRE, M_WAITOK | M_ZERO); + ip->ip_src.s_addr = src->sin_addr.s_addr; + ip->ip_dst.s_addr = dst->sin_addr.s_addr; + if (sc->gre_family != 0) { + /* Detach existing tunnel first */ + CK_LIST_REMOVE(sc, chain); + GRE_WAIT(); + free(sc->gre_hdr, M_GRE); + /* XXX: should we notify about link state change? */ + } + sc->gre_family = AF_INET; + sc->gre_hdr = ip; + sc->gre_oseq = 0; + sc->gre_iseq = UINT32_MAX; + in_gre_attach(sc); + break; + case SIOCGIFPSRCADDR: + case SIOCGIFPDSTADDR: + if (sc->gre_family != AF_INET) { + error = EADDRNOTAVAIL; + break; + } + src = (struct sockaddr_in *)&ifr->ifr_addr; + memset(src, 0, sizeof(*src)); + src->sin_family = AF_INET; + src->sin_len = sizeof(*src); + src->sin_addr = (cmd == SIOCGIFPSRCADDR) ? + sc->gre_oip.ip_src: sc->gre_oip.ip_dst; + error = prison_if(curthread->td_ucred, (struct sockaddr *)src); + if (error != 0) + memset(src, 0, sizeof(*src)); + break; + } + return (error); } int @@ -158,14 +273,30 @@ in_gre_output(struct mbuf *m, int af, int hlen) return (ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL)); } -int -in_gre_attach(struct gre_softc *sc) +static const struct encaptab *ecookie = NULL; +static const struct encap_config ipv4_encap_cfg = { + .proto = IPPROTO_GRE, + .min_length = sizeof(struct greip) + sizeof(struct ip), + .exact_match = ENCAP_DRV_LOOKUP, + .lookup = in_gre_lookup, + .input = gre_input +}; + +void +in_gre_init(void) { - KASSERT(sc->gre_ecookie == NULL, ("gre_ecookie isn't NULL")); - sc->gre_ecookie = encap_attach_func(AF_INET, IPPROTO_GRE, - in_gre_encapcheck, &in_gre_protosw, sc); - if (sc->gre_ecookie == NULL) - return (EEXIST); - return (0); + if (!IS_DEFAULT_VNET(curvnet)) + return; + ecookie = ip_encap_attach(&ipv4_encap_cfg, NULL, M_WAITOK); +} + +void +in_gre_uninit(void) +{ + + if (IS_DEFAULT_VNET(curvnet)) + ip_encap_detach(ecookie); + if (V_ipv4_hashtbl != NULL) + gre_hashdestroy(V_ipv4_hashtbl); } diff --git a/freebsd/sys/netinet/ip_icmp.c b/freebsd/sys/netinet/ip_icmp.c index 3fc59a14..414e3812 100644 --- a/freebsd/sys/netinet/ip_icmp.c +++ b/freebsd/sys/netinet/ip_icmp.c @@ -84,13 +84,13 @@ __FBSDID("$FreeBSD$"); * routines to turnaround packets back to the originator, and * host table maintenance routines. */ -static VNET_DEFINE(int, icmplim) = 200; +VNET_DEFINE_STATIC(int, icmplim) = 200; #define V_icmplim VNET(icmplim) SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmplim), 0, "Maximum number of ICMP responses per second"); -static VNET_DEFINE(int, icmplim_output) = 1; +VNET_DEFINE_STATIC(int, icmplim_output) = 1; #define V_icmplim_output VNET(icmplim_output) SYSCTL_INT(_net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmplim_output), 0, @@ -106,13 +106,13 @@ SYSCTL_VNET_PCPUSTAT(_net_inet_icmp, ICMPCTL_STATS, stats, struct icmpstat, VNET_PCPUSTAT_SYSUNINIT(icmpstat); #endif /* VIMAGE */ -static VNET_DEFINE(int, icmpmaskrepl) = 0; +VNET_DEFINE_STATIC(int, icmpmaskrepl) = 0; #define V_icmpmaskrepl VNET(icmpmaskrepl) SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmpmaskrepl), 0, "Reply to ICMP Address Mask Request packets"); -static VNET_DEFINE(u_int, icmpmaskfake) = 0; +VNET_DEFINE_STATIC(u_int, icmpmaskfake) = 0; #define V_icmpmaskfake VNET(icmpmaskfake) SYSCTL_UINT(_net_inet_icmp, OID_AUTO, maskfake, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmpmaskfake), 0, @@ -124,37 +124,37 @@ SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(drop_redirect), 0, "Ignore ICMP redirects"); -static VNET_DEFINE(int, log_redirect) = 0; +VNET_DEFINE_STATIC(int, log_redirect) = 0; #define V_log_redirect VNET(log_redirect) SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(log_redirect), 0, "Log ICMP redirects to the console"); -static VNET_DEFINE(char, reply_src[IFNAMSIZ]); +VNET_DEFINE_STATIC(char, reply_src[IFNAMSIZ]); #define V_reply_src VNET(reply_src) SYSCTL_STRING(_net_inet_icmp, OID_AUTO, reply_src, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(reply_src), IFNAMSIZ, "ICMP reply source for non-local packets"); -static VNET_DEFINE(int, icmp_rfi) = 0; +VNET_DEFINE_STATIC(int, icmp_rfi) = 0; #define V_icmp_rfi VNET(icmp_rfi) SYSCTL_INT(_net_inet_icmp, OID_AUTO, reply_from_interface, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_rfi), 0, "ICMP reply from incoming interface for non-local packets"); - -static VNET_DEFINE(int, icmp_quotelen) = 8; +/* Router requirements RFC 1812 section 4.3.2.3 requires 576 - 28. */ +VNET_DEFINE_STATIC(int, icmp_quotelen) = 548; #define V_icmp_quotelen VNET(icmp_quotelen) SYSCTL_INT(_net_inet_icmp, OID_AUTO, quotelen, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_quotelen), 0, "Number of bytes from original packet to quote in ICMP reply"); -static VNET_DEFINE(int, icmpbmcastecho) = 0; +VNET_DEFINE_STATIC(int, icmpbmcastecho) = 0; #define V_icmpbmcastecho VNET(icmpbmcastecho) SYSCTL_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmpbmcastecho), 0, "Reply to multicast ICMP Echo Request and Timestamp packets"); -static VNET_DEFINE(int, icmptstamprepl) = 1; +VNET_DEFINE_STATIC(int, icmptstamprepl) = 1; #define V_icmptstamprepl VNET(icmptstamprepl) SYSCTL_INT(_net_inet_icmp, OID_AUTO, tstamprepl, CTLFLAG_RW, &VNET_NAME(icmptstamprepl), 0, @@ -1003,7 +1003,7 @@ struct icmp_rate { const char *descr; struct counter_rate cr; }; -static VNET_DEFINE(struct icmp_rate, icmp_rates[BANDLIM_MAX]) = { +VNET_DEFINE_STATIC(struct icmp_rate, icmp_rates[BANDLIM_MAX]) = { { "icmp unreach response" }, { "icmp ping response" }, { "icmp tstamp response" }, diff --git a/freebsd/sys/netinet/ip_id.c b/freebsd/sys/netinet/ip_id.c index 02bf2c5b..85a67612 100644 --- a/freebsd/sys/netinet/ip_id.c +++ b/freebsd/sys/netinet/ip_id.c @@ -100,8 +100,8 @@ __FBSDID("$FreeBSD$"); * suggested by RFC6864. We use per-CPU counter for that, or if * user wants to, we can turn on random ID generation. */ -static VNET_DEFINE(int, ip_rfc6864) = 1; -static VNET_DEFINE(int, ip_do_randomid) = 0; +VNET_DEFINE_STATIC(int, ip_rfc6864) = 1; +VNET_DEFINE_STATIC(int, ip_do_randomid) = 0; #define V_ip_rfc6864 VNET(ip_rfc6864) #define V_ip_do_randomid VNET(ip_do_randomid) @@ -109,13 +109,13 @@ static VNET_DEFINE(int, ip_do_randomid) = 0; * Random ID state engine. */ static MALLOC_DEFINE(M_IPID, "ipid", "randomized ip id state"); -static VNET_DEFINE(uint16_t *, id_array); -static VNET_DEFINE(bitstr_t *, id_bits); -static VNET_DEFINE(int, array_ptr); -static VNET_DEFINE(int, array_size); -static VNET_DEFINE(int, random_id_collisions); -static VNET_DEFINE(int, random_id_total); -static VNET_DEFINE(struct mtx, ip_id_mtx); +VNET_DEFINE_STATIC(uint16_t *, id_array); +VNET_DEFINE_STATIC(bitstr_t *, id_bits); +VNET_DEFINE_STATIC(int, array_ptr); +VNET_DEFINE_STATIC(int, array_size); +VNET_DEFINE_STATIC(int, random_id_collisions); +VNET_DEFINE_STATIC(int, random_id_total); +VNET_DEFINE_STATIC(struct mtx, ip_id_mtx); #define V_id_array VNET(id_array) #define V_id_bits VNET(id_bits) #define V_array_ptr VNET(array_ptr) @@ -127,7 +127,7 @@ static VNET_DEFINE(struct mtx, ip_id_mtx); /* * Non-random ID state engine is simply a per-cpu counter. */ -static VNET_DEFINE(counter_u64_t, ip_id); +VNET_DEFINE_STATIC(counter_u64_t, ip_id); #define V_ip_id VNET(ip_id) static int sysctl_ip_randomid(SYSCTL_HANDLER_ARGS); diff --git a/freebsd/sys/netinet/ip_input.c b/freebsd/sys/netinet/ip_input.c index 343eec5e..2852b52e 100644 --- a/freebsd/sys/netinet/ip_input.c +++ b/freebsd/sys/netinet/ip_input.c @@ -111,7 +111,7 @@ SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_VNET | CTLFLAG_RW &VNET_NAME(ipforwarding), 0, "Enable IP forwarding between interfaces"); -static VNET_DEFINE(int, ipsendredirects) = 1; /* XXX */ +VNET_DEFINE_STATIC(int, ipsendredirects) = 1; /* XXX */ #define V_ipsendredirects VNET(ipsendredirects) SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsendredirects), 0, @@ -130,7 +130,7 @@ SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_VNET | CTLFLAG_R * to the loopback interface instead of the interface where the * packets for those addresses are received. */ -static VNET_DEFINE(int, ip_checkinterface); +VNET_DEFINE_STATIC(int, ip_checkinterface); #define V_ip_checkinterface VNET(ip_checkinterface) SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_checkinterface), 0, @@ -559,13 +559,15 @@ tooshort: /* * Try to forward the packet, but if we fail continue. + * ip_tryforward() does not generate redirects, so fall + * through to normal processing if redirects are required. * ip_tryforward() does inbound and outbound packet firewall * processing. If firewall has decided that destination becomes * our local address, it sets M_FASTFWD_OURS flag. In this * case skip another inbound firewall processing and update * ip pointer. */ - if (V_ipforwarding != 0 + if (V_ipforwarding != 0 && V_ipsendredirects == 0 #if defined(IPSEC) || defined(IPSEC_SUPPORT) && (!IPSEC_ENABLED(ipv4) || IPSEC_CAPS(ipv4, m, IPSEC_CAP_OPERABLE) == 0) @@ -1349,7 +1351,7 @@ makedummy: * locking. This code remains in ip_input.c as ip_mroute.c is optionally * compiled. */ -static VNET_DEFINE(int, ip_rsvp_on); +VNET_DEFINE_STATIC(int, ip_rsvp_on); VNET_DEFINE(struct socket *, ip_rsvpd); #define V_ip_rsvp_on VNET(ip_rsvp_on) diff --git a/freebsd/sys/netinet/ip_mroute.c b/freebsd/sys/netinet/ip_mroute.c index ac901601..987549c6 100644 --- a/freebsd/sys/netinet/ip_mroute.c +++ b/freebsd/sys/netinet/ip_mroute.c @@ -127,7 +127,7 @@ __FBSDID("$FreeBSD$"); #define VIFI_INVALID ((vifi_t) -1) -static VNET_DEFINE(uint32_t, last_tv_sec); /* last time we processed this */ +VNET_DEFINE_STATIC(uint32_t, last_tv_sec); /* last time we processed this */ #define V_last_tv_sec VNET(last_tv_sec) static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast forwarding cache"); @@ -151,14 +151,14 @@ static struct mtx mrouter_mtx; static int ip_mrouter_cnt; /* # of vnets with active mrouters */ static int ip_mrouter_unloading; /* Allow no more V_ip_mrouter sockets */ -static VNET_PCPUSTAT_DEFINE(struct mrtstat, mrtstat); +VNET_PCPUSTAT_DEFINE_STATIC(struct mrtstat, mrtstat); VNET_PCPUSTAT_SYSINIT(mrtstat); VNET_PCPUSTAT_SYSUNINIT(mrtstat); SYSCTL_VNET_PCPUSTAT(_net_inet_ip, OID_AUTO, mrtstat, struct mrtstat, mrtstat, "IPv4 Multicast Forwarding Statistics (struct mrtstat, " "netinet/ip_mroute.h)"); -static VNET_DEFINE(u_long, mfchash); +VNET_DEFINE_STATIC(u_long, mfchash); #define V_mfchash VNET(mfchash) #define MFCHASH(a, g) \ ((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^ \ @@ -166,9 +166,9 @@ static VNET_DEFINE(u_long, mfchash); #define MFCHASHSIZE 256 static u_long mfchashsize; /* Hash size */ -static VNET_DEFINE(u_char *, nexpire); /* 0..mfchashsize-1 */ +VNET_DEFINE_STATIC(u_char *, nexpire); /* 0..mfchashsize-1 */ #define V_nexpire VNET(nexpire) -static VNET_DEFINE(LIST_HEAD(mfchashhdr, mfc)*, mfchashtbl); +VNET_DEFINE_STATIC(LIST_HEAD(mfchashhdr, mfc)*, mfchashtbl); #define V_mfchashtbl VNET(mfchashtbl) static struct mtx mfc_mtx; @@ -179,9 +179,9 @@ static struct mtx mfc_mtx; mtx_init(&mfc_mtx, "IPv4 multicast forwarding cache", NULL, MTX_DEF) #define MFC_LOCK_DESTROY() mtx_destroy(&mfc_mtx) -static VNET_DEFINE(vifi_t, numvifs); +VNET_DEFINE_STATIC(vifi_t, numvifs); #define V_numvifs VNET(numvifs) -static VNET_DEFINE(struct vif, viftable[MAXVIFS]); +VNET_DEFINE_STATIC(struct vif, viftable[MAXVIFS]); #define V_viftable VNET(viftable) SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(viftable), sizeof(V_viftable), "S,vif[MAXVIFS]", @@ -197,7 +197,7 @@ static struct mtx vif_mtx; static eventhandler_tag if_detach_event_tag = NULL; -static VNET_DEFINE(struct callout, expire_upcalls_ch); +VNET_DEFINE_STATIC(struct callout, expire_upcalls_ch); #define V_expire_upcalls_ch VNET(expire_upcalls_ch) #define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ @@ -212,9 +212,9 @@ static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters"); * expiration time. Periodically, the entries are analysed and processed. */ #define BW_METER_BUCKETS 1024 -static VNET_DEFINE(struct bw_meter*, bw_meter_timers[BW_METER_BUCKETS]); +VNET_DEFINE_STATIC(struct bw_meter*, bw_meter_timers[BW_METER_BUCKETS]); #define V_bw_meter_timers VNET(bw_meter_timers) -static VNET_DEFINE(struct callout, bw_meter_ch); +VNET_DEFINE_STATIC(struct callout, bw_meter_ch); #define V_bw_meter_ch VNET(bw_meter_ch) #define BW_METER_PERIOD (hz) /* periodical handling of bw meters */ @@ -222,16 +222,16 @@ static VNET_DEFINE(struct callout, bw_meter_ch); * Pending upcalls are stored in a vector which is flushed when * full, or periodically */ -static VNET_DEFINE(struct bw_upcall, bw_upcalls[BW_UPCALLS_MAX]); +VNET_DEFINE_STATIC(struct bw_upcall, bw_upcalls[BW_UPCALLS_MAX]); #define V_bw_upcalls VNET(bw_upcalls) -static VNET_DEFINE(u_int, bw_upcalls_n); /* # of pending upcalls */ +VNET_DEFINE_STATIC(u_int, bw_upcalls_n); /* # of pending upcalls */ #define V_bw_upcalls_n VNET(bw_upcalls_n) -static VNET_DEFINE(struct callout, bw_upcalls_ch); +VNET_DEFINE_STATIC(struct callout, bw_upcalls_ch); #define V_bw_upcalls_ch VNET(bw_upcalls_ch) #define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */ -static VNET_PCPUSTAT_DEFINE(struct pimstat, pimstat); +VNET_PCPUSTAT_DEFINE_STATIC(struct pimstat, pimstat); VNET_PCPUSTAT_SYSINIT(pimstat); VNET_PCPUSTAT_SYSUNINIT(pimstat); @@ -244,20 +244,17 @@ SYSCTL_ULONG(_net_inet_pim, OID_AUTO, squelch_wholepkt, CTLFLAG_RW, &pim_squelch_wholepkt, 0, "Disable IGMP_WHOLEPKT notifications if rendezvous point is unspecified"); -extern struct domain inetdomain; -static const struct protosw in_pim_protosw = { - .pr_type = SOCK_RAW, - .pr_domain = &inetdomain, - .pr_protocol = IPPROTO_PIM, - .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, - .pr_input = pim_input, - .pr_output = rip_output, - .pr_ctloutput = rip_ctloutput, - .pr_usrreqs = &rip_usrreqs -}; static const struct encaptab *pim_encap_cookie; - static int pim_encapcheck(const struct mbuf *, int, int, void *); +static int pim_input(struct mbuf *, int, int, void *); + +static const struct encap_config ipv4_encap_cfg = { + .proto = IPPROTO_PIM, + .min_length = sizeof(struct ip) + PIM_MINLEN, + .exact_match = 8, + .check = pim_encapcheck, + .input = pim_input +}; /* * Note: the PIM Register encapsulation adds the following in front of a @@ -302,9 +299,9 @@ static struct pim_encap_pimhdr pim_encap_pimhdr = { 0 /* flags */ }; -static VNET_DEFINE(vifi_t, reg_vif_num) = VIFI_INVALID; +VNET_DEFINE_STATIC(vifi_t, reg_vif_num) = VIFI_INVALID; #define V_reg_vif_num VNET(reg_vif_num) -static VNET_DEFINE(struct ifnet, multicast_register_if); +VNET_DEFINE_STATIC(struct ifnet, multicast_register_if); #define V_multicast_register_if VNET(multicast_register_if) /* @@ -373,9 +370,9 @@ static const uint32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF | MRT_MFC_FLAGS_BORDER_VIF | MRT_MFC_RP | MRT_MFC_BW_UPCALL); -static VNET_DEFINE(uint32_t, mrt_api_config); +VNET_DEFINE_STATIC(uint32_t, mrt_api_config); #define V_mrt_api_config VNET(mrt_api_config) -static VNET_DEFINE(int, pim_assert_enabled); +VNET_DEFINE_STATIC(int, pim_assert_enabled); #define V_pim_assert_enabled VNET(pim_assert_enabled) static struct timeval pim_assert_interval = { 3, 0 }; /* Rate limit */ @@ -2546,16 +2543,12 @@ pim_register_send_rp(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy, * into the kernel. */ static int -pim_encapcheck(const struct mbuf *m, int off, int proto, void *arg) +pim_encapcheck(const struct mbuf *m __unused, int off __unused, + int proto __unused, void *arg __unused) { -#ifdef DIAGNOSTIC KASSERT(proto == IPPROTO_PIM, ("not for IPPROTO_PIM")); -#endif - if (proto != IPPROTO_PIM) - return 0; /* not for us; reject the datagram. */ - - return 64; /* claim the datagram. */ + return (8); /* claim the datagram. */ } /* @@ -2566,18 +2559,15 @@ pim_encapcheck(const struct mbuf *m, int off, int proto, void *arg) * (used by PIM-SM): the PIM header is stripped off, and the inner packet * is passed to if_simloop(). */ -int -pim_input(struct mbuf **mp, int *offp, int proto) +static int +pim_input(struct mbuf *m, int off, int proto, void *arg __unused) { - struct mbuf *m = *mp; struct ip *ip = mtod(m, struct ip *); struct pim *pim; - int iphlen = *offp; + int iphlen = off; int minlen; int datalen = ntohs(ip->ip_len) - iphlen; int ip_tos; - - *mp = NULL; /* Keep statistics */ PIMSTAT_INC(pims_rcv_total_msgs); @@ -2781,10 +2771,7 @@ pim_input_to_daemon: * XXX: the outer IP header pkt size of a Register is not adjust to * reflect the fact that the inner multicast data is truncated. */ - *mp = m; - rip_input(mp, offp, proto); - - return (IPPROTO_DONE); + return (rip_input(&m, &off, proto)); } static int @@ -2877,8 +2864,7 @@ ip_mroute_modevent(module_t mod, int type, void *unused) TUNABLE_ULONG_FETCH("net.inet.pim.squelch_wholepkt", &pim_squelch_wholepkt); - pim_encap_cookie = encap_attach_func(AF_INET, IPPROTO_PIM, - pim_encapcheck, &in_pim_protosw, NULL); + pim_encap_cookie = ip_encap_attach(&ipv4_encap_cfg, NULL, M_WAITOK); if (pim_encap_cookie == NULL) { printf("ip_mroute: unable to attach pim encap\n"); VIF_LOCK_DESTROY(); @@ -2921,7 +2907,7 @@ ip_mroute_modevent(module_t mod, int type, void *unused) EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag); if (pim_encap_cookie) { - encap_detach(pim_encap_cookie); + ip_encap_detach(pim_encap_cookie); pim_encap_cookie = NULL; } diff --git a/freebsd/sys/netinet/ip_options.c b/freebsd/sys/netinet/ip_options.c index cc2f3eed..7c189bdb 100644 --- a/freebsd/sys/netinet/ip_options.c +++ b/freebsd/sys/netinet/ip_options.c @@ -70,13 +70,13 @@ __FBSDID("$FreeBSD$"); #include <sys/socketvar.h> -static VNET_DEFINE(int, ip_dosourceroute); +VNET_DEFINE_STATIC(int, ip_dosourceroute); SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_dosourceroute), 0, "Enable forwarding source routed IP packets"); #define V_ip_dosourceroute VNET(ip_dosourceroute) -static VNET_DEFINE(int, ip_acceptsourceroute); +VNET_DEFINE_STATIC(int, ip_acceptsourceroute); SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_acceptsourceroute), 0, "Enable accepting source routed IP packets"); diff --git a/freebsd/sys/netinet/ip_output.c b/freebsd/sys/netinet/ip_output.c index 792f2311..5f643746 100644 --- a/freebsd/sys/netinet/ip_output.c +++ b/freebsd/sys/netinet/ip_output.c @@ -82,6 +82,10 @@ __FBSDID("$FreeBSD$"); #include <netinet/in_var.h> #include <netinet/ip_var.h> #include <netinet/ip_options.h> + +#include <netinet/udp.h> +#include <netinet/udp_var.h> + #ifdef SCTP #include <netinet/sctp.h> #include <netinet/sctp_crc32.h> @@ -922,24 +926,34 @@ void in_delayed_cksum(struct mbuf *m) { struct ip *ip; - uint16_t csum, offset, ip_len; + struct udphdr *uh; + uint16_t cklen, csum, offset; ip = mtod(m, struct ip *); offset = ip->ip_hl << 2 ; - ip_len = ntohs(ip->ip_len); - csum = in_cksum_skip(m, ip_len, offset); - if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) - csum = 0xffff; - offset += m->m_pkthdr.csum_data; /* checksum offset */ - /* find the mbuf in the chain where the checksum starts*/ - while ((m != NULL) && (offset >= m->m_len)) { - offset -= m->m_len; - m = m->m_next; + if (m->m_pkthdr.csum_flags & CSUM_UDP) { + /* if udp header is not in the first mbuf copy udplen */ + if (offset + sizeof(struct udphdr) > m->m_len) + m_copydata(m, offset + offsetof(struct udphdr, + uh_ulen), sizeof(cklen), (caddr_t)&cklen); + else { + uh = (struct udphdr *)mtodo(m, offset); + cklen = ntohs(uh->uh_ulen); + } + csum = in_cksum_skip(m, cklen + offset, offset); + if (csum == 0) + csum = 0xffff; + } else { + cklen = ntohs(ip->ip_len); + csum = in_cksum_skip(m, cklen, offset); } - KASSERT(m != NULL, ("in_delayed_cksum: checksum outside mbuf chain.")); - KASSERT(offset + sizeof(u_short) <= m->m_len, ("in_delayed_cksum: checksum split between mbufs.")); - *(u_short *)(m->m_data + offset) = csum; + offset += m->m_pkthdr.csum_data; /* checksum offset */ + + if (offset + sizeof(csum) > m->m_len) + m_copyback(m, offset, sizeof(csum), (caddr_t)&csum); + else + *(u_short *)mtodo(m, offset) = csum; } /* @@ -980,6 +994,15 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) INP_WUNLOCK(inp); error = 0; break; + case SO_REUSEPORT_LB: + INP_WLOCK(inp); + if ((so->so_options & SO_REUSEPORT_LB) != 0) + inp->inp_flags2 |= INP_REUSEPORT_LB; + else + inp->inp_flags2 &= ~INP_REUSEPORT_LB; + INP_WUNLOCK(inp); + error = 0; + break; case SO_SETFIB: INP_WLOCK(inp); inp->inp_inc.inc_fibnum = so->so_fibnum; @@ -1235,13 +1258,23 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) switch (sopt->sopt_name) { case IP_OPTIONS: case IP_RETOPTS: - if (inp->inp_options) - error = sooptcopyout(sopt, - mtod(inp->inp_options, - char *), - inp->inp_options->m_len); - else + INP_RLOCK(inp); + if (inp->inp_options) { + struct mbuf *options; + + options = m_dup(inp->inp_options, M_NOWAIT); + INP_RUNLOCK(inp); + if (options != NULL) { + error = sooptcopyout(sopt, + mtod(options, char *), + options->m_len); + m_freem(options); + } else + error = ENOMEM; + } else { + INP_RUNLOCK(inp); sopt->sopt_valsize = 0; + } break; case IP_TOS: diff --git a/freebsd/sys/netinet/ip_reass.c b/freebsd/sys/netinet/ip_reass.c index 64660228..95603390 100644 --- a/freebsd/sys/netinet/ip_reass.c +++ b/freebsd/sys/netinet/ip_reass.c @@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$"); #include <sys/hash.h> #include <sys/mbuf.h> #include <sys/malloc.h> +#include <sys/limits.h> #include <sys/lock.h> #include <sys/mutex.h> #include <sys/sysctl.h> @@ -65,18 +66,19 @@ SYSCTL_DECL(_net_inet_ip); /* * Reassembly headers are stored in hash buckets. */ -#define IPREASS_NHASH_LOG2 6 +#define IPREASS_NHASH_LOG2 10 #define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) #define IPREASS_HMASK (IPREASS_NHASH - 1) struct ipqbucket { TAILQ_HEAD(ipqhead, ipq) head; struct mtx lock; + int count; }; -static VNET_DEFINE(struct ipqbucket, ipq[IPREASS_NHASH]); +VNET_DEFINE_STATIC(struct ipqbucket, ipq[IPREASS_NHASH]); #define V_ipq VNET(ipq) -static VNET_DEFINE(uint32_t, ipq_hashseed); +VNET_DEFINE_STATIC(uint32_t, ipq_hashseed); #define V_ipq_hashseed VNET(ipq_hashseed) #define IPQ_LOCK(i) mtx_lock(&V_ipq[i].lock) @@ -84,6 +86,9 @@ static VNET_DEFINE(uint32_t, ipq_hashseed); #define IPQ_UNLOCK(i) mtx_unlock(&V_ipq[i].lock) #define IPQ_LOCK_ASSERT(i) mtx_assert(&V_ipq[i].lock, MA_OWNED) +VNET_DEFINE_STATIC(int, ipreass_maxbucketsize); +#define V_ipreass_maxbucketsize VNET(ipreass_maxbucketsize) + void ipreass_init(void); void ipreass_drain(void); void ipreass_slowtimo(void); @@ -91,28 +96,54 @@ void ipreass_slowtimo(void); void ipreass_destroy(void); #endif static int sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS); +static int sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS); static void ipreass_zone_change(void *); static void ipreass_drain_tomax(void); -static void ipq_free(struct ipqhead *, struct ipq *); +static void ipq_free(struct ipqbucket *, struct ipq *); static struct ipq * ipq_reuse(int); static inline void -ipq_timeout(struct ipqhead *head, struct ipq *fp) +ipq_timeout(struct ipqbucket *bucket, struct ipq *fp) { IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags); - ipq_free(head, fp); + ipq_free(bucket, fp); } static inline void -ipq_drop(struct ipqhead *head, struct ipq *fp) +ipq_drop(struct ipqbucket *bucket, struct ipq *fp) { IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); - ipq_free(head, fp); + ipq_free(bucket, fp); } -static VNET_DEFINE(uma_zone_t, ipq_zone); +/* + * By default, limit the number of IP fragments across all reassembly + * queues to 1/32 of the total number of mbuf clusters. + * + * Limit the total number of reassembly queues per VNET to the + * IP fragment limit, but ensure the limit will not allow any bucket + * to grow above 100 items. (The bucket limit is + * IP_MAXFRAGPACKETS / (IPREASS_NHASH / 2), so the 50 is the correct + * multiplier to reach a 100-item limit.) + * The 100-item limit was chosen as brief testing seems to show that + * this produces "reasonable" performance on some subset of systems + * under DoS attack. + */ +#define IP_MAXFRAGS (nmbclusters / 32) +#define IP_MAXFRAGPACKETS (imin(IP_MAXFRAGS, IPREASS_NHASH * 50)) + +static int maxfrags; +static volatile u_int nfrags; +SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfrags, CTLFLAG_RW, + &maxfrags, 0, + "Maximum number of IPv4 fragments allowed across all reassembly queues"); +SYSCTL_UINT(_net_inet_ip, OID_AUTO, curfrags, CTLFLAG_RD, + __DEVOLATILE(u_int *, &nfrags), 0, + "Current number of IPv4 fragments across all reassembly queues"); + +VNET_DEFINE_STATIC(uma_zone_t, ipq_zone); #define V_ipq_zone VNET(ipq_zone) SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, NULL, 0, sysctl_maxfragpackets, "I", @@ -121,14 +152,18 @@ SYSCTL_UMA_CUR(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_VNET, &VNET_NAME(ipq_zone), "Current number of IPv4 fragment reassembly queue entries"); -static VNET_DEFINE(int, noreass); +VNET_DEFINE_STATIC(int, noreass); #define V_noreass VNET(noreass) -static VNET_DEFINE(int, maxfragsperpacket); +VNET_DEFINE_STATIC(int, maxfragsperpacket); #define V_maxfragsperpacket VNET(maxfragsperpacket) SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(maxfragsperpacket), 0, "Maximum number of IPv4 fragments allowed per packet"); +SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragbucketsize, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, + sysctl_maxfragbucketsize, "I", + "Maximum number of IPv4 fragment reassembly queue entries per bucket"); /* * Take incoming datagram fragment and try to reassemble it into @@ -148,9 +183,9 @@ ip_reass(struct mbuf *m) struct mbuf *p, *q, *nq, *t; struct ipq *fp; struct ipqhead *head; - int i, hlen, next; + int i, hlen, next, tmpmax; u_int8_t ecn, ecn0; - uint32_t hash; + uint32_t hash, hashkey[3]; #ifdef RSS uint32_t rss_hash, rss_type; #endif @@ -158,8 +193,12 @@ ip_reass(struct mbuf *m) /* * If no reassembling or maxfragsperpacket are 0, * never accept fragments. + * Also, drop packet if it would exceed the maximum + * number of fragments. */ - if (V_noreass == 1 || V_maxfragsperpacket == 0) { + tmpmax = maxfrags; + if (V_noreass == 1 || V_maxfragsperpacket == 0 || + (tmpmax >= 0 && atomic_load_int(&nfrags) >= (u_int)tmpmax)) { IPSTAT_INC(ips_fragments); IPSTAT_INC(ips_fragdropped); m_freem(m); @@ -204,8 +243,12 @@ ip_reass(struct mbuf *m) m->m_data += hlen; m->m_len -= hlen; - hash = ip->ip_src.s_addr ^ ip->ip_id; - hash = jenkins_hash32(&hash, 1, V_ipq_hashseed) & IPREASS_HMASK; + hashkey[0] = ip->ip_src.s_addr; + hashkey[1] = ip->ip_dst.s_addr; + hashkey[2] = (uint32_t)ip->ip_p << 16; + hashkey[2] += ip->ip_id; + hash = jenkins_hash32(hashkey, nitems(hashkey), V_ipq_hashseed); + hash &= IPREASS_HMASK; head = &V_ipq[hash].head; IPQ_LOCK(hash); @@ -226,9 +269,12 @@ ip_reass(struct mbuf *m) * If first fragment to arrive, create a reassembly queue. */ if (fp == NULL) { - fp = uma_zalloc(V_ipq_zone, M_NOWAIT); + if (V_ipq[hash].count < V_ipreass_maxbucketsize) + fp = uma_zalloc(V_ipq_zone, M_NOWAIT); if (fp == NULL) fp = ipq_reuse(hash); + if (fp == NULL) + goto dropfrag; #ifdef MAC if (mac_ipq_init(fp, M_NOWAIT) != 0) { uma_zfree(V_ipq_zone, fp); @@ -238,7 +284,9 @@ ip_reass(struct mbuf *m) mac_ipq_create(m, fp); #endif TAILQ_INSERT_HEAD(head, fp, ipq_list); + V_ipq[hash].count++; fp->ipq_nfrags = 1; + atomic_add_int(&nfrags, 1); fp->ipq_ttl = IPFRAGTTL; fp->ipq_p = ip->ip_p; fp->ipq_id = ip->ip_id; @@ -249,6 +297,7 @@ ip_reass(struct mbuf *m) goto done; } else { fp->ipq_nfrags++; + atomic_add_int(&nfrags, 1); #ifdef MAC mac_ipq_update(m, fp); #endif @@ -325,6 +374,7 @@ ip_reass(struct mbuf *m) m->m_nextpkt = nq; IPSTAT_INC(ips_fragdropped); fp->ipq_nfrags--; + atomic_subtract_int(&nfrags, 1); m_freem(q); } @@ -342,7 +392,7 @@ ip_reass(struct mbuf *m) for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { if (ntohs(GETIP(q)->ip_off) != next) { if (fp->ipq_nfrags > V_maxfragsperpacket) - ipq_drop(head, fp); + ipq_drop(&V_ipq[hash], fp); goto done; } next += ntohs(GETIP(q)->ip_len); @@ -350,7 +400,7 @@ ip_reass(struct mbuf *m) /* Make sure the last packet didn't have the IP_MF flag */ if (p->m_flags & M_IP_FRAG) { if (fp->ipq_nfrags > V_maxfragsperpacket) - ipq_drop(head, fp); + ipq_drop(&V_ipq[hash], fp); goto done; } @@ -361,7 +411,7 @@ ip_reass(struct mbuf *m) ip = GETIP(q); if (next + (ip->ip_hl << 2) > IP_MAXPACKET) { IPSTAT_INC(ips_toolong); - ipq_drop(head, fp); + ipq_drop(&V_ipq[hash], fp); goto done; } @@ -390,6 +440,7 @@ ip_reass(struct mbuf *m) while (m->m_pkthdr.csum_data & 0xffff0000) m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) + (m->m_pkthdr.csum_data >> 16); + atomic_subtract_int(&nfrags, fp->ipq_nfrags); #ifdef MAC mac_ipq_reassemble(fp, m); mac_ipq_destroy(fp); @@ -404,6 +455,7 @@ ip_reass(struct mbuf *m) ip->ip_src = fp->ipq_src; ip->ip_dst = fp->ipq_dst; TAILQ_REMOVE(head, fp, ipq_list); + V_ipq[hash].count--; uma_zfree(V_ipq_zone, fp); m->m_len += (ip->ip_hl << 2); m->m_data -= (ip->ip_hl << 2); @@ -449,8 +501,10 @@ ip_reass(struct mbuf *m) dropfrag: IPSTAT_INC(ips_fragdropped); - if (fp != NULL) + if (fp != NULL) { fp->ipq_nfrags--; + atomic_subtract_int(&nfrags, 1); + } m_freem(m); done: IPQ_UNLOCK(hash); @@ -465,21 +519,27 @@ done: void ipreass_init(void) { + int max; for (int i = 0; i < IPREASS_NHASH; i++) { TAILQ_INIT(&V_ipq[i].head); mtx_init(&V_ipq[i].lock, "IP reassembly", NULL, MTX_DEF | MTX_DUPOK); + V_ipq[i].count = 0; } V_ipq_hashseed = arc4random(); V_maxfragsperpacket = 16; V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); - uma_zone_set_max(V_ipq_zone, nmbclusters / 32); + max = IP_MAXFRAGPACKETS; + max = uma_zone_set_max(V_ipq_zone, max); + V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1); - if (IS_DEFAULT_VNET(curvnet)) + if (IS_DEFAULT_VNET(curvnet)) { + maxfrags = IP_MAXFRAGS; EVENTHANDLER_REGISTER(nmbclusters_change, ipreass_zone_change, NULL, EVENTHANDLER_PRI_ANY); + } } /* @@ -494,7 +554,7 @@ ipreass_slowtimo(void) IPQ_LOCK(i); TAILQ_FOREACH_SAFE(fp, &V_ipq[i].head, ipq_list, tmp) if (--fp->ipq_ttl == 0) - ipq_timeout(&V_ipq[i].head, fp); + ipq_timeout(&V_ipq[i], fp); IPQ_UNLOCK(i); } } @@ -509,7 +569,10 @@ ipreass_drain(void) for (int i = 0; i < IPREASS_NHASH; i++) { IPQ_LOCK(i); while(!TAILQ_EMPTY(&V_ipq[i].head)) - ipq_drop(&V_ipq[i].head, TAILQ_FIRST(&V_ipq[i].head)); + ipq_drop(&V_ipq[i], TAILQ_FIRST(&V_ipq[i].head)); + KASSERT(V_ipq[i].count == 0, + ("%s: V_ipq[%d] count %d (V_ipq=%p)", __func__, i, + V_ipq[i].count, V_ipq)); IPQ_UNLOCK(i); } } @@ -537,9 +600,23 @@ ipreass_destroy(void) static void ipreass_drain_tomax(void) { + struct ipq *fp; int target; /* + * Make sure each bucket is under the new limit. If + * necessary, drop enough of the oldest elements from + * each bucket to get under the new limit. + */ + for (int i = 0; i < IPREASS_NHASH; i++) { + IPQ_LOCK(i); + while (V_ipq[i].count > V_ipreass_maxbucketsize && + (fp = TAILQ_LAST(&V_ipq[i].head, ipqhead)) != NULL) + ipq_timeout(&V_ipq[i], fp); + IPQ_UNLOCK(i); + } + + /* * If we are over the maximum number of fragments, * drain off enough to get down to the new limit, * stripping off last elements on queues. Every @@ -547,13 +624,11 @@ ipreass_drain_tomax(void) */ target = uma_zone_get_max(V_ipq_zone); while (uma_zone_get_cur(V_ipq_zone) > target) { - struct ipq *fp; - for (int i = 0; i < IPREASS_NHASH; i++) { IPQ_LOCK(i); fp = TAILQ_LAST(&V_ipq[i].head, ipqhead); if (fp != NULL) - ipq_timeout(&V_ipq[i].head, fp); + ipq_timeout(&V_ipq[i], fp); IPQ_UNLOCK(i); } } @@ -562,9 +637,20 @@ ipreass_drain_tomax(void) static void ipreass_zone_change(void *tag) { - - uma_zone_set_max(V_ipq_zone, nmbclusters / 32); - ipreass_drain_tomax(); + VNET_ITERATOR_DECL(vnet_iter); + int max; + + maxfrags = IP_MAXFRAGS; + max = IP_MAXFRAGPACKETS; + VNET_LIST_RLOCK_NOSLEEP(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + max = uma_zone_set_max(V_ipq_zone, max); + V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1); + ipreass_drain_tomax(); + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK_NOSLEEP(); } /* @@ -592,6 +678,7 @@ sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS) * and place an extreme upper bound. */ max = uma_zone_set_max(V_ipq_zone, max); + V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1); ipreass_drain_tomax(); V_noreass = 0; } else if (max == 0) { @@ -600,6 +687,7 @@ sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS) } else if (max == -1) { V_noreass = 0; uma_zone_set_max(V_ipq_zone, 0); + V_ipreass_maxbucketsize = INT_MAX; } else return (EINVAL); return (0); @@ -613,49 +701,72 @@ static struct ipq * ipq_reuse(int start) { struct ipq *fp; - int i; + int bucket, i; IPQ_LOCK_ASSERT(start); - for (i = start;; i++) { - if (i == IPREASS_NHASH) - i = 0; - if (i != start && IPQ_TRYLOCK(i) == 0) + for (i = 0; i < IPREASS_NHASH; i++) { + bucket = (start + i) % IPREASS_NHASH; + if (bucket != start && IPQ_TRYLOCK(bucket) == 0) continue; - fp = TAILQ_LAST(&V_ipq[i].head, ipqhead); + fp = TAILQ_LAST(&V_ipq[bucket].head, ipqhead); if (fp) { struct mbuf *m; IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags); + atomic_subtract_int(&nfrags, fp->ipq_nfrags); while (fp->ipq_frags) { m = fp->ipq_frags; fp->ipq_frags = m->m_nextpkt; m_freem(m); } - TAILQ_REMOVE(&V_ipq[i].head, fp, ipq_list); - if (i != start) - IPQ_UNLOCK(i); - IPQ_LOCK_ASSERT(start); - return (fp); + TAILQ_REMOVE(&V_ipq[bucket].head, fp, ipq_list); + V_ipq[bucket].count--; + if (bucket != start) + IPQ_UNLOCK(bucket); + break; } - if (i != start) - IPQ_UNLOCK(i); + if (bucket != start) + IPQ_UNLOCK(bucket); } + IPQ_LOCK_ASSERT(start); + return (fp); } /* * Free a fragment reassembly header and all associated datagrams. */ static void -ipq_free(struct ipqhead *fhp, struct ipq *fp) +ipq_free(struct ipqbucket *bucket, struct ipq *fp) { struct mbuf *q; + atomic_subtract_int(&nfrags, fp->ipq_nfrags); while (fp->ipq_frags) { q = fp->ipq_frags; fp->ipq_frags = q->m_nextpkt; m_freem(q); } - TAILQ_REMOVE(fhp, fp, ipq_list); + TAILQ_REMOVE(&bucket->head, fp, ipq_list); + bucket->count--; uma_zfree(V_ipq_zone, fp); } + +/* + * Get or set the maximum number of reassembly queues per bucket. + */ +static int +sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS) +{ + int error, max; + + max = V_ipreass_maxbucketsize; + error = sysctl_handle_int(oidp, &max, 0, req); + if (error || !req->newptr) + return (error); + if (max <= 0) + return (EINVAL); + V_ipreass_maxbucketsize = max; + ipreass_drain_tomax(); + return (0); +} diff --git a/freebsd/sys/netinet/libalias/alias.c b/freebsd/sys/netinet/libalias/alias.c index 2dd5b999..d4eeb040 100644 --- a/freebsd/sys/netinet/libalias/alias.c +++ b/freebsd/sys/netinet/libalias/alias.c @@ -1753,7 +1753,8 @@ LibAliasUnLoadAllModule(void) * the input packet, on failure NULL. The input packet is always consumed. */ struct mbuf * -m_megapullup(struct mbuf *m, int len) { +m_megapullup(struct mbuf *m, int len) +{ struct mbuf *mcl; if (len > m->m_pkthdr.len) @@ -1762,7 +1763,14 @@ m_megapullup(struct mbuf *m, int len) { if (m->m_next == NULL && M_WRITABLE(m)) return (m); - mcl = m_get2(len, M_NOWAIT, MT_DATA, M_PKTHDR); + if (len <= MJUMPAGESIZE) + mcl = m_get2(len, M_NOWAIT, MT_DATA, M_PKTHDR); + else if (len <= MJUM9BYTES) + mcl = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, MJUM9BYTES); + else if (len <= MJUM16BYTES) + mcl = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, MJUM16BYTES); + else + goto bad; if (mcl == NULL) goto bad; m_align(mcl, len); diff --git a/freebsd/sys/netinet/libalias/alias_irc.c b/freebsd/sys/netinet/libalias/alias_irc.c index 1dbb9ddf..19337121 100644 --- a/freebsd/sys/netinet/libalias/alias_irc.c +++ b/freebsd/sys/netinet/libalias/alias_irc.c @@ -100,8 +100,7 @@ static int fingerprint(struct libalias *la, struct alias_data *ah) { - if (ah->dport == NULL || ah->dport == NULL || ah->lnk == NULL || - ah->maxpktsize == 0) + if (ah->dport == NULL || ah->lnk == NULL || ah->maxpktsize == 0) return (-1); if (ntohs(*ah->dport) == IRC_CONTROL_PORT_NUMBER_1 || ntohs(*ah->dport) == IRC_CONTROL_PORT_NUMBER_2) diff --git a/freebsd/sys/netinet/libalias/alias_mod.h b/freebsd/sys/netinet/libalias/alias_mod.h index c646f794..a894b6de 100644 --- a/freebsd/sys/netinet/libalias/alias_mod.h +++ b/freebsd/sys/netinet/libalias/alias_mod.h @@ -41,17 +41,17 @@ MALLOC_DECLARE(M_ALIAS); /* Use kernel allocator. */ #if defined(_SYS_MALLOC_H_) +#undef malloc #ifndef __rtems__ #define malloc(x) malloc(x, M_ALIAS, M_NOWAIT|M_ZERO) #define calloc(n, x) mallocarray((n), (x), M_ALIAS, M_NOWAIT|M_ZERO) #define free(x) free(x, M_ALIAS) #else /* __rtems__ */ -#undef malloc #undef calloc #undef free -#define malloc(x) _bsd_malloc(x, M_ALIAS, M_NOWAIT|M_ZERO) -#define calloc(x, n) malloc(x*n) -#define free(x) _bsd_free(x, M_ALIAS) +#define malloc(x) _bsd_malloc(x, M_ALIAS, M_NOWAIT|M_ZERO) +#define calloc(n, x) mallocarray((n), (x), M_ALIAS, M_NOWAIT|M_ZERO) +#define free(x) _bsd_free(x, M_ALIAS) #endif /* __rtems__ */ #endif #endif diff --git a/freebsd/sys/netinet/pim_var.h b/freebsd/sys/netinet/pim_var.h index e6398a4d..dfb06928 100644 --- a/freebsd/sys/netinet/pim_var.h +++ b/freebsd/sys/netinet/pim_var.h @@ -73,8 +73,6 @@ struct pimstat { #define PIMCTL_STATS 1 /* statistics (read-only) */ #ifdef _KERNEL - -int pim_input(struct mbuf **, int *, int); SYSCTL_DECL(_net_inet_pim); #endif diff --git a/freebsd/sys/netinet/raw_ip.c b/freebsd/sys/netinet/raw_ip.c index 7dea3ec1..a97eadae 100644 --- a/freebsd/sys/netinet/raw_ip.c +++ b/freebsd/sys/netinet/raw_ip.c @@ -172,7 +172,7 @@ rip_inshash(struct inpcb *inp) } else hash = 0; pcbhash = &pcbinfo->ipi_hashbase[hash]; - LIST_INSERT_HEAD(pcbhash, inp, inp_hash); + CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash); } static void @@ -182,7 +182,7 @@ rip_delhash(struct inpcb *inp) INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); INP_WLOCK_ASSERT(inp); - LIST_REMOVE(inp, inp_hash); + CK_LIST_REMOVE(inp, inp_hash); } #endif /* INET */ @@ -287,6 +287,7 @@ rip_input(struct mbuf **mp, int *offp, int proto) struct ip *ip = mtod(m, struct ip *); struct inpcb *inp, *last; struct sockaddr_in ripsrc; + struct epoch_tracker et; int hash; *mp = NULL; @@ -301,8 +302,8 @@ rip_input(struct mbuf **mp, int *offp, int proto) hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr, ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask); - INP_INFO_RLOCK(&V_ripcbinfo); - LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) { + INP_INFO_RLOCK_ET(&V_ripcbinfo, et); + CK_LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) { if (inp->inp_ip_p != proto) continue; #ifdef INET6 @@ -314,27 +315,33 @@ rip_input(struct mbuf **mp, int *offp, int proto) continue; if (inp->inp_faddr.s_addr != ip->ip_src.s_addr) continue; - if (jailed_without_vnet(inp->inp_cred)) { - /* - * XXX: If faddr was bound to multicast group, - * jailed raw socket will drop datagram. - */ - if (prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) - continue; - } if (last != NULL) { struct mbuf *n; n = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (n != NULL) - (void) rip_append(last, ip, n, &ripsrc); + (void) rip_append(last, ip, n, &ripsrc); /* XXX count dropped packet */ INP_RUNLOCK(last); + last = NULL; } INP_RLOCK(inp); + if (__predict_false(inp->inp_flags2 & INP_FREED)) + goto skip_1; + if (jailed_without_vnet(inp->inp_cred)) { + /* + * XXX: If faddr was bound to multicast group, + * jailed raw socket will drop datagram. + */ + if (prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) + goto skip_1; + } last = inp; + continue; + skip_1: + INP_RUNLOCK(inp); } - LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) { + CK_LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) { if (inp->inp_ip_p && inp->inp_ip_p != proto) continue; #ifdef INET6 @@ -348,6 +355,19 @@ rip_input(struct mbuf **mp, int *offp, int proto) if (!in_nullhost(inp->inp_faddr) && !in_hosteq(inp->inp_faddr, ip->ip_src)) continue; + if (last != NULL) { + struct mbuf *n; + + n = m_copym(m, 0, M_COPYALL, M_NOWAIT); + if (n != NULL) + (void) rip_append(last, ip, n, &ripsrc); + /* XXX count dropped packet */ + INP_RUNLOCK(last); + last = NULL; + } + INP_RLOCK(inp); + if (__predict_false(inp->inp_flags2 & INP_FREED)) + goto skip_2; if (jailed_without_vnet(inp->inp_cred)) { /* * Allow raw socket in jail to receive multicast; @@ -356,7 +376,7 @@ rip_input(struct mbuf **mp, int *offp, int proto) */ if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) - continue; + goto skip_2; } /* * If this raw socket has multicast state, and we @@ -397,22 +417,15 @@ rip_input(struct mbuf **mp, int *offp, int proto) if (blocked != MCAST_PASS) { IPSTAT_INC(ips_notmember); - continue; + goto skip_2; } } - if (last != NULL) { - struct mbuf *n; - - n = m_copym(m, 0, M_COPYALL, M_NOWAIT); - if (n != NULL) - (void) rip_append(last, ip, n, &ripsrc); - /* XXX count dropped packet */ - INP_RUNLOCK(last); - } - INP_RLOCK(inp); last = inp; + continue; + skip_2: + INP_RUNLOCK(inp); } - INP_INFO_RUNLOCK(&V_ripcbinfo); + INP_INFO_RUNLOCK_ET(&V_ripcbinfo, et); if (last != NULL) { if (rip_append(last, ip, m, &ripsrc) != 0) IPSTAT_INC(ips_delivered); @@ -853,7 +866,6 @@ rip_detach(struct socket *so) ip_rsvp_force_done(so); if (so == V_ip_rsvpd) ip_rsvp_done(); - /* XXX defer to epoch_call */ in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); @@ -1023,10 +1035,10 @@ static int rip_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; - struct in_pcblist *il; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; + struct epoch_tracker et; /* * The process of preparing the TCB list is too time-consuming and @@ -1045,10 +1057,10 @@ rip_pcblist(SYSCTL_HANDLER_ARGS) /* * OK, now we're committed to doing something. */ - INP_INFO_RLOCK(&V_ripcbinfo); + INP_INFO_WLOCK(&V_ripcbinfo); gencnt = V_ripcbinfo.ipi_gencnt; n = V_ripcbinfo.ipi_count; - INP_INFO_RUNLOCK(&V_ripcbinfo); + INP_INFO_WUNLOCK(&V_ripcbinfo); xig.xig_len = sizeof xig; xig.xig_count = n; @@ -1058,12 +1070,11 @@ rip_pcblist(SYSCTL_HANDLER_ARGS) if (error) return (error); - il = malloc(sizeof(struct in_pcblist) + n * sizeof(struct inpcb *), M_TEMP, M_WAITOK|M_ZERO_INVARIANTS); - inp_list = il->il_inp_list; + inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); - INP_INFO_RLOCK(&V_ripcbinfo); - for (inp = LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n; - inp = LIST_NEXT(inp, inp_list)) { + INP_INFO_RLOCK_ET(&V_ripcbinfo, et); + for (inp = CK_LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n; + inp = CK_LIST_NEXT(inp, inp_list)) { INP_WLOCK(inp); if (inp->inp_gencnt <= gencnt && cr_canseeinpcb(req->td->td_ucred, inp) == 0) { @@ -1072,7 +1083,7 @@ rip_pcblist(SYSCTL_HANDLER_ARGS) } INP_WUNLOCK(inp); } - INP_INFO_RUNLOCK(&V_ripcbinfo); + INP_INFO_RUNLOCK_ET(&V_ripcbinfo, et); n = i; error = 0; @@ -1088,24 +1099,31 @@ rip_pcblist(SYSCTL_HANDLER_ARGS) } else INP_RUNLOCK(inp); } - il->il_count = n; - il->il_pcbinfo = &V_ripcbinfo; - epoch_call(net_epoch_preempt, &il->il_epoch_ctx, in_pcblist_rele_rlocked); + INP_INFO_WLOCK(&V_ripcbinfo); + for (i = 0; i < n; i++) { + inp = inp_list[i]; + INP_RLOCK(inp); + if (!in_pcbrele_rlocked(inp)) + INP_RUNLOCK(inp); + } + INP_INFO_WUNLOCK(&V_ripcbinfo); if (!error) { + struct epoch_tracker et; /* * Give the user an updated idea of our state. If the * generation differs from what we told her before, she knows * that something happened while we were processing this * request, and it might be necessary to retry. */ - INP_INFO_RLOCK(&V_ripcbinfo); + INP_INFO_RLOCK_ET(&V_ripcbinfo, et); xig.xig_gen = V_ripcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_ripcbinfo.ipi_count; - INP_INFO_RUNLOCK(&V_ripcbinfo); + INP_INFO_RUNLOCK_ET(&V_ripcbinfo, et); error = SYSCTL_OUT(req, &xig, sizeof xig); } + free(inp_list, M_TEMP); return (error); } diff --git a/freebsd/sys/netinet/sctp.h b/freebsd/sys/netinet/sctp.h index 5a86f108..64fd5442 100644 --- a/freebsd/sys/netinet/sctp.h +++ b/freebsd/sys/netinet/sctp.h @@ -419,7 +419,7 @@ struct sctp_error_unresolv_addr { struct sctp_error_unrecognized_chunk { struct sctp_error_cause cause; /* code=SCTP_CAUSE_UNRECOG_CHUNK */ - struct sctp_chunkhdr ch;/* header from chunk in error */ + struct sctp_chunkhdr ch; /* header from chunk in error */ } SCTP_PACKED; struct sctp_error_no_user_data { diff --git a/freebsd/sys/netinet/sctp_asconf.c b/freebsd/sys/netinet/sctp_asconf.c index d2d990e1..c21e3251 100644 --- a/freebsd/sys/netinet/sctp_asconf.c +++ b/freebsd/sys/netinet/sctp_asconf.c @@ -279,6 +279,7 @@ sctp_asconf_del_remote_addrs_except(struct sctp_tcb *stcb, struct sockaddr *src) /* not found */ return (-1); } + /* delete all destination addresses except the source */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (net != src_net) { @@ -385,6 +386,7 @@ sctp_process_asconf_delete_ip(struct sockaddr *src, aparam_length); return (m_reply); } + /* if deleting 0.0.0.0/::0, delete all addresses except src addr */ if (zero_address && SCTP_BASE_SYSCTL(sctp_nat_friendly)) { result = sctp_asconf_del_remote_addrs_except(stcb, src); @@ -403,6 +405,7 @@ sctp_process_asconf_delete_ip(struct sockaddr *src, } return (m_reply); } + /* delete the address */ result = sctp_del_remote_addr(stcb, sa); /* @@ -618,6 +621,7 @@ sctp_handle_asconf(struct mbuf *m, unsigned int offset, serial_num, asoc->asconf_seq_in + 1); return; } + /* it's the expected "next" sequence number, so process it */ asoc->asconf_seq_in = serial_num; /* update sequence */ /* get length of all the param's in the ASCONF */ @@ -642,6 +646,7 @@ sctp_handle_asconf(struct mbuf *m, unsigned int offset, SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asconf_ack), ack); } } + m_ack = sctp_get_mbuf_for_msg(sizeof(struct sctp_asconf_ack_chunk), 0, M_NOWAIT, 1, MT_DATA); if (m_ack == NULL) { @@ -976,6 +981,7 @@ sctp_assoc_immediate_retrans(struct sctp_tcb *stcb, struct sctp_nets *dstnet) if (stcb->asoc.deleted_primary == NULL) { return; } + if (!TAILQ_EMPTY(&stcb->asoc.sent_queue)) { SCTPDBG(SCTP_DEBUG_ASCONF1, "assoc_immediate_retrans: Deleted primary is "); SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, &stcb->asoc.deleted_primary->ro._l_addr.sa); @@ -1079,6 +1085,7 @@ sctp_path_check_and_react(struct sctp_tcb *stcb, struct sctp_ifa *newifa) } return; } + /* Multiple local addresses exsist in the association. */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { /* clear any cached route and source address */ @@ -1325,6 +1332,7 @@ sctp_asconf_queue_add(struct sctp_tcb *stcb, struct sctp_ifa *ifa, if (stcb->asoc.asconf_supported == 0) { return (-1); } + /* * if this is deleting the last address from the assoc, mark it as * pending. @@ -1345,6 +1353,7 @@ sctp_asconf_queue_add(struct sctp_tcb *stcb, struct sctp_ifa *ifa, return (-1); } } + /* queue an asconf parameter */ status = sctp_asconf_queue_mgmt(stcb, ifa, type); @@ -1366,6 +1375,7 @@ sctp_asconf_queue_add(struct sctp_tcb *stcb, struct sctp_ifa *ifa, stcb->asoc.asconf_addr_del_pending = NULL; } } + if (pending_delete_queued) { struct sctp_nets *net; @@ -1390,6 +1400,7 @@ sctp_asconf_queue_add(struct sctp_tcb *stcb, struct sctp_ifa *ifa, SCTP_FROM_SCTP_ASCONF, __LINE__); } + /* queue in an advisory set primary too */ (void)sctp_asconf_queue_mgmt(stcb, ifa, SCTP_SET_PRIM_ADDR); /* let caller know we should send this out immediately */ @@ -1687,11 +1698,13 @@ sctp_handle_asconf_ack(struct mbuf *m, int offset, serial_num, asoc->asconf_seq_out_acked + 1); return; } + if (serial_num == asoc->asconf_seq_out - 1) { /* stop our timer */ sctp_timer_stop(SCTP_TIMER_TYPE_ASCONF, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_ASCONF + SCTP_LOC_5); } + /* process the ASCONF-ACK contents */ ack_length = ntohs(cp->ch.chunk_length) - sizeof(struct sctp_asconf_ack_chunk); @@ -1780,7 +1793,7 @@ sctp_handle_asconf_ack(struct mbuf *m, int offset, * at any given time */ if (last_error_id == 0) - last_error_id--;/* set to "max" value */ + last_error_id--; /* set to "max" value */ TAILQ_FOREACH_SAFE(aa, &stcb->asoc.asconf_queue, next, aa_next) { if (aa->sent == 1) { /* @@ -1980,8 +1993,8 @@ sctp_addr_mgmt_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, * sent when the state goes open. */ if (status == 0 && - ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) || - (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED))) { + ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED))) { #ifdef SCTP_TIMER_BASED_ASCONF sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, inp, stcb, stcb->asoc.primary_destination); @@ -2060,6 +2073,7 @@ sctp_asconf_iterator_ep_end(struct sctp_inpcb *inp, void *ptr, uint32_t val SCTP laddr->action = 0; break; } + } } else if (l->action == SCTP_DEL_IP_ADDRESS) { LIST_FOREACH_SAFE(laddr, &inp->sctp_addr_list, sctp_nxt_addr, nladdr) { @@ -2093,6 +2107,7 @@ sctp_asconf_iterator_stcb(struct sctp_inpcb *inp, struct sctp_tcb *stcb, if (ifa->vrf_id != stcb->asoc.vrf_id) { continue; } + /* Same checks again for assoc */ switch (ifa->address.sa.sa_family) { #ifdef INET6 @@ -2229,8 +2244,8 @@ sctp_asconf_iterator_stcb(struct sctp_inpcb *inp, struct sctp_tcb *stcb, * count of queued params. If in the non-open * state, these get sent when the assoc goes open. */ - if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) || - (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) { if (status >= 0) { num_queued++; } @@ -2283,6 +2298,7 @@ sctp_set_primary_ip_address_sa(struct sctp_tcb *stcb, struct sockaddr *sa) /* Invalid address */ return (-1); } + /* queue an ASCONF:SET_PRIM_ADDR to be sent */ if (!sctp_asconf_queue_add(stcb, ifa, SCTP_SET_PRIM_ADDR)) { /* set primary queuing succeeded */ @@ -2290,8 +2306,8 @@ sctp_set_primary_ip_address_sa(struct sctp_tcb *stcb, struct sockaddr *sa) "set_primary_ip_address_sa: queued on tcb=%p, ", (void *)stcb); SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa); - if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) || - (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) { #ifdef SCTP_TIMER_BASED_ASCONF sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, stcb->sctp_ep, stcb, @@ -2361,11 +2377,13 @@ sctp_is_addr_pending(struct sctp_tcb *stcb, struct sctp_ifa *sctp_ifa) SCTPDBG(SCTP_DEBUG_ASCONF1, "is_addr_pending: param length(%u) too short\n", param_length); break; } + aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(chk->data, offset, param_length, aparam_buf); if (aph == NULL) { SCTPDBG(SCTP_DEBUG_ASCONF1, "is_addr_pending: couldn't get entire param\n"); break; } + ph = (struct sctp_paramhdr *)(aph + 1); if (sctp_addr_match(ph, &sctp_ifa->address.sa) != 0) { switch (param_type) { @@ -2380,6 +2398,7 @@ sctp_is_addr_pending(struct sctp_tcb *stcb, struct sctp_ifa *sctp_ifa) } last_param_type = param_type; } + offset += SCTP_SIZE32(param_length); if (offset >= asconf_limit) { /* no more data in the mbuf chain */ @@ -2463,6 +2482,7 @@ sctp_find_valid_localaddr(struct sctp_tcb *stcb, int addr_locked) if (sctp_ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { continue; } + sin6 = &sctp_ifa->address.sin6; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* @@ -2826,8 +2846,7 @@ sctp_process_initack_addresses(struct sctp_tcb *stcb, struct mbuf *m, * out the ASCONF. */ if (status == 0 && - SCTP_GET_STATE(&stcb->asoc) == - SCTP_STATE_OPEN) { + SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) { #ifdef SCTP_TIMER_BASED_ASCONF sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, stcb->sctp_ep, stcb, @@ -2838,6 +2857,7 @@ sctp_process_initack_addresses(struct sctp_tcb *stcb, struct mbuf *m, } } } + next_addr: /* * Sanity check: Make sure the length isn't 0, otherwise @@ -3372,6 +3392,7 @@ sctp_asconf_send_nat_state_update(struct sctp_tcb *stcb, if (vrf == NULL) { goto skip_rest; } + SCTP_IPI_ADDR_RLOCK(); LIST_FOREACH(sctp_ifnp, &vrf->ifnlist, next_ifn) { LIST_FOREACH(sctp_ifap, &sctp_ifnp->ifalist, next_ifa) { diff --git a/freebsd/sys/netinet/sctp_asconf.h b/freebsd/sys/netinet/sctp_asconf.h index 2a372205..581d504c 100644 --- a/freebsd/sys/netinet/sctp_asconf.h +++ b/freebsd/sys/netinet/sctp_asconf.h @@ -60,10 +60,10 @@ sctp_addr_mgmt_ep_sa(struct sctp_inpcb *, struct sockaddr *, uint32_t, uint32_t, struct sctp_ifa *); -extern int +extern int sctp_asconf_iterator_ep(struct sctp_inpcb *inp, void *ptr, uint32_t val); -extern void +extern void sctp_asconf_iterator_stcb(struct sctp_inpcb *inp, struct sctp_tcb *stcb, void *ptr, uint32_t type); diff --git a/freebsd/sys/netinet/sctp_auth.c b/freebsd/sys/netinet/sctp_auth.c index d8fbcf6e..0fc076e1 100644 --- a/freebsd/sys/netinet/sctp_auth.c +++ b/freebsd/sys/netinet/sctp_auth.c @@ -1311,6 +1311,7 @@ sctp_auth_setactivekey(struct sctp_tcb *stcb, uint16_t keyid) /* can't reactivate a deactivated key with other refcounts */ return (-1); } + /* set the (new) active key */ stcb->asoc.authinfo.active_keyid = keyid; /* reset the deactivated flag */ @@ -1365,6 +1366,7 @@ sctp_deact_sharedkey(struct sctp_tcb *stcb, uint16_t keyid) sctp_ulp_notify(SCTP_NOTIFY_AUTH_FREE_KEY, stcb, keyid, 0, SCTP_SO_LOCKED); } + /* mark the key as deactivated */ skey->deactivated = 1; @@ -1506,6 +1508,8 @@ sctp_auth_get_cookie_params(struct sctp_tcb *stcb, struct mbuf *m, if (p_random != NULL) { keylen = sizeof(*p_random) + random_len; memcpy(new_key->key, p_random, keylen); + } else { + keylen = 0; } /* append in the AUTH chunks */ if (chunks != NULL) { @@ -1582,6 +1586,7 @@ sctp_fill_hmac_digest_m(struct mbuf *m, uint32_t auth_offset, "Assoc Key"); #endif } + /* set in the active key id */ auth->shared_key_id = htons(keyid); @@ -1769,6 +1774,7 @@ sctp_notify_authentication(struct sctp_tcb *stcb, uint32_t indication, /* If the socket is gone we are out of here */ return; } + if (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_AUTHEVNT)) /* event not enabled */ return; @@ -1929,6 +1935,7 @@ sctp_validate_init_auth_params(struct mbuf *m, int offset, int limit) if (num_chunks) got_chklist = 1; } + offset += SCTP_SIZE32(plen); if (offset >= limit) { break; @@ -2023,6 +2030,7 @@ sctp_initialize_auth_params(struct sctp_inpcb *inp, struct sctp_tcb *stcb) new_key->key[keylen++] = i; } } + /* append in the HMACs */ ph = (struct sctp_paramhdr *)(new_key->key + keylen); ph->param_type = htons(SCTP_HMAC_LIST); diff --git a/freebsd/sys/netinet/sctp_auth.h b/freebsd/sys/netinet/sctp_auth.h index 66990c30..44126e3e 100644 --- a/freebsd/sys/netinet/sctp_auth.h +++ b/freebsd/sys/netinet/sctp_auth.h @@ -87,7 +87,7 @@ typedef struct sctp_hmaclist { typedef struct sctp_authinformation { sctp_key_t *random; /* local random key (concatenated) */ uint32_t random_len; /* local random number length for param */ - sctp_key_t *peer_random;/* peer's random key (concatenated) */ + sctp_key_t *peer_random; /* peer's random key (concatenated) */ sctp_key_t *assoc_key; /* cached concatenated send key */ sctp_key_t *recv_key; /* cached concatenated recv key */ uint16_t active_keyid; /* active send keyid */ @@ -114,13 +114,13 @@ extern sctp_auth_chklist_t *sctp_copy_chunklist(sctp_auth_chklist_t *chklist); extern int sctp_auth_add_chunk(uint8_t chunk, sctp_auth_chklist_t *list); extern int sctp_auth_delete_chunk(uint8_t chunk, sctp_auth_chklist_t *list); extern size_t sctp_auth_get_chklist_size(const sctp_auth_chklist_t *list); -extern int +extern int sctp_serialize_auth_chunks(const sctp_auth_chklist_t *list, uint8_t *ptr); -extern int +extern int sctp_pack_auth_chunks(const sctp_auth_chklist_t *list, uint8_t *ptr); -extern int +extern int sctp_unpack_auth_chunks(const uint8_t *ptr, uint8_t num_chunks, sctp_auth_chklist_t *list); @@ -141,16 +141,16 @@ extern void sctp_free_sharedkey(sctp_sharedkey_t *skey); extern sctp_sharedkey_t * sctp_find_sharedkey(struct sctp_keyhead *shared_keys, uint16_t key_id); -extern int +extern int sctp_insert_sharedkey(struct sctp_keyhead *shared_keys, sctp_sharedkey_t *new_skey); -extern int +extern int sctp_copy_skeylist(const struct sctp_keyhead *src, struct sctp_keyhead *dest); /* ref counts on shared keys, by key id */ extern void sctp_auth_key_acquire(struct sctp_tcb *stcb, uint16_t keyid); -extern void +extern void sctp_auth_key_release(struct sctp_tcb *stcb, uint16_t keyid, int so_locked); @@ -161,11 +161,11 @@ extern void sctp_free_hmaclist(sctp_hmaclist_t *list); extern int sctp_auth_add_hmacid(sctp_hmaclist_t *list, uint16_t hmac_id); extern sctp_hmaclist_t *sctp_copy_hmaclist(sctp_hmaclist_t *list); extern sctp_hmaclist_t *sctp_default_supported_hmaclist(void); -extern uint16_t +extern uint16_t sctp_negotiate_hmacid(sctp_hmaclist_t *peer, sctp_hmaclist_t *local); extern int sctp_serialize_hmaclist(sctp_hmaclist_t *list, uint8_t *ptr); -extern int +extern int sctp_verify_hmac_param(struct sctp_auth_hmac_algo *hmacs, uint32_t num_hmacs); @@ -175,22 +175,22 @@ extern void sctp_free_authinfo(sctp_authinfo_t *authinfo); /* keyed-HMAC functions */ extern uint32_t sctp_get_auth_chunk_len(uint16_t hmac_algo); extern uint32_t sctp_get_hmac_digest_len(uint16_t hmac_algo); -extern uint32_t +extern uint32_t sctp_hmac(uint16_t hmac_algo, uint8_t *key, uint32_t keylen, uint8_t *text, uint32_t textlen, uint8_t *digest); -extern int +extern int sctp_verify_hmac(uint16_t hmac_algo, uint8_t *key, uint32_t keylen, uint8_t *text, uint32_t textlen, uint8_t *digest, uint32_t digestlen); -extern uint32_t +extern uint32_t sctp_compute_hmac(uint16_t hmac_algo, sctp_key_t *key, uint8_t *text, uint32_t textlen, uint8_t *digest); extern int sctp_auth_is_supported_hmac(sctp_hmaclist_t *list, uint16_t id); /* mbuf versions */ -extern uint32_t +extern uint32_t sctp_hmac_m(uint16_t hmac_algo, uint8_t *key, uint32_t keylen, struct mbuf *m, uint32_t m_offset, uint8_t *digest, uint32_t trailer); -extern uint32_t +extern uint32_t sctp_compute_hmac_m(uint16_t hmac_algo, sctp_key_t *key, struct mbuf *m, uint32_t m_offset, uint8_t *digest); @@ -206,26 +206,26 @@ extern int sctp_auth_setactivekey_ep(struct sctp_inpcb *inp, uint16_t keyid); extern int sctp_deact_sharedkey(struct sctp_tcb *stcb, uint16_t keyid); extern int sctp_deact_sharedkey_ep(struct sctp_inpcb *inp, uint16_t keyid); -extern void +extern void sctp_auth_get_cookie_params(struct sctp_tcb *stcb, struct mbuf *m, uint32_t offset, uint32_t length); -extern void +extern void sctp_fill_hmac_digest_m(struct mbuf *m, uint32_t auth_offset, struct sctp_auth_chunk *auth, struct sctp_tcb *stcb, uint16_t key_id); extern struct mbuf * sctp_add_auth_chunk(struct mbuf *m, struct mbuf **m_end, struct sctp_auth_chunk **auth_ret, uint32_t *offset, struct sctp_tcb *stcb, uint8_t chunk); -extern int +extern int sctp_handle_auth(struct sctp_tcb *stcb, struct sctp_auth_chunk *ch, struct mbuf *m, uint32_t offset); -extern void +extern void sctp_notify_authentication(struct sctp_tcb *stcb, uint32_t indication, uint16_t keyid, uint16_t alt_keyid, int so_locked); -extern int +extern int sctp_validate_init_auth_params(struct mbuf *m, int offset, int limit); -extern void +extern void sctp_initialize_auth_params(struct sctp_inpcb *inp, struct sctp_tcb *stcb); diff --git a/freebsd/sys/netinet/sctp_bsd_addr.c b/freebsd/sys/netinet/sctp_bsd_addr.c index 94c23bff..0f0ddd89 100644 --- a/freebsd/sys/netinet/sctp_bsd_addr.c +++ b/freebsd/sys/netinet/sctp_bsd_addr.c @@ -307,10 +307,12 @@ sctp_addr_change(struct ifaddr *ifa, int cmd) SCTP_BASE_VAR(first_time) = 1; sctp_init_ifns_for_vrf(SCTP_DEFAULT_VRFID); } + if ((cmd != RTM_ADD) && (cmd != RTM_DELETE)) { /* don't know what to do with this */ return; } + if (ifa->ifa_addr == NULL) { return; } diff --git a/freebsd/sys/netinet/sctp_cc_functions.c b/freebsd/sys/netinet/sctp_cc_functions.c index e8d6a354..1163cb91 100644 --- a/freebsd/sys/netinet/sctp_cc_functions.c +++ b/freebsd/sys/netinet/sctp_cc_functions.c @@ -133,6 +133,7 @@ sctp_cwnd_update_after_fr(struct sctp_tcb *stcb, t_ucwnd_sbw = 1; } } + /*- * CMT fast recovery code. Need to debug. ((sctp_cmt_on_off > 0) && * (net->fast_retran_loss_recovery == 0))) @@ -1121,6 +1122,7 @@ sctp_cwnd_update_after_ecn_echo_common(struct sctp_tcb *stcb, struct sctp_nets * if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd), SCTP_CWND_LOG_FROM_SAT); } + } SCTP_STAT_INCR(sctps_ecnereducedcwnd); } else { @@ -1320,7 +1322,7 @@ sctp_cwnd_update_rtcc_after_ecn_echo(struct sctp_tcb *stcb, struct sctp_nets *ne static -void +void sctp_cwnd_update_rtcc_tsn_acknowledged(struct sctp_nets *net, struct sctp_tmit_chunk *tp1) { @@ -1937,6 +1939,7 @@ measure_achieved_throughput(struct sctp_nets *net) net->cc_mod.htcp_ca.lasttime = now; return; } + net->cc_mod.htcp_ca.bytecount += net->net_ack; if ((net->cc_mod.htcp_ca.bytecount >= net->cwnd - (((net->cc_mod.htcp_ca.alpha >> 7) ? (net->cc_mod.htcp_ca.alpha >> 7) : 1) * net->mtu)) && (now - net->cc_mod.htcp_ca.lasttime >= net->cc_mod.htcp_ca.minRTT) && @@ -1973,6 +1976,7 @@ htcp_beta_update(struct htcp *ca, uint32_t minRTT, uint32_t maxRTT) return; } } + if (ca->modeswitch && minRTT > (uint32_t)MSEC_TO_TICKS(10) && maxRTT) { ca->beta = (minRTT << 7) / maxRTT; if (ca->beta < BETA_MIN) @@ -1996,6 +2000,7 @@ htcp_alpha_update(struct htcp *ca) diff -= hz; factor = 1 + (10 * diff + ((diff / 2) * (diff / 2) / hz)) / hz; } + if (use_rtt_scaling && minRTT) { uint32_t scale = (hz << 3) / (10 * minRTT); @@ -2005,6 +2010,7 @@ htcp_alpha_update(struct htcp *ca) if (!factor) factor = 1; } + ca->alpha = 2 * factor * ((1 << 7) - ca->beta); if (!ca->alpha) ca->alpha = ALPHA_BASE; @@ -2059,12 +2065,14 @@ htcp_cong_avoid(struct sctp_tcb *stcb, struct sctp_nets *net) sctp_log_cwnd(stcb, net, net->mtu, SCTP_CWND_LOG_FROM_SS); } + } else { net->cwnd += net->net_ack; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { sctp_log_cwnd(stcb, net, net->net_ack, SCTP_CWND_LOG_FROM_SS); } + } sctp_enforce_cwnd_limit(&stcb->asoc, net); } else { diff --git a/freebsd/sys/netinet/sctp_constants.h b/freebsd/sys/netinet/sctp_constants.h index 018cd282..d07381d5 100644 --- a/freebsd/sys/netinet/sctp_constants.h +++ b/freebsd/sys/netinet/sctp_constants.h @@ -470,10 +470,14 @@ __FBSDID("$FreeBSD$"); #define SCTP_STATE_IN_ACCEPT_QUEUE 0x1000 #define SCTP_STATE_MASK 0x007f -#define SCTP_GET_STATE(asoc) ((asoc)->state & SCTP_STATE_MASK) -#define SCTP_SET_STATE(asoc, newstate) ((asoc)->state = ((asoc)->state & ~SCTP_STATE_MASK) | newstate) -#define SCTP_CLEAR_SUBSTATE(asoc, substate) ((asoc)->state &= ~substate) -#define SCTP_ADD_SUBSTATE(asoc, substate) ((asoc)->state |= substate) +#define SCTP_GET_STATE(_stcb) \ + ((_stcb)->asoc.state & SCTP_STATE_MASK) +#define SCTP_SET_STATE(_stcb, _state) \ + sctp_set_state(_stcb, _state) +#define SCTP_CLEAR_SUBSTATE(_stcb, _substate) \ + (_stcb)->asoc.state &= ~(_substate) +#define SCTP_ADD_SUBSTATE(_stcb, _substate) \ + sctp_add_substate(_stcb, _substate) /* SCTP reachability state for each address */ #define SCTP_ADDR_REACHABLE 0x001 diff --git a/freebsd/sys/netinet/sctp_dtrace_define.h b/freebsd/sys/netinet/sctp_dtrace_define.h index 53451d20..ad7c8526 100644 --- a/freebsd/sys/netinet/sctp_dtrace_define.h +++ b/freebsd/sys/netinet/sctp_dtrace_define.h @@ -40,7 +40,7 @@ __FBSDID("$FreeBSD$"); #include <sys/kernel.h> #include <sys/sdt.h> -SDT_PROVIDER_DEFINE(sctp); +SDT_PROVIDER_DECLARE(sctp); /********************************************************/ /* Cwnd probe - tracks changes in the congestion window on a netp */ diff --git a/freebsd/sys/netinet/sctp_header.h b/freebsd/sys/netinet/sctp_header.h index 685ed78a..8c4137a5 100644 --- a/freebsd/sys/netinet/sctp_header.h +++ b/freebsd/sys/netinet/sctp_header.h @@ -48,7 +48,7 @@ __FBSDID("$FreeBSD$"); * Parameter structures */ struct sctp_ipv4addr_param { - struct sctp_paramhdr ph;/* type=SCTP_IPV4_PARAM_TYPE, len=8 */ + struct sctp_paramhdr ph; /* type=SCTP_IPV4_PARAM_TYPE, len=8 */ uint32_t addr; /* IPV4 address */ } SCTP_PACKED; @@ -56,20 +56,20 @@ struct sctp_ipv4addr_param { struct sctp_ipv6addr_param { - struct sctp_paramhdr ph;/* type=SCTP_IPV6_PARAM_TYPE, len=20 */ + struct sctp_paramhdr ph; /* type=SCTP_IPV6_PARAM_TYPE, len=20 */ uint8_t addr[SCTP_V6_ADDR_BYTES]; /* IPV6 address */ } SCTP_PACKED; /* Cookie Preservative */ struct sctp_cookie_perserve_param { - struct sctp_paramhdr ph;/* type=SCTP_COOKIE_PRESERVE, len=8 */ + struct sctp_paramhdr ph; /* type=SCTP_COOKIE_PRESERVE, len=8 */ uint32_t time; /* time in ms to extend cookie */ } SCTP_PACKED; #define SCTP_ARRAY_MIN_LEN 1 /* Host Name Address */ struct sctp_host_name_param { - struct sctp_paramhdr ph;/* type=SCTP_HOSTNAME_ADDRESS */ + struct sctp_paramhdr ph; /* type=SCTP_HOSTNAME_ADDRESS */ char name[SCTP_ARRAY_MIN_LEN]; /* host name */ } SCTP_PACKED; @@ -80,7 +80,7 @@ struct sctp_host_name_param { #define SCTP_MAX_ADDR_PARAMS_SIZE 12 /* supported address type */ struct sctp_supported_addr_param { - struct sctp_paramhdr ph;/* type=SCTP_SUPPORTED_ADDRTYPE */ + struct sctp_paramhdr ph; /* type=SCTP_SUPPORTED_ADDRTYPE */ uint16_t addr_type[2]; /* array of supported address types */ } SCTP_PACKED; @@ -108,8 +108,8 @@ struct sctp_prsctp_supported_param { /* draft-ietf-tsvwg-addip-sctp */ struct sctp_asconf_paramhdr { /* an ASCONF "parameter" */ - struct sctp_paramhdr ph;/* a SCTP parameter header */ - uint32_t correlation_id;/* correlation id for this param */ + struct sctp_paramhdr ph; /* a SCTP parameter header */ + uint32_t correlation_id; /* correlation id for this param */ } SCTP_PACKED; struct sctp_asconf_addr_param { /* an ASCONF address parameter */ @@ -133,7 +133,7 @@ struct sctp_asconf_addrv4_param { /* an ASCONF address (v4) parameter */ #define SCTP_MAX_SUPPORTED_EXT 256 struct sctp_supported_chunk_types_param { - struct sctp_paramhdr ph;/* type = 0x8008 len = x */ + struct sctp_paramhdr ph; /* type = 0x8008 len = x */ uint8_t chunk_types[]; } SCTP_PACKED; @@ -206,8 +206,8 @@ struct sctp_state_cookie { /* this is our definition... */ uint16_t peerport; /* port address of the peer in the INIT */ uint16_t myport; /* my port address used in the INIT */ - uint8_t ipv4_addr_legal;/* Are V4 addr legal? */ - uint8_t ipv6_addr_legal;/* Are V6 addr legal? */ + uint8_t ipv4_addr_legal; /* Are V4 addr legal? */ + uint8_t ipv6_addr_legal; /* Are V6 addr legal? */ uint8_t local_scope; /* IPv6 local scope flag */ uint8_t site_scope; /* IPv6 site scope flag */ @@ -512,17 +512,17 @@ struct sctp_stream_reset_resp_tsn { /* Should we make the max be 32? */ #define SCTP_RANDOM_MAX_SIZE 256 struct sctp_auth_random { - struct sctp_paramhdr ph;/* type = 0x8002 */ + struct sctp_paramhdr ph; /* type = 0x8002 */ uint8_t random_data[]; } SCTP_PACKED; struct sctp_auth_chunk_list { - struct sctp_paramhdr ph;/* type = 0x8003 */ + struct sctp_paramhdr ph; /* type = 0x8003 */ uint8_t chunk_types[]; } SCTP_PACKED; struct sctp_auth_hmac_algo { - struct sctp_paramhdr ph;/* type = 0x8004 */ + struct sctp_paramhdr ph; /* type = 0x8004 */ uint16_t hmac_ids[]; } SCTP_PACKED; diff --git a/freebsd/sys/netinet/sctp_indata.c b/freebsd/sys/netinet/sctp_indata.c index 98b397a2..28e3f5b2 100644 --- a/freebsd/sys/netinet/sctp_indata.c +++ b/freebsd/sys/netinet/sctp_indata.c @@ -92,6 +92,7 @@ sctp_calc_rwnd(struct sctp_tcb *stcb, struct sctp_association *asoc) if (stcb->sctp_socket == NULL) { return (calc); } + KASSERT(asoc->cnt_on_reasm_queue > 0 || asoc->size_on_reasm_queue == 0, ("size_on_reasm_queue is %u", asoc->size_on_reasm_queue)); KASSERT(asoc->cnt_on_all_streams > 0 || asoc->size_on_all_streams == 0, @@ -117,6 +118,7 @@ sctp_calc_rwnd(struct sctp_tcb *stcb, struct sctp_association *asoc) /* out of space */ return (calc); } + /* what is the overhead of all these rwnd's */ calc = sctp_sbspace_sub(calc, stcb->asoc.my_rwnd_control_len); /* @@ -187,6 +189,7 @@ sctp_build_ctl_nchunk(struct sctp_inpcb *inp, struct sctp_sndrcvinfo *sinfo) /* user does not want any ancillary data */ return (NULL); } + len = 0; if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVRCVINFO)) { len += CMSG_SPACE(sizeof(struct sctp_rcvinfo)); @@ -1046,6 +1049,7 @@ place_chunk: SCTP_FROM_SCTP_INDATA + SCTP_LOC_5); return; } + } if (inserted == 0) { /* Its at the end */ @@ -2140,6 +2144,7 @@ sctp_process_a_data_chunk(struct sctp_tcb *stcb, struct sctp_association *asoc, control = NULL; goto finish_express_del; } + /* Now will we need a chunk too? */ if ((chk_flags & SCTP_DATA_NOT_FRAG) != SCTP_DATA_NOT_FRAG) { sctp_alloc_a_chunk(stcb, chk); @@ -2570,7 +2575,7 @@ sctp_sack_check(struct sctp_tcb *stcb, int was_a_gap) * Now we need to see if we need to queue a sack or just start the * timer (if allowed). */ - if (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_SENT) { + if (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_SENT) { /* * Ok special case, in SHUTDOWN-SENT case. here we maker * sure SACK timer is off and instead send a SHUTDOWN and a @@ -2927,7 +2932,7 @@ sctp_process_data(struct mbuf **mm, int iphlen, int *offset, int length, (void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_last_rcvd); } /* now service all of the reassm queue if needed */ - if (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_SENT) { + if (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_SENT) { /* Assure that we ack right away */ stcb->asoc.send_sack = 1; } @@ -3075,7 +3080,7 @@ sctp_process_segment_range(struct sctp_tcb *stcb, struct sctp_tmit_chunk **p_tp1 tp1->whoTo->net_ack += tp1->send_size; if (tp1->snd_count < 2) { /*- - * True non-retransmited chunk + * True non-retransmitted chunk */ tp1->whoTo->net_ack2 += tp1->send_size; @@ -3098,6 +3103,7 @@ sctp_process_segment_range(struct sctp_tcb *stcb, struct sctp_tmit_chunk **p_tp1 tp1->do_rtt = 0; } } + } if (tp1->sent <= SCTP_DATAGRAM_RESEND) { if (SCTP_TSN_GT(tp1->rec.data.tsn, @@ -3363,6 +3369,7 @@ sctp_strike_gap_ack_chunks(struct sctp_tcb *stcb, struct sctp_association *asoc, continue; } } + } if (SCTP_TSN_GT(tp1->rec.data.tsn, asoc->this_sack_highest_gap) && !(accum_moved && asoc->fast_retran_loss_recovery)) { @@ -3598,6 +3605,7 @@ sctp_strike_gap_ack_chunks(struct sctp_tcb *stcb, struct sctp_association *asoc, tp1); } } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_RWND_ENABLE) { sctp_log_rwnd(SCTP_INCREASE_PEER_RWND, asoc->peers_rwnd, tp1->send_size, SCTP_BASE_SYSCTL(sctp_peer_chunk_oh)); @@ -3679,6 +3687,7 @@ sctp_strike_gap_ack_chunks(struct sctp_tcb *stcb, struct sctp_association *asoc, tp1->whoTo->find_pseudo_cumack = 1; tp1->whoTo->find_rtx_pseudo_cumack = 1; } + } else { /* CMT is OFF */ #ifdef SCTP_FR_TO_ALTERNATE @@ -3967,6 +3976,7 @@ sctp_express_handle_sack(struct sctp_tcb *stcb, uint32_t cumack, } return; } + /* First setup for CC stuff */ TAILQ_FOREACH(net, &asoc->nets, sctp_next) { if (SCTP_TSN_GT(cumack, net->cwr_window_tsn)) { @@ -4048,7 +4058,7 @@ sctp_express_handle_sack(struct sctp_tcb *stcb, uint32_t cumack, tp1->whoTo->net_ack += tp1->send_size; if (tp1->snd_count < 2) { /* - * True non-retransmited + * True non-retransmitted * chunk */ tp1->whoTo->net_ack2 += @@ -4232,6 +4242,7 @@ sctp_express_handle_sack(struct sctp_tcb *stcb, uint32_t cumack, asoc->total_flight = 0; asoc->total_flight_count = 0; } + /* RWND update */ asoc->peers_rwnd = sctp_sbspace_sub(rwnd, (uint32_t)(asoc->total_flight + (asoc->total_flight_count * SCTP_BASE_SYSCTL(sctp_peer_chunk_oh)))); @@ -4320,12 +4331,12 @@ again: /* clean up */ if ((asoc->stream_queue_cnt == 1) && ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) || - (asoc->state & SCTP_STATE_SHUTDOWN_RECEIVED)) && + (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) && ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc))) { - asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT; + SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_PARTIAL_MSG_LEFT); } if (((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) || - (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) && + (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) && (asoc->stream_queue_cnt == 1) && (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT)) { struct mbuf *op_err; @@ -4341,12 +4352,11 @@ again: (asoc->stream_queue_cnt == 0)) { struct sctp_nets *netp; - if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) || - (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } - SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT); - SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); + SCTP_SET_STATE(stcb, SCTP_STATE_SHUTDOWN_SENT); sctp_stop_timers_for_shutdown(stcb); if (asoc->alternate) { netp = asoc->alternate; @@ -4358,13 +4368,12 @@ again: stcb->sctp_ep, stcb, netp); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, netp); - } else if ((SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED) && + } else if ((SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED) && (asoc->stream_queue_cnt == 0)) { struct sctp_nets *netp; SCTP_STAT_DECR_GAUGE32(sctps_currestab); - SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_ACK_SENT); - SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); + SCTP_SET_STATE(stcb, SCTP_STATE_SHUTDOWN_ACK_SENT); sctp_stop_timers_for_shutdown(stcb); if (asoc->alternate) { netp = asoc->alternate; @@ -4484,6 +4493,7 @@ sctp_handle_sack(struct mbuf *m, int offset_seg, int offset_dup, sctp_misc_ints(SCTP_SACK_LOG_NORMAL, cum_ack, rwnd, stcb->asoc.last_acked_seq, stcb->asoc.peers_rwnd); } + old_rwnd = stcb->asoc.peers_rwnd; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, @@ -4555,6 +4565,7 @@ hopeless_peer: /* acking something behind */ return; } + /* update the Rwnd of the peer */ if (TAILQ_EMPTY(&asoc->sent_queue) && TAILQ_EMPTY(&asoc->send_queue) && @@ -4608,6 +4619,7 @@ hopeless_peer: if (stcb->asoc.cc_functions.sctp_cwnd_prepare_net_for_sack) { (*stcb->asoc.cc_functions.sctp_cwnd_prepare_net_for_sack) (stcb, net); } + /* * CMT: SFR algo (and HTNA) - this_sack_highest_newack has * to be greater than the cumack. Also reset saw_newack to 0 @@ -4664,7 +4676,7 @@ hopeless_peer: if (tp1->snd_count < 2) { /* - * True non-retransmited + * True non-retransmitted * chunk */ tp1->whoTo->net_ack2 += @@ -4843,6 +4855,7 @@ hopeless_peer: #endif asoc->total_flight = 0; } + /* sa_ignore NO_NULL_CHK */ if ((wake_him) && (stcb->sctp_socket)) { #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) @@ -4947,6 +4960,7 @@ hopeless_peer: sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, 0, (void *)net, SCTP_SO_NOT_LOCKED); } + if (net == stcb->asoc.primary_destination) { if (stcb->asoc.alternate) { /* @@ -4957,6 +4971,7 @@ hopeless_peer: stcb->asoc.alternate = NULL; } } + if (net->dest_state & SCTP_ADDR_PF) { net->dest_state &= ~SCTP_ADDR_PF; sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, @@ -4979,6 +4994,7 @@ hopeless_peer: } asoc->cc_functions.sctp_cwnd_update_after_sack(stcb, asoc, accum_moved, reneged_all, will_exit_fast_recovery); } + if (TAILQ_EMPTY(&asoc->sent_queue)) { /* nothing left in-flight */ TAILQ_FOREACH(net, &asoc->nets, sctp_next) { @@ -4992,6 +5008,7 @@ hopeless_peer: asoc->total_flight = 0; asoc->total_flight_count = 0; } + /**********************************/ /* Now what about shutdown issues */ /**********************************/ @@ -5009,12 +5026,12 @@ hopeless_peer: /* clean up */ if ((asoc->stream_queue_cnt == 1) && ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) || - (asoc->state & SCTP_STATE_SHUTDOWN_RECEIVED)) && + (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) && ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc))) { - asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT; + SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_PARTIAL_MSG_LEFT); } if (((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) || - (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) && + (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) && (asoc->stream_queue_cnt == 1) && (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT)) { struct mbuf *op_err; @@ -5030,12 +5047,11 @@ hopeless_peer: (asoc->stream_queue_cnt == 0)) { struct sctp_nets *netp; - if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) || - (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } - SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT); - SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); + SCTP_SET_STATE(stcb, SCTP_STATE_SHUTDOWN_SENT); sctp_stop_timers_for_shutdown(stcb); if (asoc->alternate) { netp = asoc->alternate; @@ -5048,13 +5064,12 @@ hopeless_peer: sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, netp); return; - } else if ((SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED) && + } else if ((SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED) && (asoc->stream_queue_cnt == 0)) { struct sctp_nets *netp; SCTP_STAT_DECR_GAUGE32(sctps_currestab); - SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_ACK_SENT); - SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); + SCTP_SET_STATE(stcb, SCTP_STATE_SHUTDOWN_ACK_SENT); sctp_stop_timers_for_shutdown(stcb); if (asoc->alternate) { netp = asoc->alternate; @@ -5126,6 +5141,7 @@ hopeless_peer: if (asoc->peers_rwnd > old_rwnd) { win_probe_recovery = 1; } + /* * Now we must setup so we have a timer up for anyone with * outstanding data. diff --git a/freebsd/sys/netinet/sctp_indata.h b/freebsd/sys/netinet/sctp_indata.h index 10b18d0b..59ceac3a 100644 --- a/freebsd/sys/netinet/sctp_indata.h +++ b/freebsd/sys/netinet/sctp_indata.h @@ -99,8 +99,7 @@ void sctp_handle_forward_tsn(struct sctp_tcb *, struct sctp_forward_tsn_chunk *, int *, struct mbuf *, int); -struct sctp_tmit_chunk * - sctp_try_advance_peer_ack_point(struct sctp_tcb *, struct sctp_association *); +struct sctp_tmit_chunk *sctp_try_advance_peer_ack_point(struct sctp_tcb *, struct sctp_association *); void sctp_service_queues(struct sctp_tcb *, struct sctp_association *); diff --git a/freebsd/sys/netinet/sctp_input.c b/freebsd/sys/netinet/sctp_input.c index ee206551..c7e86e78 100644 --- a/freebsd/sys/netinet/sctp_input.c +++ b/freebsd/sys/netinet/sctp_input.c @@ -54,6 +54,7 @@ __FBSDID("$FreeBSD$"); #if defined(INET) || defined(INET6) #include <netinet/udp.h> #endif +#include <netinet/in_kdtrace.h> #include <sys/smp.h> @@ -192,7 +193,7 @@ sctp_handle_init(struct mbuf *m, int iphlen, int offset, goto outnow; } if ((stcb != NULL) && - (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_ACK_SENT)) { + (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_ACK_SENT)) { SCTPDBG(SCTP_DEBUG_INPUT3, "sctp_handle_init: sending SHUTDOWN-ACK\n"); sctp_send_shutdown_ack(stcb, NULL); sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CONTROL_PROC, SCTP_SO_NOT_LOCKED); @@ -307,6 +308,7 @@ sctp_process_init(struct sctp_init_chunk *cp, struct sctp_tcb *stcb) if (SCTP_BASE_SYSCTL(sctp_logging_level) & (SCTP_CWND_MONITOR_ENABLE | SCTP_CWND_LOGGING_ENABLE)) { sctp_log_cwnd(stcb, lnet, 0, SCTP_CWND_INITIALIZATION); } + } } SCTP_TCB_SEND_LOCK(stcb); @@ -493,6 +495,7 @@ sctp_process_init_ack(struct mbuf *m, int iphlen, int offset, SCTP_FREE(param, SCTP_M_ASC_ADDR); } } + stcb->asoc.peer_hmac_id = sctp_negotiate_hmacid(stcb->asoc.peer_hmacs, stcb->asoc.local_hmacs); if (op_err) { @@ -555,6 +558,7 @@ sctp_process_init_ack(struct mbuf *m, int iphlen, int offset, } return (retval); } + return (0); } @@ -572,6 +576,7 @@ sctp_handle_heartbeat_ack(struct sctp_heartbeat_chunk *cp, /* Invalid length */ return; } + memset(&store, 0, sizeof(store)); switch (cp->heartbeat.hb_info.addr_family) { #ifdef INET @@ -711,15 +716,15 @@ sctp_handle_nat_colliding_state(struct sctp_tcb *stcb) */ struct sctpasochead *head; - if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_WAIT) || - (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_ECHOED)) { + if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_INP_INFO_WLOCK(); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); } - if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_WAIT) { + if (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) { /* generate a new vtag and send init */ LIST_REMOVE(stcb, sctp_asocs); stcb->asoc.my_vtag = sctp_select_a_tag(stcb->sctp_ep, stcb->sctp_ep->sctp_lport, stcb->rport, 1); @@ -733,15 +738,14 @@ sctp_handle_nat_colliding_state(struct sctp_tcb *stcb) SCTP_INP_INFO_WUNLOCK(); return (1); } - if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_ECHOED) { + if (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED) { /* * treat like a case where the cookie expired i.e.: - dump * current cookie. - generate a new vtag. - resend init. */ /* generate a new vtag and send init */ LIST_REMOVE(stcb, sctp_asocs); - stcb->asoc.state &= ~SCTP_STATE_COOKIE_ECHOED; - stcb->asoc.state |= SCTP_STATE_COOKIE_WAIT; + SCTP_SET_STATE(stcb, SCTP_STATE_COOKIE_WAIT); sctp_stop_all_cookie_timers(stcb); sctp_toss_old_cookies(stcb, &stcb->asoc); stcb->asoc.my_vtag = sctp_select_a_tag(stcb->sctp_ep, stcb->sctp_ep->sctp_lport, stcb->rport, 1); @@ -823,8 +827,8 @@ sctp_handle_abort(struct sctp_abort_chunk *abort, sctp_abort_notification(stcb, 1, error, abort, SCTP_SO_NOT_LOCKED); /* free the tcb */ SCTP_STAT_INCR_COUNTER32(sctps_aborted); - if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) || - (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } #ifdef SCTP_ASOCLOG_OF_TSNS @@ -838,7 +842,7 @@ sctp_handle_abort(struct sctp_abort_chunk *abort, SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); #endif - stcb->asoc.state |= SCTP_STATE_WAS_ABORTED; + SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_WAS_ABORTED); (void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_8); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) @@ -893,15 +897,15 @@ sctp_handle_shutdown(struct sctp_shutdown_chunk *cp, if (stcb == NULL) return; asoc = &stcb->asoc; - if ((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT) || - (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED)) { + if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { return; } if (ntohs(cp->ch.chunk_length) != sizeof(struct sctp_shutdown_chunk)) { /* Shutdown NOT the expected size */ return; } - old_state = SCTP_GET_STATE(asoc); + old_state = SCTP_GET_STATE(stcb); sctp_update_acked(stcb, cp, abort_flag); if (*abort_flag) { return; @@ -957,11 +961,10 @@ sctp_handle_shutdown(struct sctp_shutdown_chunk *cp, } /* goto SHUTDOWN_RECEIVED state to block new requests */ if (stcb->sctp_socket) { - if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_RECEIVED) && - (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT) && - (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT)) { - SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_RECEIVED); - SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); + if ((SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_RECEIVED) && + (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_ACK_SENT) && + (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT)) { + SCTP_SET_STATE(stcb, SCTP_STATE_SHUTDOWN_RECEIVED); /* * notify upper layer that peer has initiated a * shutdown @@ -972,7 +975,7 @@ sctp_handle_shutdown(struct sctp_shutdown_chunk *cp, (void)SCTP_GETTIME_TIMEVAL(&asoc->time_entered); } } - if (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_SENT) { + if (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_SENT) { /* * stop the shutdown timer, since we WILL move to * SHUTDOWN-ACK-SENT. @@ -992,13 +995,12 @@ sctp_handle_shutdown(struct sctp_shutdown_chunk *cp, /* no outstanding data to send, so move on... */ /* send SHUTDOWN-ACK */ /* move to SHUTDOWN-ACK-SENT state */ - if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) || - (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } - SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); - if (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT) { - SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_ACK_SENT); + if (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_ACK_SENT) { + SCTP_SET_STATE(stcb, SCTP_STATE_SHUTDOWN_ACK_SENT); sctp_stop_timers_for_shutdown(stcb); sctp_send_shutdown_ack(stcb, net); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNACK, @@ -1027,15 +1029,15 @@ sctp_handle_shutdown_ack(struct sctp_shutdown_ack_chunk *cp SCTP_UNUSED, asoc = &stcb->asoc; /* process according to association state */ - if ((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT) || - (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED)) { + if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { /* unexpected SHUTDOWN-ACK... do OOTB handling... */ sctp_send_shutdown_complete(stcb, net, 1); SCTP_TCB_UNLOCK(stcb); return; } - if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) && - (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { + if ((SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT) && + (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { /* unexpected SHUTDOWN-ACK... so ignore... */ SCTP_TCB_UNLOCK(stcb); return; @@ -1231,7 +1233,7 @@ sctp_handle_error(struct sctp_chunkhdr *ch, * waiting. */ if ((cause_length >= sizeof(struct sctp_error_stale_cookie)) && - (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED)) { + (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { struct sctp_error_stale_cookie *stale_cookie; stale_cookie = (struct sctp_error_stale_cookie *)cause; @@ -1264,8 +1266,7 @@ sctp_handle_error(struct sctp_chunkhdr *ch, } /* blast back to INIT state */ sctp_toss_old_cookies(stcb, &stcb->asoc); - asoc->state &= ~SCTP_STATE_COOKIE_ECHOED; - asoc->state |= SCTP_STATE_COOKIE_WAIT; + SCTP_SET_STATE(stcb, SCTP_STATE_COOKIE_WAIT); sctp_stop_all_cookie_timers(stcb); sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); } @@ -1416,7 +1417,7 @@ sctp_handle_init_ack(struct mbuf *m, int iphlen, int offset, return (-1); } /* process according to association state... */ - switch (stcb->asoc.state & SCTP_STATE_MASK) { + switch (SCTP_GET_STATE(stcb)) { case SCTP_STATE_COOKIE_WAIT: /* this is the expected state for this chunk */ /* process the INIT-ACK parameters */ @@ -1442,7 +1443,7 @@ sctp_handle_init_ack(struct mbuf *m, int iphlen, int offset, } /* update our state */ SCTPDBG(SCTP_DEBUG_INPUT2, "moving to COOKIE-ECHOED state\n"); - SCTP_SET_STATE(&stcb->asoc, SCTP_STATE_COOKIE_ECHOED); + SCTP_SET_STATE(stcb, SCTP_STATE_COOKIE_ECHOED); /* reset the RTO calc */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { @@ -1536,7 +1537,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, if (how_indx < sizeof(asoc->cookie_how)) { asoc->cookie_how[how_indx] = 1; } - if (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_ACK_SENT) { + if (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_ACK_SENT) { /* SHUTDOWN came in after sending INIT-ACK */ sctp_send_shutdown_ack(stcb, stcb->asoc.primary_destination); op_err = sctp_generate_cause(SCTP_CAUSE_COOKIE_IN_SHUTDOWN, ""); @@ -1605,7 +1606,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, return (NULL); } - switch (SCTP_GET_STATE(asoc)) { + switch (SCTP_GET_STATE(stcb)) { case SCTP_STATE_COOKIE_WAIT: case SCTP_STATE_COOKIE_ECHOED: /* @@ -1629,12 +1630,12 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_14); /* update current state */ - if (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED) + if (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED) SCTP_STAT_INCR_COUNTER32(sctps_activeestab); else SCTP_STAT_INCR_COUNTER32(sctps_collisionestab); - SCTP_SET_STATE(asoc, SCTP_STATE_OPEN); + SCTP_SET_STATE(stcb, SCTP_STATE_OPEN); if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) { sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, asoc->primary_destination); @@ -1718,6 +1719,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, asoc->cookie_how[how_indx] = 5; return (stcb); } + if (ntohl(initack_cp->init.initiate_tag) != asoc->my_vtag && ntohl(init_cp->init.initiate_tag) == asoc->peer_vtag && cookie->tie_tag_my_vtag == 0 && @@ -1733,7 +1735,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, * If nat support, and the below and stcb is established, send back * a ABORT(colliding state) if we are established. */ - if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) && + if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) && (asoc->peer_supports_nat) && ((ntohl(initack_cp->init.initiate_tag) == asoc->my_vtag) && ((ntohl(init_cp->init.initiate_tag) != asoc->peer_vtag) || @@ -1838,8 +1840,8 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, asoc->cookie_how[how_indx] = 10; return (NULL); } - if ((asoc->state & SCTP_STATE_COOKIE_WAIT) || - (asoc->state & SCTP_STATE_COOKIE_ECHOED)) { + if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { *notification = SCTP_NOTIFY_ASSOC_UP; if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || @@ -1867,17 +1869,17 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, SCTP_SOCKET_UNLOCK(so, 1); #endif } - if (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED) + if (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED) SCTP_STAT_INCR_COUNTER32(sctps_activeestab); else SCTP_STAT_INCR_COUNTER32(sctps_collisionestab); SCTP_STAT_INCR_GAUGE32(sctps_currestab); - } else if (SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) { + } else if (SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) { SCTP_STAT_INCR_COUNTER32(sctps_restartestab); } else { SCTP_STAT_INCR_COUNTER32(sctps_collisionestab); } - SCTP_SET_STATE(asoc, SCTP_STATE_OPEN); + SCTP_SET_STATE(stcb, SCTP_STATE_OPEN); if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) { sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, asoc->primary_destination); @@ -1937,24 +1939,24 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, /* notify upper layer */ *notification = SCTP_NOTIFY_ASSOC_RESTART; atomic_add_int(&stcb->asoc.refcnt, 1); - if ((SCTP_GET_STATE(asoc) != SCTP_STATE_OPEN) && - (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_RECEIVED) && - (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT)) { + if ((SCTP_GET_STATE(stcb) != SCTP_STATE_OPEN) && + (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_RECEIVED) && + (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT)) { SCTP_STAT_INCR_GAUGE32(sctps_currestab); } - if (SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) { + if (SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) { SCTP_STAT_INCR_GAUGE32(sctps_restartestab); - } else if (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) { + } else if (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT) { SCTP_STAT_INCR_GAUGE32(sctps_collisionestab); } if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) { - SCTP_SET_STATE(asoc, SCTP_STATE_OPEN); + SCTP_SET_STATE(stcb, SCTP_STATE_OPEN); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, asoc->primary_destination); - } else if (!(asoc->state & SCTP_STATE_SHUTDOWN_SENT)) { + } else if (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT) { /* move to OPEN state, if not in SHUTDOWN_SENT */ - SCTP_SET_STATE(asoc, SCTP_STATE_OPEN); + SCTP_SET_STATE(stcb, SCTP_STATE_OPEN); } asoc->pre_open_streams = ntohs(initack_cp->init.num_outbound_streams); @@ -2293,6 +2295,7 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset, stcb->asoc.authenticated = 1; } } + /* * if we're doing ASCONFs, check to see if we have any new local * addresses that need to get added to the peer (eg. addresses @@ -2342,7 +2345,7 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset, /* update current state */ SCTPDBG(SCTP_DEBUG_INPUT2, "moving to OPEN state\n"); - SCTP_SET_STATE(asoc, SCTP_STATE_OPEN); + SCTP_SET_STATE(stcb, SCTP_STATE_OPEN); if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) { sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, asoc->primary_destination); @@ -2590,6 +2593,7 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset, (uint32_t)offset, cookie_offset, sig_offset); return (NULL); } + /* * check the cookie timestamps to be sure it's not stale */ @@ -2710,6 +2714,7 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset, } } } + cookie_len -= SCTP_SIGNATURE_SIZE; if (*stcb == NULL) { /* this is the "normal" case... get a new TCB */ @@ -2877,7 +2882,7 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset, * the accept state waiting for the accept! */ if (*stcb) { - (*stcb)->asoc.state |= SCTP_STATE_IN_ACCEPT_QUEUE; + SCTP_ADD_SUBSTATE(*stcb, SCTP_STATE_IN_ACCEPT_QUEUE); } sctp_move_pcb_and_assoc(*inp_p, inp, *stcb); @@ -2912,6 +2917,7 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset, sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED, (*stcb), 0, (void *)netl, SCTP_SO_NOT_LOCKED); } + /* * Pull it from the incomplete queue and wake the * guy @@ -2952,6 +2958,7 @@ sctp_handle_cookie_ack(struct sctp_cookie_ack_chunk *cp SCTP_UNUSED, if ((stcb == NULL) || (net == NULL)) { return; } + asoc = &stcb->asoc; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, @@ -2963,10 +2970,10 @@ sctp_handle_cookie_ack(struct sctp_cookie_ack_chunk *cp SCTP_UNUSED, asoc->overall_error_count = 0; sctp_stop_all_cookie_timers(stcb); /* process according to association state */ - if (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED) { + if (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED) { /* state change only needed when I am in right state */ SCTPDBG(SCTP_DEBUG_INPUT2, "moving to OPEN state\n"); - SCTP_SET_STATE(asoc, SCTP_STATE_OPEN); + SCTP_SET_STATE(stcb, SCTP_STATE_OPEN); sctp_start_net_timers(stcb); if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) { sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, @@ -3018,6 +3025,7 @@ sctp_handle_cookie_ack(struct sctp_cookie_ack_chunk *cp SCTP_UNUSED, */ goto closed_socket; } + sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net); @@ -3224,7 +3232,6 @@ static void sctp_handle_shutdown_complete(struct sctp_shutdown_complete_chunk *cp SCTP_UNUSED, struct sctp_tcb *stcb, struct sctp_nets *net) { - struct sctp_association *asoc; #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; #endif @@ -3234,9 +3241,8 @@ sctp_handle_shutdown_complete(struct sctp_shutdown_complete_chunk *cp SCTP_UNUSE if (stcb == NULL) return; - asoc = &stcb->asoc; /* process according to association state */ - if (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT) { + if (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_ACK_SENT) { /* unexpected SHUTDOWN-COMPLETE... so ignore... */ SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_shutdown_complete: not in SCTP_STATE_SHUTDOWN_ACK_SENT --- ignore\n"); @@ -3248,8 +3254,8 @@ sctp_handle_shutdown_complete(struct sctp_shutdown_complete_chunk *cp SCTP_UNUSE sctp_ulp_notify(SCTP_NOTIFY_ASSOC_DOWN, stcb, 0, NULL, SCTP_SO_NOT_LOCKED); } #ifdef INVARIANTS - if (!TAILQ_EMPTY(&asoc->send_queue) || - !TAILQ_EMPTY(&asoc->sent_queue) || + if (!TAILQ_EMPTY(&stcb->asoc.send_queue) || + !TAILQ_EMPTY(&stcb->asoc.sent_queue) || sctp_is_there_unsent_data(stcb, SCTP_SO_NOT_LOCKED)) { panic("Queues are not empty when handling SHUTDOWN-COMPLETE"); } @@ -3796,6 +3802,7 @@ sctp_handle_stream_reset_response(struct sctp_tcb *stcb, if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) { sctp_log_map(0, 7, asoc->highest_tsn_inside_map, SCTP_MAP_SLIDE_RESULT); } + stcb->asoc.tsn_last_delivered = stcb->asoc.cumulative_tsn = stcb->asoc.highest_tsn_inside_map; stcb->asoc.mapping_array_base_tsn = ntohl(resp->senders_next_tsn); memset(stcb->asoc.mapping_array, 0, stcb->asoc.mapping_array_size); @@ -4395,6 +4402,7 @@ sctp_handle_packet_dropped(struct sctp_pktdrop_chunk *cp, if (trunc_len > limit) { trunc_len = limit; } + /* now the chunks themselves */ while ((ch != NULL) && (chlen >= sizeof(struct sctp_chunkhdr))) { desc.chunk_type = ch->chunk_type; @@ -4654,6 +4662,7 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length, */ SCTP_INP_DECR_REF(inp); } + /* now go back and verify any auth chunk to be sure */ if (auth_skipped && (stcb != NULL)) { struct sctp_auth_chunk *auth; @@ -4748,11 +4757,12 @@ sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length, if (((ch->chunk_type == SCTP_SELECTIVE_ACK) || (ch->chunk_type == SCTP_NR_SELECTIVE_ACK) || (ch->chunk_type == SCTP_HEARTBEAT_REQUEST)) && - (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_ECHOED)) { + (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { /* implied cookie-ack.. we must have lost the ack */ sctp_handle_cookie_ack((struct sctp_cookie_ack_chunk *)ch, stcb, *netp); } + process_control_chunks: while (IS_SCTP_CONTROL(ch)) { /* validate chunk length */ @@ -4792,6 +4802,7 @@ process_control_chunks: } return (NULL); } + num_chunks++; /* Save off the last place we got a control from */ if (stcb != NULL) { @@ -4811,7 +4822,6 @@ process_control_chunks: /* check to see if this chunk required auth, but isn't */ if ((stcb != NULL) && - (stcb->asoc.auth_supported == 1) && sctp_auth_is_required_chunk(ch->chunk_type, stcb->asoc.local_auth_chunks) && !stcb->asoc.authenticated) { /* "silently" ignore */ @@ -4941,7 +4951,7 @@ process_control_chunks: break; } } - if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_ACK_SENT) { + if (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_ACK_SENT) { /*- * If we have sent a shutdown-ack, we will pay no * attention to a sack sent in to us since @@ -5159,6 +5169,7 @@ process_control_chunks: goto abend; } } + if (netp != NULL) { struct sctp_tcb *locked_stcb; @@ -5331,6 +5342,7 @@ process_control_chunks: *offset = length; return (stcb); } + if (stcb != NULL) { int abort_flag = 0; @@ -5393,6 +5405,7 @@ process_control_chunks: *offset = length; return (stcb); } + if ((ch != NULL) && (stcb != NULL) && (netp != NULL) && (*netp != NULL)) { if (stcb->asoc.pktdrop_supported == 0) { goto unknown_chunk; @@ -5559,6 +5572,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt net->flowtype = mflowtype; net->flowid = mflowid; } + SCTP_PROBE5(receive, NULL, stcb, m, stcb, sh); if ((inp != NULL) && (stcb != NULL)) { sctp_send_packet_dropped(stcb, net, m, length, iphlen, 1); sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_INPUT_ERROR, SCTP_SO_NOT_LOCKED); @@ -5599,6 +5613,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt net->flowid = mflowid; } if (inp == NULL) { + SCTP_PROBE5(receive, NULL, stcb, m, stcb, sh); SCTP_STAT_INCR(sctps_noport); if (badport_bandlim(BANDLIM_SCTP_OOTB) < 0) { goto out; @@ -5647,6 +5662,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt */ SCTP_TCB_UNLOCK(stcb); stcb = NULL; + SCTP_PROBE5(receive, NULL, stcb, m, stcb, sh); snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __func__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); @@ -5655,6 +5671,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt vrf_id, port); goto out; } + } if (IS_SCTP_CONTROL(ch)) { /* process the control portion of the SCTP packet */ @@ -5700,14 +5717,15 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt * chunks */ if ((stcb != NULL) && - (stcb->asoc.auth_supported == 1) && sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.local_auth_chunks)) { /* "silently" ignore */ + SCTP_PROBE5(receive, NULL, stcb, m, stcb, sh); SCTP_STAT_INCR(sctps_recvauthmissing); goto out; } if (stcb == NULL) { /* out of the blue DATA chunk */ + SCTP_PROBE5(receive, NULL, NULL, m, NULL, sh); snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __func__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); @@ -5718,11 +5736,13 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt } if (stcb->asoc.my_vtag != ntohl(sh->v_tag)) { /* v_tag mismatch! */ + SCTP_PROBE5(receive, NULL, stcb, m, stcb, sh); SCTP_STAT_INCR(sctps_badvtag); goto out; } } + SCTP_PROBE5(receive, NULL, stcb, m, stcb, sh); if (stcb == NULL) { /* * no valid TCB for this packet, or we found it's a bad @@ -5731,6 +5751,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt */ goto out; } + /* * DATA chunk processing */ @@ -5742,7 +5763,6 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt */ if ((length > offset) && (stcb != NULL) && - (stcb->asoc.auth_supported == 1) && sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.local_auth_chunks) && !stcb->asoc.authenticated) { /* "silently" ignore */ @@ -5759,7 +5779,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt * not get here unless we really did have a tag, so we don't * abort if this happens, just dump the chunk silently. */ - switch (SCTP_GET_STATE(&stcb->asoc)) { + switch (SCTP_GET_STATE(stcb)) { case SCTP_STATE_COOKIE_ECHOED: /* * we consider data with valid tags in this state @@ -5810,6 +5830,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt * process_data */ } + /* take care of ecn */ if ((data_processed == 1) && (stcb->asoc.ecn_supported == 1) && @@ -5817,6 +5838,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt /* Yep, we need to add a ECNE */ sctp_send_ecn_echo(stcb, net, high_tsn); } + if ((data_processed == 0) && (fwd_tsn_seen)) { int was_a_gap; uint32_t highest_tsn; diff --git a/freebsd/sys/netinet/sctp_input.h b/freebsd/sys/netinet/sctp_input.h index f393ad89..72908e11 100644 --- a/freebsd/sys/netinet/sctp_input.h +++ b/freebsd/sys/netinet/sctp_input.h @@ -52,7 +52,7 @@ struct sctp_stream_reset_request * sctp_find_stream_reset(struct sctp_tcb *stcb, uint32_t seq, struct sctp_tmit_chunk **bchk); -void +void sctp_reset_in_stream(struct sctp_tcb *stcb, uint32_t number_entries, uint16_t *list); diff --git a/freebsd/sys/netinet/sctp_os_bsd.h b/freebsd/sys/netinet/sctp_os_bsd.h index d8d9e6e8..abe8e2c9 100644 --- a/freebsd/sys/netinet/sctp_os_bsd.h +++ b/freebsd/sys/netinet/sctp_os_bsd.h @@ -445,7 +445,7 @@ sctp_get_mbuf_for_msg(unsigned int space_needed, /* * SCTP AUTH */ -#define SCTP_READ_RANDOM(buf, len) read_random(buf, len) +#define SCTP_READ_RANDOM(buf, len) arc4rand(buf, len, 0) /* map standard crypto API names */ #define SCTP_SHA1_CTX SHA1_CTX diff --git a/freebsd/sys/netinet/sctp_output.c b/freebsd/sys/netinet/sctp_output.c index bdef958c..8f0c8aa4 100644 --- a/freebsd/sys/netinet/sctp_output.c +++ b/freebsd/sys/netinet/sctp_output.c @@ -59,6 +59,7 @@ __FBSDID("$FreeBSD$"); #endif #include <netinet/udp_var.h> #include <machine/in_cksum.h> +#include <netinet/in_kdtrace.h> @@ -2547,6 +2548,7 @@ once_again: inp->next_addr_touse = NULL; goto once_again; } + inp->next_addr_touse = starting_point; resettotop = 0; once_again_too: @@ -2554,6 +2556,7 @@ once_again_too: inp->next_addr_touse = LIST_FIRST(&inp->sctp_addr_list); resettotop = 1; } + /* ok, what about an acceptable address in the inp */ for (laddr = inp->next_addr_touse; laddr; laddr = LIST_NEXT(laddr, sctp_nxt_addr)) { @@ -2576,6 +2579,7 @@ once_again_too: inp->next_addr_touse = NULL; goto once_again_too; } + /* * no address bound can be a source for the destination we are in * trouble @@ -3990,8 +3994,8 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, int so_locked #endif ) -/* nofragment_flag to tell if IP_DF should be set (IPv4 only) */ { +/* nofragment_flag to tell if IP_DF should be set (IPv4 only) */ /** * Given a mbuf chain (via SCTP_BUF_NEXT()) that holds a packet header * WITH an SCTPHDR but no IP header, endpoint inp and sa structure: @@ -4038,6 +4042,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, if ((auth != NULL) && (stcb != NULL)) { sctp_fill_hmac_digest_m(m, auth_offset, auth, stcb, auth_keyid); } + if (net) { tos_value = net->dscp; } else if (stcb) { @@ -4249,6 +4254,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, SCTP_SOCKET_UNLOCK(so, 0); } #endif + SCTP_PROBE5(send, NULL, stcb, ip, stcb, sctphdr); SCTP_IP_OUTPUT(ret, o_pak, ro, stcb, vrf_id); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) if ((SCTP_BASE_SYSCTL(sctp_output_unlocked)) && (so_locked)) { @@ -4394,7 +4400,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, } else { ip6h->ip6_nxt = IPPROTO_SCTP; } - ip6h->ip6_plen = (uint16_t)(packet_length - sizeof(struct ip6_hdr)); + ip6h->ip6_plen = htons((uint16_t)(packet_length - sizeof(struct ip6_hdr))); ip6h->ip6_dst = sin6->sin6_addr; /* @@ -4552,6 +4558,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, prev_scope = sin6->sin6_scope_id; prev_port = sin6->sin6_port; } + if (SCTP_GET_HEADER_FOR_OUTPUT(o_pak)) { /* failed to prepend data, give up */ sctp_m_freem(m); @@ -4581,6 +4588,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING) sctp_packet_log(o_pak); #endif + SCTP_PROBE5(send, NULL, stcb, ip6h, stcb, sctphdr); SCTP_IP6_OUTPUT(ret, o_pak, (struct route_in6 *)ro, &ifp, stcb, vrf_id); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) if ((SCTP_BASE_SYSCTL(sctp_output_unlocked)) && (so_locked)) { @@ -4740,6 +4748,7 @@ sctp_send_initiate(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked ali->indication = htonl(inp->sctp_ep.adaptation_layer_indicator); chunk_len += parameter_len; } + /* ECN parameter */ if (stcb->asoc.ecn_supported == 1) { parameter_len = (uint16_t)sizeof(struct sctp_paramhdr); @@ -4748,6 +4757,7 @@ sctp_send_initiate(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked ph->param_length = htons(parameter_len); chunk_len += parameter_len; } + /* PR-SCTP supported parameter */ if (stcb->asoc.prsctp_supported == 1) { parameter_len = (uint16_t)sizeof(struct sctp_paramhdr); @@ -4756,6 +4766,7 @@ sctp_send_initiate(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked ph->param_length = htons(parameter_len); chunk_len += parameter_len; } + /* Add NAT friendly parameter. */ if (SCTP_BASE_SYSCTL(sctp_inits_include_nat_friendly)) { parameter_len = (uint16_t)sizeof(struct sctp_paramhdr); @@ -4764,6 +4775,7 @@ sctp_send_initiate(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked ph->param_length = htons(parameter_len); chunk_len += parameter_len; } + /* And now tell the peer which extensions we support */ num_ext = 0; pr_supported = (struct sctp_supported_chunk_types_param *)(mtod(m, caddr_t)+chunk_len); @@ -4854,6 +4866,7 @@ sctp_send_initiate(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked chunk_len += parameter_len; } } + /* now any cookie time extensions */ if (stcb->asoc.cookie_preserve_req) { struct sctp_cookie_perserve_param *cookie_preserve; @@ -4871,6 +4884,7 @@ sctp_send_initiate(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked stcb->asoc.cookie_preserve_req = 0; chunk_len += parameter_len; } + if (stcb->asoc.scope.ipv4_addr_legal || stcb->asoc.scope.ipv6_addr_legal) { uint8_t i; @@ -4899,6 +4913,7 @@ sctp_send_initiate(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked padding_len = 4 - 2 * i; chunk_len += parameter_len; } + SCTP_BUF_LEN(m) = chunk_len; /* now the addresses */ /* @@ -5519,7 +5534,7 @@ sctp_send_initiate_ack(struct sctp_inpcb *inp, struct sctp_tcb *stcb, asoc = NULL; } if ((asoc != NULL) && - (SCTP_GET_STATE(asoc) != SCTP_STATE_COOKIE_WAIT)) { + (SCTP_GET_STATE(stcb) != SCTP_STATE_COOKIE_WAIT)) { if (sctp_are_there_new_addresses(asoc, init_pkt, offset, src)) { /* * new addresses, out of here in non-cookie-wait @@ -5822,9 +5837,9 @@ do_a_abort: initack->ch.chunk_length = 0; /* place in my tag */ if ((asoc != NULL) && - ((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT) || - (SCTP_GET_STATE(asoc) == SCTP_STATE_INUSE) || - (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED))) { + ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_INUSE) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED))) { /* re-use the v-tags and init-seq here */ initack->init.initiate_tag = htonl(asoc->my_vtag); initack->init.initial_tsn = htonl(asoc->init_seq_number); @@ -5904,6 +5919,7 @@ do_a_abort: ali->indication = htonl(inp->sctp_ep.adaptation_layer_indicator); chunk_len += parameter_len; } + /* ECN parameter */ if (((asoc != NULL) && (asoc->ecn_supported == 1)) || ((asoc == NULL) && (inp->ecn_supported == 1))) { @@ -5913,6 +5929,7 @@ do_a_abort: ph->param_length = htons(parameter_len); chunk_len += parameter_len; } + /* PR-SCTP supported parameter */ if (((asoc != NULL) && (asoc->prsctp_supported == 1)) || ((asoc == NULL) && (inp->prsctp_supported == 1))) { @@ -5922,6 +5939,7 @@ do_a_abort: ph->param_length = htons(parameter_len); chunk_len += parameter_len; } + /* Add NAT friendly parameter */ if (nat_friendly) { parameter_len = (uint16_t)sizeof(struct sctp_paramhdr); @@ -5930,6 +5948,7 @@ do_a_abort: ph->param_length = htons(parameter_len); chunk_len += parameter_len; } + /* And now tell the peer which extensions we support */ num_ext = 0; pr_supported = (struct sctp_supported_chunk_types_param *)(mtod(m, caddr_t)+chunk_len); @@ -5973,6 +5992,7 @@ do_a_abort: padding_len = SCTP_SIZE32(parameter_len) - parameter_len; chunk_len += parameter_len; } + /* add authentication parameters */ if (((asoc != NULL) && (asoc->auth_supported == 1)) || ((asoc == NULL) && (inp->auth_supported == 1))) { @@ -6050,6 +6070,7 @@ do_a_abort: SCTP_BUF_LEN(m) += padding_len; padding_len = 0; } + /* tack on the operational error if present */ if (op_err) { parameter_len = 0; @@ -6347,9 +6368,9 @@ sctp_msg_append(struct sctp_tcb *stcb, } strm = &stcb->asoc.strmout[srcv->sinfo_stream]; /* Now can we send this? */ - if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_SENT) || - (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_ACK_SENT) || - (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED) || + if ((SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_SENT) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_ACK_SENT) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED) || (stcb->asoc.state & SCTP_STATE_SHUTDOWN_PENDING)) { /* got data while shutting down */ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ECONNRESET); @@ -6682,18 +6703,17 @@ sctp_sendall_iterator(struct sctp_inpcb *inp, struct sctp_tcb *stcb, void *ptr, * there is nothing queued to send, so I'm * done... */ - if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) && - (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_RECEIVED) && - (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { + if ((SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT) && + (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_RECEIVED) && + (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { /* * only send SHUTDOWN the first time * through */ - if (SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) { + if (SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } - SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT); - SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); + SCTP_SET_STATE(stcb, SCTP_STATE_SHUTDOWN_SENT); sctp_stop_timers_for_shutdown(stcb); sctp_send_shutdown(stcb, net); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, @@ -6714,13 +6734,13 @@ sctp_sendall_iterator(struct sctp_inpcb *inp, struct sctp_tcb *stcb, void *ptr, * we will allow user data to be sent first * and move to SHUTDOWN-PENDING */ - if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) && - (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_RECEIVED) && - (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { + if ((SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT) && + (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_RECEIVED) && + (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) { - asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT; + SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_PARTIAL_MSG_LEFT); } - asoc->state |= SCTP_STATE_SHUTDOWN_PENDING; + SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_SHUTDOWN_PENDING); if (TAILQ_EMPTY(&asoc->send_queue) && TAILQ_EMPTY(&asoc->sent_queue) && (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT)) { @@ -7433,6 +7453,7 @@ dont_do_it: chk->last_mbuf = SCTP_BUF_NEXT(chk->last_mbuf); } } + if (to_move > length) { /*- This should not happen either * since we always lower to_move to the size @@ -7839,7 +7860,7 @@ sctp_med_chunk_output(struct sctp_inpcb *inp, *reason_code = 0; auth_keyid = stcb->asoc.authinfo.active_keyid; if ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) || - (asoc->state & SCTP_STATE_SHUTDOWN_RECEIVED) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED) || (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR))) { eeor_mode = 1; } else { @@ -7970,6 +7991,7 @@ nothing_to_send: *reason_code = 8; return (0); } + if (asoc->sctp_cmt_on_off > 0) { /* get the last start point */ start_at = asoc->last_net_cmt_send_started; @@ -8596,8 +8618,8 @@ again_one_more_time: omtu = 0; break; } - if ((((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) || - (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) && + if ((((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) && (skip_data_for_this_net == 0)) || (cookie)) { TAILQ_FOREACH_SAFE(chk, &asoc->send_queue, sctp_next, nchk) { @@ -8616,6 +8638,7 @@ again_one_more_time: /* Don't send the chunk on this net */ continue; } + if (asoc->sctp_cmt_on_off == 0) { if ((asoc->alternate) && (asoc->alternate != net) && @@ -8645,7 +8668,7 @@ again_one_more_time: chk->flags |= CHUNK_FLAGS_FRAGMENT_OK; } if (SCTP_BASE_SYSCTL(sctp_enable_sack_immediately) && - ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) == SCTP_STATE_SHUTDOWN_PENDING)) { + (asoc->state & SCTP_STATE_SHUTDOWN_PENDING)) { struct sctp_data_chunk *dchkh; dchkh = mtod(chk->data, struct sctp_data_chunk *); @@ -8879,6 +8902,7 @@ no_data_fill: if (old_start_at) goto again_one_more_time; } + /* * At the end there should be no NON timed chunks hanging on this * queue. @@ -9272,17 +9296,20 @@ sctp_send_asconf(struct sctp_tcb *stcb, struct sctp_nets *net, int addr_locked) /* can't send a new one if there is one in flight already */ return; } + /* compose an ASCONF chunk, maximum length is PMTU */ m_asconf = sctp_compose_asconf(stcb, &len, addr_locked); if (m_asconf == NULL) { return; } + sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { /* no memory */ sctp_m_freem(m_asconf); return; } + chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_ASCONF; chk->rec.chunk_id.can_take_data = 0; @@ -9353,6 +9380,7 @@ sctp_send_asconf_ack(struct sctp_tcb *stcb) if (ack->data == NULL) { continue; } + /* copy the asconf_ack */ m_ack = SCTP_M_COPYM(ack->data, 0, M_COPYALL, M_NOWAIT); if (m_ack == NULL) { @@ -9541,8 +9569,8 @@ sctp_chunk_retransmission(struct sctp_inpcb *inp, if (TAILQ_EMPTY(&asoc->sent_queue)) { return (SCTP_RETRAN_DONE); } - if ((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED) || - (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT)) { + if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT)) { /* not yet open, resend the cookie and that is it */ return (1); } @@ -10241,6 +10269,7 @@ sctp_output( SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL); return (EINVAL); } + if (inp->sctp_socket == NULL) { SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL); return (EINVAL); @@ -11228,12 +11257,13 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst, sctp_packet_log(o_pak); } #endif + SCTP_PROBE5(send, NULL, NULL, ip, NULL, shout); SCTP_IP_OUTPUT(ret, o_pak, NULL, NULL, vrf_id); break; #endif #ifdef INET6 case AF_INET6: - ip6->ip6_plen = (uint16_t)(len - sizeof(struct ip6_hdr)); + ip6->ip6_plen = htons((uint16_t)(len - sizeof(struct ip6_hdr))); if (port) { shout->checksum = sctp_calculate_cksum(mout, sizeof(struct ip6_hdr) + sizeof(struct udphdr)); SCTP_STAT_INCR(sctps_sendswcrc); @@ -11250,6 +11280,7 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst, sctp_packet_log(o_pak); } #endif + SCTP_PROBE5(send, NULL, NULL, ip6, NULL, shout); SCTP_IP6_OUTPUT(ret, o_pak, NULL, NULL, NULL, vrf_id); break; #endif @@ -11314,6 +11345,7 @@ sctp_send_hb(struct sctp_tcb *stcb, struct sctp_nets *net, int so_locked SCTPDBG(SCTP_DEBUG_OUTPUT4, "Gak, can't get a chunk for hb\n"); return; } + chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_HEARTBEAT_REQUEST; chk->rec.chunk_id.can_take_data = 1; @@ -12332,6 +12364,7 @@ sctp_copy_one(struct sctp_stream_queue_pending *sp, SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, ENOBUFS); return (ENOBUFS); } + sp->tail_mbuf = m_last(sp->data); return (0); } @@ -12348,6 +12381,7 @@ sctp_copy_it_in(struct sctp_tcb *stcb, int user_marks_eor, int *error) { + /*- * This routine must be very careful in its work. Protocol * processing is up and running so care must be taken to spl...() @@ -12360,9 +12394,9 @@ sctp_copy_it_in(struct sctp_tcb *stcb, *error = 0; /* Now can we send this? */ - if ((SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_SENT) || - (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_ACK_SENT) || - (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED) || + if ((SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_SENT) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_ACK_SENT) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED) || (asoc->state & SCTP_STATE_SHUTDOWN_PENDING)) { /* got data while shutting down */ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ECONNRESET); @@ -12748,7 +12782,7 @@ sctp_lower_sosend(struct socket *so, */ queue_only = 1; asoc = &stcb->asoc; - SCTP_SET_STATE(asoc, SCTP_STATE_COOKIE_WAIT); + SCTP_SET_STATE(stcb, SCTP_STATE_COOKIE_WAIT); (void)SCTP_GETTIME_TIMEVAL(&asoc->time_entered); /* initialize authentication params for the assoc */ @@ -12870,8 +12904,8 @@ sctp_lower_sosend(struct socket *so, SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, error); goto out_unlocked; } - if ((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT) || - (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED)) { + if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { queue_only = 1; } /* we are now done with all control */ @@ -12879,9 +12913,9 @@ sctp_lower_sosend(struct socket *so, sctp_m_freem(control); control = NULL; } - if ((SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_SENT) || - (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED) || - (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_ACK_SENT) || + if ((SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_SENT) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_ACK_SENT) || (asoc->state & SCTP_STATE_SHUTDOWN_PENDING)) { if (srcv->sinfo_flags & SCTP_ABORT) { ; @@ -12903,8 +12937,8 @@ sctp_lower_sosend(struct socket *so, int tot_demand, tot_out = 0, max_out; SCTP_STAT_INCR(sctps_sends_with_abort); - if ((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT) || - (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED)) { + if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { /* It has to be up before we abort */ /* how big is the user initiated abort? */ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); @@ -13014,6 +13048,7 @@ sctp_lower_sosend(struct socket *so, error = EFAULT; goto out_unlocked; } + /* Unless E_EOR mode is on, we must make a send FIT in one call. */ if ((user_marks_eor == 0) && (sndlen > SCTP_SB_LIMIT_SND(stcb->sctp_socket))) { @@ -13031,6 +13066,7 @@ sctp_lower_sosend(struct socket *so, error = EINVAL; goto out_unlocked; } + if (user_marks_eor) { local_add_more = min(SCTP_SB_LIMIT_SND(so), SCTP_BASE_SYSCTL(sctp_add_more_threshold)); } else { @@ -13095,6 +13131,7 @@ sctp_lower_sosend(struct socket *so, } SOCKBUF_UNLOCK(&so->so_snd); } + skip_preblock: if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { goto out_unlocked; @@ -13231,6 +13268,7 @@ skip_preblock: if (srcv->sinfo_flags & SCTP_SACK_IMMEDIATELY) { sp->sinfo_flags |= SCTP_SACK_IMMEDIATELY; } + /* Did we reach EOR? */ if ((uio->uio_resid == 0) && ((user_marks_eor == 0) || @@ -13279,12 +13317,12 @@ skip_preblock: SCTP_TCB_LOCK(stcb); hold_tcblock = 1; } - if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) { + if (SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) { /* a collision took us forward? */ queue_only = 0; } else { sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED); - SCTP_SET_STATE(asoc, SCTP_STATE_COOKIE_WAIT); + SCTP_SET_STATE(stcb, SCTP_STATE_COOKIE_WAIT); queue_only = 1; } } @@ -13396,6 +13434,7 @@ skip_preblock: SOCKBUF_UNLOCK(&so->so_snd); goto out_unlocked; } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) { sctp_log_block(SCTP_BLOCK_LOG_OUTOF_BLK, asoc, stcb->asoc.total_output_queue_size); @@ -13466,17 +13505,16 @@ dataless_eof: goto abort_anyway; } /* there is nothing queued to send, so I'm done... */ - if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) && - (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_RECEIVED) && - (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { + if ((SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT) && + (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_RECEIVED) && + (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { struct sctp_nets *netp; /* only send SHUTDOWN the first time through */ - if (SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) { + if (SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } - SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT); - SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); + SCTP_SET_STATE(stcb, SCTP_STATE_SHUTDOWN_SENT); sctp_stop_timers_for_shutdown(stcb); if (stcb->asoc.alternate) { netp = stcb->asoc.alternate; @@ -13500,17 +13538,17 @@ dataless_eof: * data to be sent first and move to * SHUTDOWN-PENDING */ - if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) && - (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_RECEIVED) && - (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { + if ((SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT) && + (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_RECEIVED) && + (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { if (hold_tcblock == 0) { SCTP_TCB_LOCK(stcb); hold_tcblock = 1; } if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) { - asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT; + SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_PARTIAL_MSG_LEFT); } - asoc->state |= SCTP_STATE_SHUTDOWN_PENDING; + SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_SHUTDOWN_PENDING); if (TAILQ_EMPTY(&asoc->send_queue) && TAILQ_EMPTY(&asoc->sent_queue) && (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT)) { @@ -13551,12 +13589,12 @@ skip_out_eof: SCTP_TCB_LOCK(stcb); hold_tcblock = 1; } - if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) { + if (SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) { /* a collision took us forward? */ queue_only = 0; } else { sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED); - SCTP_SET_STATE(&stcb->asoc, SCTP_STATE_COOKIE_WAIT); + SCTP_SET_STATE(stcb, SCTP_STATE_COOKIE_WAIT); queue_only = 1; } } @@ -13761,6 +13799,7 @@ sctp_v6src_match_nexthop(struct sockaddr_in6 *src6, sctp_route_t *ro) SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, (struct sockaddr *)src6); return (0); } + SCTPDBG(SCTP_DEBUG_OUTPUT2, "v6src_match_nexthop(), Prefix entry is "); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, (struct sockaddr *)src6); diff --git a/freebsd/sys/netinet/sctp_output.h b/freebsd/sys/netinet/sctp_output.h index e6222e3f..1b3d22d9 100644 --- a/freebsd/sys/netinet/sctp_output.h +++ b/freebsd/sys/netinet/sctp_output.h @@ -74,7 +74,7 @@ int int sctp_v4src_match_nexthop(struct sctp_ifa *sifa, sctp_route_t *ro); -void +void sctp_send_initiate(struct sctp_inpcb *, struct sctp_tcb *, int #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) SCTP_UNUSED @@ -117,7 +117,7 @@ void sctp_send_shutdown_ack(struct sctp_tcb *, struct sctp_nets *); void sctp_send_shutdown_complete(struct sctp_tcb *, struct sctp_nets *, int); -void +void sctp_send_shutdown_complete2(struct sockaddr *, struct sockaddr *, struct sctphdr *, uint8_t, uint32_t, uint16_t, @@ -146,13 +146,13 @@ int sctp_output(struct sctp_inpcb *, struct mbuf *, struct sockaddr *, struct mbuf *, struct thread *, int); -void +void sctp_chunk_output(struct sctp_inpcb *, struct sctp_tcb *, int, int #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) SCTP_UNUSED #endif ); -void +void sctp_send_abort_tcb(struct sctp_tcb *, struct mbuf *, int #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) SCTP_UNUSED @@ -201,7 +201,7 @@ sctp_send_abort(struct mbuf *, int, struct sockaddr *, struct sockaddr *, uint8_t, uint32_t, uint16_t, uint32_t, uint16_t); -void +void sctp_send_operr_to(struct sockaddr *, struct sockaddr *, struct sctphdr *, uint32_t, struct mbuf *, uint8_t, uint32_t, uint16_t, diff --git a/freebsd/sys/netinet/sctp_pcb.c b/freebsd/sys/netinet/sctp_pcb.c index cf993d64..782e5f1d 100644 --- a/freebsd/sys/netinet/sctp_pcb.c +++ b/freebsd/sys/netinet/sctp_pcb.c @@ -187,6 +187,7 @@ sctp_allocate_vrf(int vrf_id) SCTP_FREE(vrf, SCTP_M_VRF); return (NULL); } + /* Add it to the hash table */ bucket = &SCTP_BASE_INFO(sctp_vrfhash)[(vrf_id & SCTP_BASE_INFO(hashvrfmark))]; LIST_INSERT_HEAD(bucket, vrf, next_vrf); @@ -738,6 +739,7 @@ sctp_del_addr_from_vrf(uint32_t vrf_id, struct sockaddr *addr, SCTPDBG(SCTP_DEBUG_PCB4, "Can't find vrf_id 0x%x\n", vrf_id); goto out_now; } + #ifdef SCTP_DEBUG SCTPDBG(SCTP_DEBUG_PCB4, "vrf_id 0x%x: deleting address:", vrf_id); SCTPDBG_ADDR(SCTP_DEBUG_PCB4, addr); @@ -866,6 +868,7 @@ sctp_does_stcb_own_this_addr(struct sctp_tcb *stcb, struct sockaddr *to) SCTP_IPI_ADDR_RUNLOCK(); return (0); } + if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) { if ((loopback_scope == 0) && @@ -1027,6 +1030,7 @@ sctp_tcb_special_locate(struct sctp_inpcb **inp_p, struct sockaddr *from, if ((to == NULL) || (from == NULL)) { return (NULL); } + switch (to->sa_family) { #ifdef INET case AF_INET: @@ -1389,6 +1393,7 @@ sctp_findassociation_ep_addr(struct sctp_inpcb **inp_p, struct sockaddr *remote, if (locked_tcb) { atomic_subtract_int(&locked_tcb->asoc.refcnt, 1); } + SCTP_INP_WUNLOCK(inp); SCTP_INP_INFO_RUNLOCK(); return (stcb); @@ -2254,6 +2259,7 @@ sctp_findassociation_addr(struct mbuf *m, int offset, return (stcb); } } + if (inp_p) { stcb = sctp_findassociation_addr_sa(src, dst, inp_p, netp, 1, vrf_id); @@ -2849,6 +2855,7 @@ sctp_inpcb_bind(struct socket *so, struct sockaddr *addr, SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); return (EINVAL); } + sin = (struct sockaddr_in *)addr; lport = sin->sin_port; /* @@ -3368,14 +3375,14 @@ sctp_inpcb_free(struct sctp_inpcb *inp, int immediate, int from) * was not closed. So go ahead and * start it now. */ - asoc->asoc.state &= ~SCTP_STATE_IN_ACCEPT_QUEUE; + SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_IN_ACCEPT_QUEUE); sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, asoc, NULL); } SCTP_TCB_UNLOCK(asoc); continue; } - if (((SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_COOKIE_WAIT) || - (SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_COOKIE_ECHOED)) && + if (((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT) || + (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED)) && (asoc->asoc.total_output_queue_size == 0)) { /* * If we have data in queue, we don't want @@ -3392,7 +3399,7 @@ sctp_inpcb_free(struct sctp_inpcb *inp, int immediate, int from) } /* Disconnect the socket please */ asoc->sctp_socket = NULL; - asoc->asoc.state |= SCTP_STATE_CLOSED_SOCKET; + SCTP_ADD_SUBSTATE(asoc, SCTP_STATE_CLOSED_SOCKET); if ((asoc->asoc.size_on_reasm_queue > 0) || (asoc->asoc.control_pdapi) || (asoc->asoc.size_on_all_streams > 0) || @@ -3404,8 +3411,8 @@ sctp_inpcb_free(struct sctp_inpcb *inp, int immediate, int from) asoc->sctp_ep->last_abort_code = SCTP_FROM_SCTP_PCB + SCTP_LOC_3; sctp_send_abort_tcb(asoc, op_err, SCTP_SO_LOCKED); SCTP_STAT_INCR_COUNTER32(sctps_aborted); - if ((SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_OPEN) || - (SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } if (sctp_free_assoc(inp, asoc, @@ -3419,20 +3426,19 @@ sctp_inpcb_free(struct sctp_inpcb *inp, int immediate, int from) if ((*asoc->asoc.ss_functions.sctp_ss_is_user_msgs_incomplete) (asoc, &asoc->asoc)) { goto abort_anyway; } - if ((SCTP_GET_STATE(&asoc->asoc) != SCTP_STATE_SHUTDOWN_SENT) && - (SCTP_GET_STATE(&asoc->asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { + if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) && + (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { struct sctp_nets *netp; /* * there is nothing queued to send, * so I send shutdown */ - if ((SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_OPEN) || - (SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } - SCTP_SET_STATE(&asoc->asoc, SCTP_STATE_SHUTDOWN_SENT); - SCTP_CLEAR_SUBSTATE(&asoc->asoc, SCTP_STATE_SHUTDOWN_PENDING); + SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT); sctp_stop_timers_for_shutdown(asoc); if (asoc->asoc.alternate) { netp = asoc->asoc.alternate; @@ -3448,11 +3454,11 @@ sctp_inpcb_free(struct sctp_inpcb *inp, int immediate, int from) } } else { /* mark into shutdown pending */ - asoc->asoc.state |= SCTP_STATE_SHUTDOWN_PENDING; + SCTP_ADD_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, asoc->sctp_ep, asoc, asoc->asoc.primary_destination); if ((*asoc->asoc.ss_functions.sctp_ss_is_user_msgs_incomplete) (asoc, &asoc->asoc)) { - asoc->asoc.state |= SCTP_STATE_PARTIAL_MSG_LEFT; + SCTP_ADD_SUBSTATE(asoc, SCTP_STATE_PARTIAL_MSG_LEFT); } if (TAILQ_EMPTY(&asoc->asoc.send_queue) && TAILQ_EMPTY(&asoc->asoc.sent_queue) && @@ -3464,8 +3470,8 @@ sctp_inpcb_free(struct sctp_inpcb *inp, int immediate, int from) asoc->sctp_ep->last_abort_code = SCTP_FROM_SCTP_PCB + SCTP_LOC_5; sctp_send_abort_tcb(asoc, op_err, SCTP_SO_LOCKED); SCTP_STAT_INCR_COUNTER32(sctps_aborted); - if ((SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_OPEN) || - (SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } if (sctp_free_assoc(inp, asoc, @@ -3503,6 +3509,7 @@ sctp_inpcb_free(struct sctp_inpcb *inp, int immediate, int from) LIST_REMOVE(inp, sctp_hash); inp->sctp_flags |= SCTP_PCB_FLAGS_UNBOUND; } + /* * If there is a timer running to kill us, forget it, since it may * have a contest on the INP lock.. which would cause us to die ... @@ -3512,7 +3519,7 @@ sctp_inpcb_free(struct sctp_inpcb *inp, int immediate, int from) SCTP_TCB_LOCK(asoc); if (asoc->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { if (asoc->asoc.state & SCTP_STATE_IN_ACCEPT_QUEUE) { - asoc->asoc.state &= ~SCTP_STATE_IN_ACCEPT_QUEUE; + SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_IN_ACCEPT_QUEUE); sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, asoc, NULL); } cnt++; @@ -3520,7 +3527,7 @@ sctp_inpcb_free(struct sctp_inpcb *inp, int immediate, int from) continue; } /* Free associations that are NOT killing us */ - if ((SCTP_GET_STATE(&asoc->asoc) != SCTP_STATE_COOKIE_WAIT) && + if ((SCTP_GET_STATE(asoc) != SCTP_STATE_COOKIE_WAIT) && ((asoc->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) == 0)) { struct mbuf *op_err; @@ -3533,8 +3540,8 @@ sctp_inpcb_free(struct sctp_inpcb *inp, int immediate, int from) SCTP_TCB_UNLOCK(asoc); continue; } - if ((SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_OPEN) || - (SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } if (sctp_free_assoc(inp, asoc, SCTP_PCBFREE_FORCE, @@ -3637,6 +3644,8 @@ sctp_inpcb_free(struct sctp_inpcb *inp, int immediate, int from) (void)sctp_m_free(ip_pcb->inp_options); ip_pcb->inp_options = 0; } + + #ifdef INET6 if (ip_pcb->inp_vflag & INP_IPV6) { struct in6pcb *in6p; @@ -4797,7 +4806,7 @@ sctp_free_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int from_inpcbfre } /* Now the read queue needs to be cleaned up (only once) */ if ((stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) == 0) { - stcb->asoc.state |= SCTP_STATE_ABOUT_TO_BE_FREED; + SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_ABOUT_TO_BE_FREED); SCTP_INP_READ_LOCK(inp); TAILQ_FOREACH(sq, &inp->read_queue, next) { if (sq->stcb == stcb) { @@ -4851,7 +4860,7 @@ sctp_free_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int from_inpcbfre if ((stcb->asoc.refcnt) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) { - stcb->asoc.state &= ~SCTP_STATE_IN_ACCEPT_QUEUE; + SCTP_CLEAR_SUBSTATE(stcb, SCTP_STATE_IN_ACCEPT_QUEUE); sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL); } SCTP_TCB_UNLOCK(stcb); @@ -4864,6 +4873,7 @@ sctp_free_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int from_inpcbfre sctp_sorwakeup(inp, so); sctp_sowwakeup(inp, so); } + #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, stcb, 9); #endif @@ -4922,6 +4932,7 @@ sctp_free_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int from_inpcbfre } } } + /* * Make it invalid too, that way if its about to run it will abort * and return. @@ -4931,7 +4942,7 @@ sctp_free_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int from_inpcbfre atomic_add_int(&stcb->asoc.refcnt, -1); } if (stcb->asoc.refcnt) { - stcb->asoc.state &= ~SCTP_STATE_IN_ACCEPT_QUEUE; + SCTP_CLEAR_SUBSTATE(stcb, SCTP_STATE_IN_ACCEPT_QUEUE); sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL); if (from_inpcbfree == SCTP_NORMAL_PROC) { SCTP_INP_INFO_WUNLOCK(); @@ -5339,6 +5350,7 @@ sctp_update_ep_vflag(struct sctp_inpcb *inp) __func__); continue; } + if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) { continue; } @@ -5752,6 +5764,7 @@ sctp_startup_mcore_threads(void) i++; } } + /* Now start them all */ CPU_FOREACH(cpu) { (void)kproc_create(sctp_mcore_thread, @@ -6267,7 +6280,7 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m, * assoc? straighten out locks. */ if (stcb_tmp) { - if (SCTP_GET_STATE(&stcb_tmp->asoc) & SCTP_STATE_COOKIE_WAIT) { + if (SCTP_GET_STATE(stcb_tmp) == SCTP_STATE_COOKIE_WAIT) { struct mbuf *op_err; char msg[SCTP_DIAG_INFO_LEN]; @@ -6286,6 +6299,7 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m, } SCTP_TCB_UNLOCK(stcb_tmp); } + if (stcb->asoc.state == 0) { /* the assoc was freed? */ return (-12); @@ -6366,7 +6380,7 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m, * assoc? straighten out locks. */ if (stcb_tmp) { - if (SCTP_GET_STATE(&stcb_tmp->asoc) & SCTP_STATE_COOKIE_WAIT) { + if (SCTP_GET_STATE(stcb_tmp) == SCTP_STATE_COOKIE_WAIT) { struct mbuf *op_err; char msg[SCTP_DIAG_INFO_LEN]; @@ -6708,6 +6722,8 @@ next_param: if (p_random != NULL) { keylen = sizeof(*p_random) + random_len; memcpy(new_key->key, p_random, keylen); + } else { + keylen = 0; } /* append in the AUTH chunks */ if (chunks != NULL) { @@ -7003,6 +7019,7 @@ sctp_drain_mbufs(struct sctp_tcb *stcb) if (!fnd) { asoc->highest_tsn_inside_map = asoc->mapping_array_base_tsn - 1; } + /* * Question, should we go through the delivery queue? The * only reason things are on here is the app not reading OR diff --git a/freebsd/sys/netinet/sctp_pcb.h b/freebsd/sys/netinet/sctp_pcb.h index 3fc03399..5b41ae8a 100644 --- a/freebsd/sys/netinet/sctp_pcb.h +++ b/freebsd/sys/netinet/sctp_pcb.h @@ -363,7 +363,7 @@ struct sctp_inpcb { union { struct inpcb inp; char align[(sizeof(struct in6pcb) + SCTP_ALIGNM1) & - ~SCTP_ALIGNM1]; + ~SCTP_ALIGNM1]; } ip_inp; @@ -389,7 +389,7 @@ struct sctp_inpcb { uint64_t sctp_features; /* Feature flags */ uint32_t sctp_flags; /* INP state flag set */ uint32_t sctp_mobility_features; /* Mobility Feature flags */ - struct sctp_pcb sctp_ep;/* SCTP ep data */ + struct sctp_pcb sctp_ep; /* SCTP ep data */ /* head of the hash of all associations */ struct sctpasochead *sctp_tcbhash; u_long sctp_hashmark; @@ -492,8 +492,7 @@ int SCTP6_ARE_ADDR_EQUAL(struct sockaddr_in6 *a, struct sockaddr_in6 *b); void sctp_fill_pcbinfo(struct sctp_pcbinfo *); -struct sctp_ifn * - sctp_find_ifn(void *ifn, uint32_t ifn_index); +struct sctp_ifn *sctp_find_ifn(void *ifn, uint32_t ifn_index); struct sctp_vrf *sctp_allocate_vrf(int vrfid); struct sctp_vrf *sctp_find_vrf(uint32_t vrfid); @@ -524,7 +523,7 @@ void sctp_free_ifn(struct sctp_ifn *sctp_ifnp); void sctp_free_ifa(struct sctp_ifa *sctp_ifap); -void +void sctp_del_addr_from_vrf(uint32_t vrfid, struct sockaddr *addr, uint32_t ifn_index, const char *if_name); @@ -534,7 +533,7 @@ struct sctp_nets *sctp_findnet(struct sctp_tcb *, struct sockaddr *); struct sctp_inpcb *sctp_pcb_findep(struct sockaddr *, int, int, uint32_t); -int +int sctp_inpcb_bind(struct socket *, struct sockaddr *, struct sctp_ifa *, struct thread *); @@ -563,8 +562,7 @@ sctp_findassociation_ep_addr(struct sctp_inpcb **, struct sockaddr *, struct sctp_nets **, struct sockaddr *, struct sctp_tcb *); -struct sctp_tcb * - sctp_findasoc_ep_asocid_locked(struct sctp_inpcb *inp, sctp_assoc_t asoc_id, int want_lock); +struct sctp_tcb *sctp_findasoc_ep_asocid_locked(struct sctp_inpcb *inp, sctp_assoc_t asoc_id, int want_lock); struct sctp_tcb * sctp_findassociation_ep_asocid(struct sctp_inpcb *, diff --git a/freebsd/sys/netinet/sctp_peeloff.c b/freebsd/sys/netinet/sctp_peeloff.c index ad96b88c..14a7c381 100644 --- a/freebsd/sys/netinet/sctp_peeloff.c +++ b/freebsd/sys/netinet/sctp_peeloff.c @@ -76,7 +76,7 @@ sctp_can_peel_off(struct socket *head, sctp_assoc_t assoc_id) SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_PEELOFF, ENOENT); return (ENOENT); } - state = SCTP_GET_STATE((&stcb->asoc)); + state = SCTP_GET_STATE(stcb); if ((state == SCTP_STATE_EMPTY) || (state == SCTP_STATE_INUSE)) { SCTP_TCB_UNLOCK(stcb); @@ -105,13 +105,15 @@ sctp_do_peeloff(struct socket *head, struct socket *so, sctp_assoc_t assoc_id) SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PEELOFF, ENOTCONN); return (ENOTCONN); } - state = SCTP_GET_STATE((&stcb->asoc)); + + state = SCTP_GET_STATE(stcb); if ((state == SCTP_STATE_EMPTY) || (state == SCTP_STATE_INUSE)) { SCTP_TCB_UNLOCK(stcb); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PEELOFF, ENOTCONN); return (ENOTCONN); } + n_inp = (struct sctp_inpcb *)so->so_pcb; n_inp->sctp_flags = (SCTP_PCB_FLAGS_UDPTYPE | SCTP_PCB_FLAGS_CONNECTED | diff --git a/freebsd/sys/netinet/sctp_structs.h b/freebsd/sys/netinet/sctp_structs.h index d60705b4..c4eafc26 100644 --- a/freebsd/sys/netinet/sctp_structs.h +++ b/freebsd/sys/netinet/sctp_structs.h @@ -105,7 +105,7 @@ TAILQ_HEAD(sctp_resethead, sctp_stream_reset_list); #define SCTP_ASOC_ANY_STATE 0x00000000 typedef void (*asoc_func) (struct sctp_inpcb *, struct sctp_tcb *, void *ptr, - uint32_t val); + uint32_t val); typedef int (*inp_func) (struct sctp_inpcb *, void *ptr, uint32_t val); typedef void (*end_func) (void *ptr, uint32_t val); @@ -144,7 +144,7 @@ struct sctp_iterator { asoc_func function_assoc; /* per assoc function */ inp_func function_inp; /* per endpoint function */ inp_func function_inp_end; /* end INP function */ - end_func function_atend;/* iterator completion function */ + end_func function_atend; /* iterator completion function */ void *pointer; /* pointer for apply func to use */ uint32_t val; /* value for apply func to use */ uint32_t pcb_flags; /* endpoint flags being checked */ @@ -231,7 +231,7 @@ struct rtcc_cc { uint64_t bw_tot_time; /* The total time since sending began */ uint64_t new_tot_time; /* temp holding the new value */ uint64_t bw_bytes_at_last_rttc; /* What bw_bytes was at last rtt calc */ - uint32_t cwnd_at_bw_set;/* Cwnd at last bw saved - lbw */ + uint32_t cwnd_at_bw_set; /* Cwnd at last bw saved - lbw */ uint32_t vol_reduce; /* cnt of voluntary reductions */ uint16_t steady_step; /* The number required to be in steady state */ uint16_t step_cnt; /* The current number */ @@ -240,7 +240,8 @@ struct rtcc_cc { uint8_t use_dccc_ecn; /* Flag to enable DCCC ECN */ uint8_t tls_needs_set; /* Flag to indicate we need to set tls 0 or 1 * means set at send 2 not */ - uint8_t last_step_state;/* Last state if steady state stepdown is on */ + uint8_t last_step_state; /* Last state if steady state stepdown + * is on */ uint8_t rtt_set_this_sack; /* Flag saying this sack had RTT calc * on it */ uint8_t last_inst_ind; /* Last saved inst indication */ @@ -331,8 +332,8 @@ struct sctp_nets { uint8_t dscp; struct timeval start_time; /* time when this net was created */ - uint32_t marked_retrans;/* number or DATA chunks marked for timer - * based retransmissions */ + uint32_t marked_retrans; /* number or DATA chunks marked for + * timer based retransmissions */ uint32_t marked_fastretrans; uint32_t heart_beat_delay; /* Heart Beat delay in ms */ @@ -706,28 +707,28 @@ struct sctp_nonpad_sndrcvinfo { struct sctp_cc_functions { void (*sctp_set_initial_cc_param) (struct sctp_tcb *stcb, struct sctp_nets *net); void (*sctp_cwnd_update_after_sack) (struct sctp_tcb *stcb, - struct sctp_association *asoc, - int accum_moved, int reneged_all, int will_exit); + struct sctp_association *asoc, + int accum_moved, int reneged_all, int will_exit); void (*sctp_cwnd_update_exit_pf) (struct sctp_tcb *stcb, struct sctp_nets *net); void (*sctp_cwnd_update_after_fr) (struct sctp_tcb *stcb, - struct sctp_association *asoc); + struct sctp_association *asoc); void (*sctp_cwnd_update_after_timeout) (struct sctp_tcb *stcb, - struct sctp_nets *net); + struct sctp_nets *net); void (*sctp_cwnd_update_after_ecn_echo) (struct sctp_tcb *stcb, - struct sctp_nets *net, int in_window, int num_pkt_lost); + struct sctp_nets *net, int in_window, int num_pkt_lost); void (*sctp_cwnd_update_after_packet_dropped) (struct sctp_tcb *stcb, - struct sctp_nets *net, struct sctp_pktdrop_chunk *cp, - uint32_t *bottle_bw, uint32_t *on_queue); + struct sctp_nets *net, struct sctp_pktdrop_chunk *cp, + uint32_t *bottle_bw, uint32_t *on_queue); void (*sctp_cwnd_update_after_output) (struct sctp_tcb *stcb, - struct sctp_nets *net, int burst_limit); + struct sctp_nets *net, int burst_limit); void (*sctp_cwnd_update_packet_transmitted) (struct sctp_tcb *stcb, - struct sctp_nets *net); + struct sctp_nets *net); void (*sctp_cwnd_update_tsn_acknowledged) (struct sctp_nets *net, - struct sctp_tmit_chunk *); + struct sctp_tmit_chunk *); void (*sctp_cwnd_new_transmission_begins) (struct sctp_tcb *stcb, - struct sctp_nets *net); + struct sctp_nets *net); void (*sctp_cwnd_prepare_net_for_sack) (struct sctp_tcb *stcb, - struct sctp_nets *net); + struct sctp_nets *net); int (*sctp_cwnd_socket_option) (struct sctp_tcb *stcb, int set, struct sctp_cc_option *); void (*sctp_rtt_calculated) (struct sctp_tcb *, struct sctp_nets *, struct timeval *); }; @@ -738,25 +739,25 @@ struct sctp_cc_functions { */ struct sctp_ss_functions { void (*sctp_ss_init) (struct sctp_tcb *stcb, struct sctp_association *asoc, - int holds_lock); + int holds_lock); void (*sctp_ss_clear) (struct sctp_tcb *stcb, struct sctp_association *asoc, - int clear_values, int holds_lock); + int clear_values, int holds_lock); void (*sctp_ss_init_stream) (struct sctp_tcb *stcb, struct sctp_stream_out *strq, struct sctp_stream_out *with_strq); void (*sctp_ss_add_to_stream) (struct sctp_tcb *stcb, struct sctp_association *asoc, - struct sctp_stream_out *strq, struct sctp_stream_queue_pending *sp, int holds_lock); + struct sctp_stream_out *strq, struct sctp_stream_queue_pending *sp, int holds_lock); int (*sctp_ss_is_empty) (struct sctp_tcb *stcb, struct sctp_association *asoc); void (*sctp_ss_remove_from_stream) (struct sctp_tcb *stcb, struct sctp_association *asoc, - struct sctp_stream_out *strq, struct sctp_stream_queue_pending *sp, int holds_lock); - struct sctp_stream_out *(*sctp_ss_select_stream) (struct sctp_tcb *stcb, - struct sctp_nets *net, struct sctp_association *asoc); + struct sctp_stream_out *strq, struct sctp_stream_queue_pending *sp, int holds_lock); +struct sctp_stream_out *(*sctp_ss_select_stream) (struct sctp_tcb *stcb, + struct sctp_nets *net, struct sctp_association *asoc); void (*sctp_ss_scheduled) (struct sctp_tcb *stcb, struct sctp_nets *net, - struct sctp_association *asoc, struct sctp_stream_out *strq, int moved_how_much); + struct sctp_association *asoc, struct sctp_stream_out *strq, int moved_how_much); void (*sctp_ss_packet_done) (struct sctp_tcb *stcb, struct sctp_nets *net, - struct sctp_association *asoc); + struct sctp_association *asoc); int (*sctp_ss_get_value) (struct sctp_tcb *stcb, struct sctp_association *asoc, - struct sctp_stream_out *strq, uint16_t *value); + struct sctp_stream_out *strq, uint16_t *value); int (*sctp_ss_set_value) (struct sctp_tcb *stcb, struct sctp_association *asoc, - struct sctp_stream_out *strq, uint16_t value); + struct sctp_stream_out *strq, uint16_t value); int (*sctp_ss_is_user_msgs_incomplete) (struct sctp_tcb *stcb, struct sctp_association *asoc); }; diff --git a/freebsd/sys/netinet/sctp_sysctl.c b/freebsd/sys/netinet/sctp_sysctl.c index f1a8d1d5..a4343cbe 100644 --- a/freebsd/sys/netinet/sctp_sysctl.c +++ b/freebsd/sys/netinet/sctp_sysctl.c @@ -411,7 +411,7 @@ sctp_sysctl_handle_assoclist(SYSCTL_HANDLER_ARGS) xinpcb.total_recvs = inp->total_recvs; xinpcb.total_nospaces = inp->total_nospaces; xinpcb.fragmentation_point = inp->sctp_frag_point; - xinpcb.socket = inp->sctp_socket; + xinpcb.socket = (uintptr_t)inp->sctp_socket; so = inp->sctp_socket; if ((so == NULL) || (!SCTP_IS_LISTENING(inp)) || diff --git a/freebsd/sys/netinet/sctp_timer.c b/freebsd/sys/netinet/sctp_timer.c index c0253840..86ed4d0d 100644 --- a/freebsd/sys/netinet/sctp_timer.c +++ b/freebsd/sys/netinet/sctp_timer.c @@ -651,6 +651,7 @@ start_again: sctp_log_fr(chk->rec.data.tsn, chk->snd_count, 0, SCTP_FR_T3_MARKED); } + if (chk->rec.data.chunk_was_revoked) { /* deflate the cwnd */ chk->whoTo->cwnd -= chk->book_size; @@ -717,6 +718,7 @@ start_again: /* we did not subtract the same things? */ audit_tf = 1; } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) { sctp_log_fr(tsnfirst, tsnlast, num_mk, SCTP_FR_T3_TIMEOUT); } @@ -791,6 +793,7 @@ start_again: (uint32_t)(uintptr_t)chk->whoTo, chk->rec.data.tsn); } + sctp_flight_size_increase(chk); sctp_total_flight_increase(stcb, chk); } @@ -911,6 +914,7 @@ sctp_t3rxt_timer(struct sctp_inpcb *inp, (net->flight_size == 0)) { (*stcb->asoc.cc_functions.sctp_cwnd_new_transmission_begins) (stcb, net); } + /* * setup the sat loss recovery that prevents satellite cwnd advance. */ @@ -939,6 +943,7 @@ sctp_t3rxt_timer(struct sctp_inpcb *inp, RTFREE(net->ro.ro_rt); net->ro.ro_rt = NULL; } + /* Was it our primary? */ if ((stcb->asoc.primary_destination == net) && (alt != net)) { /* @@ -959,7 +964,7 @@ sctp_t3rxt_timer(struct sctp_inpcb *inp, * Special case for cookie-echo'ed case, we don't do output but must * await the COOKIE-ACK before retransmission */ - if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_ECHOED) { + if (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED) { /* * Here we just reset the timer and start again since we * have not established the asoc @@ -1001,7 +1006,7 @@ sctp_t1init_timer(struct sctp_inpcb *inp, sctp_send_initiate(inp, stcb, SCTP_SO_NOT_LOCKED); return (0); } - if (SCTP_GET_STATE((&stcb->asoc)) != SCTP_STATE_COOKIE_WAIT) { + if (SCTP_GET_STATE(stcb) != SCTP_STATE_COOKIE_WAIT) { return (0); } if (sctp_threshold_management(inp, stcb, net, @@ -1049,7 +1054,7 @@ sctp_cookie_timer(struct sctp_inpcb *inp, } } if (cookie == NULL) { - if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_ECHOED) { + if (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED) { /* FOOBAR! */ struct mbuf *op_err; @@ -1061,7 +1066,7 @@ sctp_cookie_timer(struct sctp_inpcb *inp, #ifdef INVARIANTS panic("Cookie timer expires in wrong state?"); #else - SCTP_PRINTF("Strange in state %d not cookie-echoed yet c-e timer expires?\n", SCTP_GET_STATE(&stcb->asoc)); + SCTP_PRINTF("Strange in state %d not cookie-echoed yet c-e timer expires?\n", SCTP_GET_STATE(stcb)); return (0); #endif } @@ -1212,6 +1217,7 @@ sctp_asconf_timer(struct sctp_inpcb *inp, struct sctp_tcb *stcb, asconf->whoTo = alt; atomic_add_int(&alt->ref_count, 1); } + /* See if an ECN Echo is also stranded */ TAILQ_FOREACH(chk, &stcb->asoc.control_send_queue, sctp_next) { if ((chk->whoTo == net) && @@ -1554,16 +1560,15 @@ sctp_autoclose_timer(struct sctp_inpcb *inp, * there is nothing queued to send, so I'm * done... */ - if (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) { + if (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT) { /* only send SHUTDOWN 1st time thru */ struct sctp_nets *netp; - if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) || - (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } - SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT); - SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); + SCTP_SET_STATE(stcb, SCTP_STATE_SHUTDOWN_SENT); sctp_stop_timers_for_shutdown(stcb); if (stcb->asoc.alternate) { netp = stcb->asoc.alternate; diff --git a/freebsd/sys/netinet/sctp_uio.h b/freebsd/sys/netinet/sctp_uio.h index 8732219c..c91e0414 100644 --- a/freebsd/sys/netinet/sctp_uio.h +++ b/freebsd/sys/netinet/sctp_uio.h @@ -258,13 +258,14 @@ struct sctp_snd_all_completes { /* for the endpoint */ /* The lower four bits is an enumeration of PR-SCTP policies */ -#define SCTP_PR_SCTP_NONE 0x0000/* Reliable transfer */ -#define SCTP_PR_SCTP_TTL 0x0001/* Time based PR-SCTP */ -#define SCTP_PR_SCTP_PRIO 0x0002/* Buffer based PR-SCTP */ +#define SCTP_PR_SCTP_NONE 0x0000 /* Reliable transfer */ +#define SCTP_PR_SCTP_TTL 0x0001 /* Time based PR-SCTP */ +#define SCTP_PR_SCTP_PRIO 0x0002 /* Buffer based PR-SCTP */ #define SCTP_PR_SCTP_BUF SCTP_PR_SCTP_PRIO /* For backwards compatibility */ -#define SCTP_PR_SCTP_RTX 0x0003/* Number of retransmissions based PR-SCTP */ +#define SCTP_PR_SCTP_RTX 0x0003 /* Number of retransmissions based + * PR-SCTP */ #define SCTP_PR_SCTP_MAX SCTP_PR_SCTP_RTX -#define SCTP_PR_SCTP_ALL 0x000f/* Used for aggregated stats */ +#define SCTP_PR_SCTP_ALL 0x000f /* Used for aggregated stats */ #define PR_SCTP_POLICY(x) ((x) & 0x0f) #define PR_SCTP_ENABLED(x) ((PR_SCTP_POLICY(x) != SCTP_PR_SCTP_NONE) && \ @@ -744,7 +745,7 @@ struct sctp_prstatus { struct sctp_cwnd_args { struct sctp_nets *net; /* network to *//* FIXME: LP64 issue */ - uint32_t cwnd_new_value;/* cwnd in k */ + uint32_t cwnd_new_value; /* cwnd in k */ uint32_t pseudo_cumack; uint16_t inflight; /* flightsize in k */ uint16_t cwnd_augment; /* increment to it */ @@ -758,9 +759,9 @@ struct sctp_blk_args { uint32_t onsb; /* in 1k bytes */ uint32_t sndlen; /* len of send being attempted */ uint32_t peer_rwnd; /* rwnd of peer */ - uint16_t send_sent_qcnt;/* chnk cnt */ + uint16_t send_sent_qcnt; /* chnk cnt */ uint16_t stream_qcnt; /* chnk cnt */ - uint16_t chunks_on_oque;/* chunks out */ + uint16_t chunks_on_oque; /* chunks out */ uint16_t flight_size; /* flight size in k */ }; @@ -952,7 +953,7 @@ struct sctpstat { uint32_t sctps_collisionestab; uint32_t sctps_passiveestab; /* sctpStats 3 (Counter32) */ uint32_t sctps_aborted; /* sctpStats 4 (Counter32) */ - uint32_t sctps_shutdown;/* sctpStats 5 (Counter32) */ + uint32_t sctps_shutdown; /* sctpStats 5 (Counter32) */ uint32_t sctps_outoftheblue; /* sctpStats 6 (Counter32) */ uint32_t sctps_checksumerrors; /* sctpStats 7 (Counter32) */ uint32_t sctps_outcontrolchunks; /* sctpStats 8 (Counter64) */ @@ -971,12 +972,12 @@ struct sctpstat { uint32_t sctps_recvdatagrams; /* total input datagrams */ uint32_t sctps_recvpktwithdata; /* total packets that had data */ uint32_t sctps_recvsacks; /* total input SACK chunks */ - uint32_t sctps_recvdata;/* total input DATA chunks */ + uint32_t sctps_recvdata; /* total input DATA chunks */ uint32_t sctps_recvdupdata; /* total input duplicate DATA chunks */ uint32_t sctps_recvheartbeat; /* total input HB chunks */ uint32_t sctps_recvheartbeatack; /* total input HB-ACK chunks */ - uint32_t sctps_recvecne;/* total input ECNE chunks */ - uint32_t sctps_recvauth;/* total input AUTH chunks */ + uint32_t sctps_recvecne; /* total input ECNE chunks */ + uint32_t sctps_recvauth; /* total input AUTH chunks */ uint32_t sctps_recvauthmissing; /* total input chunks missing AUTH */ uint32_t sctps_recvivalhmacid; /* total number of invalid HMAC ids * received */ @@ -993,7 +994,7 @@ struct sctpstat { /* output statistics: */ uint32_t sctps_sendpackets; /* total output packets */ uint32_t sctps_sendsacks; /* total output SACKs */ - uint32_t sctps_senddata;/* total output DATA chunks */ + uint32_t sctps_senddata; /* total output DATA chunks */ uint32_t sctps_sendretransdata; /* total output retransmitted DATA * chunks */ uint32_t sctps_sendfastretrans; /* total output fast retransmitted @@ -1003,8 +1004,8 @@ struct sctpstat { * chunk (u-del multi-fr * algo). */ uint32_t sctps_sendheartbeat; /* total output HB chunks */ - uint32_t sctps_sendecne;/* total output ECNE chunks */ - uint32_t sctps_sendauth;/* total output AUTH chunks FIXME */ + uint32_t sctps_sendecne; /* total output ECNE chunks */ + uint32_t sctps_sendauth; /* total output AUTH chunks FIXME */ uint32_t sctps_senderrors; /* ip_output error counter */ uint32_t sctps_send_spare; /* formerly sctps_sendnocrc */ uint32_t sctps_sendswcrc; @@ -1012,8 +1013,8 @@ struct sctpstat { /* PCKDROPREP statistics: */ uint32_t sctps_pdrpfmbox; /* Packet drop from middle box */ uint32_t sctps_pdrpfehos; /* P-drop from end host */ - uint32_t sctps_pdrpmbda;/* P-drops with data */ - uint32_t sctps_pdrpmbct;/* P-drops, non-data, non-endhost */ + uint32_t sctps_pdrpmbda; /* P-drops with data */ + uint32_t sctps_pdrpmbct; /* P-drops, non-data, non-endhost */ uint32_t sctps_pdrpbwrpt; /* P-drop, non-endhost, bandwidth rep * only */ uint32_t sctps_pdrpcrupt; /* P-drop, not enough for chunk header */ @@ -1024,16 +1025,17 @@ struct sctpstat { uint32_t sctps_pdrpdnfnd; /* P-drop, attempt reverse TSN lookup */ uint32_t sctps_pdrpdiwnp; /* P-drop, e-host confirms zero-rwnd */ uint32_t sctps_pdrpdizrw; /* P-drop, midbox confirms no space */ - uint32_t sctps_pdrpbadd;/* P-drop, data did not match TSN */ - uint32_t sctps_pdrpmark;/* P-drop, TSN's marked for Fast Retran */ + uint32_t sctps_pdrpbadd; /* P-drop, data did not match TSN */ + uint32_t sctps_pdrpmark; /* P-drop, TSN's marked for Fast + * Retran */ /* timeouts */ uint32_t sctps_timoiterator; /* Number of iterator timers that * fired */ - uint32_t sctps_timodata;/* Number of T3 data time outs */ + uint32_t sctps_timodata; /* Number of T3 data time outs */ uint32_t sctps_timowindowprobe; /* Number of window probe (T3) timers * that fired */ - uint32_t sctps_timoinit;/* Number of INIT timers that fired */ - uint32_t sctps_timosack;/* Number of sack timers that fired */ + uint32_t sctps_timoinit; /* Number of INIT timers that fired */ + uint32_t sctps_timosack; /* Number of sack timers that fired */ uint32_t sctps_timoshutdown; /* Number of shutdown timers that * fired */ uint32_t sctps_timoheartbeat; /* Number of heartbeat timers that @@ -1175,14 +1177,11 @@ struct xsctp_inpcb { uint16_t local_port; uint16_t qlen_old; uint16_t maxqlen_old; - void *socket; + uint16_t __spare16; + kvaddr_t socket; uint32_t qlen; uint32_t maxqlen; -#if defined(__LP64__) - uint32_t extra_padding[27]; /* future */ -#else - uint32_t extra_padding[28]; /* future */ -#endif + uint32_t extra_padding[26]; /* future */ }; struct xsctp_tcb { @@ -1192,7 +1191,7 @@ struct xsctp_tcb { uint32_t state; /* sctpAssocEntry 8 */ uint32_t in_streams; /* sctpAssocEntry 9 */ uint32_t out_streams; /* sctpAssocEntry 10 */ - uint32_t max_nr_retrans;/* sctpAssocEntry 11 */ + uint32_t max_nr_retrans; /* sctpAssocEntry 11 */ uint32_t primary_process; /* sctpAssocEntry 12 */ uint32_t T1_expireries; /* sctpAssocEntry 13 */ uint32_t T2_expireries; /* sctpAssocEntry 14 */ @@ -1305,37 +1304,37 @@ void sctp_freeladdrs(struct sockaddr *); int sctp_opt_info(int, sctp_assoc_t, int, void *, socklen_t *); /* deprecated */ -ssize_t +ssize_t sctp_sendmsg(int, const void *, size_t, const struct sockaddr *, socklen_t, uint32_t, uint32_t, uint16_t, uint32_t, uint32_t); /* deprecated */ -ssize_t +ssize_t sctp_send(int, const void *, size_t, const struct sctp_sndrcvinfo *, int); /* deprecated */ -ssize_t +ssize_t sctp_sendx(int, const void *, size_t, struct sockaddr *, int, struct sctp_sndrcvinfo *, int); /* deprecated */ -ssize_t +ssize_t sctp_sendmsgx(int sd, const void *, size_t, struct sockaddr *, int, uint32_t, uint32_t, uint16_t, uint32_t, uint32_t); sctp_assoc_t sctp_getassocid(int, struct sockaddr *); /* deprecated */ -ssize_t +ssize_t sctp_recvmsg(int, void *, size_t, struct sockaddr *, socklen_t *, struct sctp_sndrcvinfo *, int *); -ssize_t +ssize_t sctp_sendv(int, const struct iovec *, int, struct sockaddr *, int, void *, socklen_t, unsigned int, int); -ssize_t +ssize_t sctp_recvv(int, const struct iovec *, int, struct sockaddr *, socklen_t *, void *, socklen_t *, unsigned int *, int *); diff --git a/freebsd/sys/netinet/sctp_usrreq.c b/freebsd/sys/netinet/sctp_usrreq.c index 071d44c2..b519971c 100644 --- a/freebsd/sys/netinet/sctp_usrreq.c +++ b/freebsd/sys/netinet/sctp_usrreq.c @@ -391,6 +391,7 @@ sctp_getcred(SYSCTL_HANDLER_ARGS) SCTP_INP_DECR_REF(inp); goto cred_can_cont; } + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); error = ENOENT; goto out; @@ -431,6 +432,7 @@ sctp_abort(struct socket *so) if (inp == NULL) { return; } + sctp_must_try_again: flags = inp->sctp_flags; #ifdef SCTP_LOG_CLOSING @@ -704,8 +706,7 @@ sctp_disconnect(struct socket *so) if (((so->so_options & SO_LINGER) && (so->so_linger == 0)) || (so->so_rcv.sb_cc > 0)) { - if (SCTP_GET_STATE(asoc) != - SCTP_STATE_COOKIE_WAIT) { + if (SCTP_GET_STATE(stcb) != SCTP_STATE_COOKIE_WAIT) { /* Left with Data unread */ struct mbuf *op_err; @@ -714,8 +715,8 @@ sctp_disconnect(struct socket *so) SCTP_STAT_INCR_COUNTER32(sctps_aborted); } SCTP_INP_RUNLOCK(inp); - if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) || - (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, @@ -730,17 +731,16 @@ sctp_disconnect(struct socket *so) if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) { goto abort_anyway; } - if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) && - (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { + if ((SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT) && + (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { /* only send SHUTDOWN 1st time thru */ struct sctp_nets *netp; - if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) || - (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } - SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT); - SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); + SCTP_SET_STATE(stcb, SCTP_STATE_SHUTDOWN_SENT); sctp_stop_timers_for_shutdown(stcb); if (stcb->asoc.alternate) { netp = stcb->asoc.alternate; @@ -773,11 +773,11 @@ sctp_disconnect(struct socket *so) netp = stcb->asoc.primary_destination; } - asoc->state |= SCTP_STATE_SHUTDOWN_PENDING; + SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_SHUTDOWN_PENDING); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, netp); if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) { - asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT; + SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_PARTIAL_MSG_LEFT); } if (TAILQ_EMPTY(&asoc->send_queue) && TAILQ_EMPTY(&asoc->sent_queue) && @@ -789,8 +789,8 @@ sctp_disconnect(struct socket *so) stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_USRREQ + SCTP_LOC_4; sctp_send_abort_tcb(stcb, op_err, SCTP_SO_LOCKED); SCTP_STAT_INCR_COUNTER32(sctps_aborted); - if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) || - (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } SCTP_INP_RUNLOCK(inp); @@ -921,9 +921,9 @@ sctp_shutdown(struct socket *so) SCTP_INP_RUNLOCK(inp); return (0); } - if ((SCTP_GET_STATE(asoc) != SCTP_STATE_COOKIE_WAIT) && - (SCTP_GET_STATE(asoc) != SCTP_STATE_COOKIE_ECHOED) && - (SCTP_GET_STATE(asoc) != SCTP_STATE_OPEN)) { + if ((SCTP_GET_STATE(stcb) != SCTP_STATE_COOKIE_WAIT) && + (SCTP_GET_STATE(stcb) != SCTP_STATE_COOKIE_ECHOED) && + (SCTP_GET_STATE(stcb) != SCTP_STATE_OPEN)) { /* * If we are not in or before ESTABLISHED, there is * no protocol action required. @@ -937,7 +937,7 @@ sctp_shutdown(struct socket *so) } else { netp = stcb->asoc.primary_destination; } - if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) && + if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) && TAILQ_EMPTY(&asoc->send_queue) && TAILQ_EMPTY(&asoc->sent_queue) && (asoc->stream_queue_cnt == 0)) { @@ -946,8 +946,7 @@ sctp_shutdown(struct socket *so) } /* there is nothing queued to send, so I'm done... */ SCTP_STAT_DECR_GAUGE32(sctps_currestab); - SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT); - SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); + SCTP_SET_STATE(stcb, SCTP_STATE_SHUTDOWN_SENT); sctp_stop_timers_for_shutdown(stcb); sctp_send_shutdown(stcb, netp); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, @@ -957,9 +956,9 @@ sctp_shutdown(struct socket *so) * We still got (or just got) data to send, so set * SHUTDOWN_PENDING. */ - SCTP_ADD_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); + SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_SHUTDOWN_PENDING); if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) { - SCTP_ADD_SUBSTATE(asoc, SCTP_STATE_PARTIAL_MSG_LEFT); + SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_PARTIAL_MSG_LEFT); } if (TAILQ_EMPTY(&asoc->send_queue) && TAILQ_EMPTY(&asoc->sent_queue) && @@ -1369,11 +1368,13 @@ sctp_do_connect_x(struct socket *so, struct sctp_inpcb *inp, void *optval, SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EADDRINUSE); return (EADDRINUSE); } + if ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) && (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE))) { SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (EINVAL); } + if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) { SCTP_INP_RLOCK(inp); stcb = LIST_FIRST(&inp->sctp_asoc_list); @@ -1438,6 +1439,7 @@ sctp_do_connect_x(struct socket *so, struct sctp_inpcb *inp, void *optval, goto out_now; } } + /* FIX ME: do we want to pass in a vrf on the connect call? */ vrf_id = inp->def_vrf_id; @@ -1457,7 +1459,7 @@ sctp_do_connect_x(struct socket *so, struct sctp_inpcb *inp, void *optval, /* Set the connected flag so we can queue data */ soisconnecting(so); } - SCTP_SET_STATE(&stcb->asoc, SCTP_STATE_COOKIE_WAIT); + SCTP_SET_STATE(stcb, SCTP_STATE_COOKIE_WAIT); /* move to second address */ switch (sa->sa_family) { #ifdef INET @@ -1549,6 +1551,7 @@ sctp_getopt(struct socket *so, int optname, void *optval, size_t *optsize, SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (EINVAL); } + inp = (struct sctp_inpcb *)so->so_pcb; if (inp == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); @@ -2393,6 +2396,7 @@ flags_out: break; } } + if (stcb != NULL) { /* Applies to the specific association */ paddrp->spp_flags = 0; @@ -3262,6 +3266,7 @@ flags_out: break; } } + if (stcb != NULL) { if (net != NULL) { thlds->spt_pathmaxrxt = net->failure_threshold; @@ -3374,6 +3379,7 @@ flags_out: break; } } + if (stcb != NULL) { if (net) { encaps->sue_port = net->port; @@ -4252,6 +4258,8 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, if (sctp_auth_add_chunk(sauth->sauth_chunk, inp->sctp_ep.local_auth_chunks)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; + } else { + inp->auth_supported = 1; } SCTP_INP_WUNLOCK(inp); break; @@ -4397,6 +4405,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, error = EINVAL; break; } + hmaclist = sctp_alloc_hmaclist((uint16_t)shmac->shmac_number_of_idents); if (hmaclist == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM); @@ -4589,6 +4598,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } SCTP_INP_RUNLOCK(inp); } + } break; } @@ -5272,12 +5282,14 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (EINVAL); } + if ((paddrp->spp_flags & SPP_PMTUD_ENABLE) && (paddrp->spp_flags & SPP_PMTUD_DISABLE)) { if (stcb) SCTP_TCB_UNLOCK(stcb); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (EINVAL); } + if (stcb != NULL) { /************************TCB SPECIFIC SET ******************/ if (net != NULL) { @@ -5413,6 +5425,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, net->failure_threshold = paddrp->spp_pathmaxrxt; } } + if (paddrp->spp_flags & SPP_HB_ENABLE) { if (paddrp->spp_hbinterval != 0) { stcb->asoc.heart_beat_delay = paddrp->spp_hbinterval; @@ -5523,6 +5536,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, if (paddrp->spp_pathmaxrxt != 0) { inp->sctp_ep.def_net_failure = paddrp->spp_pathmaxrxt; } + if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO) inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = 0; else if (paddrp->spp_hbinterval != 0) { @@ -5530,6 +5544,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, paddrp->spp_hbinterval = SCTP_MAX_HB_INTERVAL; inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = MSEC_TO_TICKS(paddrp->spp_hbinterval); } + if (paddrp->spp_flags & SPP_HB_ENABLE) { if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO) { inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = 0; @@ -6482,6 +6497,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, break; } } + if (stcb != NULL) { if (net != NULL) { net->port = encaps->sue_port; @@ -6865,6 +6881,7 @@ sctp_connect(struct socket *so, struct sockaddr *addr, struct thread *p) SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return EINVAL; } + switch (addr->sa_family) { #ifdef INET6 case AF_INET6: @@ -6970,6 +6987,7 @@ sctp_connect(struct socket *so, struct sockaddr *addr, struct thread *p) error = EALREADY; goto out_now; } + vrf_id = inp->def_vrf_id; /* We are GOOD to go */ stcb = sctp_aloc_assoc(inp, addr, &error, 0, vrf_id, @@ -6984,7 +7002,7 @@ sctp_connect(struct socket *so, struct sockaddr *addr, struct thread *p) /* Set the connected flag so we can queue data */ soisconnecting(so); } - SCTP_SET_STATE(&stcb->asoc, SCTP_STATE_COOKIE_WAIT); + SCTP_SET_STATE(stcb, SCTP_STATE_COOKIE_WAIT); (void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered); /* initialize authentication parameters for the assoc */ @@ -6996,6 +7014,7 @@ out_now: if (create_lock_on) { SCTP_ASOC_CREATE_UNLOCK(inp); } + SCTP_INP_DECR_REF(inp); return (error); } @@ -7134,6 +7153,7 @@ sctp_listen(struct socket *so, int backlog, struct thread *p) return (EADDRINUSE); } } + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED)) { /* We are already connected AND the TCP model */ @@ -7201,7 +7221,7 @@ sctp_accept(struct socket *so, struct sockaddr **addr) SCTP_TCB_LOCK(stcb); SCTP_INP_RUNLOCK(inp); store = stcb->asoc.primary_destination->ro._l_addr; - stcb->asoc.state &= ~SCTP_STATE_IN_ACCEPT_QUEUE; + SCTP_CLEAR_SUBSTATE(stcb, SCTP_STATE_IN_ACCEPT_QUEUE); SCTP_TCB_UNLOCK(stcb); switch (store.sa.sa_family) { #ifdef INET @@ -7336,6 +7356,7 @@ sctp_ingetaddr(struct socket *so, struct sockaddr **addr) SCTP_TCB_UNLOCK(stcb); goto notConn; } + vrf_id = inp->def_vrf_id; sctp_ifa = sctp_source_address_selection(inp, stcb, diff --git a/freebsd/sys/netinet/sctp_var.h b/freebsd/sys/netinet/sctp_var.h index 84cbfc88..175888c3 100644 --- a/freebsd/sys/netinet/sctp_var.h +++ b/freebsd/sys/netinet/sctp_var.h @@ -341,12 +341,12 @@ int sctp_input(struct mbuf **, int *, int); void sctp_pathmtu_adjustment(struct sctp_tcb *, uint16_t); void sctp_drain(void); void sctp_init(void); -void +void sctp_notify(struct sctp_inpcb *, struct sctp_tcb *, struct sctp_nets *, uint8_t, uint8_t, uint16_t, uint32_t); int sctp_flush(struct socket *, int); int sctp_shutdown(struct socket *); -int +int sctp_bindx(struct socket *, int, struct sockaddr_storage *, int, int, struct proc *); diff --git a/freebsd/sys/netinet/sctputil.c b/freebsd/sys/netinet/sctputil.c index aad1e19d..c3cb115e 100644 --- a/freebsd/sys/netinet/sctputil.c +++ b/freebsd/sys/netinet/sctputil.c @@ -58,6 +58,7 @@ __FBSDID("$FreeBSD$"); #endif #include <netinet/udp.h> #include <netinet/udp_var.h> +#include <netinet/in_kdtrace.h> #include <sys/proc.h> #ifdef INET6 #include <netinet/icmp6.h> @@ -1016,7 +1017,7 @@ sctp_init_asoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, asoc = &stcb->asoc; /* init all variables to a known value. */ - SCTP_SET_STATE(&stcb->asoc, SCTP_STATE_INUSE); + SCTP_SET_STATE(stcb, SCTP_STATE_INUSE); asoc->max_burst = inp->sctp_ep.max_burst; asoc->fr_max_burst = inp->sctp_ep.fr_max_burst; asoc->heart_beat_delay = TICKS_TO_MSEC(inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT]); @@ -1435,6 +1436,7 @@ select_a_new_ep: atomic_add_int(&it->stcb->asoc.refcnt, -1); iteration_count = 0; } + /* run function on this one */ (*it->function_assoc) (it->inp, it->stcb, it->pointer, it->val); @@ -1788,6 +1790,7 @@ sctp_timeout_handler(void *t) if ((stcb == NULL) || (inp == NULL)) { break; } + if (sctp_cookie_timer(inp, stcb, net)) { /* no need to unlock on tcb its gone */ goto out_decr; @@ -1983,6 +1986,7 @@ out_decr: if (inp) { SCTP_INP_DECR_REF(inp); } + out_no_decr: SCTPDBG(SCTP_DEBUG_TIMER1, "Timer now complete (type = %d)\n", type); CURVNET_RESTORE(); @@ -2498,9 +2502,8 @@ sctp_calculate_rto(struct sctp_tcb *stcb, } timevalsub(&now, old); /* store the current RTT in us */ - net->rtt = (uint64_t)1000000 *(uint64_t)now.tv_sec + - (uint64_t)now.tv_usec; - + net->rtt = (uint64_t)1000000 * (uint64_t)now.tv_sec + + (uint64_t)now.tv_usec; /* compute rtt in ms */ rtt = (int32_t)(net->rtt / 1000); if ((asoc->cc_functions.sctp_rtt_calculated) && (rtt_from_sack == SCTP_RTT_FROM_DATA)) { @@ -2522,6 +2525,7 @@ sctp_calculate_rto(struct sctp_tcb *stcb, net->lan_type = SCTP_LAN_LOCAL; } } + /***************************/ /* 2. update RTTVAR & SRTT */ /***************************/ @@ -2798,7 +2802,7 @@ set_error: ((state == SCTP_COMM_LOST) || (state == SCTP_CANT_STR_ASSOC))) { SOCK_LOCK(stcb->sctp_socket); if (from_peer) { - if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_WAIT) { + if (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) { SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ECONNREFUSED); stcb->sctp_socket->so_error = ECONNREFUSED; } else { @@ -2806,8 +2810,8 @@ set_error: stcb->sctp_socket->so_error = ECONNRESET; } } else { - if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_WAIT) || - (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_ECHOED)) { + if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ETIMEDOUT); stcb->sctp_socket->so_error = ETIMEDOUT; } else { @@ -2960,6 +2964,7 @@ sctp_notify_send_failed(struct sctp_tcb *stcb, uint8_t sent, uint32_t error, /* event not enabled */ return; } + if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVNSENDFAILEVNT)) { notifhdr_len = sizeof(struct sctp_send_failed_event); } else { @@ -3188,6 +3193,7 @@ sctp_notify_adaptation_layer(struct sctp_tcb *stcb) /* event not enabled */ return; } + m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_adaption_event), 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) /* no space left */ @@ -3244,6 +3250,7 @@ sctp_notify_partial_delivery_indication(struct sctp_tcb *stcb, uint32_t error, if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_CANT_READ) { return; } + m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_pdapi_event), 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) /* no space left */ @@ -3352,6 +3359,7 @@ sctp_notify_shutdown_event(struct sctp_tcb *stcb) /* event not enabled */ return; } + m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_shutdown_event), 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) /* no space left */ @@ -3401,6 +3409,7 @@ sctp_notify_sender_dry_event(struct sctp_tcb *stcb, /* event not enabled */ return; } + m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_sender_dry_event), 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) { /* no space left */ @@ -3557,6 +3566,7 @@ sctp_notify_stream_reset(struct sctp_tcb *stcb, /* event not enabled */ return; } + m_notify = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) /* no space left */ @@ -3691,8 +3701,8 @@ sctp_ulp_notify(uint32_t notification, struct sctp_tcb *stcb, if (stcb->sctp_socket->so_rcv.sb_state & SBS_CANTRCVMORE) { return; } - if ((stcb->asoc.state & SCTP_STATE_COOKIE_WAIT) || - (stcb->asoc.state & SCTP_STATE_COOKIE_ECHOED)) { + if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { if ((notification == SCTP_NOTIFY_INTERFACE_DOWN) || (notification == SCTP_NOTIFY_INTERFACE_UP) || (notification == SCTP_NOTIFY_INTERFACE_CONFIRMED)) { @@ -3766,16 +3776,16 @@ sctp_ulp_notify(uint32_t notification, struct sctp_tcb *stcb, break; } case SCTP_NOTIFY_ASSOC_LOC_ABORTED: - if (((stcb->asoc.state & SCTP_STATE_MASK) == SCTP_STATE_COOKIE_WAIT) || - ((stcb->asoc.state & SCTP_STATE_MASK) == SCTP_STATE_COOKIE_ECHOED)) { + if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { sctp_notify_assoc_change(SCTP_CANT_STR_ASSOC, stcb, error, data, 0, so_locked); } else { sctp_notify_assoc_change(SCTP_COMM_LOST, stcb, error, data, 0, so_locked); } break; case SCTP_NOTIFY_ASSOC_REM_ABORTED: - if (((stcb->asoc.state & SCTP_STATE_MASK) == SCTP_STATE_COOKIE_WAIT) || - ((stcb->asoc.state & SCTP_STATE_MASK) == SCTP_STATE_COOKIE_ECHOED)) { + if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { sctp_notify_assoc_change(SCTP_CANT_STR_ASSOC, stcb, error, data, 1, so_locked); } else { sctp_notify_assoc_change(SCTP_COMM_LOST, stcb, error, data, 1, so_locked); @@ -4019,7 +4029,7 @@ sctp_abort_association(struct sctp_inpcb *inp, struct sctp_tcb *stcb, if (stcb != NULL) { /* We have a TCB to abort, send notification too */ sctp_abort_notification(stcb, 0, 0, NULL, SCTP_SO_NOT_LOCKED); - stcb->asoc.state |= SCTP_STATE_WAS_ABORTED; + SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_WAS_ABORTED); /* Ok, now lets free it */ #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) so = SCTP_INP_SO(inp); @@ -4030,8 +4040,8 @@ sctp_abort_association(struct sctp_inpcb *inp, struct sctp_tcb *stcb, atomic_subtract_int(&stcb->asoc.refcnt, 1); #endif SCTP_STAT_INCR_COUNTER32(sctps_aborted); - if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) || - (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, @@ -4130,13 +4140,13 @@ sctp_abort_an_association(struct sctp_inpcb *inp, struct sctp_tcb *stcb, } return; } else { - stcb->asoc.state |= SCTP_STATE_WAS_ABORTED; + SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_WAS_ABORTED); } /* notify the peer */ sctp_send_abort_tcb(stcb, op_err, so_locked); SCTP_STAT_INCR_COUNTER32(sctps_aborted); - if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) || - (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } /* notify the ulp */ @@ -4971,6 +4981,7 @@ sctp_find_ifa_in_ep(struct sctp_inpcb *inp, struct sockaddr *addr, if (holds_lock == 0) { SCTP_INP_RLOCK(inp); } + LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == NULL) continue; @@ -5060,6 +5071,7 @@ sctp_find_ifa_by_addr(struct sockaddr *addr, uint32_t vrf_id, int holds_lock) SCTP_IPI_ADDR_RUNLOCK(); return (NULL); } + hash_of_addr = sctp_get_ifa_hash_val(addr); hash_head = &vrf->vrf_addr_hash[(hash_of_addr & vrf->vrf_addr_hashmark)]; @@ -5121,9 +5133,8 @@ sctp_user_rcvd(struct sctp_tcb *stcb, uint32_t *freed_so_far, int hold_rlock, atomic_add_int(&stcb->asoc.refcnt, 1); - if (stcb->asoc.state & (SCTP_STATE_ABOUT_TO_BE_FREED | - SCTP_STATE_SHUTDOWN_RECEIVED | - SCTP_STATE_SHUTDOWN_ACK_SENT)) { + if ((SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_ACK_SENT) || + (stcb->asoc.state & (SCTP_STATE_ABOUT_TO_BE_FREED | SCTP_STATE_SHUTDOWN_RECEIVED))) { /* Pre-check If we are freeing no update */ goto no_lock; } @@ -5184,6 +5195,7 @@ out: if (so && r_unlocked && hold_rlock) { SCTP_INP_READ_LOCK(stcb->sctp_ep); } + SCTP_INP_DECR_REF(stcb->sctp_ep); no_lock: atomic_add_int(&stcb->asoc.refcnt, -1); @@ -5233,6 +5245,7 @@ sctp_sorecvmsg(struct socket *so, SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); return (EINVAL); } + if (msg_flags) { in_flags = *msg_flags; if (in_flags & MSG_PEEK) @@ -5276,6 +5289,8 @@ sctp_sorecvmsg(struct socket *so, sctp_misc_ints(SCTP_SORECV_ENTERPL, rwnd_req, block_allowed, so->so_rcv.sb_cc, (uint32_t)uio->uio_resid); } + + error = sblock(&so->so_rcv, (block_allowed ? SBL_WAIT : 0)); if (error) { goto release_unlocked; @@ -5385,6 +5400,7 @@ restart_nosblocks: hold_rlock = 0; goto restart; } + if ((control->length == 0) && (control->do_not_ref_stcb)) { /* @@ -5568,6 +5584,7 @@ found_one: control->do_not_ref_stcb == 0) { stcb->asoc.strmin[control->sinfo_stream].delivery_started = 1; } + /* First lets get off the sinfo and sockaddr info */ if ((sinfo != NULL) && (filling_sinfo != 0)) { sinfo->sinfo_stream = control->sinfo_stream; @@ -5729,6 +5746,7 @@ get_more_data: if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { goto release; } + if ((control->do_not_ref_stcb == 0) && stcb && stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { no_rcv_needed = 1; @@ -5941,6 +5959,7 @@ wait_some_more: if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { goto release; } + if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) goto release; @@ -6069,6 +6088,7 @@ release: SOCKBUF_UNLOCK(&so->so_rcv); hold_sblock = 0; } + sbunlock(&so->so_rcv); sockbuf_lock = 0; @@ -6106,6 +6126,7 @@ out: if (sockbuf_lock) { sbunlock(&so->so_rcv); } + if (freecnt_applied) { /* * The lock on the socket buffer protects us so the free @@ -6703,6 +6724,7 @@ sctp_local_addr_count(struct sctp_tcb *stcb) SCTP_IPI_ADDR_RUNLOCK(); return (0); } + if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { /* * bound all case: go through all ifns on the vrf @@ -7362,3 +7384,49 @@ sctp_hc_get_mtu(union sctp_sockstore *addr, uint16_t fibnum) } return ((uint32_t)tcp_hc_getmtu(&inc)); } + +void +sctp_set_state(struct sctp_tcb *stcb, int new_state) +{ +#if defined(KDTRACE_HOOKS) + int old_state = stcb->asoc.state; +#endif + + KASSERT((new_state & ~SCTP_STATE_MASK) == 0, + ("sctp_set_state: Can't set substate (new_state = %x)", + new_state)); + stcb->asoc.state = (stcb->asoc.state & ~SCTP_STATE_MASK) | new_state; + if ((new_state == SCTP_STATE_SHUTDOWN_RECEIVED) || + (new_state == SCTP_STATE_SHUTDOWN_SENT) || + (new_state == SCTP_STATE_SHUTDOWN_ACK_SENT)) { + SCTP_CLEAR_SUBSTATE(stcb, SCTP_STATE_SHUTDOWN_PENDING); + } +#if defined(KDTRACE_HOOKS) + if (((old_state & SCTP_STATE_MASK) != new_state) && + !(((old_state & SCTP_STATE_MASK) == SCTP_STATE_EMPTY) && + (new_state == SCTP_STATE_INUSE))) { + SCTP_PROBE6(state__change, NULL, stcb, NULL, stcb, NULL, old_state); + } +#endif +} + +void +sctp_add_substate(struct sctp_tcb *stcb, int substate) +{ +#if defined(KDTRACE_HOOKS) + int old_state = stcb->asoc.state; +#endif + + KASSERT((substate & SCTP_STATE_MASK) == 0, + ("sctp_add_substate: Can't set state (substate = %x)", + substate)); + stcb->asoc.state |= substate; +#if defined(KDTRACE_HOOKS) + if (((substate & SCTP_STATE_ABOUT_TO_BE_FREED) && + ((old_state & SCTP_STATE_ABOUT_TO_BE_FREED) == 0)) || + ((substate & SCTP_STATE_SHUTDOWN_PENDING) && + ((old_state & SCTP_STATE_SHUTDOWN_PENDING) == 0))) { + SCTP_PROBE6(state__change, NULL, stcb, NULL, stcb, NULL, old_state); + } +#endif +} diff --git a/freebsd/sys/netinet/sctputil.h b/freebsd/sys/netinet/sctputil.h index 61d34591..c12fb210 100644 --- a/freebsd/sys/netinet/sctputil.h +++ b/freebsd/sys/netinet/sctputil.h @@ -72,11 +72,9 @@ int32_t uint32_t sctp_get_ifa_hash_val(struct sockaddr *addr); -struct sctp_ifa * - sctp_find_ifa_in_ep(struct sctp_inpcb *inp, struct sockaddr *addr, int hold_lock); +struct sctp_ifa *sctp_find_ifa_in_ep(struct sctp_inpcb *inp, struct sockaddr *addr, int hold_lock); -struct sctp_ifa * - sctp_find_ifa_by_addr(struct sockaddr *addr, uint32_t vrf_id, int holds_lock); +struct sctp_ifa *sctp_find_ifa_by_addr(struct sockaddr *addr, uint32_t vrf_id, int holds_lock); uint32_t sctp_select_initial_TSN(struct sctp_pcb *); @@ -147,13 +145,11 @@ struct sctp_paramhdr * sctp_get_next_param(struct mbuf *, int, struct sctp_paramhdr *, int); -struct mbuf * - sctp_add_pad_tombuf(struct mbuf *, int); +struct mbuf *sctp_add_pad_tombuf(struct mbuf *, int); -struct mbuf * - sctp_pad_lastmbuf(struct mbuf *, int, struct mbuf *); +struct mbuf *sctp_pad_lastmbuf(struct mbuf *, int, struct mbuf *); -void +void sctp_ulp_notify(uint32_t, struct sctp_tcb *, uint32_t, void *, int #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) SCTP_UNUSED @@ -168,7 +164,7 @@ sctp_pull_off_control_to_new_inp(struct sctp_inpcb *old_inp, void sctp_stop_timers_for_shutdown(struct sctp_tcb *); -void +void sctp_report_all_outbound(struct sctp_tcb *, uint16_t, int, int #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) SCTP_UNUSED @@ -177,7 +173,7 @@ sctp_report_all_outbound(struct sctp_tcb *, uint16_t, int, int int sctp_expand_mapping_array(struct sctp_association *, uint32_t); -void +void sctp_abort_notification(struct sctp_tcb *, uint8_t, uint16_t, struct sctp_abort_chunk *, int #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) @@ -203,7 +199,7 @@ sctp_abort_an_association(struct sctp_inpcb *, struct sctp_tcb *, #endif ); -void +void sctp_handle_ootb(struct mbuf *, int, int, struct sockaddr *, struct sockaddr *, struct sctphdr *, struct sctp_inpcb *, @@ -211,7 +207,7 @@ sctp_handle_ootb(struct mbuf *, int, int, uint8_t, uint32_t, uint16_t, uint32_t, uint16_t); -int +int sctp_connectx_helper_add(struct sctp_tcb *stcb, struct sockaddr *addr, int totaddr, int *error); @@ -224,8 +220,7 @@ int sctp_is_there_an_abort_here(struct mbuf *, int, uint32_t *); #ifdef INET6 uint32_t sctp_is_same_scope(struct sockaddr_in6 *, struct sockaddr_in6 *); -struct sockaddr_in6 * - sctp_recover_scope(struct sockaddr_in6 *, struct sockaddr_in6 *); +struct sockaddr_in6 *sctp_recover_scope(struct sockaddr_in6 *, struct sockaddr_in6 *); #define sctp_recover_scope_mac(addr, store) do { \ if ((addr->sin6_family == AF_INET6) && \ @@ -258,11 +253,11 @@ sctp_release_pr_sctp_chunk(struct sctp_tcb *, struct sctp_tmit_chunk *, struct mbuf *sctp_generate_cause(uint16_t, char *); struct mbuf *sctp_generate_no_user_data_cause(uint32_t); -void +void sctp_bindx_add_address(struct socket *so, struct sctp_inpcb *inp, struct sockaddr *sa, sctp_assoc_t assoc_id, uint32_t vrf_id, int *error, void *p); -void +void sctp_bindx_delete_address(struct sctp_inpcb *inp, struct sockaddr *sa, sctp_assoc_t assoc_id, uint32_t vrf_id, int *error); @@ -393,5 +388,7 @@ void sctp_audit_log(uint8_t, uint8_t); uint32_t sctp_min_mtu(uint32_t, uint32_t, uint32_t); void sctp_hc_set_mtu(union sctp_sockstore *, uint16_t, uint32_t); uint32_t sctp_hc_get_mtu(union sctp_sockstore *, uint16_t); +void sctp_set_state(struct sctp_tcb *, int); +void sctp_add_substate(struct sctp_tcb *, int); #endif /* _KERNEL */ #endif diff --git a/freebsd/sys/netinet/tcp_hostcache.c b/freebsd/sys/netinet/tcp_hostcache.c index d1de3f33..f2e3d875 100644 --- a/freebsd/sys/netinet/tcp_hostcache.c +++ b/freebsd/sys/netinet/tcp_hostcache.c @@ -114,10 +114,10 @@ __FBSDID("$FreeBSD$"); #define TCP_HOSTCACHE_EXPIRE 60*60 /* one hour */ #define TCP_HOSTCACHE_PRUNE 5*60 /* every 5 minutes */ -static VNET_DEFINE(struct tcp_hostcache, tcp_hostcache); +VNET_DEFINE_STATIC(struct tcp_hostcache, tcp_hostcache); #define V_tcp_hostcache VNET(tcp_hostcache) -static VNET_DEFINE(struct callout, tcp_hc_callout); +VNET_DEFINE_STATIC(struct callout, tcp_hc_callout); #define V_tcp_hc_callout VNET(tcp_hc_callout) static struct hc_metrics *tcp_hc_lookup(struct in_conninfo *); diff --git a/freebsd/sys/netinet/tcp_hpts.h b/freebsd/sys/netinet/tcp_hpts.h index c52a1d78..04c86769 100644 --- a/freebsd/sys/netinet/tcp_hpts.h +++ b/freebsd/sys/netinet/tcp_hpts.h @@ -238,10 +238,10 @@ int #define tcp_queue_to_input_locked(a, b) __tcp_queue_to_input_locked(a, b, __LINE__); void tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, - int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, uint8_t ti_locked); + int32_t tlen, int32_t drop_hdrlen, uint8_t iptos); int __tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, - int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, uint8_t ti_locked, int32_t line); + int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, int32_t line); #define tcp_queue_to_input(a, b, c, d, e, f, g) __tcp_queue_to_input(a, b, c, d, e, f, g, __LINE__) uint16_t tcp_hpts_delayedby(struct inpcb *inp); diff --git a/freebsd/sys/netinet/tcp_input.c b/freebsd/sys/netinet/tcp_input.c index 20bea2de..2c6c3048 100644 --- a/freebsd/sys/netinet/tcp_input.c +++ b/freebsd/sys/netinet/tcp_input.c @@ -585,6 +585,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto) int rstreason = 0; /* For badport_bandlim accounting purposes */ uint8_t iptos; struct m_tag *fwd_tag = NULL; + struct epoch_tracker et; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6; @@ -775,7 +776,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto) * connection in TIMEWAIT and SYNs not targeting a listening socket. */ if ((thflags & (TH_FIN | TH_RST)) != 0) { - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); ti_locked = TI_RLOCKED; } else ti_locked = TI_UNLOCKED; @@ -962,25 +963,10 @@ findpcb: * * XXXRW: It may be time to rethink timewait locking. */ -relocked: if (inp->inp_flags & INP_TIMEWAIT) { if (ti_locked == TI_UNLOCKED) { - if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) { - in_pcbref(inp); - INP_WUNLOCK(inp); - INP_INFO_RLOCK(&V_tcbinfo); - ti_locked = TI_RLOCKED; - INP_WLOCK(inp); - if (in_pcbrele_wlocked(inp)) { - inp = NULL; - goto findpcb; - } else if (inp->inp_flags & INP_DROPPED) { - INP_WUNLOCK(inp); - inp = NULL; - goto findpcb; - } - } else - ti_locked = TI_RLOCKED; + INP_INFO_RLOCK_ET(&V_tcbinfo, et); + ti_locked = TI_RLOCKED; } INP_INFO_RLOCK_ASSERT(&V_tcbinfo); @@ -991,7 +977,7 @@ relocked: */ if (tcp_twcheck(inp, &to, th, m, tlen)) goto findpcb; - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); return (IPPROTO_DONE); } /* @@ -1028,23 +1014,8 @@ relocked: (tp->t_state == TCPS_LISTEN && (thflags & TH_SYN) && !IS_FASTOPEN(tp->t_flags)))) { if (ti_locked == TI_UNLOCKED) { - if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) { - in_pcbref(inp); - INP_WUNLOCK(inp); - INP_INFO_RLOCK(&V_tcbinfo); - ti_locked = TI_RLOCKED; - INP_WLOCK(inp); - if (in_pcbrele_wlocked(inp)) { - inp = NULL; - goto findpcb; - } else if (inp->inp_flags & INP_DROPPED) { - INP_WUNLOCK(inp); - inp = NULL; - goto findpcb; - } - goto relocked; - } else - ti_locked = TI_RLOCKED; + INP_INFO_RLOCK_ET(&V_tcbinfo, et); + ti_locked = TI_RLOCKED; } INP_INFO_RLOCK_ASSERT(&V_tcbinfo); } @@ -1082,6 +1053,8 @@ relocked: #ifdef INET6 if (isipv6) { inc.inc_flags |= INC_ISIPV6; + if (inp->inp_inc.inc_flags & INC_IPV6MINMTU) + inc.inc_flags |= INC_IPV6MINMTU; inc.inc6_faddr = ip6->ip6_src; inc.inc6_laddr = ip6->ip6_dst; } else @@ -1176,9 +1149,11 @@ tfo_socket_result: * contains. tcp_do_segment() consumes * the mbuf chain and unlocks the inpcb. */ + TCP_PROBE5(receive, NULL, tp, m, tp, th); tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, - iptos, ti_locked); - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + iptos); + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); return (IPPROTO_DONE); } /* @@ -1382,7 +1357,7 @@ tfo_socket_result: * Only the listen socket is unlocked by syncache_add(). */ if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); ti_locked = TI_UNLOCKED; } INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); @@ -1416,15 +1391,16 @@ tfo_socket_result: * state. tcp_do_segment() always consumes the mbuf chain, unlocks * the inpcb, and unlocks pcbinfo. */ - tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked); - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos); + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); return (IPPROTO_DONE); dropwithreset: TCP_PROBE5(receive, NULL, tp, m, tp, th); if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); ti_locked = TI_UNLOCKED; } #ifdef INVARIANTS @@ -1448,7 +1424,7 @@ dropunlock: TCP_PROBE5(receive, NULL, tp, m, tp, th); if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); ti_locked = TI_UNLOCKED; } #ifdef INVARIANTS @@ -1535,8 +1511,7 @@ tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so, void tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, - struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, - int ti_locked) + struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos) { int thflags, acked, ourfinisacked, needoutput = 0, sack_changed; int rstreason, todrop, win; @@ -1562,7 +1537,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, tp->sackhint.last_sack_ack = 0; sack_changed = 0; nsegs = max(1, m->m_pkthdr.lro_nsegs); - /* * If this is either a state-changing packet or current state isn't * established, we require a write lock on tcbinfo. Otherwise, we @@ -1571,19 +1545,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || tp->t_state != TCPS_ESTABLISHED) { - KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for " - "SYN/FIN/RST/!EST", __func__, ti_locked)); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - } else { -#ifdef INVARIANTS - if (ti_locked == TI_RLOCKED) - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - else { - KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " - "ti_locked: %d", __func__, ti_locked)); - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); - } -#endif } INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", @@ -1717,10 +1679,19 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, (to.to_flags & TOF_SACKPERM) == 0) tp->t_flags &= ~TF_SACK_PERMIT; if (IS_FASTOPEN(tp->t_flags)) { - if (to.to_flags & TOF_FASTOPEN) - tcp_fastopen_update_cache(tp, to.to_mss, + if (to.to_flags & TOF_FASTOPEN) { + uint16_t mss; + + if (to.to_flags & TOF_MSS) + mss = to.to_mss; + else + if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) + mss = TCP6_MSS; + else + mss = TCP_MSS; + tcp_fastopen_update_cache(tp, mss, to.to_tfo_len, to.to_tfo_cookie); - else + } else tcp_fastopen_disable_path(tp); } } @@ -1767,7 +1738,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, tp->snd_nxt == tp->snd_max && tiwin && tiwin == tp->snd_wnd && ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && - LIST_EMPTY(&tp->t_segq) && + SEGQ_EMPTY(tp) && ((to.to_flags & TOF_TS) == 0 || TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) { @@ -1792,10 +1763,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, /* * This is a pure ack for outstanding data. */ - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - ti_locked = TI_UNLOCKED; - TCPSTAT_INC(tcps_predack); /* @@ -1899,10 +1866,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, * nothing on the reassembly queue and we have enough * buffer space to take it. */ - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - ti_locked = TI_UNLOCKED; - /* Clean receiver SACK report if present */ if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) tcp_clean_sackreport(tp); @@ -2104,8 +2067,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, tcp_state_change(tp, TCPS_SYN_RECEIVED); } - KASSERT(ti_locked == TI_RLOCKED, ("%s: trimthenstep6: " - "ti_locked %d", __func__, ti_locked)); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); @@ -2180,9 +2141,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - KASSERT(ti_locked == TI_RLOCKED, - ("%s: TH_RST ti_locked %d, th %p tp %p", - __func__, ti_locked, th, tp)); KASSERT(tp->t_state != TCPS_SYN_SENT, ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", __func__, th, tp)); @@ -2225,8 +2183,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT && tp->t_state != TCPS_SYN_RECEIVED) { - KASSERT(ti_locked == TI_RLOCKED, - ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked)); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); TCPSTAT_INC(tcps_badsyn); @@ -2340,8 +2296,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if ((so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && tlen) { - KASSERT(ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && " - "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked)); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { @@ -2457,6 +2411,16 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, * SYN-RECEIVED* -> FIN-WAIT-1 */ tp->t_starttime = ticks; + if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { + tcp_fastopen_decrement_counter(tp->t_tfo_pending); + tp->t_tfo_pending = NULL; + + /* + * Account for the ACK of our SYN prior to + * regular ACK processing below. + */ + tp->snd_una++; + } if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; @@ -2464,16 +2428,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(accept__established, NULL, tp, m, tp, th); - if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { - tcp_fastopen_decrement_counter(tp->t_tfo_pending); - tp->t_tfo_pending = NULL; - - /* - * Account for the ACK of our SYN prior to - * regular ACK processing below. - */ - tp->snd_una++; - } /* * TFO connections call cc_conn_init() during SYN * processing. Calling it again here for such @@ -2490,7 +2444,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, * later; if not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) - (void) tcp_reass(tp, (struct tcphdr *)0, 0, + (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, (struct mbuf *)0); tp->snd_wl1 = th->th_seq - 1; /* FALLTHROUGH */ @@ -2931,7 +2885,6 @@ process_ACK: if (ourfinisacked) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tcp_twstart(tp); - INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return; } @@ -3068,7 +3021,7 @@ dodata: /* XXX */ * fast retransmit can work). */ if (th->th_seq == tp->rcv_nxt && - LIST_EMPTY(&tp->t_segq) && + SEGQ_EMPTY(tp) && (TCPS_HAVEESTABLISHED(tp->t_state) || tfo_syn)) { if (DELAY_ACK(tp, tlen) || tfo_syn) @@ -3093,7 +3046,7 @@ dodata: /* XXX */ * m_adj() doesn't actually frees any mbufs * when trimming from the head. */ - thflags = tcp_reass(tp, th, &tlen, m); + thflags = tcp_reass(tp, th, &save_start, &tlen, m); tp->t_flags |= TF_ACKNOW; } if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT)) @@ -3163,19 +3116,11 @@ dodata: /* XXX */ */ case TCPS_FIN_WAIT_2: INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - KASSERT(ti_locked == TI_RLOCKED, ("%s: dodata " - "TCP_FIN_WAIT_2 ti_locked: %d", __func__, - ti_locked)); tcp_twstart(tp); - INP_INFO_RUNLOCK(&V_tcbinfo); return; } } - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - ti_locked = TI_UNLOCKED; - #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, @@ -3190,9 +3135,6 @@ dodata: /* XXX */ (void) tp->t_fb->tfb_tcp_output(tp); check_delack: - KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", - __func__, ti_locked)); - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); if (tp->t_flags & TF_DELACK) { @@ -3230,10 +3172,6 @@ dropafterack: &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - ti_locked = TI_UNLOCKED; - tp->t_flags |= TF_ACKNOW; (void) tp->t_fb->tfb_tcp_output(tp); INP_WUNLOCK(tp->t_inpcb); @@ -3241,10 +3179,6 @@ dropafterack: return; dropwithreset: - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); - ti_locked = TI_UNLOCKED; - if (tp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); INP_WUNLOCK(tp->t_inpcb); @@ -3253,15 +3187,6 @@ dropwithreset: return; drop: - if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); - ti_locked = TI_UNLOCKED; - } -#ifdef INVARIANTS - else - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); -#endif - /* * Drop space held by incoming segment and return. */ diff --git a/freebsd/sys/netinet/tcp_log_buf.h b/freebsd/sys/netinet/tcp_log_buf.h index 58713fe5..e569395a 100644 --- a/freebsd/sys/netinet/tcp_log_buf.h +++ b/freebsd/sys/netinet/tcp_log_buf.h @@ -94,7 +94,7 @@ struct tcp_log_bbr { uint16_t flex7; uint8_t bbr_state; uint8_t bbr_substate; - uint8_t inpacer; + uint8_t inhpts; uint8_t ininput; uint8_t use_lt_bw; uint8_t flex8; @@ -217,7 +217,9 @@ enum tcp_log_events { BBR_LOG_REDUCE, /* old bbr log reduce for 4.1 and earlier 46*/ TCP_LOG_RTT, /* A rtt (in useconds) is being sampled and applied to the srtt algo 47 */ BBR_LOG_SETTINGS_CHG, /* Settings changed for loss response 48 */ - TCP_LOG_END /* End (keep at end) 49 */ + BBR_LOG_SRTT_GAIN_EVENT, /* SRTT gaining 49 */ + TCP_LOG_REASS, /* Reassembly buffer logging 50 */ + TCP_LOG_END /* End (keep at end) 51 */ }; enum tcp_log_states { diff --git a/freebsd/sys/netinet/tcp_output.c b/freebsd/sys/netinet/tcp_output.c index bdbfe984..8f83440d 100644 --- a/freebsd/sys/netinet/tcp_output.c +++ b/freebsd/sys/netinet/tcp_output.c @@ -145,18 +145,13 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto_lowat, CTLFLAG_VNET | CTLFLAG_R tcp_timer_active((tp), TT_PERSIST), \ ("neither rexmt nor persist timer is set")) -#ifdef TCP_HHOOK -static void inline hhook_run_tcp_est_out(struct tcpcb *tp, - struct tcphdr *th, struct tcpopt *to, - uint32_t len, int tso); -#endif static void inline cc_after_idle(struct tcpcb *tp); #ifdef TCP_HHOOK /* * Wrapper for the TCP established output helper hook. */ -static void inline +void hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, uint32_t len, int tso) { @@ -197,6 +192,8 @@ tcp_output(struct tcpcb *tp) int32_t len; uint32_t recwin, sendwin; int off, flags, error = 0; /* Keep compiler happy */ + u_int if_hw_tsomaxsegcount = 0; + u_int if_hw_tsomaxsegsize; struct mbuf *m; struct ip *ip = NULL; #ifdef TCPDEBUG @@ -233,13 +230,15 @@ tcp_output(struct tcpcb *tp) #endif /* - * For TFO connections in SYN_RECEIVED, only allow the initial - * SYN|ACK and those sent by the retransmit timer. + * For TFO connections in SYN_SENT or SYN_RECEIVED, + * only allow the initial SYN or SYN|ACK and those sent + * by the retransmit timer. */ if (IS_FASTOPEN(tp->t_flags) && - (tp->t_state == TCPS_SYN_RECEIVED) && - SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */ - (tp->snd_nxt != tp->snd_una)) /* not a retransmit */ + ((tp->t_state == TCPS_SYN_SENT) || + (tp->t_state == TCPS_SYN_RECEIVED)) && + SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ + (tp->snd_nxt != tp->snd_una)) /* not a retransmit */ return (0); /* @@ -867,9 +866,6 @@ send: if (tso) { u_int if_hw_tsomax; - u_int if_hw_tsomaxsegcount; - u_int if_hw_tsomaxsegsize; - struct mbuf *mb; u_int moff; int max_len; @@ -901,65 +897,6 @@ send: len = max_len; } } - - /* - * Check if we should limit by maximum segment - * size and count: - */ - if (if_hw_tsomaxsegcount != 0 && - if_hw_tsomaxsegsize != 0) { - /* - * Subtract one segment for the LINK - * and TCP/IP headers mbuf that will - * be prepended to this mbuf chain - * after the code in this section - * limits the number of mbufs in the - * chain to if_hw_tsomaxsegcount. - */ - if_hw_tsomaxsegcount -= 1; - max_len = 0; - mb = sbsndmbuf(&so->so_snd, off, &moff); - - while (mb != NULL && max_len < len) { - u_int mlen; - u_int frags; - - /* - * Get length of mbuf fragment - * and how many hardware frags, - * rounded up, it would use: - */ - mlen = (mb->m_len - moff); - frags = howmany(mlen, - if_hw_tsomaxsegsize); - - /* Handle special case: Zero Length Mbuf */ - if (frags == 0) - frags = 1; - - /* - * Check if the fragment limit - * will be reached or exceeded: - */ - if (frags >= if_hw_tsomaxsegcount) { - max_len += min(mlen, - if_hw_tsomaxsegcount * - if_hw_tsomaxsegsize); - break; - } - max_len += mlen; - if_hw_tsomaxsegcount -= frags; - moff = 0; - mb = mb->m_next; - } - if (max_len <= 0) { - len = 0; - } else if (len > max_len) { - sendalot = 1; - len = max_len; - } - } - /* * Prevent the last segment from being * fractional unless the send sockbuf can be @@ -994,7 +931,6 @@ send: */ if (tp->t_flags & TF_NEEDFIN) sendalot = 1; - } else { len = tp->t_maxseg - optlen - ipoptlen; sendalot = 1; @@ -1029,6 +965,7 @@ send: */ if (len) { struct mbuf *mb; + struct sockbuf *msb; u_int moff; if ((tp->t_flags & TF_FORCEDATA) && len == 1) @@ -1062,14 +999,30 @@ send: * Start the m_copy functions from the closest mbuf * to the offset in the socket buffer chain. */ - mb = sbsndptr(&so->so_snd, off, len, &moff); - + mb = sbsndptr_noadv(&so->so_snd, off, &moff); if (len <= MHLEN - hdrlen - max_linkhdr) { m_copydata(mb, moff, len, mtod(m, caddr_t) + hdrlen); + if (SEQ_LT(tp->snd_nxt, tp->snd_max)) + sbsndptr_adv(&so->so_snd, mb, len); m->m_len += len; } else { - m->m_next = m_copym(mb, moff, len, M_NOWAIT); + if (SEQ_LT(tp->snd_nxt, tp->snd_max)) + msb = NULL; + else + msb = &so->so_snd; + m->m_next = tcp_m_copym(mb, moff, + &len, if_hw_tsomaxsegcount, + if_hw_tsomaxsegsize, msb); + if (len <= (tp->t_maxseg - optlen)) { + /* + * Must have ran out of mbufs for the copy + * shorten it to no longer need tso. Lets + * not put on sendalot since we are low on + * mbufs. + */ + tso = 0; + } if (m->m_next == NULL) { SOCKBUF_UNLOCK(&so->so_snd); (void) m_free(m); @@ -1853,6 +1806,144 @@ tcp_addoptions(struct tcpopt *to, u_char *optp) return (optlen); } +/* + * This is a copy of m_copym(), taking the TSO segment size/limit + * constraints into account, and advancing the sndptr as it goes. + */ +struct mbuf * +tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen, + int32_t seglimit, int32_t segsize, struct sockbuf *sb) +{ + struct mbuf *n, **np; + struct mbuf *top; + int32_t off = off0; + int32_t len = *plen; + int32_t fragsize; + int32_t len_cp = 0; + int32_t *pkthdrlen; + uint32_t mlen, frags; + bool copyhdr; + + + KASSERT(off >= 0, ("tcp_m_copym, negative off %d", off)); + KASSERT(len >= 0, ("tcp_m_copym, negative len %d", len)); + if (off == 0 && m->m_flags & M_PKTHDR) + copyhdr = true; + else + copyhdr = false; + while (off > 0) { + KASSERT(m != NULL, ("tcp_m_copym, offset > size of mbuf chain")); + if (off < m->m_len) + break; + off -= m->m_len; + if ((sb) && (m == sb->sb_sndptr)) { + sb->sb_sndptroff += m->m_len; + sb->sb_sndptr = m->m_next; + } + m = m->m_next; + } + np = ⊤ + top = NULL; + pkthdrlen = NULL; + while (len > 0) { + if (m == NULL) { + KASSERT(len == M_COPYALL, + ("tcp_m_copym, length > size of mbuf chain")); + *plen = len_cp; + if (pkthdrlen != NULL) + *pkthdrlen = len_cp; + break; + } + mlen = min(len, m->m_len - off); + if (seglimit) { + /* + * For M_NOMAP mbufs, add 3 segments + * + 1 in case we are crossing page boundaries + * + 2 in case the TLS hdr/trailer are used + * It is cheaper to just add the segments + * than it is to take the cache miss to look + * at the mbuf ext_pgs state in detail. + */ + if (m->m_flags & M_NOMAP) { + fragsize = min(segsize, PAGE_SIZE); + frags = 3; + } else { + fragsize = segsize; + frags = 0; + } + + /* Break if we really can't fit anymore. */ + if ((frags + 1) >= seglimit) { + *plen = len_cp; + if (pkthdrlen != NULL) + *pkthdrlen = len_cp; + break; + } + + /* + * Reduce size if you can't copy the whole + * mbuf. If we can't copy the whole mbuf, also + * adjust len so the loop will end after this + * mbuf. + */ + if ((frags + howmany(mlen, fragsize)) >= seglimit) { + mlen = (seglimit - frags - 1) * fragsize; + len = mlen; + *plen = len_cp + len; + if (pkthdrlen != NULL) + *pkthdrlen = *plen; + } + frags += howmany(mlen, fragsize); + if (frags == 0) + frags++; + seglimit -= frags; + KASSERT(seglimit > 0, + ("%s: seglimit went too low", __func__)); + } + if (copyhdr) + n = m_gethdr(M_NOWAIT, m->m_type); + else + n = m_get(M_NOWAIT, m->m_type); + *np = n; + if (n == NULL) + goto nospace; + if (copyhdr) { + if (!m_dup_pkthdr(n, m, M_NOWAIT)) + goto nospace; + if (len == M_COPYALL) + n->m_pkthdr.len -= off0; + else + n->m_pkthdr.len = len; + pkthdrlen = &n->m_pkthdr.len; + copyhdr = false; + } + n->m_len = mlen; + len_cp += n->m_len; + if (m->m_flags & M_EXT) { + n->m_data = m->m_data + off; + mb_dupcl(n, m); + } else + bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), + (u_int)n->m_len); + + if (sb && (sb->sb_sndptr == m) && + ((n->m_len + off) >= m->m_len) && m->m_next) { + sb->sb_sndptroff += m->m_len; + sb->sb_sndptr = m->m_next; + } + off = 0; + if (len != M_COPYALL) { + len -= n->m_len; + } + m = m->m_next; + np = &n->m_next; + } + return (top); +nospace: + m_freem(top); + return (NULL); +} + void tcp_sndbuf_autoscale(struct tcpcb *tp, struct socket *so, uint32_t sendwin) { diff --git a/freebsd/sys/netinet/tcp_reass.c b/freebsd/sys/netinet/tcp_reass.c index dbb61299..4776a808 100644 --- a/freebsd/sys/netinet/tcp_reass.c +++ b/freebsd/sys/netinet/tcp_reass.c @@ -74,15 +74,37 @@ __FBSDID("$FreeBSD$"); #include <netinet/tcp_seq.h> #include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> +#include <netinet/tcp_log_buf.h> +#include <netinet/tcp_hpts.h> #include <netinet6/tcp6_var.h> #include <netinet/tcpip.h> #ifdef TCPDEBUG #include <netinet/tcp_debug.h> #endif /* TCPDEBUG */ +#define TCP_R_LOG_ADD 1 +#define TCP_R_LOG_LIMIT_REACHED 2 +#define TCP_R_LOG_APPEND 3 +#define TCP_R_LOG_PREPEND 4 +#define TCP_R_LOG_REPLACE 5 +#define TCP_R_LOG_MERGE_INTO 6 +#define TCP_R_LOG_NEW_ENTRY 7 +#define TCP_R_LOG_READ 8 +#define TCP_R_LOG_ZERO 9 +#define TCP_R_LOG_DUMP 10 +#define TCP_R_LOG_TRIM 11 + +/* For debugging we want counters and BB logging */ +/* #define TCP_REASS_COUNTERS 1 */ +/* #define TCP_REASS_LOGGING 1 */ + static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0, "TCP Segment Reassembly Queue"); +static SYSCTL_NODE(_net_inet_tcp_reass, OID_AUTO, stats, CTLFLAG_RW, 0, + "TCP Segment Reassembly stats"); + + static int tcp_reass_maxseg = 0; SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN, &tcp_reass_maxseg, 0, @@ -93,6 +115,77 @@ SYSCTL_UMA_CUR(_net_inet_tcp_reass, OID_AUTO, cursegments, 0, &tcp_reass_zone, "Global number of TCP Segments currently in Reassembly Queue"); +static u_int tcp_reass_maxqueuelen = 100; +SYSCTL_UINT(_net_inet_tcp_reass, OID_AUTO, maxqueuelen, CTLFLAG_RWTUN, + &tcp_reass_maxqueuelen, 0, + "Maximum number of TCP Segments per Reassembly Queue"); + +static int tcp_new_limits = 0; +SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, new_limit, CTLFLAG_RWTUN, + &tcp_new_limits, 0, + "Do we use the new limit method we are discussing?"); + +static u_int tcp_reass_queue_guard = 16; +SYSCTL_UINT(_net_inet_tcp_reass, OID_AUTO, queueguard, CTLFLAG_RWTUN, + &tcp_reass_queue_guard, 16, + "Number of TCP Segments in Reassembly Queue where we flip over to guard mode"); + +#ifdef TCP_REASS_COUNTERS + +counter_u64_t reass_entry; +SYSCTL_COUNTER_U64(_net_inet_tcp_reass_stats, OID_AUTO, entry, CTLFLAG_RD, + &reass_entry, "A segment entered reassembly "); + +counter_u64_t reass_path1; +SYSCTL_COUNTER_U64(_net_inet_tcp_reass_stats, OID_AUTO, path1, CTLFLAG_RD, + &reass_path1, "Took path 1"); + +counter_u64_t reass_path2; +SYSCTL_COUNTER_U64(_net_inet_tcp_reass_stats, OID_AUTO, path2, CTLFLAG_RD, + &reass_path2, "Took path 2"); + +counter_u64_t reass_path3; +SYSCTL_COUNTER_U64(_net_inet_tcp_reass_stats, OID_AUTO, path3, CTLFLAG_RD, + &reass_path3, "Took path 3"); + +counter_u64_t reass_path4; +SYSCTL_COUNTER_U64(_net_inet_tcp_reass_stats, OID_AUTO, path4, CTLFLAG_RD, + &reass_path4, "Took path 4"); + +counter_u64_t reass_path5; +SYSCTL_COUNTER_U64(_net_inet_tcp_reass_stats, OID_AUTO, path5, CTLFLAG_RD, + &reass_path5, "Took path 5"); + +counter_u64_t reass_path6; +SYSCTL_COUNTER_U64(_net_inet_tcp_reass_stats, OID_AUTO, path6, CTLFLAG_RD, + &reass_path6, "Took path 6"); + +counter_u64_t reass_path7; +SYSCTL_COUNTER_U64(_net_inet_tcp_reass_stats, OID_AUTO, path7, CTLFLAG_RD, + &reass_path7, "Took path 7"); + +counter_u64_t reass_fullwalk; +SYSCTL_COUNTER_U64(_net_inet_tcp_reass_stats, OID_AUTO, fullwalk, CTLFLAG_RD, + &reass_fullwalk, "Took a full walk "); + +counter_u64_t reass_nospace; +SYSCTL_COUNTER_U64(_net_inet_tcp_reass_stats, OID_AUTO, nospace, CTLFLAG_RD, + &reass_nospace, "Had no mbuf capacity "); + +counter_u64_t merge_fwd; +SYSCTL_COUNTER_U64(_net_inet_tcp_reass_stats, OID_AUTO, merge_fwd, CTLFLAG_RD, + &merge_fwd, "Ran merge fwd"); + +counter_u64_t merge_into; +SYSCTL_COUNTER_U64(_net_inet_tcp_reass_stats, OID_AUTO, merge_into, CTLFLAG_RD, + &merge_into, "Ran merge into"); + +counter_u64_t tcp_zero_input; +SYSCTL_COUNTER_U64(_net_inet_tcp_reass_stats, OID_AUTO, zero_input, CTLFLAG_RD, + &tcp_zero_input, "The reassembly buffer saw a zero len segment etc"); + +#endif + /* Initialize TCP reassembly queue */ static void tcp_reass_zone_change(void *tag) @@ -104,6 +197,77 @@ tcp_reass_zone_change(void *tag) tcp_reass_maxseg); } +#ifdef TCP_REASS_LOGGING + +static void +tcp_log_reassm(struct tcpcb *tp, struct tseg_qent *q, struct tseg_qent *p, + tcp_seq seq, int len, uint8_t action, int instance) +{ + uint32_t cts; + struct timeval tv; + + if (tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + memset(&log, 0, sizeof(log)); + cts = tcp_get_usecs(&tv); + log.u_bbr.flex1 = seq; + log.u_bbr.cur_del_rate = (uint64_t)q; + log.u_bbr.delRate = (uint64_t)p; + if (q != NULL) { + log.u_bbr.flex2 = q->tqe_start; + log.u_bbr.flex3 = q->tqe_len; + log.u_bbr.flex4 = q->tqe_mbuf_cnt; + log.u_bbr.hptsi_gain = q->tqe_flags; + } + if (p != NULL) { + log.u_bbr.flex5 = p->tqe_start; + log.u_bbr.pkts_out = p->tqe_len; + log.u_bbr.epoch = p->tqe_mbuf_cnt; + log.u_bbr.cwnd_gain = p->tqe_flags; + } + log.u_bbr.flex6 = tp->t_segqmbuflen; + log.u_bbr.flex7 = instance; + log.u_bbr.flex8 = action; + log.u_bbr.timeStamp = cts; + TCP_LOG_EVENTP(tp, NULL, + &tp->t_inpcb->inp_socket->so_rcv, + &tp->t_inpcb->inp_socket->so_snd, + TCP_LOG_REASS, 0, + len, &log, false, &tv); + } +} + +static void +tcp_reass_log_dump(struct tcpcb *tp) +{ + struct tseg_qent *q; + + if (tp->t_logstate != TCP_LOG_STATE_OFF) { + TAILQ_FOREACH(q, &tp->t_segq, tqe_q) { + tcp_log_reassm(tp, q, NULL, q->tqe_start, q->tqe_len, TCP_R_LOG_DUMP, 0); + } + }; +} + +static void +tcp_reass_log_new_in(struct tcpcb *tp, tcp_seq seq, int len, struct mbuf *m, + int logval, struct tseg_qent *q) +{ + int cnt; + struct mbuf *t; + + cnt = 0; + t = m; + while (t) { + cnt += t->m_len; + t = t->m_next; + } + tcp_log_reassm(tp, q, NULL, seq, len, logval, cnt); +} + +#endif + void tcp_reass_global_init(void) { @@ -116,8 +280,24 @@ tcp_reass_global_init(void) /* Set the zone limit and read back the effective value. */ tcp_reass_maxseg = uma_zone_set_max(tcp_reass_zone, tcp_reass_maxseg); +#ifdef TCP_REASS_COUNTERS + reass_path1 = counter_u64_alloc(M_WAITOK); + reass_path2 = counter_u64_alloc(M_WAITOK); + reass_path3 = counter_u64_alloc(M_WAITOK); + reass_path4 = counter_u64_alloc(M_WAITOK); + reass_path5 = counter_u64_alloc(M_WAITOK); + reass_path6 = counter_u64_alloc(M_WAITOK); + reass_path7 = counter_u64_alloc(M_WAITOK); + reass_fullwalk = counter_u64_alloc(M_WAITOK); + reass_nospace = counter_u64_alloc(M_WAITOK); + reass_entry = counter_u64_alloc(M_WAITOK); + merge_fwd = counter_u64_alloc(M_WAITOK); + merge_into = counter_u64_alloc(M_WAITOK); + tcp_zero_input = counter_u64_alloc(M_WAITOK); +#endif EVENTHANDLER_REGISTER(nmbclusters_change, tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY); + } void @@ -127,32 +307,237 @@ tcp_reass_flush(struct tcpcb *tp) INP_WLOCK_ASSERT(tp->t_inpcb); - while ((qe = LIST_FIRST(&tp->t_segq)) != NULL) { - LIST_REMOVE(qe, tqe_q); + while ((qe = TAILQ_FIRST(&tp->t_segq)) != NULL) { + TAILQ_REMOVE(&tp->t_segq, qe, tqe_q); m_freem(qe->tqe_m); uma_zfree(tcp_reass_zone, qe); tp->t_segqlen--; } - + tp->t_segqmbuflen = 0; KASSERT((tp->t_segqlen == 0), ("TCP reass queue %p segment count is %d instead of 0 after flush.", tp, tp->t_segqlen)); } +static void +tcp_reass_append(struct tcpcb *tp, struct tseg_qent *last, + struct mbuf *m, struct tcphdr *th, int tlen, + struct mbuf *mlast, int lenofoh) +{ + +#ifdef TCP_REASS_LOGGING + tcp_log_reassm(tp, last, NULL, th->th_seq, tlen, TCP_R_LOG_APPEND, 0); +#endif + last->tqe_len += tlen; + last->tqe_m->m_pkthdr.len += tlen; + /* Preserve the FIN bit if its there */ + last->tqe_flags |= (th->th_flags & TH_FIN); + last->tqe_last->m_next = m; + last->tqe_last = mlast; + last->tqe_mbuf_cnt += lenofoh; + tp->t_rcvoopack++; + TCPSTAT_INC(tcps_rcvoopack); + TCPSTAT_ADD(tcps_rcvoobyte, tlen); +#ifdef TCP_REASS_LOGGING + tcp_reass_log_new_in(tp, last->tqe_start, lenofoh, last->tqe_m, + TCP_R_LOG_APPEND, + last); +#endif +} + +static void +tcp_reass_prepend(struct tcpcb *tp, struct tseg_qent *first, struct mbuf *m, struct tcphdr *th, + int tlen, struct mbuf *mlast, int lenofoh) +{ + int i; + +#ifdef TCP_REASS_LOGGING + tcp_log_reassm(tp, first, NULL, th->th_seq, tlen, TCP_R_LOG_PREPEND, 0); +#endif + if (SEQ_GT((th->th_seq + tlen), first->tqe_start)) { + /* The new data overlaps into the old */ + i = (th->th_seq + tlen) - first->tqe_start; +#ifdef TCP_REASS_LOGGING + tcp_log_reassm(tp, first, NULL, 0, i, TCP_R_LOG_TRIM, 1); +#endif + m_adj(first->tqe_m, i); + first->tqe_len -= i; + first->tqe_start += i; + } + /* Ok now setup our chain to point to the old first */ + mlast->m_next = first->tqe_m; + first->tqe_m = m; + first->tqe_len += tlen; + first->tqe_start = th->th_seq; + first->tqe_m->m_pkthdr.len = first->tqe_len; + first->tqe_mbuf_cnt += lenofoh; + tp->t_rcvoopack++; + TCPSTAT_INC(tcps_rcvoopack); + TCPSTAT_ADD(tcps_rcvoobyte, tlen); +#ifdef TCP_REASS_LOGGING + tcp_reass_log_new_in(tp, first->tqe_start, lenofoh, first->tqe_m, + TCP_R_LOG_PREPEND, + first); +#endif +} + +static void +tcp_reass_replace(struct tcpcb *tp, struct tseg_qent *q, struct mbuf *m, + tcp_seq seq, int len, struct mbuf *mlast, int mbufoh, uint8_t flags) +{ + /* + * Free the data in q, and replace + * it with the new segment. + */ + int len_dif; + +#ifdef TCP_REASS_LOGGING + tcp_log_reassm(tp, q, NULL, seq, len, TCP_R_LOG_REPLACE, 0); +#endif + m_freem(q->tqe_m); + KASSERT(tp->t_segqmbuflen >= q->tqe_mbuf_cnt, + ("Tp:%p seg queue goes negative", tp)); + tp->t_segqmbuflen -= q->tqe_mbuf_cnt; + q->tqe_mbuf_cnt = mbufoh; + q->tqe_m = m; + q->tqe_last = mlast; + q->tqe_start = seq; + if (len > q->tqe_len) + len_dif = len - q->tqe_len; + else + len_dif = 0; + tp->t_rcvoopack++; + TCPSTAT_INC(tcps_rcvoopack); + TCPSTAT_ADD(tcps_rcvoobyte, len_dif); + q->tqe_len = len; + q->tqe_flags = (flags & TH_FIN); + q->tqe_m->m_pkthdr.len = q->tqe_len; + tp->t_segqmbuflen += mbufoh; + +} + +static void +tcp_reass_merge_into(struct tcpcb *tp, struct tseg_qent *ent, + struct tseg_qent *q) +{ + /* + * Merge q into ent and free q from the list. + */ +#ifdef TCP_REASS_LOGGING + tcp_log_reassm(tp, q, ent, 0, 0, TCP_R_LOG_MERGE_INTO, 0); +#endif +#ifdef TCP_REASS_COUNTERS + counter_u64_add(merge_into, 1); +#endif + ent->tqe_last->m_next = q->tqe_m; + ent->tqe_last = q->tqe_last; + ent->tqe_len += q->tqe_len; + ent->tqe_mbuf_cnt += q->tqe_mbuf_cnt; + ent->tqe_m->m_pkthdr.len += q->tqe_len; + ent->tqe_flags |= (q->tqe_flags & TH_FIN); + TAILQ_REMOVE(&tp->t_segq, q, tqe_q); + uma_zfree(tcp_reass_zone, q); + tp->t_segqlen--; + +} + +static void +tcp_reass_merge_forward(struct tcpcb *tp, struct tseg_qent *ent) +{ + struct tseg_qent *q, *qtmp; + int i; + tcp_seq max; + /* + * Given an entry merge forward anyplace + * that ent overlaps forward. + */ + + max = ent->tqe_start + ent->tqe_len; + q = TAILQ_NEXT(ent, tqe_q); + if (q == NULL) { + /* Nothing left */ + return; + } + TAILQ_FOREACH_FROM_SAFE(q, &tp->t_segq, tqe_q, qtmp) { + if (SEQ_GT(q->tqe_start, max)) { + /* Beyond q */ + break; + } + /* We have some or all that are overlapping */ + if (SEQ_GEQ(max, (q->tqe_start + q->tqe_len))) { + /* It consumes it all */ + tp->t_segqmbuflen -= q->tqe_mbuf_cnt; + m_freem(q->tqe_m); + TAILQ_REMOVE(&tp->t_segq, q, tqe_q); + uma_zfree(tcp_reass_zone, q); + tp->t_segqlen--; + continue; + } + /* + * Trim the q entry to dovetail to this one + * and then merge q into ent updating max + * in the process. + */ + i = max - q->tqe_start; +#ifdef TCP_REASS_LOGGING + tcp_log_reassm(tp, q, NULL, 0, i, TCP_R_LOG_TRIM, 2); +#endif + m_adj(q->tqe_m, i); + q->tqe_len -= i; + q->tqe_start += i; + tcp_reass_merge_into(tp, ent, q); + max = ent->tqe_start + ent->tqe_len; + } +#ifdef TCP_REASS_COUNTERS + counter_u64_add(merge_fwd, 1); +#endif +} + +static int +tcp_reass_overhead_of_chain(struct mbuf *m, struct mbuf **mlast) +{ + int len = MSIZE; + + if (m->m_flags & M_EXT) + len += m->m_ext.ext_size; + while (m->m_next != NULL) { + m = m->m_next; + len += MSIZE; + if (m->m_flags & M_EXT) + len += m->m_ext.ext_size; + } + *mlast = m; + return (len); +} + + +/* + * NOTE!!! the new tcp-reassembly code *must not* use + * m_adj() with a negative index. That alters the chain + * of mbufs (by possibly chopping trailing mbufs). At + * the front of tcp_reass we count the mbuf overhead + * and setup the tail pointer. If we use m_adj(m, -5) + * we could corrupt the tail pointer. Currently the + * code only uses m_adj(m, postive-num). If this + * changes appropriate changes to update mlast would + * be needed. + */ int -tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) +tcp_reass(struct tcpcb *tp, struct tcphdr *th, tcp_seq *seq_start, + int *tlenp, struct mbuf *m) { - struct tseg_qent *q; + struct tseg_qent *q, *last, *first; struct tseg_qent *p = NULL; - struct tseg_qent *nq; + struct tseg_qent *nq = NULL; struct tseg_qent *te = NULL; + struct tseg_qent tqs; + struct mbuf *mlast = NULL; + struct sockbuf *sb; struct socket *so = tp->t_inpcb->inp_socket; char *s = NULL; - int flags; - struct tseg_qent tqs; + int flags, i, lenofoh; INP_WLOCK_ASSERT(tp->t_inpcb); - /* * XXX: tcp_reass() is rather inefficient with its data structures * and should be rewritten (see NetBSD for optimizations). @@ -164,149 +549,475 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) */ if (th == NULL) goto present; - + KASSERT(SEQ_GEQ(th->th_seq, tp->rcv_nxt), + ("Attempt to add old entry to reassembly queue (th=%p, tp=%p)", + th, tp)); +#ifdef TCP_REASS_LOGGING + tcp_reass_log_new_in(tp, th->th_seq, *tlenp, m, TCP_R_LOG_ADD, NULL); +#endif +#ifdef TCP_REASS_COUNTERS + counter_u64_add(reass_entry, 1); +#endif /* - * Limit the number of segments that can be queued to reduce the - * potential for mbuf exhaustion. For best performance, we want to be - * able to queue a full window's worth of segments. The size of the - * socket receive buffer determines our advertised window and grows - * automatically when socket buffer autotuning is enabled. Use it as the - * basis for our queue limit. - * Always let the missing segment through which caused this queue. - * NB: Access to the socket buffer is left intentionally unlocked as we - * can tolerate stale information here. - * - * XXXLAS: Using sbspace(so->so_rcv) instead of so->so_rcv.sb_hiwat - * should work but causes packets to be dropped when they shouldn't. - * Investigate why and re-evaluate the below limit after the behaviour - * is understood. + * Check for zero length data. + */ + if ((*tlenp == 0) && ((th->th_flags & TH_FIN) == 0)) { + /* + * A zero length segment does no + * one any good. We could check + * the rcv_nxt <-> rcv_wnd but thats + * already done for us by the caller. + */ +#ifdef TCP_REASS_COUNTERS + counter_u64_add(tcp_zero_input, 1); +#endif + m_freem(m); +#ifdef TCP_REASS_LOGGING + tcp_reass_log_dump(tp); +#endif + return (0); + } + /* + * Will it fit? */ - if ((th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) && - tp->t_segqlen >= (so->so_rcv.sb_hiwat / tp->t_maxseg) + 1) { + lenofoh = tcp_reass_overhead_of_chain(m, &mlast); + sb = &tp->t_inpcb->inp_socket->so_rcv; + if ((sb->sb_mbcnt + tp->t_segqmbuflen + lenofoh) > sb->sb_mbmax) { + /* No room */ TCPSTAT_INC(tcps_rcvreassfull); - *tlenp = 0; - if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) { - log(LOG_DEBUG, "%s; %s: queue limit reached, " - "segment dropped\n", s, __func__); - free(s, M_TCPLOG); - } +#ifdef TCP_REASS_COUNTERS + counter_u64_add(reass_nospace, 1); +#endif +#ifdef TCP_REASS_LOGGING + tcp_log_reassm(tp, NULL, NULL, th->th_seq, lenofoh, TCP_R_LOG_LIMIT_REACHED, 0); +#endif m_freem(m); + *tlenp = 0; +#ifdef TCP_REASS_LOGGING + tcp_reass_log_dump(tp); +#endif return (0); } - /* - * Allocate a new queue entry. If we can't, or hit the zone limit - * just drop the pkt. - * - * Use a temporary structure on the stack for the missing segment - * when the zone is exhausted. Otherwise we may get stuck. + * First lets deal with two common cases, the + * segment appends to the back of our collected + * segments. Or the segment is the next in line. */ - te = uma_zalloc(tcp_reass_zone, M_NOWAIT); - if (te == NULL) { - if (th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) { - TCPSTAT_INC(tcps_rcvmemdrop); - m_freem(m); + last = TAILQ_LAST_FAST(&tp->t_segq, tseg_qent, tqe_q); + if (last != NULL) { + if ((th->th_flags & TH_FIN) && + SEQ_LT((th->th_seq + *tlenp), (last->tqe_start + last->tqe_len))) { + /* + * Someone is trying to game us, dump + * the segment. + */ *tlenp = 0; - if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, - NULL))) { - log(LOG_DEBUG, "%s; %s: global zone limit " - "reached, segment dropped\n", s, __func__); - free(s, M_TCPLOG); + m_freem(m); + return (0); + } + if ((SEQ_GEQ(th->th_seq, last->tqe_start)) && + (SEQ_GEQ((last->tqe_start + last->tqe_len), th->th_seq))) { + /* Common case, trailing segment is added */ + /** + * +--last + * v + * reassembly buffer |---| |---| |---| + * new segment |---| + */ +#ifdef TCP_REASS_COUNTERS + counter_u64_add(reass_path1, 1); +#endif + if (SEQ_GT((last->tqe_start + last->tqe_len), th->th_seq)) { + i = (last->tqe_start + last->tqe_len) - th->th_seq; + if (i < *tlenp) { +#ifdef TCP_REASS_LOGGING + tcp_log_reassm(tp, last, NULL, 0, i, TCP_R_LOG_TRIM, 3); + th->th_seq += i; +#endif + m_adj(m, i); + *tlenp -= i; + } else { + /* Complete overlap */ + TCPSTAT_INC(tcps_rcvduppack); + TCPSTAT_ADD(tcps_rcvdupbyte, *tlenp); + m_freem(m); + *tlenp = last->tqe_len; + *seq_start = last->tqe_start; + return (0); + } + } + if (last->tqe_flags & TH_FIN) { + /* + * We have data after the FIN on the last? + */ + *tlenp = 0; + m_freem(m); + return(0); } + tcp_reass_append(tp, last, m, th, *tlenp, mlast, lenofoh); + tp->t_segqmbuflen += lenofoh; + *seq_start = last->tqe_start; + *tlenp = last->tqe_len; return (0); - } else { - bzero(&tqs, sizeof(struct tseg_qent)); - te = &tqs; - if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, - NULL))) { - log(LOG_DEBUG, - "%s; %s: global zone limit reached, using " - "stack for missing segment\n", s, __func__); - free(s, M_TCPLOG); + } else if (SEQ_GT(th->th_seq, (last->tqe_start + last->tqe_len))) { + /* + * Second common case, we missed + * another one and have something more + * for the end. + */ + /** + * +--last + * v + * reassembly buffer |---| |---| |---| + * new segment |---| + */ + if (last->tqe_flags & TH_FIN) { + /* + * We have data after the FIN on the last? + */ + *tlenp = 0; + m_freem(m); + return(0); } +#ifdef TCP_REASS_COUNTERS + counter_u64_add(reass_path2, 1); +#endif + p = last; + goto new_entry; } + } else { + /* First segment (it's NULL). */ + goto new_entry; } - tp->t_segqlen++; + first = TAILQ_FIRST(&tp->t_segq); + if (SEQ_LT(th->th_seq, first->tqe_start) && + SEQ_GEQ((th->th_seq + *tlenp),first->tqe_start) && + SEQ_LT((th->th_seq + *tlenp), (first->tqe_start + first->tqe_len))) { + /* + * The head of the queue is prepended by this and + * it may be the one I want most. + */ + /** + * first-------+ + * v + * rea: |---| |---| |---| + * new: |---| + * Note the case we do not deal with here is: + * rea= |---| |---| |---| + * new= |----| + * Due to the fact that it could be + * new |--------------------| + * And we might need to merge forward. + */ +#ifdef INVARIANTS + struct mbuf *firstmbuf; +#endif +#ifdef TCP_REASS_COUNTERS + counter_u64_add(reass_path3, 1); +#endif + if (SEQ_LT(th->th_seq, tp->rcv_nxt)) { + /* + * The resend was even before + * what we have. We need to trim it. + * Note TSNH (it should be trimmed + * before the call to tcp_reass()). + */ +#ifdef INVARIANTS + panic("th->th_seq:%u rcv_nxt:%u tp:%p not pre-trimmed", + th->th_seq, tp->rcv_nxt, tp); +#else + i = tp->rcv_nxt - th->th_seq; +#ifdef TCP_REASS_LOGGING + tcp_log_reassm(tp, first, NULL, 0, i, TCP_R_LOG_TRIM, 4); +#endif + m_adj(m, i); + th->th_seq += i; + *tlenp -= i; +#endif + } +#ifdef INVARIANTS + firstmbuf = first->tqe_m; +#endif + tcp_reass_prepend(tp, first, m, th, *tlenp, mlast, lenofoh); +#ifdef INVARIANTS + if (firstmbuf == first->tqe_m) { + panic("First stayed same m:%p foobar:%p first->tqe_m:%p tp:%p first:%p", + m, firstmbuf, first->tqe_m, tp, first); + } else if (first->tqe_m != m) { + panic("First did not change to m:%p foobar:%p first->tqe_m:%p tp:%p first:%p", + m, firstmbuf, first->tqe_m, tp, first); + } +#endif + tp->t_segqmbuflen += lenofoh; + *seq_start = first->tqe_start; + *tlenp = first->tqe_len; + goto present; + } else if (SEQ_LT((th->th_seq + *tlenp), first->tqe_start)) { + /* New segment is before our earliest segment. */ + /** + * first---->+ + * v + * rea= |---| .... + * new" |---| + * + */ + goto new_entry; + } /* * Find a segment which begins after this one does. */ - LIST_FOREACH(q, &tp->t_segq, tqe_q) { - if (SEQ_GT(q->tqe_th->th_seq, th->th_seq)) +#ifdef TCP_REASS_COUNTERS + counter_u64_add(reass_fullwalk, 1); +#endif + TAILQ_FOREACH(q, &tp->t_segq, tqe_q) { + if (SEQ_GT(q->tqe_start, th->th_seq)) break; - p = q; } - - /* - * If there is a preceding segment, it may provide some of - * our data already. If so, drop the data from the incoming - * segment. If it provides all of our data, drop us. + p = TAILQ_PREV(q, tsegqe_head, tqe_q); + /** + * Now is this fit just in-between only? + * i.e.: + * p---+ +----q + * v v + * res= |--| |--| |--| + * nee |-| + */ + if (SEQ_LT((th->th_seq + *tlenp), q->tqe_start) && + ((p == NULL) || (SEQ_GT(th->th_seq, (p->tqe_start + p->tqe_len))))) { + /* Yep no overlap */ + goto new_entry; + } + /** + * If we reach here we have some (possibly all) overlap + * such as: + * res= |--| |--| |--| + * new= |----| + * or new= |-----------------| + * or new= |--------| + * or new= |---| + * or new= |-----------| */ - if (p != NULL) { - int i; + if ((p != NULL) && + (SEQ_LEQ(th->th_seq, (p->tqe_start + p->tqe_len)))) { /* conversion to int (in i) handles seq wraparound */ - i = p->tqe_th->th_seq + p->tqe_len - th->th_seq; - if (i > 0) { + +#ifdef TCP_REASS_COUNTERS + counter_u64_add(reass_path4, 1); +#endif + i = p->tqe_start + p->tqe_len - th->th_seq; + if (i >= 0) { if (i >= *tlenp) { + /** + * prev seg---->+ + * v + * reassembly buffer |---| + * new segment |-| + */ TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, *tlenp); + *tlenp = p->tqe_len; + *seq_start = p->tqe_start; m_freem(m); - if (te != &tqs) - uma_zfree(tcp_reass_zone, te); - tp->t_segqlen--; /* * Try to present any queued data * at the left window edge to the user. * This is needed after the 3-WHS - * completes. + * completes. Note this probably + * will not work and we will return. */ - goto present; /* ??? */ + return (0); } - m_adj(m, i); - *tlenp -= i; - th->th_seq += i; + if (i > 0) { + /** + * prev seg---->+ + * v + * reassembly buffer |---| + * new segment |-----| + */ +#ifdef TCP_REASS_COUNTERS + counter_u64_add(reass_path5, 1); +#endif +#ifdef TCP_REASS_LOGGING + tcp_log_reassm(tp, p, NULL, 0, i, TCP_R_LOG_TRIM, 5); +#endif + m_adj(m, i); + *tlenp -= i; + th->th_seq += i; + } + } + if (th->th_seq == (p->tqe_start + p->tqe_len)) { + /* + * If dovetails in with this one + * append it. + */ + /** + * prev seg---->+ + * v + * reassembly buffer |--| |---| + * new segment |--| + * (note: it was trimmed above if it overlapped) + */ + tcp_reass_append(tp, p, m, th, *tlenp, mlast, lenofoh); + tp->t_segqmbuflen += lenofoh; + } else { +#ifdef INVARIANTS + panic("Impossible cut th_seq:%u p->seq:%u(%d) p:%p tp:%p", + th->th_seq, p->tqe_start, p->tqe_len, + p, tp); +#endif + *tlenp = 0; + m_freem(m); + return (0); + } + q = p; + } else { + /* + * The new data runs over the + * top of previously sack'd data (in q). + * It may be partially overlapping, or + * it may overlap the entire segment. + */ +#ifdef TCP_REASS_COUNTERS + counter_u64_add(reass_path6, 1); +#endif + if (SEQ_GEQ((th->th_seq + *tlenp), (q->tqe_start + q->tqe_len))) { + /* It consumes it all */ + /** + * next seg---->+ + * v + * reassembly buffer |--| |---| + * new segment |----------| + */ +#ifdef TCP_REASS_COUNTERS + counter_u64_add(reass_path7, 1); +#endif + tcp_reass_replace(tp, q, m, th->th_seq, *tlenp, mlast, lenofoh, th->th_flags); + } else { + /* + * We just need to prepend the data + * to this. It does not overrun + * the end. + */ + /** + * next seg---->+ + * v + * reassembly buffer |--| |---| + * new segment |----------| + */ + tcp_reass_prepend(tp, q, m, th, *tlenp, mlast, lenofoh); + tp->t_segqmbuflen += lenofoh; } } - tp->t_rcvoopack++; - TCPSTAT_INC(tcps_rcvoopack); - TCPSTAT_ADD(tcps_rcvoobyte, *tlenp); + /* Now does it go further than that? */ + tcp_reass_merge_forward(tp, q); + *seq_start = q->tqe_start; + *tlenp = q->tqe_len; + goto present; - /* - * While we overlap succeeding segments trim them or, - * if they are completely covered, dequeue them. + /* + * When we reach here we can't combine it + * with any existing segment. + * + * Limit the number of segments that can be queued to reduce the + * potential for mbuf exhaustion. For best performance, we want to be + * able to queue a full window's worth of segments. The size of the + * socket receive buffer determines our advertised window and grows + * automatically when socket buffer autotuning is enabled. Use it as the + * basis for our queue limit. + * + * However, allow the user to specify a ceiling for the number of + * segments in each queue. + * + * Always let the missing segment through which caused this queue. + * NB: Access to the socket buffer is left intentionally unlocked as we + * can tolerate stale information here. + * + * XXXLAS: Using sbspace(so->so_rcv) instead of so->so_rcv.sb_hiwat + * should work but causes packets to be dropped when they shouldn't. + * Investigate why and re-evaluate the below limit after the behaviour + * is understood. */ - while (q) { - int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq; - if (i <= 0) - break; - if (i < q->tqe_len) { - q->tqe_th->th_seq += i; - q->tqe_len -= i; - m_adj(q->tqe_m, i); - break; +new_entry: + if (tcp_new_limits) { + if ((tp->t_segqlen > tcp_reass_queue_guard) && + (*tlenp < MSIZE)) { + /* + * This is really a lie, we are not full but + * are getting a segment that is above + * guard threshold. If it is and its below + * a mbuf size (256) we drop it if it + * can't fill in some place. + */ + TCPSTAT_INC(tcps_rcvreassfull); + *tlenp = 0; + if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) { + log(LOG_DEBUG, "%s; %s: queue limit reached, " + "segment dropped\n", s, __func__); + free(s, M_TCPLOG); + } + m_freem(m); +#ifdef TCP_REASS_LOGGING + tcp_reass_log_dump(tp); +#endif + return (0); } + } else { - nq = LIST_NEXT(q, tqe_q); - LIST_REMOVE(q, tqe_q); - m_freem(q->tqe_m); - uma_zfree(tcp_reass_zone, q); - tp->t_segqlen--; - q = nq; + if ((th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) && + tp->t_segqlen >= min((so->so_rcv.sb_hiwat / tp->t_maxseg) + 1, + tcp_reass_maxqueuelen)) { + TCPSTAT_INC(tcps_rcvreassfull); + *tlenp = 0; + if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) { + log(LOG_DEBUG, "%s; %s: queue limit reached, " + "segment dropped\n", s, __func__); + free(s, M_TCPLOG); + } + m_freem(m); +#ifdef TCP_REASS_LOGGING + tcp_reass_log_dump(tp); +#endif + return (0); + } } - + /* + * Allocate a new queue entry. If we can't, or hit the zone limit + * just drop the pkt. + */ + te = uma_zalloc(tcp_reass_zone, M_NOWAIT); + if (te == NULL) { + TCPSTAT_INC(tcps_rcvmemdrop); + m_freem(m); + *tlenp = 0; + if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, + NULL))) { + log(LOG_DEBUG, "%s; %s: global zone limit " + "reached, segment dropped\n", s, __func__); + free(s, M_TCPLOG); + } + return (0); + } + tp->t_segqlen++; + tp->t_rcvoopack++; + TCPSTAT_INC(tcps_rcvoopack); + TCPSTAT_ADD(tcps_rcvoobyte, *tlenp); /* Insert the new segment queue entry into place. */ te->tqe_m = m; - te->tqe_th = th; + te->tqe_flags = th->th_flags; te->tqe_len = *tlenp; - + te->tqe_start = th->th_seq; + te->tqe_last = mlast; + te->tqe_mbuf_cnt = lenofoh; + tp->t_segqmbuflen += te->tqe_mbuf_cnt; if (p == NULL) { - LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q); + TAILQ_INSERT_HEAD(&tp->t_segq, te, tqe_q); } else { - KASSERT(te != &tqs, ("%s: temporary stack based entry not " - "first element in queue", __func__)); - LIST_INSERT_AFTER(p, te, tqe_q); + TAILQ_INSERT_AFTER(&tp->t_segq, p, te, tqe_q); } - +#ifdef TCP_REASS_LOGGING + tcp_reass_log_new_in(tp, th->th_seq, *tlenp, m, TCP_R_LOG_NEW_ENTRY, te); +#endif present: /* * Present data to user, advancing rcv_nxt through @@ -314,24 +1025,56 @@ present: */ if (!TCPS_HAVEESTABLISHED(tp->t_state)) return (0); - q = LIST_FIRST(&tp->t_segq); - if (!q || q->tqe_th->th_seq != tp->rcv_nxt) + q = TAILQ_FIRST(&tp->t_segq); + KASSERT(q == NULL || SEQ_GEQ(q->tqe_start, tp->rcv_nxt), + ("Reassembly queue for %p has stale entry at head", tp)); + if (!q || q->tqe_start != tp->rcv_nxt) { +#ifdef TCP_REASS_LOGGING + tcp_reass_log_dump(tp); +#endif return (0); + } SOCKBUF_LOCK(&so->so_rcv); do { tp->rcv_nxt += q->tqe_len; - flags = q->tqe_th->th_flags & TH_FIN; - nq = LIST_NEXT(q, tqe_q); - LIST_REMOVE(q, tqe_q); - if (so->so_rcv.sb_state & SBS_CANTRCVMORE) + flags = q->tqe_flags & TH_FIN; + nq = TAILQ_NEXT(q, tqe_q); + TAILQ_REMOVE(&tp->t_segq, q, tqe_q); + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(q->tqe_m); - else + } else { +#ifdef TCP_REASS_LOGGING + tcp_reass_log_new_in(tp, q->tqe_start, q->tqe_len, q->tqe_m, TCP_R_LOG_READ, q); + tcp_log_reassm(tp, q, NULL, th->th_seq, *tlenp, TCP_R_LOG_READ, 1); +#endif sbappendstream_locked(&so->so_rcv, q->tqe_m, 0); - if (q != &tqs) + } +#ifdef TCP_REASS_LOGGING + tcp_log_reassm(tp, q, NULL, th->th_seq, *tlenp, TCP_R_LOG_READ, 2); +#endif + KASSERT(tp->t_segqmbuflen >= q->tqe_mbuf_cnt, + ("tp:%p seg queue goes negative", tp)); + tp->t_segqmbuflen -= q->tqe_mbuf_cnt; + if (q != &tqs) uma_zfree(tcp_reass_zone, q); tp->t_segqlen--; q = nq; - } while (q && q->tqe_th->th_seq == tp->rcv_nxt); + } while (q && q->tqe_start == tp->rcv_nxt); + if (TAILQ_EMPTY(&tp->t_segq) && + (tp->t_segqmbuflen != 0)) { +#ifdef INVARIANTS + panic("tp:%p segq:%p len:%d queue empty", + tp, &tp->t_segq, tp->t_segqmbuflen); +#else +#ifdef TCP_REASS_LOGGING + tcp_log_reassm(tp, NULL, NULL, th->th_seq, *tlenp, TCP_R_LOG_ZERO, 0); +#endif + tp->t_segqmbuflen = 0; +#endif + } +#ifdef TCP_REASS_LOGGING + tcp_reass_log_dump(tp); +#endif sorwakeup_locked(so); return (flags); } diff --git a/freebsd/sys/netinet/tcp_subr.c b/freebsd/sys/netinet/tcp_subr.c index 787213b0..4852ffaf 100644 --- a/freebsd/sys/netinet/tcp_subr.c +++ b/freebsd/sys/netinet/tcp_subr.c @@ -216,13 +216,13 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs"); -static VNET_DEFINE(int, icmp_may_rst) = 1; +VNET_DEFINE_STATIC(int, icmp_may_rst) = 1; #define V_icmp_may_rst VNET(icmp_may_rst) SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_may_rst), 0, "Certain ICMP unreachable messages may abort connections in SYN_SENT"); -static VNET_DEFINE(int, tcp_isn_reseed_interval) = 0; +VNET_DEFINE_STATIC(int, tcp_isn_reseed_interval) = 0; #define V_tcp_isn_reseed_interval VNET(tcp_isn_reseed_interval) SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_isn_reseed_interval), 0, @@ -239,6 +239,10 @@ VNET_DEFINE(uma_zone_t, sack_hole_zone); VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]); #endif +#define TS_OFFSET_SECRET_LENGTH 32 +VNET_DEFINE_STATIC(u_char, ts_offset_secret[TS_OFFSET_SECRET_LENGTH]); +#define V_ts_offset_secret VNET(ts_offset_secret) + static int tcp_default_fb_init(struct tcpcb *tp); static void tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged); static int tcp_default_handoff_ok(struct tcpcb *tp); @@ -701,7 +705,7 @@ struct tcpcb_mem { #endif }; -static VNET_DEFINE(uma_zone_t, tcpcb_zone); +VNET_DEFINE_STATIC(uma_zone_t, tcpcb_zone); #define V_tcpcb_zone VNET(tcpcb_zone) MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); @@ -949,11 +953,10 @@ deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce, rw_wunlock(&tcp_function_lock); VNET_LIST_RLOCK(); - /* XXX handle */ VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); INP_INFO_WLOCK(&V_tcbinfo); - LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { + CK_LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { INP_WLOCK(inp); if (inp->inp_flags & INP_TIMEWAIT) { INP_WUNLOCK(inp); @@ -1099,6 +1102,7 @@ tcp_init(void) /* Initialize the TCP logging data. */ tcp_log_init(); #endif + arc4rand(&V_ts_offset_secret, sizeof(V_ts_offset_secret), 0); if (tcp_soreceive_stream) { #ifdef INET @@ -1629,7 +1633,7 @@ tcp_newtcpcb(struct inpcb *inp) tp->t_vnet = inp->inp_vnet; #endif tp->t_timers = &tm->tt; - /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ + TAILQ_INIT(&tp->t_segq); tp->t_maxseg = #ifdef INET6 isipv6 ? V_tcp_v6mssdflt : @@ -1723,7 +1727,7 @@ tcp_ccalgounload(struct cc_algo *unload_algo) * therefore don't enter the loop below until the connection * list has stabilised. */ - LIST_FOREACH(inp, &V_tcb, inp_list) { + CK_LIST_FOREACH(inp, &V_tcb, inp_list) { INP_WLOCK(inp); /* Important to skip tcptw structs. */ if (!(inp->inp_flags & INP_TIMEWAIT) && @@ -1737,11 +1741,18 @@ tcp_ccalgounload(struct cc_algo *unload_algo) */ if (CC_ALGO(tp) == unload_algo) { tmpalgo = CC_ALGO(tp); - /* NewReno does not require any init. */ - CC_ALGO(tp) = &newreno_cc_algo; - /* XXX defer to epoch_call */ if (tmpalgo->cb_destroy != NULL) tmpalgo->cb_destroy(tp->ccv); + CC_DATA(tp) = NULL; + /* + * NewReno may allocate memory on + * demand for certain stateful + * configuration as needed, but is + * coded to never fail on memory + * allocation failure so it is a safe + * fallback. + */ + CC_ALGO(tp) = &newreno_cc_algo; } } INP_WUNLOCK(inp); @@ -1893,6 +1904,7 @@ tcp_discardcb(struct tcpcb *tp) /* Allow the CC algorithm to clean up after itself. */ if (CC_ALGO(tp)->cb_destroy != NULL) CC_ALGO(tp)->cb_destroy(tp->ccv); + CC_DATA(tp) = NULL; #ifdef TCP_HHOOK khelp_destroy_osd(tp->osd); @@ -1922,10 +1934,11 @@ tcp_timer_discard(void *ptp) { struct inpcb *inp; struct tcpcb *tp; + struct epoch_tracker et; tp = (struct tcpcb *)ptp; CURVNET_SET(tp->t_vnet); - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); @@ -1945,13 +1958,13 @@ tcp_timer_discard(void *ptp) tp->t_inpcb = NULL; uma_zfree(V_tcpcb_zone, tp); if (in_pcbrele_wlocked(inp)) { - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); CURVNET_RESTORE(); return; } } INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); CURVNET_RESTORE(); } @@ -2024,10 +2037,12 @@ tcp_drain(void) * useful. */ INP_INFO_WLOCK(&V_tcbinfo); - LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) { - if (inpb->inp_flags & INP_TIMEWAIT) - continue; + CK_LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) { INP_WLOCK(inpb); + if (inpb->inp_flags & INP_TIMEWAIT) { + INP_WUNLOCK(inpb); + continue; + } if ((tcpb = intotcpcb(inpb)) != NULL) { tcp_reass_flush(tcpb); tcp_clean_sackreport(tcpb); @@ -2110,10 +2125,10 @@ static int tcp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, m, n, pcb_count; - struct in_pcblist *il; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; + struct epoch_tracker et; /* * The process of preparing the TCB list is too time-consuming and @@ -2157,12 +2172,11 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) if (error) return (error); - il = malloc(sizeof(struct in_pcblist) + n * sizeof(struct inpcb *), M_TEMP, M_WAITOK|M_ZERO_INVARIANTS); - inp_list = il->il_inp_list; + inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); INP_INFO_WLOCK(&V_tcbinfo); - for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0; - inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) { + for (inp = CK_LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0; + inp != NULL && i < n; inp = CK_LIST_NEXT(inp, inp_list)) { INP_WLOCK(inp); if (inp->inp_gencnt <= gencnt) { /* @@ -2201,10 +2215,14 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) } else INP_RUNLOCK(inp); } - - il->il_count = n; - il->il_pcbinfo = &V_tcbinfo; - epoch_call(net_epoch_preempt, &il->il_epoch_ctx, in_pcblist_rele_rlocked); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); + for (i = 0; i < n; i++) { + inp = inp_list[i]; + INP_RLOCK(inp); + if (!in_pcbrele_rlocked(inp)) + INP_RUNLOCK(inp); + } + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); if (!error) { /* @@ -2221,6 +2239,7 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) INP_LIST_RUNLOCK(&V_tcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } + free(inp_list, M_TEMP); return (error); } @@ -2342,6 +2361,7 @@ tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct icmp *icp; struct in_conninfo inc; + struct epoch_tracker et; tcp_seq icmp_tcp_seq; int mtu; @@ -2373,7 +2393,7 @@ tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) icp = (struct icmp *)((caddr_t)ip - offsetof(struct icmp, icmp_ip)); th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); inp = in_pcblookup(&V_tcbinfo, faddr, th->th_dport, ip->ip_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL); if (inp != NULL && PRC_IS_REDIRECT(cmd)) { @@ -2438,7 +2458,7 @@ tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) out: if (inp != NULL) INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); } #endif /* INET */ @@ -2456,6 +2476,7 @@ tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; struct in_conninfo inc; + struct epoch_tracker et; struct tcp_ports { uint16_t th_sport; uint16_t th_dport; @@ -2517,7 +2538,7 @@ tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) } bzero(&t_ports, sizeof(struct tcp_ports)); m_copydata(m, off, sizeof(struct tcp_ports), (caddr_t)&t_ports); - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_dst, t_ports.th_dport, &ip6->ip6_src, t_ports.th_sport, INPLOOKUP_WLOCKPCB, NULL); if (inp != NULL && PRC_IS_REDIRECT(cmd)) { @@ -2589,10 +2610,45 @@ tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) out: if (inp != NULL) INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); } #endif /* INET6 */ +static uint32_t +tcp_keyed_hash(struct in_conninfo *inc, u_char *key, u_int len) +{ + MD5_CTX ctx; + uint32_t hash[4]; + + MD5Init(&ctx); + MD5Update(&ctx, &inc->inc_fport, sizeof(uint16_t)); + MD5Update(&ctx, &inc->inc_lport, sizeof(uint16_t)); + switch (inc->inc_flags & INC_ISIPV6) { +#ifdef INET + case 0: + MD5Update(&ctx, &inc->inc_faddr, sizeof(struct in_addr)); + MD5Update(&ctx, &inc->inc_laddr, sizeof(struct in_addr)); + break; +#endif +#ifdef INET6 + case INC_ISIPV6: + MD5Update(&ctx, &inc->inc6_faddr, sizeof(struct in6_addr)); + MD5Update(&ctx, &inc->inc6_laddr, sizeof(struct in6_addr)); + break; +#endif + } + MD5Update(&ctx, key, len); + MD5Final((unsigned char *)hash, &ctx); + + return (hash[0]); +} + +uint32_t +tcp_new_ts_offset(struct in_conninfo *inc) +{ + return (tcp_keyed_hash(inc, V_ts_offset_secret, + sizeof(V_ts_offset_secret))); +} /* * Following is where TCP initial sequence number generation occurs. @@ -2634,19 +2690,20 @@ out: * as reseeding should not be necessary. * * Locking of the global variables isn_secret, isn_last_reseed, isn_offset, - * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock. In + * isn_offset_old, and isn_ctx is performed using the ISN lock. In * general, this means holding an exclusive (write) lock. */ #define ISN_BYTES_PER_SECOND 1048576 #define ISN_STATIC_INCREMENT 4096 #define ISN_RANDOM_INCREMENT (4096 - 1) +#define ISN_SECRET_LENGTH 32 -static VNET_DEFINE(u_char, isn_secret[32]); -static VNET_DEFINE(int, isn_last); -static VNET_DEFINE(int, isn_last_reseed); -static VNET_DEFINE(u_int32_t, isn_offset); -static VNET_DEFINE(u_int32_t, isn_offset_old); +VNET_DEFINE_STATIC(u_char, isn_secret[ISN_SECRET_LENGTH]); +VNET_DEFINE_STATIC(int, isn_last); +VNET_DEFINE_STATIC(int, isn_last_reseed); +VNET_DEFINE_STATIC(u_int32_t, isn_offset); +VNET_DEFINE_STATIC(u_int32_t, isn_offset_old); #define V_isn_secret VNET(isn_secret) #define V_isn_last VNET(isn_last) @@ -2655,45 +2712,23 @@ static VNET_DEFINE(u_int32_t, isn_offset_old); #define V_isn_offset_old VNET(isn_offset_old) tcp_seq -tcp_new_isn(struct tcpcb *tp) +tcp_new_isn(struct in_conninfo *inc) { - MD5_CTX isn_ctx; - u_int32_t md5_buffer[4]; tcp_seq new_isn; u_int32_t projected_offset; - INP_WLOCK_ASSERT(tp->t_inpcb); - ISN_LOCK(); /* Seed if this is the first use, reseed if requested. */ if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) && (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz) < (u_int)ticks))) { - read_random(&V_isn_secret, sizeof(V_isn_secret)); + arc4rand(&V_isn_secret, sizeof(V_isn_secret), 0); V_isn_last_reseed = ticks; } /* Compute the md5 hash and return the ISN. */ - MD5Init(&isn_ctx); - MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); - MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); -#ifdef INET6 - if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { - MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, - sizeof(struct in6_addr)); - MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, - sizeof(struct in6_addr)); - } else -#endif - { - MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, - sizeof(struct in_addr)); - MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, - sizeof(struct in_addr)); - } - MD5Update(&isn_ctx, (u_char *) &V_isn_secret, sizeof(V_isn_secret)); - MD5Final((u_char *) &md5_buffer, &isn_ctx); - new_isn = (tcp_seq) md5_buffer[0]; + new_isn = (tcp_seq)tcp_keyed_hash(inc, V_isn_secret, + sizeof(V_isn_secret)); V_isn_offset += ISN_STATIC_INCREMENT + (arc4random() & ISN_RANDOM_INCREMENT); if (ticks != V_isn_last) { @@ -2840,6 +2875,9 @@ tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap) KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer")); + if (inc->inc_flags & INC_IPV6MINMTU) + return (IPV6_MMTU); + if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { in6_splitscope(&inc->inc6_faddr, &dst6, &scopeid); if (fib6_lookup_nh_ext(inc->inc_fibnum, &dst6, scopeid, 0, @@ -2928,6 +2966,7 @@ sysctl_drop(SYSCTL_HANDLER_ARGS) struct tcpcb *tp; struct tcptw *tw; struct sockaddr_in *fin, *lin; + struct epoch_tracker et; #ifdef INET6 struct sockaddr_in6 *fin6, *lin6; #endif @@ -2987,7 +3026,7 @@ sysctl_drop(SYSCTL_HANDLER_ARGS) default: return (EINVAL); } - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: @@ -3026,7 +3065,7 @@ sysctl_drop(SYSCTL_HANDLER_ARGS) INP_WUNLOCK(inp); } else error = ESRCH; - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); return (error); } diff --git a/freebsd/sys/netinet/tcp_syncache.c b/freebsd/sys/netinet/tcp_syncache.c index e163aa54..6fdd859d 100644 --- a/freebsd/sys/netinet/tcp_syncache.c +++ b/freebsd/sys/netinet/tcp_syncache.c @@ -71,6 +71,7 @@ __FBSDID("$FreeBSD$"); #include <net/vnet.h> #include <netinet/in.h> +#include <netinet/in_kdtrace.h> #include <netinet/in_systm.h> #include <netinet/ip.h> #include <netinet/in_var.h> @@ -104,19 +105,19 @@ __FBSDID("$FreeBSD$"); #include <security/mac/mac_framework.h> -static VNET_DEFINE(int, tcp_syncookies) = 1; +VNET_DEFINE_STATIC(int, tcp_syncookies) = 1; #define V_tcp_syncookies VNET(tcp_syncookies) SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_syncookies), 0, "Use TCP SYN cookies if the syncache overflows"); -static VNET_DEFINE(int, tcp_syncookiesonly) = 0; +VNET_DEFINE_STATIC(int, tcp_syncookiesonly) = 0; #define V_tcp_syncookiesonly VNET(tcp_syncookiesonly) SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_syncookiesonly), 0, "Use only TCP SYN cookies"); -static VNET_DEFINE(int, functions_inherit_listen_socket_stack) = 1; +VNET_DEFINE_STATIC(int, functions_inherit_listen_socket_stack) = 1; #define V_functions_inherit_listen_socket_stack \ VNET(functions_inherit_listen_socket_stack) SYSCTL_INT(_net_inet_tcp, OID_AUTO, functions_inherit_listen_socket_stack, @@ -164,7 +165,7 @@ static int syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, #define TCP_SYNCACHE_HASHSIZE 512 #define TCP_SYNCACHE_BUCKETLIMIT 30 -static VNET_DEFINE(struct tcp_syncache, tcp_syncache); +VNET_DEFINE_STATIC(struct tcp_syncache, tcp_syncache); #define V_tcp_syncache VNET(tcp_syncache) static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, @@ -185,8 +186,27 @@ SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_VNET | CTLFLAG_R &VNET_NAME(tcp_syncache.hashsize), 0, "Size of TCP syncache hashtable"); -SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_VNET | CTLFLAG_RW, +static int +sysctl_net_inet_tcp_syncache_rexmtlimit_check(SYSCTL_HANDLER_ARGS) +{ + int error; + u_int new; + + new = V_tcp_syncache.rexmt_limit; + error = sysctl_handle_int(oidp, &new, 0, req); + if ((error == 0) && (req->newptr != NULL)) { + if (new > TCP_MAXRXTSHIFT) + error = EINVAL; + else + V_tcp_syncache.rexmt_limit = new; + } + return (error); +} + +SYSCTL_PROC(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, + CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, &VNET_NAME(tcp_syncache.rexmt_limit), 0, + sysctl_net_inet_tcp_syncache_rexmtlimit_check, "UI", "Limit on SYN/ACK retransmissions"); VNET_DEFINE(int, tcp_sc_rst_sock_fail) = 1; @@ -398,8 +418,14 @@ syncache_drop(struct syncache *sc, struct syncache_head *sch) static void syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout) { - sc->sc_rxttime = ticks + - TCPTV_RTOBASE * (tcp_syn_backoff[sc->sc_rxmits]); + int rexmt; + + if (sc->sc_rxmits == 0) + rexmt = TCPTV_RTOBASE; + else + TCPT_RANGESET(rexmt, TCPTV_RTOBASE * tcp_syn_backoff[sc->sc_rxmits], + tcp_rexmit_min, TCPTV_REXMTMAX); + sc->sc_rxttime = ticks + rexmt; sc->sc_rxmits++; if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) { sch->sch_nextc = sc->sc_rxttime; @@ -746,10 +772,9 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) goto abort; } #ifdef INET6 - if (sc->sc_inc.inc_flags & INC_ISIPV6) { + if (inp->inp_vflag & INP_IPV6PROTO) { struct inpcb *oinp = sotoinpcb(lso); - struct in6_addr laddr6; - struct sockaddr_in6 sin6; + /* * Inherit socket options from the listening socket. * Note that in6p_inputopts are not (and should not be) @@ -763,6 +788,11 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) if (oinp->in6p_outputopts) inp->in6p_outputopts = ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT); + } + + if (sc->sc_inc.inc_flags & INC_ISIPV6) { + struct in6_addr laddr6; + struct sockaddr_in6 sin6; sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(sin6); @@ -1153,25 +1183,6 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, } } - /* - * If timestamps were negotiated, the reflected timestamp - * must be equal to what we actually sent in the SYN|ACK - * except in the case of 0. Some boxes are known for sending - * broken timestamp replies during the 3whs (and potentially - * during the connection also). - * - * Accept the final ACK of 3whs with reflected timestamp of 0 - * instead of sending a RST and deleting the syncache entry. - */ - if ((to->to_flags & TOF_TS) && to->to_tsecr && - to->to_tsecr != sc->sc_ts) { - if ((s = tcp_log_addrs(inc, th, NULL, NULL))) - log(LOG_DEBUG, "%s; %s: TSECR %u != TS %u, " - "segment rejected\n", - s, __func__, to->to_tsecr, sc->sc_ts); - goto failed; - } - *lsop = syncache_socket(sc, *lsop, m); if (*lsop == NULL) @@ -1404,6 +1415,7 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, */ mac_syncache_destroy(&maclabel); #endif + TCP_PROBE5(receive, NULL, NULL, m, NULL, th); /* Retransmit SYN|ACK and reset retransmit count. */ if ((s = tcp_log_addrs(&sc->sc_inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Received duplicate SYN, " @@ -1418,7 +1430,7 @@ syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, TCPSTAT_INC(tcps_sndtotal); } SCH_UNLOCK(sch); - goto done; + goto donenoprobe; } if (tfo_cookie_valid) { @@ -1498,8 +1510,8 @@ skip_alloc: */ if (to->to_flags & TOF_TS) { sc->sc_tsreflect = to->to_tsval; - sc->sc_ts = tcp_ts_getticks(); sc->sc_flags |= SCF_TIMESTAMP; + sc->sc_tsoff = tcp_new_ts_offset(inc); } if (to->to_flags & TOF_SCALE) { int wscale = 0; @@ -1571,6 +1583,7 @@ skip_alloc: goto tfo_expanded; } + TCP_PROBE5(receive, NULL, NULL, m, NULL, th); /* * Do a standard 3-way handshake. */ @@ -1586,8 +1599,11 @@ skip_alloc: syncache_free(sc); TCPSTAT_INC(tcps_sc_dropped); } + goto donenoprobe; done: + TCP_PROBE5(receive, NULL, NULL, m, NULL, th); +donenoprobe: if (m) { *lsop = NULL; m_freem(m); @@ -1727,8 +1743,7 @@ syncache_respond(struct syncache *sc, struct syncache_head *sch, int locked, to.to_flags |= TOF_SCALE; } if (sc->sc_flags & SCF_TIMESTAMP) { - /* Virgin timestamp or TCP cookie enhanced one. */ - to.to_tsval = sc->sc_ts; + to.to_tsval = sc->sc_tsoff + tcp_ts_getticks(); to.to_tsecr = sc->sc_tsreflect; to.to_flags |= TOF_TS; } @@ -1799,6 +1814,7 @@ syncache_respond(struct syncache *sc, struct syncache_head *sch, int locked, return (error); } #endif + TCP_PROBE5(send, NULL, NULL, ip6, NULL, th); error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); } #endif @@ -1819,6 +1835,7 @@ syncache_respond(struct syncache *sc, struct syncache_head *sch, int locked, return (error); } #endif + TCP_PROBE5(send, NULL, NULL, ip, NULL, th); error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, NULL); } #endif @@ -2033,12 +2050,6 @@ syncookie_generate(struct syncache_head *sch, struct syncache *sc) iss = hash & ~0xff; iss |= cookie.cookie ^ (hash >> 24); - /* Randomize the timestamp. */ - if (sc->sc_flags & SCF_TIMESTAMP) { - sc->sc_ts = arc4random(); - sc->sc_tsoff = sc->sc_ts - tcp_ts_getticks(); - } - TCPSTAT_INC(tcps_sc_sendcookie); return (iss); } @@ -2125,8 +2136,7 @@ syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, if (to->to_flags & TOF_TS) { sc->sc_flags |= SCF_TIMESTAMP; sc->sc_tsreflect = to->to_tsval; - sc->sc_ts = to->to_tsecr; - sc->sc_tsoff = to->to_tsecr - tcp_ts_getticks(); + sc->sc_tsoff = tcp_new_ts_offset(inc); } if (to->to_flags & TOF_SIGNATURE) diff --git a/freebsd/sys/netinet/tcp_syncache.h b/freebsd/sys/netinet/tcp_syncache.h index 92a7c7c9..0104e528 100644 --- a/freebsd/sys/netinet/tcp_syncache.h +++ b/freebsd/sys/netinet/tcp_syncache.h @@ -56,7 +56,6 @@ struct syncache { int sc_rxttime; /* retransmit time */ u_int16_t sc_rxmits; /* retransmit counter */ u_int32_t sc_tsreflect; /* timestamp to reflect */ - u_int32_t sc_ts; /* our timestamp to send */ u_int32_t sc_tsoff; /* ts offset w/ syncookies */ u_int32_t sc_flowlabel; /* IPv6 flowlabel */ tcp_seq sc_irs; /* seq from peer */ diff --git a/freebsd/sys/netinet/tcp_timer.c b/freebsd/sys/netinet/tcp_timer.c index 422e5122..c50af2bb 100644 --- a/freebsd/sys/netinet/tcp_timer.c +++ b/freebsd/sys/netinet/tcp_timer.c @@ -73,6 +73,7 @@ __FBSDID("$FreeBSD$"); #include <netinet/tcp_log_buf.h> #include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> +#include <netinet/tcp_seq.h> #include <netinet/cc/cc.h> #ifdef INET6 #include <netinet6/tcp6_var.h> @@ -141,7 +142,7 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, /* max idle probes */ int tcp_maxpersistidle; -static int tcp_rexmit_drop_options = 0; +int tcp_rexmit_drop_options = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, &tcp_rexmit_drop_options, 0, "Drop TCP options from 3rd and later retransmitted SYN"); @@ -176,18 +177,13 @@ static int per_cpu_timers = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, &per_cpu_timers , 0, "run tcp timers on all cpus"); -#if 0 -#define INP_CPU(inp) (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \ - ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0) -#endif - /* * Map the given inp to a CPU id. * * This queries RSS if it's compiled in, else it defaults to the current * CPU ID. */ -static inline int +inline int inp_to_cpuid(struct inpcb *inp) { u_int cpuid; @@ -245,7 +241,7 @@ int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = int tcp_backoff[TCP_MAXRXTSHIFT + 1] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; -static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ +int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ /* * TCP timer processing. @@ -280,55 +276,10 @@ tcp_timer_delack(void *xtp) CURVNET_RESTORE(); } -/* - * When a timer wants to remove a TCB it must - * hold the INP_INFO_RLOCK(). The timer function - * should only have grabbed the INP_WLOCK() when - * it entered. To safely switch to holding both the - * INP_INFO_RLOCK() and the INP_WLOCK() we must first - * grab a reference on the inp, which will hold the inp - * so that it can't be removed. We then unlock the INP_WLOCK(), - * and grab the INP_INFO_RLOCK() lock. Once we have the INP_INFO_RLOCK() - * we proceed again to get the INP_WLOCK() (this preserves proper - * lock order). After acquiring the INP_WLOCK we must check if someone - * else deleted the pcb i.e. the inp_flags check. - * If so we return 1 otherwise we return 0. - * - * No matter what the tcp_inpinfo_lock_add() function - * returns the caller must afterwards call tcp_inpinfo_lock_del() - * to drop the locks and reference properly. - */ - -int -tcp_inpinfo_lock_add(struct inpcb *inp) -{ - in_pcbref(inp); - INP_WUNLOCK(inp); - INP_INFO_RLOCK(&V_tcbinfo); - INP_WLOCK(inp); - if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { - return(1); - } - return(0); - -} - void tcp_inpinfo_lock_del(struct inpcb *inp, struct tcpcb *tp) { - INP_INFO_RUNLOCK(&V_tcbinfo); - if (inp && (tp == NULL)) { - /* - * If tcp_close/drop() gets called and tp - * returns NULL, then the function dropped - * the inp lock, we hold a reference keeping - * this around, so we must re-aquire the - * INP_WLOCK() in order to proceed with - * our dropping the inp reference. - */ - INP_WLOCK(inp); - } - if (inp && in_pcbrele_wlocked(inp) == 0) + if (inp && tp != NULL) INP_WUNLOCK(inp); } @@ -337,6 +288,7 @@ tcp_timer_2msl(void *xtp) { struct tcpcb *tp = xtp; struct inpcb *inp; + struct epoch_tracker et; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; @@ -383,11 +335,13 @@ tcp_timer_2msl(void *xtp) tp->t_inpcb && tp->t_inpcb->inp_socket && (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { TCPSTAT_INC(tcps_finwait2_drops); - if (tcp_inpinfo_lock_add(inp)) { + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { tcp_inpinfo_lock_del(inp, tp); goto out; } + INP_INFO_RLOCK_ET(&V_tcbinfo, et); tp = tcp_close(tp); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); tcp_inpinfo_lock_del(inp, tp); goto out; } else { @@ -395,15 +349,17 @@ tcp_timer_2msl(void *xtp) callout_reset(&tp->t_timers->tt_2msl, TP_KEEPINTVL(tp), tcp_timer_2msl, tp); } else { - if (tcp_inpinfo_lock_add(inp)) { + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { tcp_inpinfo_lock_del(inp, tp); goto out; } + INP_INFO_RLOCK_ET(&V_tcbinfo, et); tp = tcp_close(tp); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); tcp_inpinfo_lock_del(inp, tp); goto out; } - } + } #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) @@ -424,6 +380,7 @@ tcp_timer_keep(void *xtp) struct tcpcb *tp = xtp; struct tcptemp *t_template; struct inpcb *inp; + struct epoch_tracker et; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; @@ -517,11 +474,11 @@ tcp_timer_keep(void *xtp) dropit: TCPSTAT_INC(tcps_keepdrops); - - if (tcp_inpinfo_lock_add(inp)) { + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { tcp_inpinfo_lock_del(inp, tp); goto out; } + INP_INFO_RLOCK_ET(&V_tcbinfo, et); tp = tcp_drop(tp, ETIMEDOUT); #ifdef TCPDEBUG @@ -530,8 +487,9 @@ dropit: PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); tcp_inpinfo_lock_del(inp, tp); -out: + out: CURVNET_RESTORE(); } @@ -540,6 +498,7 @@ tcp_timer_persist(void *xtp) { struct tcpcb *tp = xtp; struct inpcb *inp; + struct epoch_tracker et; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; @@ -579,11 +538,13 @@ tcp_timer_persist(void *xtp) (ticks - tp->t_rcvtime >= tcp_maxpersistidle || ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { TCPSTAT_INC(tcps_persistdrop); - if (tcp_inpinfo_lock_add(inp)) { + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { tcp_inpinfo_lock_del(inp, tp); goto out; } + INP_INFO_RLOCK_ET(&V_tcbinfo, et); tp = tcp_drop(tp, ETIMEDOUT); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); tcp_inpinfo_lock_del(inp, tp); goto out; } @@ -594,11 +555,13 @@ tcp_timer_persist(void *xtp) if (tp->t_state > TCPS_CLOSE_WAIT && (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { TCPSTAT_INC(tcps_persistdrop); - if (tcp_inpinfo_lock_add(inp)) { + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { tcp_inpinfo_lock_del(inp, tp); goto out; } + INP_INFO_RLOCK_ET(&V_tcbinfo, et); tp = tcp_drop(tp, ETIMEDOUT); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); tcp_inpinfo_lock_del(inp, tp); goto out; } @@ -624,6 +587,7 @@ tcp_timer_rexmt(void * xtp) CURVNET_SET(tp->t_vnet); int rexmt; struct inpcb *inp; + struct epoch_tracker et; #ifdef TCPDEBUG int ostate; @@ -660,11 +624,13 @@ tcp_timer_rexmt(void * xtp) if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; TCPSTAT_INC(tcps_timeoutdrop); - if (tcp_inpinfo_lock_add(inp)) { + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { tcp_inpinfo_lock_del(inp, tp); goto out; } + INP_INFO_RLOCK_ET(&V_tcbinfo, et); tp = tcp_drop(tp, ETIMEDOUT); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); tcp_inpinfo_lock_del(inp, tp); goto out; } @@ -950,6 +916,111 @@ tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) return callout_active(t_callout); } +/* + * Stop the timer from running, and apply a flag + * against the timer_flags that will force the + * timer never to run. The flag is needed to assure + * a race does not leave it running and cause + * the timer to possibly restart itself (keep and persist + * especially do this). + */ +int +tcp_timer_suspend(struct tcpcb *tp, uint32_t timer_type) +{ + struct callout *t_callout; + uint32_t t_flags; + + switch (timer_type) { + case TT_DELACK: + t_flags = TT_DELACK_SUS; + t_callout = &tp->t_timers->tt_delack; + break; + case TT_REXMT: + t_flags = TT_REXMT_SUS; + t_callout = &tp->t_timers->tt_rexmt; + break; + case TT_PERSIST: + t_flags = TT_PERSIST_SUS; + t_callout = &tp->t_timers->tt_persist; + break; + case TT_KEEP: + t_flags = TT_KEEP_SUS; + t_callout = &tp->t_timers->tt_keep; + break; + case TT_2MSL: + t_flags = TT_2MSL_SUS; + t_callout = &tp->t_timers->tt_2msl; + break; + default: + panic("tp:%p bad timer_type 0x%x", tp, timer_type); + } + tp->t_timers->tt_flags |= t_flags; + return (callout_stop(t_callout)); +} + +void +tcp_timers_unsuspend(struct tcpcb *tp, uint32_t timer_type) +{ + switch (timer_type) { + case TT_DELACK: + if (tp->t_timers->tt_flags & TT_DELACK_SUS) { + tp->t_timers->tt_flags &= ~TT_DELACK_SUS; + if (tp->t_flags & TF_DELACK) { + /* Delayed ack timer should be up activate a timer */ + tp->t_flags &= ~TF_DELACK; + tcp_timer_activate(tp, TT_DELACK, + tcp_delacktime); + } + } + break; + case TT_REXMT: + if (tp->t_timers->tt_flags & TT_REXMT_SUS) { + tp->t_timers->tt_flags &= ~TT_REXMT_SUS; + if (SEQ_GT(tp->snd_max, tp->snd_una) && + (tcp_timer_active((tp), TT_PERSIST) == 0) && + tp->snd_wnd) { + /* We have outstanding data activate a timer */ + tcp_timer_activate(tp, TT_REXMT, + tp->t_rxtcur); + } + } + break; + case TT_PERSIST: + if (tp->t_timers->tt_flags & TT_PERSIST_SUS) { + tp->t_timers->tt_flags &= ~TT_PERSIST_SUS; + if (tp->snd_wnd == 0) { + /* Activate the persists timer */ + tp->t_rxtshift = 0; + tcp_setpersist(tp); + } + } + break; + case TT_KEEP: + if (tp->t_timers->tt_flags & TT_KEEP_SUS) { + tp->t_timers->tt_flags &= ~TT_KEEP_SUS; + tcp_timer_activate(tp, TT_KEEP, + TCPS_HAVEESTABLISHED(tp->t_state) ? + TP_KEEPIDLE(tp) : TP_KEEPINIT(tp)); + } + break; + case TT_2MSL: + if (tp->t_timers->tt_flags &= TT_2MSL_SUS) { + tp->t_timers->tt_flags &= ~TT_2MSL_SUS; + if ((tp->t_state == TCPS_FIN_WAIT_2) && + ((tp->t_inpcb->inp_socket == NULL) || + (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE))) { + /* Star the 2MSL timer */ + tcp_timer_activate(tp, TT_2MSL, + (tcp_fast_finwait2_recycle) ? + tcp_finwait2_timeout : TP_MAXIDLE(tp)); + } + } + break; + default: + panic("tp:%p bad timer_type 0x%x", tp, timer_type); + } +} + void tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) { diff --git a/freebsd/sys/netinet/tcp_timer.h b/freebsd/sys/netinet/tcp_timer.h index b0ff3809..a2ab6ca5 100644 --- a/freebsd/sys/netinet/tcp_timer.h +++ b/freebsd/sys/netinet/tcp_timer.h @@ -168,11 +168,15 @@ struct tcp_timer { #define TT_2MSL 0x0010 #define TT_MASK (TT_DELACK|TT_REXMT|TT_PERSIST|TT_KEEP|TT_2MSL) -#define TT_DELACK_RST 0x0100 -#define TT_REXMT_RST 0x0200 -#define TT_PERSIST_RST 0x0400 -#define TT_KEEP_RST 0x0800 -#define TT_2MSL_RST 0x1000 +/* + * Suspend flags - used when suspending a timer + * from ever running again. + */ +#define TT_DELACK_SUS 0x0100 +#define TT_REXMT_SUS 0x0200 +#define TT_PERSIST_SUS 0x0400 +#define TT_KEEP_SUS 0x0800 +#define TT_2MSL_SUS 0x1000 #define TT_STOPPED 0x00010000 @@ -196,6 +200,8 @@ extern int tcp_msl; extern int tcp_ttl; /* time to live for TCP segs */ extern int tcp_backoff[]; extern int tcp_syn_backoff[]; +extern int tcp_totbackoff; +extern int tcp_rexmit_drop_options; extern int tcp_always_keepalive; extern int tcp_finwait2_timeout; @@ -208,7 +214,6 @@ VNET_DECLARE(int, tcp_pmtud_blackhole_mss); VNET_DECLARE(int, tcp_v6pmtud_blackhole_mss); #define V_tcp_v6pmtud_blackhole_mss VNET(tcp_v6pmtud_blackhole_mss) -int tcp_inpinfo_lock_add(struct inpcb *inp); void tcp_inpinfo_lock_del(struct inpcb *inp, struct tcpcb *tp); void tcp_timer_init(void); diff --git a/freebsd/sys/netinet/tcp_timewait.c b/freebsd/sys/netinet/tcp_timewait.c index afadf7cd..8a28283f 100644 --- a/freebsd/sys/netinet/tcp_timewait.c +++ b/freebsd/sys/netinet/tcp_timewait.c @@ -65,6 +65,7 @@ __FBSDID("$FreeBSD$"); #include <net/vnet.h> #include <netinet/in.h> +#include <netinet/in_kdtrace.h> #include <netinet/in_pcb.h> #include <netinet/in_systm.h> #include <netinet/in_var.h> @@ -98,7 +99,7 @@ __FBSDID("$FreeBSD$"); #include <security/mac/mac_framework.h> -static VNET_DEFINE(uma_zone_t, tcptw_zone); +VNET_DEFINE_STATIC(uma_zone_t, tcptw_zone); #define V_tcptw_zone VNET(tcptw_zone) static int maxtcptw; @@ -113,11 +114,11 @@ static int maxtcptw; * - a tcptw relies on its inpcb reference counting for memory stability * - a tcptw is dereferenceable only while its inpcb is locked */ -static VNET_DEFINE(TAILQ_HEAD(, tcptw), twq_2msl); +VNET_DEFINE_STATIC(TAILQ_HEAD(, tcptw), twq_2msl); #define V_twq_2msl VNET(twq_2msl) /* Global timewait lock */ -static VNET_DEFINE(struct rwlock, tw_lock); +VNET_DEFINE_STATIC(struct rwlock, tw_lock); #define V_tw_lock VNET(tw_lock) #define TW_LOCK_INIT(tw, d) rw_init_flags(&(tw), (d), 0) @@ -174,7 +175,7 @@ SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxtcptw, CTLTYPE_INT|CTLFLAG_RW, &maxtcptw, 0, sysctl_maxtcptw, "IU", "Maximum number of compressed TCP TIME_WAIT entries"); -static VNET_DEFINE(int, nolocaltimewait) = 0; +VNET_DEFINE_STATIC(int, nolocaltimewait) = 0; #define V_nolocaltimewait VNET(nolocaltimewait) SYSCTL_INT(_net_inet_tcp, OID_AUTO, nolocaltimewait, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nolocaltimewait), 0, @@ -208,11 +209,12 @@ void tcp_tw_destroy(void) { struct tcptw *tw; + struct epoch_tracker et; - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); while ((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL) tcp_twclose(tw, 0); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); TW_LOCK_DESTROY(V_tw_lock); uma_zdestroy(V_tcptw_zone); @@ -230,6 +232,7 @@ tcp_twstart(struct tcpcb *tp) struct tcptw twlocal, *tw; struct inpcb *inp = tp->t_inpcb; struct socket *so; + uint32_t recwin; bool acknow, local; #ifdef INET6 bool isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6; @@ -292,10 +295,16 @@ tcp_twstart(struct tcpcb *tp) /* * Recover last window size sent. */ - if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) - tw->last_win = (tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale; - else - tw->last_win = 0; + so = inp->inp_socket; + recwin = lmin(lmax(sbspace(&so->so_rcv), 0), + (long)TCP_MAXWIN << tp->rcv_scale); + if (recwin < (so->so_rcv.sb_hiwat / 4) && + recwin < tp->t_maxseg) + recwin = 0; + if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && + recwin < (tp->rcv_adv - tp->rcv_nxt)) + recwin = (tp->rcv_adv - tp->rcv_nxt); + tw->last_win = htons((u_short)(recwin >> tp->rcv_scale)); /* * Set t_recent if timestamps are used on the connection. @@ -332,7 +341,6 @@ tcp_twstart(struct tcpcb *tp) * and might not be needed here any longer. */ tcp_discardcb(tp); - so = inp->inp_socket; soisdisconnected(so); tw->tw_so_options = so->so_options; inp->inp_flags |= INP_TIMEWAIT; @@ -451,9 +459,14 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to __unused, struct tcphdr *th, * Acknowledge the segment if it has data or is not a duplicate ACK. */ if (thflags != TH_ACK || tlen != 0 || - th->th_seq != tw->rcv_nxt || th->th_ack != tw->snd_nxt) + th->th_seq != tw->rcv_nxt || th->th_ack != tw->snd_nxt) { + TCP_PROBE5(receive, NULL, NULL, m, NULL, th); tcp_twrespond(tw, TH_ACK); + goto dropnoprobe; + } drop: + TCP_PROBE5(receive, NULL, NULL, m, NULL, th); +dropnoprobe: INP_WUNLOCK(inp); m_freem(m); return (0); @@ -599,6 +612,7 @@ tcp_twrespond(struct tcptw *tw, int flags) th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) + optlen, IPPROTO_TCP, 0); ip6->ip6_hlim = in6_selecthlim(inp, NULL); + TCP_PROBE5(send, NULL, NULL, ip6, NULL, th); error = ip6_output(m, inp->in6p_outputopts, NULL, (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp); } @@ -614,6 +628,7 @@ tcp_twrespond(struct tcptw *tw, int flags) ip->ip_len = htons(m->m_pkthdr.len); if (V_path_mtu_discovery) ip->ip_off |= htons(IP_DF); + TCP_PROBE5(send, NULL, NULL, ip, NULL, th); error = ip_output(m, inp->inp_options, NULL, ((tw->tw_so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), NULL, inp); @@ -676,6 +691,7 @@ tcp_tw_2msl_scan(int reuse) { struct tcptw *tw; struct inpcb *inp; + struct epoch_tracker et; #ifdef INVARIANTS if (reuse) { @@ -709,54 +725,46 @@ tcp_tw_2msl_scan(int reuse) in_pcbref(inp); TW_RUNLOCK(V_tw_lock); - if (INP_INFO_TRY_RLOCK(&V_tcbinfo)) { - - INP_WLOCK(inp); - tw = intotw(inp); - if (in_pcbrele_wlocked(inp)) { - if (__predict_true(tw == NULL)) { - INP_INFO_RUNLOCK(&V_tcbinfo); - continue; - } else { - /* This should not happen as in TIMEWAIT - * state the inp should not be destroyed - * before its tcptw. If INVARIANTS is - * defined panic. - */ + INP_INFO_RLOCK_ET(&V_tcbinfo, et); + INP_WLOCK(inp); + tw = intotw(inp); + if (in_pcbrele_wlocked(inp)) { + if (__predict_true(tw == NULL)) { + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + continue; + } else { + /* This should not happen as in TIMEWAIT + * state the inp should not be destroyed + * before its tcptw. If INVARIANTS is + * defined panic. + */ #ifdef INVARIANTS - panic("%s: Panic before an infinite " - "loop: INP_TIMEWAIT && (INP_FREED " - "|| inp last reference) && tw != " - "NULL", __func__); + panic("%s: Panic before an infinite " + "loop: INP_TIMEWAIT && (INP_FREED " + "|| inp last reference) && tw != " + "NULL", __func__); #else - log(LOG_ERR, "%s: Avoid an infinite " - "loop: INP_TIMEWAIT && (INP_FREED " - "|| inp last reference) && tw != " - "NULL", __func__); + log(LOG_ERR, "%s: Avoid an infinite " + "loop: INP_TIMEWAIT && (INP_FREED " + "|| inp last reference) && tw != " + "NULL", __func__); #endif - INP_INFO_RUNLOCK(&V_tcbinfo); - break; - } - } - - if (tw == NULL) { - /* tcp_twclose() has already been called */ - INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); - continue; + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + break; } + } - tcp_twclose(tw, reuse); - INP_INFO_RUNLOCK(&V_tcbinfo); - if (reuse) - return tw; - } else { - /* INP_INFO lock is busy, continue later. */ - INP_WLOCK(inp); - if (!in_pcbrele_wlocked(inp)) - INP_WUNLOCK(inp); - break; + if (tw == NULL) { + /* tcp_twclose() has already been called */ + INP_WUNLOCK(inp); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + continue; } + + tcp_twclose(tw, reuse); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + if (reuse) + return tw; } return NULL; diff --git a/freebsd/sys/netinet/tcp_usrreq.c b/freebsd/sys/netinet/tcp_usrreq.c index bf2cff4c..617f60d0 100644 --- a/freebsd/sys/netinet/tcp_usrreq.c +++ b/freebsd/sys/netinet/tcp_usrreq.c @@ -278,11 +278,12 @@ tcp_usr_detach(struct socket *so) { struct inpcb *inp; int rlock = 0; + struct epoch_tracker et; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_detach: inp == NULL")); if (!INP_INFO_WLOCKED(&V_tcbinfo)) { - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); rlock = 1; } INP_WLOCK(inp); @@ -290,7 +291,7 @@ tcp_usr_detach(struct socket *so) ("tcp_usr_detach: inp_socket == NULL")); tcp_detach(so, inp); if (rlock) - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); } #ifdef INET @@ -379,6 +380,11 @@ tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) struct sockaddr_in sin; in6_sin6_2_sin(&sin, sin6p); + if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { + error = EAFNOSUPPORT; + INP_HASH_WUNLOCK(&V_tcbinfo); + goto out; + } inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; error = in_pcbbind(inp, (struct sockaddr *)&sin, @@ -608,6 +614,10 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) } in6_sin6_2_sin(&sin, sin6p); + if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { + error = EAFNOSUPPORT; + goto out; + } inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; if ((error = prison_remote_ip4(td->td_ucred, @@ -670,10 +680,11 @@ tcp_usr_disconnect(struct socket *so) { struct inpcb *inp; struct tcpcb *tp = NULL; + struct epoch_tracker et; int error = 0; TCPDEBUG0; - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL")); INP_WLOCK(inp); @@ -690,7 +701,7 @@ out: TCPDEBUG2(PRU_DISCONNECT); TCP_PROBE2(debug__user, tp, PRU_DISCONNECT); INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); return (error); } @@ -749,6 +760,7 @@ tcp6_usr_accept(struct socket *so, struct sockaddr **nam) struct tcpcb *tp = NULL; struct in_addr addr; struct in6_addr addr6; + struct epoch_tracker et; in_port_t port = 0; int v4 = 0; TCPDEBUG0; @@ -758,7 +770,7 @@ tcp6_usr_accept(struct socket *so, struct sockaddr **nam) inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL")); - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNABORTED; @@ -785,7 +797,7 @@ out: TCPDEBUG2(PRU_ACCEPT); TCP_PROBE2(debug__user, tp, PRU_ACCEPT); INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); if (error == 0) { if (v4) *nam = in6_v4mapsin6_sockaddr(port, &addr); @@ -805,9 +817,10 @@ tcp_usr_shutdown(struct socket *so) int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; + struct epoch_tracker et; TCPDEBUG0; - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); inp = sotoinpcb(so); KASSERT(inp != NULL, ("inp == NULL")); INP_WLOCK(inp); @@ -826,7 +839,7 @@ out: TCPDEBUG2(PRU_SHUTDOWN); TCP_PROBE2(debug__user, tp, PRU_SHUTDOWN); INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); return (error); } @@ -889,6 +902,13 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; + struct epoch_tracker net_et; +#ifdef INET +#ifdef INET6 + struct sockaddr_in sin; +#endif + struct sockaddr_in *sinp; +#endif #ifdef INET6 int isipv6; #endif @@ -899,7 +919,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, * this call. */ if (flags & PRUS_EOF) - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, net_et); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL")); INP_WLOCK(inp); @@ -915,11 +935,124 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, error = ECONNRESET; goto out; } -#ifdef INET6 - isipv6 = nam && nam->sa_family == AF_INET6; -#endif /* INET6 */ tp = intotcpcb(inp); TCPDEBUG1(); + if (nam != NULL && tp->t_state < TCPS_SYN_SENT) { + switch (nam->sa_family) { +#ifdef INET + case AF_INET: + sinp = (struct sockaddr_in *)nam; + if (sinp->sin_len != sizeof(struct sockaddr_in)) { + if (m) + m_freem(m); + error = EINVAL; + goto out; + } + if ((inp->inp_vflag & INP_IPV6) != 0) { + if (m) + m_freem(m); + error = EAFNOSUPPORT; + goto out; + } + if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { + if (m) + m_freem(m); + error = EAFNOSUPPORT; + goto out; + } + if ((error = prison_remote_ip4(td->td_ucred, + &sinp->sin_addr))) { + if (m) + m_freem(m); + goto out; + } +#ifdef INET6 + isipv6 = 0; +#endif + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + { + struct sockaddr_in6 *sin6p; + + sin6p = (struct sockaddr_in6 *)nam; + if (sin6p->sin6_len != sizeof(struct sockaddr_in6)) { + if (m) + m_freem(m); + error = EINVAL; + goto out; + } + if (IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) { + if (m) + m_freem(m); + error = EAFNOSUPPORT; + goto out; + } + if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { +#ifdef INET + if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { + error = EINVAL; + if (m) + m_freem(m); + goto out; + } + if ((inp->inp_vflag & INP_IPV4) == 0) { + error = EAFNOSUPPORT; + if (m) + m_freem(m); + goto out; + } + inp->inp_vflag &= ~INP_IPV6; + sinp = &sin; + in6_sin6_2_sin(sinp, sin6p); + if (IN_MULTICAST( + ntohl(sinp->sin_addr.s_addr))) { + error = EAFNOSUPPORT; + if (m) + m_freem(m); + goto out; + } + if ((error = prison_remote_ip4(td->td_ucred, + &sinp->sin_addr))) { + if (m) + m_freem(m); + goto out; + } + isipv6 = 0; +#else /* !INET */ + error = EAFNOSUPPORT; + if (m) + m_freem(m); + goto out; +#endif /* INET */ + } else { + if ((inp->inp_vflag & INP_IPV6) == 0) { + if (m) + m_freem(m); + error = EAFNOSUPPORT; + goto out; + } + inp->inp_vflag &= ~INP_IPV4; + inp->inp_inc.inc_flags |= INC_ISIPV6; + if ((error = prison_remote_ip6(td->td_ucred, + &sin6p->sin6_addr))) { + if (m) + m_freem(m); + goto out; + } + isipv6 = 1; + } + break; + } +#endif /* INET6 */ + default: + if (m) + m_freem(m); + error = EAFNOSUPPORT; + goto out; + } + } if (control) { /* TCP doesn't do control messages (rights, creds, etc) */ if (control->m_len) { @@ -947,7 +1080,8 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, else #endif #ifdef INET - error = tcp_connect(tp, nam, td); + error = tcp_connect(tp, + (struct sockaddr *)sinp, td); #endif if (error) goto out; @@ -1016,7 +1150,8 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, else #endif #ifdef INET - error = tcp_connect(tp, nam, td); + error = tcp_connect(tp, + (struct sockaddr *)sinp, td); #endif if (error) goto out; @@ -1042,7 +1177,7 @@ out: ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); INP_WUNLOCK(inp); if (flags & PRUS_EOF) - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, net_et); return (error); } @@ -1081,12 +1216,13 @@ tcp_usr_abort(struct socket *so) { struct inpcb *inp; struct tcpcb *tp = NULL; + struct epoch_tracker et; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL")); - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); INP_WLOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_abort: inp_socket == NULL")); @@ -1112,7 +1248,7 @@ tcp_usr_abort(struct socket *so) } INP_WUNLOCK(inp); dropped: - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); } /* @@ -1123,12 +1259,13 @@ tcp_usr_close(struct socket *so) { struct inpcb *inp; struct tcpcb *tp = NULL; + struct epoch_tracker et; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL")); - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); INP_WLOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_close: inp_socket == NULL")); @@ -1152,7 +1289,7 @@ tcp_usr_close(struct socket *so) inp->inp_flags |= INP_SOCKREF; } INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); } /* @@ -1304,7 +1441,9 @@ tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) soisconnecting(so); TCPSTAT_INC(tcps_connattempt); tcp_state_change(tp, TCPS_SYN_SENT); - tp->iss = tcp_new_isn(tp); + tp->iss = tcp_new_isn(&inp->inp_inc); + if (tp->t_flags & TF_REQ_TSTMP) + tp->ts_offset = tcp_new_ts_offset(&inp->inp_inc); tcp_sendseqinit(tp); return 0; @@ -1343,7 +1482,9 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) soisconnecting(inp->inp_socket); TCPSTAT_INC(tcps_connattempt); tcp_state_change(tp, TCPS_SYN_SENT); - tp->iss = tcp_new_isn(tp); + tp->iss = tcp_new_isn(&inp->inp_inc); + if (tp->t_flags & TF_REQ_TSTMP) + tp->ts_offset = tcp_new_ts_offset(&inp->inp_inc); tcp_sendseqinit(tp); return 0; @@ -1445,6 +1586,42 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) if (inp->inp_vflag & INP_IPV6PROTO) { INP_WUNLOCK(inp); error = ip6_ctloutput(so, sopt); + /* + * In case of the IPV6_USE_MIN_MTU socket option, + * the INC_IPV6MINMTU flag to announce a corresponding + * MSS during the initial handshake. + * If the TCP connection is not in the front states, + * just reduce the MSS being used. + * This avoids the sending of TCP segments which will + * be fragmented at the IPv6 layer. + */ + if ((error == 0) && + (sopt->sopt_dir == SOPT_SET) && + (sopt->sopt_level == IPPROTO_IPV6) && + (sopt->sopt_name == IPV6_USE_MIN_MTU)) { + INP_WLOCK(inp); + if ((inp->inp_flags & + (INP_TIMEWAIT | INP_DROPPED))) { + INP_WUNLOCK(inp); + return (ECONNRESET); + } + inp->inp_inc.inc_flags |= INC_IPV6MINMTU; + tp = intotcpcb(inp); + if ((tp->t_state >= TCPS_SYN_SENT) && + (inp->inp_inc.inc_flags & INC_ISIPV6)) { + struct ip6_pktopts *opt; + + opt = inp->in6p_outputopts; + if ((opt != NULL) && + (opt->ip6po_minmtu == + IP6PO_MINMTU_ALL)) { + if (tp->t_maxseg > TCP6_MSS) { + tp->t_maxseg = TCP6_MSS; + } + } + } + INP_WUNLOCK(inp); + } } #endif /* INET6 */ #if defined(INET6) && defined(INET) @@ -1487,7 +1664,6 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) return (0); } if (tp->t_state != TCPS_CLOSED) { - int error=EINVAL; /* * The user has advanced the state * past the initial point, we may not @@ -1500,7 +1676,8 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) * still be possible? */ error = (*blk->tfb_tcp_handoff_ok)(tp); - } + } else + error = EINVAL; if (error) { refcount_release(&blk->tfb_refcnt); INP_WUNLOCK(inp); @@ -1724,6 +1901,7 @@ unlock_and_done: */ if (CC_ALGO(tp)->cb_destroy != NULL) CC_ALGO(tp)->cb_destroy(tp->ccv); + CC_DATA(tp) = NULL; CC_ALGO(tp) = algo; /* * If something goes pear shaped initialising the new @@ -2045,6 +2223,7 @@ tcp_attach(struct socket *so) { struct tcpcb *tp; struct inpcb *inp; + struct epoch_tracker et; int error; if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { @@ -2054,10 +2233,10 @@ tcp_attach(struct socket *so) } so->so_rcv.sb_flags |= SB_AUTOSIZE; so->so_snd.sb_flags |= SB_AUTOSIZE; - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); error = in_pcballoc(so, &V_tcbinfo); if (error) { - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); return (error); } inp = sotoinpcb(so); @@ -2075,12 +2254,12 @@ tcp_attach(struct socket *so) if (tp == NULL) { in_pcbdetach(inp); in_pcbfree(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); return (ENOBUFS); } tp->t_state = TCPS_CLOSED; INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); TCPSTATES_INC(TCPS_CLOSED); return (0); } @@ -2106,7 +2285,8 @@ tcp_disconnect(struct tcpcb *tp) * Neither tcp_close() nor tcp_drop() should return NULL, as the * socket is still open. */ - if (tp->t_state < TCPS_ESTABLISHED) { + if (tp->t_state < TCPS_ESTABLISHED && + !(tp->t_state > TCPS_LISTEN && IS_FASTOPEN(tp->t_flags))) { tp = tcp_close(tp); KASSERT(tp != NULL, ("tcp_disconnect: tcp_close() returned NULL")); @@ -2383,7 +2563,7 @@ db_print_tcpcb(struct tcpcb *tp, const char *name, int indent) db_print_indent(indent); db_printf("t_segq first: %p t_segqlen: %d t_dupacks: %d\n", - LIST_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks); + TAILQ_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks); db_print_indent(indent); db_printf("tt_rexmt: %p tt_persist: %p tt_keep: %p\n", diff --git a/freebsd/sys/netinet/tcp_var.h b/freebsd/sys/netinet/tcp_var.h index adaaff61..2fbe07ad 100644 --- a/freebsd/sys/netinet/tcp_var.h +++ b/freebsd/sys/netinet/tcp_var.h @@ -46,12 +46,15 @@ #if defined(_KERNEL) || defined(_WANT_TCPCB) /* TCP segment queue entry */ struct tseg_qent { - LIST_ENTRY(tseg_qent) tqe_q; + TAILQ_ENTRY(tseg_qent) tqe_q; + struct mbuf *tqe_m; /* mbuf contains packet */ + struct mbuf *tqe_last; /* last mbuf in chain */ + tcp_seq tqe_start; /* TCP Sequence number start */ int tqe_len; /* TCP segment data length */ - struct tcphdr *tqe_th; /* a pointer to tcp header */ - struct mbuf *tqe_m; /* mbuf contains packet */ + uint32_t tqe_flags; /* The flags from the th->th_flags */ + uint32_t tqe_mbuf_cnt; /* Count of mbuf overhead */ }; -LIST_HEAD(tsegqe_head, tseg_qent); +TAILQ_HEAD(tsegqe_head, tseg_qent); struct sackblk { tcp_seq start; /* start seq no. of sack block */ @@ -79,6 +82,8 @@ struct sackhint { uint64_t _pad[1]; /* TBD */ }; +#define SEGQ_EMPTY(tp) TAILQ_EMPTY(&(tp)->t_segq) + STAILQ_HEAD(tcp_log_stailq, tcp_log_mem); /* @@ -93,8 +98,11 @@ struct tcpcb { void *t_fb_ptr; /* Pointer to t_fb specific data */ uint32_t t_maxseg:24, /* maximum segment size */ t_logstate:8; /* State of "black box" logging */ - uint32_t t_state:4, /* state of this connection */ - bits_spare : 24; + uint32_t t_port:16, /* Tunneling (over udp) port */ + t_state:4, /* state of this connection */ + t_idle_reduce : 1, + t_delayed_ack: 7, /* Delayed ack variable */ + bits_spare : 4; u_int t_flags; tcp_seq snd_una; /* sent but unacknowledged */ tcp_seq snd_max; /* highest sequence number sent; @@ -104,7 +112,7 @@ struct tcpcb { tcp_seq snd_up; /* send urgent pointer */ uint32_t snd_wnd; /* send window */ uint32_t snd_cwnd; /* congestion-controlled window */ - uint32_t cl1_spare; /* Spare to round out CL 1 */ + uint32_t t_peakrate_thr; /* pre-calculated peak rate threshold */ /* Cache line 2 */ u_int32_t ts_offset; /* our timestamp offset */ u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */ @@ -128,6 +136,7 @@ struct tcpcb { /* Cache line 3 */ tcp_seq rcv_up; /* receive urgent pointer */ int t_segqlen; /* segment reassembly queue length */ + uint32_t t_segqmbuflen; /* Count of bytes mbufs on all entries */ struct tsegqe_head t_segq; /* segment reassembly queue */ struct mbuf *t_in_pkt; struct mbuf *t_tail_pkt; @@ -189,6 +198,7 @@ struct tcpcb { struct cc_var *ccv; /* congestion control specific vars */ struct osd *osd; /* storage for Khelp module data */ int t_bytes_acked; /* # bytes acked during current RTT */ + u_int t_maxunacktime; u_int t_keepinit; /* time to establish connection */ u_int t_keepidle; /* time before keepalive probes begin */ u_int t_keepintvl; /* interval between keepalives */ @@ -260,12 +270,11 @@ struct tcp_function_block { int (*tfb_tcp_output_wtime)(struct tcpcb *, const struct timeval *); void (*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, - int, int, uint8_t, - int); + int, int, uint8_t); void (*tfb_tcp_hpts_do_segment)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t, - int, int, struct timeval *); + int, struct timeval *); int (*tfb_tcp_ctloutput)(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp); /* Optional memory allocation/free routine */ @@ -361,6 +370,7 @@ TAILQ_HEAD(tcp_funchead, tcp_function); #define TF2_PLPMTU_PMTUD 0x00000002 /* Allowed to attempt PLPMTUD. */ #define TF2_PLPMTU_MAXSEGSNT 0x00000004 /* Last seg sent was full seg. */ #define TF2_LOG_AUTO 0x00000008 /* Session is auto-logging. */ +#define TF2_DROP_AF_DATA 0x00000010 /* Drop after all data ack'd */ /* * Structure to hold TCP options that are only used during segment @@ -649,6 +659,11 @@ struct tcp_hhook_data { int tso; tcp_seq curack; }; +#ifdef TCP_HHOOK +void hhook_run_tcp_est_out(struct tcpcb *tp, + struct tcphdr *th, struct tcpopt *to, + uint32_t len, int tso); +#endif #endif /* @@ -668,7 +683,7 @@ struct tcp_hhook_data { */ #if defined(_NETINET_IN_PCB_H_) && defined(_SYS_SOCKETVAR_H_) struct xtcpcb { - size_t xt_len; /* length of this structure */ + ksize_t xt_len; /* length of this structure */ struct xinpcb xt_inp; char xt_stack[TCP_FUNCTION_NAME_LEN_MAX]; /* (s) */ char xt_logid[TCP_LOG_ID_LEN]; /* (s) */ @@ -801,6 +816,9 @@ VNET_DECLARE(struct inpcbinfo, tcbinfo); #define V_tcp_sack_maxholes VNET(tcp_sack_maxholes) #define V_tcp_sc_rst_sock_fail VNET(tcp_sc_rst_sock_fail) #define V_tcp_sendspace VNET(tcp_sendspace) +#define V_tcp_udp_tunneling_overhead VNET(tcp_udp_tunneling_overhead) +#define V_tcp_udp_tunneling_port VNET(tcp_udp_tunneling_port) + #ifdef TCP_HHOOK VNET_DECLARE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST + 1]); @@ -825,7 +843,7 @@ char *tcp_log_addrs(struct in_conninfo *, struct tcphdr *, void *, const void *); char *tcp_log_vain(struct in_conninfo *, struct tcphdr *, void *, const void *); -int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *); +int tcp_reass(struct tcpcb *, struct tcphdr *, tcp_seq *, int *, struct mbuf *); void tcp_reass_global_init(void); void tcp_reass_flush(struct tcpcb *); void tcp_dooptions(struct tcpopt *, u_char *, int, int); @@ -849,8 +867,7 @@ int tcp_input(struct mbuf **, int *, int); int tcp_autorcvbuf(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int); void tcp_do_segment(struct mbuf *, struct tcphdr *, - struct socket *, struct tcpcb *, int, int, uint8_t, - int); + struct socket *, struct tcpcb *, int, int, uint8_t); int register_tcp_functions(struct tcp_function_block *blk, int wait); int register_tcp_functions_as_names(struct tcp_function_block *blk, @@ -893,9 +910,12 @@ struct tcptemp * tcpip_maketemplate(struct inpcb *); void tcpip_fillheaders(struct inpcb *, void *, void *); void tcp_timer_activate(struct tcpcb *, uint32_t, u_int); +int tcp_timer_suspend(struct tcpcb *, uint32_t); +void tcp_timers_unsuspend(struct tcpcb *, uint32_t); int tcp_timer_active(struct tcpcb *, uint32_t); void tcp_timer_stop(struct tcpcb *, uint32_t); void tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int); +int inp_to_cpuid(struct inpcb *inp); /* * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo) */ @@ -909,7 +929,9 @@ void tcp_hc_updatemtu(struct in_conninfo *, uint32_t); void tcp_hc_update(struct in_conninfo *, struct hc_metrics_lite *); extern struct pr_usrreqs tcp_usrreqs; -tcp_seq tcp_new_isn(struct tcpcb *); + +uint32_t tcp_new_ts_offset(struct in_conninfo *); +tcp_seq tcp_new_isn(struct in_conninfo *); int tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq); void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend); @@ -921,6 +943,10 @@ void tcp_free_sackholes(struct tcpcb *tp); int tcp_newreno(struct tcpcb *, struct tcphdr *); int tcp_compute_pipe(struct tcpcb *); void tcp_sndbuf_autoscale(struct tcpcb *, struct socket *, uint32_t); +struct mbuf * + tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen, + int32_t seglimit, int32_t segsize, struct sockbuf *sb); + static inline void tcp_fields_to_host(struct tcphdr *th) diff --git a/freebsd/sys/netinet/udp_usrreq.c b/freebsd/sys/netinet/udp_usrreq.c index 178a8d5e..9557c154 100644 --- a/freebsd/sys/netinet/udp_usrreq.c +++ b/freebsd/sys/netinet/udp_usrreq.c @@ -150,7 +150,7 @@ VNET_DEFINE(struct inpcbhead, udb); /* from udp_var.h */ VNET_DEFINE(struct inpcbinfo, udbinfo); VNET_DEFINE(struct inpcbhead, ulitecb); VNET_DEFINE(struct inpcbinfo, ulitecbinfo); -static VNET_DEFINE(uma_zone_t, udpcb_zone); +VNET_DEFINE_STATIC(uma_zone_t, udpcb_zone); #define V_udpcb_zone VNET(udpcb_zone) #ifndef UDBHASHSIZE @@ -405,6 +405,7 @@ udp_input(struct mbuf **mp, int *offp, int proto) struct sockaddr_in udp_in[2]; struct mbuf *m; struct m_tag *fwd_tag; + struct epoch_tracker et; int cscov_partial, iphlen; m = *mp; @@ -535,10 +536,10 @@ udp_input(struct mbuf **mp, int *offp, int proto) struct inpcbhead *pcblist; struct ip_moptions *imo; - INP_INFO_RLOCK(pcbinfo); + INP_INFO_RLOCK_ET(pcbinfo, et); pcblist = udp_get_pcblist(proto); last = NULL; - LIST_FOREACH(inp, pcblist, inp_list) { + CK_LIST_FOREACH(inp, pcblist, inp_list) { if (inp->inp_lport != uh->uh_dport) continue; #ifdef INET6 @@ -599,8 +600,12 @@ udp_input(struct mbuf **mp, int *offp, int proto) if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) { - UDP_PROBE(receive, NULL, last, ip, - last, uh); + if (proto == IPPROTO_UDPLITE) + UDPLITE_PROBE(receive, NULL, last, ip, + last, uh); + else + UDP_PROBE(receive, NULL, last, ip, last, + uh); if (udp_append(last, ip, n, iphlen, udp_in)) { goto inp_lost; @@ -618,7 +623,7 @@ udp_input(struct mbuf **mp, int *offp, int proto) * will never clear these options after setting them. */ if ((last->inp_socket->so_options & - (SO_REUSEPORT|SO_REUSEADDR)) == 0) + (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0) break; } @@ -631,14 +636,17 @@ udp_input(struct mbuf **mp, int *offp, int proto) UDPSTAT_INC(udps_noportbcast); if (inp) INP_RUNLOCK(inp); - INP_INFO_RUNLOCK(pcbinfo); + INP_INFO_RUNLOCK_ET(pcbinfo, et); goto badunlocked; } - UDP_PROBE(receive, NULL, last, ip, last, uh); + if (proto == IPPROTO_UDPLITE) + UDPLITE_PROBE(receive, NULL, last, ip, last, uh); + else + UDP_PROBE(receive, NULL, last, ip, last, uh); if (udp_append(last, ip, m, iphlen, udp_in) == 0) INP_RUNLOCK(last); inp_lost: - INP_INFO_RUNLOCK(pcbinfo); + INP_INFO_RUNLOCK_ET(pcbinfo, et); return (IPPROTO_DONE); } @@ -690,6 +698,10 @@ udp_input(struct mbuf **mp, int *offp, int proto) inet_ntoa_r(ip->ip_dst, dst), ntohs(uh->uh_dport), inet_ntoa_r(ip->ip_src, src), ntohs(uh->uh_sport)); } + if (proto == IPPROTO_UDPLITE) + UDPLITE_PROBE(receive, NULL, NULL, ip, NULL, uh); + else + UDP_PROBE(receive, NULL, NULL, ip, NULL, uh); UDPSTAT_INC(udps_noport); if (m->m_flags & (M_BCAST | M_MCAST)) { UDPSTAT_INC(udps_noportbcast); @@ -709,6 +721,10 @@ udp_input(struct mbuf **mp, int *offp, int proto) */ INP_RLOCK_ASSERT(inp); if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) { + if (proto == IPPROTO_UDPLITE) + UDPLITE_PROBE(receive, NULL, inp, ip, inp, uh); + else + UDP_PROBE(receive, NULL, inp, ip, inp, uh); INP_RUNLOCK(inp); m_freem(m); return (IPPROTO_DONE); @@ -724,7 +740,10 @@ udp_input(struct mbuf **mp, int *offp, int proto) } } - UDP_PROBE(receive, NULL, inp, ip, inp, uh); + if (proto == IPPROTO_UDPLITE) + UDPLITE_PROBE(receive, NULL, inp, ip, inp, uh); + else + UDP_PROBE(receive, NULL, inp, ip, inp, uh); if (udp_append(inp, ip, m, iphlen, udp_in) == 0) INP_RUNLOCK(inp); return (IPPROTO_DONE); @@ -808,14 +827,15 @@ udp_common_ctlinput(int cmd, struct sockaddr *sa, void *vip, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { struct udpcb *up; + void *ctx; + udp_tun_icmp_t func; up = intoudpcb(inp); - if (up->u_icmp_func != NULL) { - INP_RUNLOCK(inp); - (*up->u_icmp_func)(cmd, sa, vip, up->u_tun_ctx); - } else { - INP_RUNLOCK(inp); - } + ctx = up->u_tun_ctx; + func = up->u_icmp_func; + INP_RUNLOCK(inp); + if (func != NULL) + (*func)(cmd, sa, vip, ctx); } } } else @@ -842,9 +862,9 @@ udp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; struct inpcb *inp, **inp_list; - struct in_pcblist *il; inp_gen_t gencnt; struct xinpgen xig; + struct epoch_tracker et; /* * The process of preparing the PCB list is too time-consuming and @@ -863,10 +883,10 @@ udp_pcblist(SYSCTL_HANDLER_ARGS) /* * OK, now we're committed to doing something. */ - INP_INFO_RLOCK(&V_udbinfo); + INP_INFO_RLOCK_ET(&V_udbinfo, et); gencnt = V_udbinfo.ipi_gencnt; n = V_udbinfo.ipi_count; - INP_INFO_RUNLOCK(&V_udbinfo); + INP_INFO_RUNLOCK_ET(&V_udbinfo, et); error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) + n * sizeof(struct xinpcb)); @@ -880,12 +900,14 @@ udp_pcblist(SYSCTL_HANDLER_ARGS) error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); - il = malloc(sizeof(struct in_pcblist) + n * sizeof(struct inpcb *), M_TEMP, M_WAITOK|M_ZERO_INVARIANTS); - inp_list = il->il_inp_list; - INP_INFO_RLOCK(&V_udbinfo); - for (inp = LIST_FIRST(V_udbinfo.ipi_listhead), i = 0; inp && i < n; - inp = LIST_NEXT(inp, inp_list)) { + inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); + if (inp_list == NULL) + return (ENOMEM); + + INP_INFO_RLOCK_ET(&V_udbinfo, et); + for (inp = CK_LIST_FIRST(V_udbinfo.ipi_listhead), i = 0; inp && i < n; + inp = CK_LIST_NEXT(inp, inp_list)) { INP_WLOCK(inp); if (inp->inp_gencnt <= gencnt && cr_canseeinpcb(req->td->td_ucred, inp) == 0) { @@ -894,7 +916,7 @@ udp_pcblist(SYSCTL_HANDLER_ARGS) } INP_WUNLOCK(inp); } - INP_INFO_RUNLOCK(&V_udbinfo); + INP_INFO_RUNLOCK_ET(&V_udbinfo, et); n = i; error = 0; @@ -910,9 +932,14 @@ udp_pcblist(SYSCTL_HANDLER_ARGS) } else INP_RUNLOCK(inp); } - il->il_count = n; - il->il_pcbinfo = &V_udbinfo; - epoch_call(net_epoch_preempt, &il->il_epoch_ctx, in_pcblist_rele_rlocked); + INP_INFO_WLOCK(&V_udbinfo); + for (i = 0; i < n; i++) { + inp = inp_list[i]; + INP_RLOCK(inp); + if (!in_pcbrele_rlocked(inp)) + INP_RUNLOCK(inp); + } + INP_INFO_WUNLOCK(&V_udbinfo); if (!error) { /* @@ -921,13 +948,14 @@ udp_pcblist(SYSCTL_HANDLER_ARGS) * that something happened while we were processing this * request, and it might be necessary to retry. */ - INP_INFO_RLOCK(&V_udbinfo); + INP_INFO_RLOCK_ET(&V_udbinfo, et); xig.xig_gen = V_udbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_udbinfo.ipi_count; - INP_INFO_RUNLOCK(&V_udbinfo); + INP_INFO_RUNLOCK_ET(&V_udbinfo, et); error = SYSCTL_OUT(req, &xig, sizeof xig); } + free(inp_list, M_TEMP); return (error); } @@ -1106,6 +1134,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, struct cmsghdr *cm; struct inpcbinfo *pcbinfo; struct sockaddr_in *sin, src; + struct epoch_tracker et; int cscov_partial = 0; int error = 0; int ipflags; @@ -1262,7 +1291,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, (inp->inp_laddr.s_addr == INADDR_ANY) || (inp->inp_lport == 0))) || (src.sin_family == AF_INET)) { - INP_HASH_RLOCK(pcbinfo); + INP_HASH_RLOCK_ET(pcbinfo, et); unlock_udbinfo = UH_RLOCKED; } else unlock_udbinfo = UH_UNLOCKED; @@ -1390,6 +1419,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, */ ui = mtod(m, struct udpiphdr *); bzero(ui->ui_x1, sizeof(ui->ui_x1)); /* XXX still needed? */ + ui->ui_v = IPVERSION << 4; ui->ui_pr = pr; ui->ui_src = laddr; ui->ui_dst = faddr; @@ -1412,8 +1442,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, * the entire UDPLite packet is covered by the checksum. */ cscov_partial = (cscov == 0) ? 0 : 1; - } else - ui->ui_v = IPVERSION << 4; + } /* * Set the Don't Fragment bit in the IP header. @@ -1518,8 +1547,11 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, if (unlock_udbinfo == UH_WLOCKED) INP_HASH_WUNLOCK(pcbinfo); else if (unlock_udbinfo == UH_RLOCKED) - INP_HASH_RUNLOCK(pcbinfo); - UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u); + INP_HASH_RUNLOCK_ET(pcbinfo, et); + if (pr == IPPROTO_UDPLITE) + UDPLITE_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u); + else + UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u); error = ip_output(m, inp->inp_options, (unlock_inp == UH_WLOCKED ? &inp->inp_route : NULL), ipflags, inp->inp_moptions, inp); @@ -1538,7 +1570,7 @@ release: } else if (unlock_udbinfo == UH_RLOCKED) { KASSERT(unlock_inp == UH_RLOCKED, ("%s: shared udbinfo lock, excl inp lock", __func__)); - INP_HASH_RUNLOCK(pcbinfo); + INP_HASH_RUNLOCK_ET(pcbinfo, et); INP_RUNLOCK(inp); } else if (unlock_inp == UH_WLOCKED) INP_WUNLOCK(inp); @@ -1719,7 +1751,6 @@ udp_detach(struct socket *so) INP_WLOCK(inp); up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); - /* XXX defer to epoch_call */ inp->inp_ppcb = NULL; in_pcbdetach(inp); in_pcbfree(inp); diff --git a/freebsd/sys/netinet/udplite.h b/freebsd/sys/netinet/udplite.h index 0e23cd70..57a1422a 100644 --- a/freebsd/sys/netinet/udplite.h +++ b/freebsd/sys/netinet/udplite.h @@ -29,6 +29,17 @@ #ifndef _NETINET_UDPLITE_H_ #define _NETINET_UDPLITE_H_ +/* + * UDP-Lite protocol header. + * Per RFC 3828, July, 2004. + */ +struct udplitehdr { + u_short udplite_sport; /* UDO-Lite source port */ + u_short udplite_dport; /* UDP-Lite destination port */ + u_short udplite_coverage; /* UDP-Lite checksum coverage */ + u_short udplite_checksum; /* UDP-Lite checksum */ +}; + /* * User-settable options (used with setsockopt). */ |