diff options
Diffstat (limited to 'freebsd/sys/netinet/tcp_timewait.c')
-rw-r--r-- | freebsd/sys/netinet/tcp_timewait.c | 225 |
1 files changed, 156 insertions, 69 deletions
diff --git a/freebsd/sys/netinet/tcp_timewait.c b/freebsd/sys/netinet/tcp_timewait.c index 9034fab4..330e842e 100644 --- a/freebsd/sys/netinet/tcp_timewait.c +++ b/freebsd/sys/netinet/tcp_timewait.c @@ -56,6 +56,7 @@ __FBSDID("$FreeBSD$"); #include <net/route.h> #include <net/if.h> +#include <net/if_var.h> #include <net/vnet.h> #include <netinet/in.h> @@ -93,20 +94,41 @@ __FBSDID("$FreeBSD$"); #include <security/mac/mac_framework.h> static VNET_DEFINE(uma_zone_t, tcptw_zone); -#define V_tcptw_zone VNET(tcptw_zone) +#define V_tcptw_zone VNET(tcptw_zone) static int maxtcptw; /* * The timed wait queue contains references to each of the TCP sessions * currently in the TIME_WAIT state. The queue pointers, including the * queue pointers in each tcptw structure, are protected using the global - * tcbinfo lock, which must be held over queue iteration and modification. + * timewait lock, which must be held over queue iteration and modification. + * + * Rules on tcptw usage: + * - a inpcb is always freed _after_ its tcptw + * - a tcptw relies on its inpcb reference counting for memory stability + * - a tcptw is dereferenceable only while its inpcb is locked */ static VNET_DEFINE(TAILQ_HEAD(, tcptw), twq_2msl); -#define V_twq_2msl VNET(twq_2msl) +#define V_twq_2msl VNET(twq_2msl) + +/* Global timewait lock */ +static VNET_DEFINE(struct rwlock, tw_lock); +#define V_tw_lock VNET(tw_lock) + +#define TW_LOCK_INIT(tw, d) rw_init_flags(&(tw), (d), 0) +#define TW_LOCK_DESTROY(tw) rw_destroy(&(tw)) +#define TW_RLOCK(tw) rw_rlock(&(tw)) +#define TW_WLOCK(tw) rw_wlock(&(tw)) +#define TW_RUNLOCK(tw) rw_runlock(&(tw)) +#define TW_WUNLOCK(tw) rw_wunlock(&(tw)) +#define TW_LOCK_ASSERT(tw) rw_assert(&(tw), RA_LOCKED) +#define TW_RLOCK_ASSERT(tw) rw_assert(&(tw), RA_RLOCKED) +#define TW_WLOCK_ASSERT(tw) rw_assert(&(tw), RA_WLOCKED) +#define TW_UNLOCK_ASSERT(tw) rw_assert(&(tw), RA_UNLOCKED) static void tcp_tw_2msl_reset(struct tcptw *, int); -static void tcp_tw_2msl_stop(struct tcptw *); +static void tcp_tw_2msl_stop(struct tcptw *, int); +static int tcp_twrespond(struct tcptw *, int); static int tcptw_auto_size(void) @@ -149,7 +171,7 @@ SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxtcptw, CTLTYPE_INT|CTLFLAG_RW, VNET_DEFINE(int, nolocaltimewait) = 0; #define V_nolocaltimewait VNET(nolocaltimewait) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, nolocaltimewait, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, nolocaltimewait, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nolocaltimewait), 0, "Do not create compressed TCP TIME_WAIT entries for local connections"); @@ -166,13 +188,14 @@ tcp_tw_init(void) { V_tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); TUNABLE_INT_FETCH("net.inet.tcp.maxtcptw", &maxtcptw); if (maxtcptw == 0) uma_zone_set_max(V_tcptw_zone, tcptw_auto_size()); else uma_zone_set_max(V_tcptw_zone, maxtcptw); TAILQ_INIT(&V_twq_2msl); + TW_LOCK_INIT(V_tw_lock, "tcptw"); } #ifdef VIMAGE @@ -181,11 +204,12 @@ tcp_tw_destroy(void) { struct tcptw *tw; - INP_INFO_WLOCK(&V_tcbinfo); - while((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL) + INP_INFO_RLOCK(&V_tcbinfo); + while ((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL) tcp_twclose(tw, 0); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); + TW_LOCK_DESTROY(V_tw_lock); uma_zdestroy(V_tcptw_zone); } #endif @@ -206,7 +230,7 @@ tcp_twstart(struct tcpcb *tp) int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6; #endif - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* tcp_tw_2msl_reset(). */ + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); if (V_nolocaltimewait) { @@ -229,8 +253,23 @@ tcp_twstart(struct tcpcb *tp) } } + + /* + * For use only by DTrace. We do not reference the state + * after this point so modifying it in place is not a problem. + */ + tcp_state_change(tp, TCPS_TIME_WAIT); + tw = uma_zalloc(V_tcptw_zone, M_NOWAIT); if (tw == NULL) { + /* + * Reached limit on total number of TIMEWAIT connections + * allowed. Remove a connection from TIMEWAIT queue in LRU + * fashion to make room for this connection. + * + * XXX: Check if it possible to always have enough room + * in advance based on guarantees provided by uma_zalloc(). + */ tw = tcp_tw_2msl_scan(1); if (tw == NULL) { tp = tcp_close(tp); @@ -239,7 +278,12 @@ tcp_twstart(struct tcpcb *tp) return; } } + /* + * The tcptw will hold a reference on its inpcb until tcp_twclose + * is called + */ tw->tw_inpcb = inp; + in_pcbref(inp); /* Reference from tw */ /* * Recover last window size sent. @@ -313,53 +357,19 @@ tcp_twstart(struct tcpcb *tp) INP_WUNLOCK(inp); } -#if 0 -/* - * The appromixate rate of ISN increase of Microsoft TCP stacks; - * the actual rate is slightly higher due to the addition of - * random positive increments. - * - * Most other new OSes use semi-randomized ISN values, so we - * do not need to worry about them. - */ -#define MS_ISN_BYTES_PER_SECOND 250000 - -/* - * Determine if the ISN we will generate has advanced beyond the last - * sequence number used by the previous connection. If so, indicate - * that it is safe to recycle this tw socket by returning 1. - */ -int -tcp_twrecycleable(struct tcptw *tw) -{ - tcp_seq new_iss = tw->iss; - tcp_seq new_irs = tw->irs; - - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); - new_iss += (ticks - tw->t_starttime) * (ISN_BYTES_PER_SECOND / hz); - new_irs += (ticks - tw->t_starttime) * (MS_ISN_BYTES_PER_SECOND / hz); - - if (SEQ_GT(new_iss, tw->snd_nxt) && SEQ_GT(new_irs, tw->rcv_nxt)) - return (1); - else - return (0); -} -#endif - /* * Returns 1 if the TIME_WAIT state was killed and we should start over, * looking for a pcb in the listen state. Returns 0 otherwise. */ int -tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th, +tcp_twcheck(struct inpcb *inp, struct tcpopt *to __unused, struct tcphdr *th, struct mbuf *m, int tlen) { struct tcptw *tw; int thflags; tcp_seq seq; - /* tcbinfo lock required for tcp_twclose(), tcp_tw_2msl_reset(). */ - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); /* @@ -460,11 +470,10 @@ tcp_twclose(struct tcptw *tw, int reuse) inp = tw->tw_inpcb; KASSERT((inp->inp_flags & INP_TIMEWAIT), ("tcp_twclose: !timewait")); KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw")); - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* tcp_tw_2msl_stop(). */ + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* in_pcbfree() */ INP_WLOCK_ASSERT(inp); - tw->tw_inpcb = NULL; - tcp_tw_2msl_stop(tw); + tcp_tw_2msl_stop(tw, reuse); inp->inp_ppcb = NULL; in_pcbdrop(inp); @@ -493,17 +502,17 @@ tcp_twclose(struct tcptw *tw, int reuse) */ INP_WUNLOCK(inp); } - } else + } else { + /* + * The socket has been already cleaned-up for us, only free the + * inpcb. + */ in_pcbfree(inp); + } TCPSTAT_INC(tcps_closed); - crfree(tw->tw_cred); - tw->tw_cred = NULL; - if (reuse) - return; - uma_zfree(V_tcptw_zone, tw); } -int +static int tcp_twrespond(struct tcptw *tw, int flags) { struct inpcb *inp = tw->tw_inpcb; @@ -525,7 +534,7 @@ tcp_twrespond(struct tcptw *tw, int flags) INP_WLOCK_ASSERT(inp); - m = m_gethdr(M_DONTWAIT, MT_DATA); + m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); m->m_data += max_linkhdr; @@ -596,9 +605,9 @@ tcp_twrespond(struct tcptw *tw, int flags) m->m_pkthdr.csum_flags = CSUM_TCP; th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP)); - ip->ip_len = m->m_pkthdr.len; + ip->ip_len = htons(m->m_pkthdr.len); if (V_path_mtu_discovery) - ip->ip_off |= IP_DF; + ip->ip_off |= htons(IP_DF); error = ip_output(m, inp->inp_options, NULL, ((tw->tw_so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), NULL, inp); @@ -616,36 +625,114 @@ static void tcp_tw_2msl_reset(struct tcptw *tw, int rearm) { - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tw->tw_inpcb); + + TW_WLOCK(V_tw_lock); if (rearm) TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl); tw->tw_time = ticks + 2 * tcp_msl; TAILQ_INSERT_TAIL(&V_twq_2msl, tw, tw_2msl); + TW_WUNLOCK(V_tw_lock); } static void -tcp_tw_2msl_stop(struct tcptw *tw) +tcp_tw_2msl_stop(struct tcptw *tw, int reuse) { + struct ucred *cred; + struct inpcb *inp; + int released; + + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + + TW_WLOCK(V_tw_lock); + inp = tw->tw_inpcb; + tw->tw_inpcb = NULL; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl); + cred = tw->tw_cred; + tw->tw_cred = NULL; + TW_WUNLOCK(V_tw_lock); + + if (cred != NULL) + crfree(cred); + + released = in_pcbrele_wlocked(inp); + KASSERT(!released, ("%s: inp should not be released here", __func__)); + + if (!reuse) + uma_zfree(V_tcptw_zone, tw); + TCPSTATES_DEC(TCPS_TIME_WAIT); } struct tcptw * tcp_tw_2msl_scan(int reuse) { struct tcptw *tw; + struct inpcb *inp; + +#ifdef INVARIANTS + if (reuse) { + /* + * Exclusive pcbinfo lock is not required in reuse case even if + * two inpcb locks can be acquired simultaneously: + * - the inpcb transitioning to TIME_WAIT state in + * tcp_tw_start(), + * - the inpcb closed by tcp_twclose(). + * + * It is because only inpcbs in FIN_WAIT2 or CLOSING states can + * transition in TIME_WAIT state. Then a pcbcb cannot be in + * TIME_WAIT list and transitioning to TIME_WAIT state at same + * time. + */ + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + } +#endif - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); for (;;) { + TW_RLOCK(V_tw_lock); tw = TAILQ_FIRST(&V_twq_2msl); - if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0)) + if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0)) { + TW_RUNLOCK(V_tw_lock); break; - INP_WLOCK(tw->tw_inpcb); - tcp_twclose(tw, reuse); - if (reuse) - return (tw); + } + KASSERT(tw->tw_inpcb != NULL, ("%s: tw->tw_inpcb == NULL", + __func__)); + + inp = tw->tw_inpcb; + in_pcbref(inp); + TW_RUNLOCK(V_tw_lock); + + if (INP_INFO_TRY_RLOCK(&V_tcbinfo)) { + + INP_WLOCK(inp); + tw = intotw(inp); + if (in_pcbrele_wlocked(inp)) { + KASSERT(tw == NULL, ("%s: held last inp " + "reference but tw not NULL", __func__)); + INP_INFO_RUNLOCK(&V_tcbinfo); + continue; + } + + if (tw == NULL) { + /* tcp_twclose() has already been called */ + INP_WUNLOCK(inp); + INP_INFO_RUNLOCK(&V_tcbinfo); + continue; + } + + tcp_twclose(tw, reuse); + INP_INFO_RUNLOCK(&V_tcbinfo); + if (reuse) + return tw; + } else { + /* INP_INFO lock is busy, continue later. */ + INP_WLOCK(inp); + if (!in_pcbrele_wlocked(inp)) + INP_WUNLOCK(inp); + break; + } } - return (NULL); + + return NULL; } |