summaryrefslogtreecommitdiffstats
path: root/freebsd/sys/netinet/tcp_timewait.c
diff options
context:
space:
mode:
Diffstat (limited to 'freebsd/sys/netinet/tcp_timewait.c')
-rw-r--r--freebsd/sys/netinet/tcp_timewait.c225
1 files changed, 156 insertions, 69 deletions
diff --git a/freebsd/sys/netinet/tcp_timewait.c b/freebsd/sys/netinet/tcp_timewait.c
index 9034fab4..330e842e 100644
--- a/freebsd/sys/netinet/tcp_timewait.c
+++ b/freebsd/sys/netinet/tcp_timewait.c
@@ -56,6 +56,7 @@ __FBSDID("$FreeBSD$");
#include <net/route.h>
#include <net/if.h>
+#include <net/if_var.h>
#include <net/vnet.h>
#include <netinet/in.h>
@@ -93,20 +94,41 @@ __FBSDID("$FreeBSD$");
#include <security/mac/mac_framework.h>
static VNET_DEFINE(uma_zone_t, tcptw_zone);
-#define V_tcptw_zone VNET(tcptw_zone)
+#define V_tcptw_zone VNET(tcptw_zone)
static int maxtcptw;
/*
* The timed wait queue contains references to each of the TCP sessions
* currently in the TIME_WAIT state. The queue pointers, including the
* queue pointers in each tcptw structure, are protected using the global
- * tcbinfo lock, which must be held over queue iteration and modification.
+ * timewait lock, which must be held over queue iteration and modification.
+ *
+ * Rules on tcptw usage:
+ * - a inpcb is always freed _after_ its tcptw
+ * - a tcptw relies on its inpcb reference counting for memory stability
+ * - a tcptw is dereferenceable only while its inpcb is locked
*/
static VNET_DEFINE(TAILQ_HEAD(, tcptw), twq_2msl);
-#define V_twq_2msl VNET(twq_2msl)
+#define V_twq_2msl VNET(twq_2msl)
+
+/* Global timewait lock */
+static VNET_DEFINE(struct rwlock, tw_lock);
+#define V_tw_lock VNET(tw_lock)
+
+#define TW_LOCK_INIT(tw, d) rw_init_flags(&(tw), (d), 0)
+#define TW_LOCK_DESTROY(tw) rw_destroy(&(tw))
+#define TW_RLOCK(tw) rw_rlock(&(tw))
+#define TW_WLOCK(tw) rw_wlock(&(tw))
+#define TW_RUNLOCK(tw) rw_runlock(&(tw))
+#define TW_WUNLOCK(tw) rw_wunlock(&(tw))
+#define TW_LOCK_ASSERT(tw) rw_assert(&(tw), RA_LOCKED)
+#define TW_RLOCK_ASSERT(tw) rw_assert(&(tw), RA_RLOCKED)
+#define TW_WLOCK_ASSERT(tw) rw_assert(&(tw), RA_WLOCKED)
+#define TW_UNLOCK_ASSERT(tw) rw_assert(&(tw), RA_UNLOCKED)
static void tcp_tw_2msl_reset(struct tcptw *, int);
-static void tcp_tw_2msl_stop(struct tcptw *);
+static void tcp_tw_2msl_stop(struct tcptw *, int);
+static int tcp_twrespond(struct tcptw *, int);
static int
tcptw_auto_size(void)
@@ -149,7 +171,7 @@ SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxtcptw, CTLTYPE_INT|CTLFLAG_RW,
VNET_DEFINE(int, nolocaltimewait) = 0;
#define V_nolocaltimewait VNET(nolocaltimewait)
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, nolocaltimewait, CTLFLAG_RW,
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, nolocaltimewait, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(nolocaltimewait), 0,
"Do not create compressed TCP TIME_WAIT entries for local connections");
@@ -166,13 +188,14 @@ tcp_tw_init(void)
{
V_tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw),
- NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
TUNABLE_INT_FETCH("net.inet.tcp.maxtcptw", &maxtcptw);
if (maxtcptw == 0)
uma_zone_set_max(V_tcptw_zone, tcptw_auto_size());
else
uma_zone_set_max(V_tcptw_zone, maxtcptw);
TAILQ_INIT(&V_twq_2msl);
+ TW_LOCK_INIT(V_tw_lock, "tcptw");
}
#ifdef VIMAGE
@@ -181,11 +204,12 @@ tcp_tw_destroy(void)
{
struct tcptw *tw;
- INP_INFO_WLOCK(&V_tcbinfo);
- while((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL)
+ INP_INFO_RLOCK(&V_tcbinfo);
+ while ((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL)
tcp_twclose(tw, 0);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ TW_LOCK_DESTROY(V_tw_lock);
uma_zdestroy(V_tcptw_zone);
}
#endif
@@ -206,7 +230,7 @@ tcp_twstart(struct tcpcb *tp)
int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6;
#endif
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* tcp_tw_2msl_reset(). */
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
if (V_nolocaltimewait) {
@@ -229,8 +253,23 @@ tcp_twstart(struct tcpcb *tp)
}
}
+
+ /*
+ * For use only by DTrace. We do not reference the state
+ * after this point so modifying it in place is not a problem.
+ */
+ tcp_state_change(tp, TCPS_TIME_WAIT);
+
tw = uma_zalloc(V_tcptw_zone, M_NOWAIT);
if (tw == NULL) {
+ /*
+ * Reached limit on total number of TIMEWAIT connections
+ * allowed. Remove a connection from TIMEWAIT queue in LRU
+ * fashion to make room for this connection.
+ *
+ * XXX: Check if it possible to always have enough room
+ * in advance based on guarantees provided by uma_zalloc().
+ */
tw = tcp_tw_2msl_scan(1);
if (tw == NULL) {
tp = tcp_close(tp);
@@ -239,7 +278,12 @@ tcp_twstart(struct tcpcb *tp)
return;
}
}
+ /*
+ * The tcptw will hold a reference on its inpcb until tcp_twclose
+ * is called
+ */
tw->tw_inpcb = inp;
+ in_pcbref(inp); /* Reference from tw */
/*
* Recover last window size sent.
@@ -313,53 +357,19 @@ tcp_twstart(struct tcpcb *tp)
INP_WUNLOCK(inp);
}
-#if 0
-/*
- * The appromixate rate of ISN increase of Microsoft TCP stacks;
- * the actual rate is slightly higher due to the addition of
- * random positive increments.
- *
- * Most other new OSes use semi-randomized ISN values, so we
- * do not need to worry about them.
- */
-#define MS_ISN_BYTES_PER_SECOND 250000
-
-/*
- * Determine if the ISN we will generate has advanced beyond the last
- * sequence number used by the previous connection. If so, indicate
- * that it is safe to recycle this tw socket by returning 1.
- */
-int
-tcp_twrecycleable(struct tcptw *tw)
-{
- tcp_seq new_iss = tw->iss;
- tcp_seq new_irs = tw->irs;
-
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
- new_iss += (ticks - tw->t_starttime) * (ISN_BYTES_PER_SECOND / hz);
- new_irs += (ticks - tw->t_starttime) * (MS_ISN_BYTES_PER_SECOND / hz);
-
- if (SEQ_GT(new_iss, tw->snd_nxt) && SEQ_GT(new_irs, tw->rcv_nxt))
- return (1);
- else
- return (0);
-}
-#endif
-
/*
* Returns 1 if the TIME_WAIT state was killed and we should start over,
* looking for a pcb in the listen state. Returns 0 otherwise.
*/
int
-tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th,
+tcp_twcheck(struct inpcb *inp, struct tcpopt *to __unused, struct tcphdr *th,
struct mbuf *m, int tlen)
{
struct tcptw *tw;
int thflags;
tcp_seq seq;
- /* tcbinfo lock required for tcp_twclose(), tcp_tw_2msl_reset(). */
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
/*
@@ -460,11 +470,10 @@ tcp_twclose(struct tcptw *tw, int reuse)
inp = tw->tw_inpcb;
KASSERT((inp->inp_flags & INP_TIMEWAIT), ("tcp_twclose: !timewait"));
KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw"));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* tcp_tw_2msl_stop(). */
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* in_pcbfree() */
INP_WLOCK_ASSERT(inp);
- tw->tw_inpcb = NULL;
- tcp_tw_2msl_stop(tw);
+ tcp_tw_2msl_stop(tw, reuse);
inp->inp_ppcb = NULL;
in_pcbdrop(inp);
@@ -493,17 +502,17 @@ tcp_twclose(struct tcptw *tw, int reuse)
*/
INP_WUNLOCK(inp);
}
- } else
+ } else {
+ /*
+ * The socket has been already cleaned-up for us, only free the
+ * inpcb.
+ */
in_pcbfree(inp);
+ }
TCPSTAT_INC(tcps_closed);
- crfree(tw->tw_cred);
- tw->tw_cred = NULL;
- if (reuse)
- return;
- uma_zfree(V_tcptw_zone, tw);
}
-int
+static int
tcp_twrespond(struct tcptw *tw, int flags)
{
struct inpcb *inp = tw->tw_inpcb;
@@ -525,7 +534,7 @@ tcp_twrespond(struct tcptw *tw, int flags)
INP_WLOCK_ASSERT(inp);
- m = m_gethdr(M_DONTWAIT, MT_DATA);
+ m = m_gethdr(M_NOWAIT, MT_DATA);
if (m == NULL)
return (ENOBUFS);
m->m_data += max_linkhdr;
@@ -596,9 +605,9 @@ tcp_twrespond(struct tcptw *tw, int flags)
m->m_pkthdr.csum_flags = CSUM_TCP;
th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP));
- ip->ip_len = m->m_pkthdr.len;
+ ip->ip_len = htons(m->m_pkthdr.len);
if (V_path_mtu_discovery)
- ip->ip_off |= IP_DF;
+ ip->ip_off |= htons(IP_DF);
error = ip_output(m, inp->inp_options, NULL,
((tw->tw_so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
NULL, inp);
@@ -616,36 +625,114 @@ static void
tcp_tw_2msl_reset(struct tcptw *tw, int rearm)
{
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(tw->tw_inpcb);
+
+ TW_WLOCK(V_tw_lock);
if (rearm)
TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
tw->tw_time = ticks + 2 * tcp_msl;
TAILQ_INSERT_TAIL(&V_twq_2msl, tw, tw_2msl);
+ TW_WUNLOCK(V_tw_lock);
}
static void
-tcp_tw_2msl_stop(struct tcptw *tw)
+tcp_tw_2msl_stop(struct tcptw *tw, int reuse)
{
+ struct ucred *cred;
+ struct inpcb *inp;
+ int released;
+
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+
+ TW_WLOCK(V_tw_lock);
+ inp = tw->tw_inpcb;
+ tw->tw_inpcb = NULL;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
+ cred = tw->tw_cred;
+ tw->tw_cred = NULL;
+ TW_WUNLOCK(V_tw_lock);
+
+ if (cred != NULL)
+ crfree(cred);
+
+ released = in_pcbrele_wlocked(inp);
+ KASSERT(!released, ("%s: inp should not be released here", __func__));
+
+ if (!reuse)
+ uma_zfree(V_tcptw_zone, tw);
+ TCPSTATES_DEC(TCPS_TIME_WAIT);
}
struct tcptw *
tcp_tw_2msl_scan(int reuse)
{
struct tcptw *tw;
+ struct inpcb *inp;
+
+#ifdef INVARIANTS
+ if (reuse) {
+ /*
+ * Exclusive pcbinfo lock is not required in reuse case even if
+ * two inpcb locks can be acquired simultaneously:
+ * - the inpcb transitioning to TIME_WAIT state in
+ * tcp_tw_start(),
+ * - the inpcb closed by tcp_twclose().
+ *
+ * It is because only inpcbs in FIN_WAIT2 or CLOSING states can
+ * transition in TIME_WAIT state. Then a pcbcb cannot be in
+ * TIME_WAIT list and transitioning to TIME_WAIT state at same
+ * time.
+ */
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ }
+#endif
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
for (;;) {
+ TW_RLOCK(V_tw_lock);
tw = TAILQ_FIRST(&V_twq_2msl);
- if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0))
+ if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0)) {
+ TW_RUNLOCK(V_tw_lock);
break;
- INP_WLOCK(tw->tw_inpcb);
- tcp_twclose(tw, reuse);
- if (reuse)
- return (tw);
+ }
+ KASSERT(tw->tw_inpcb != NULL, ("%s: tw->tw_inpcb == NULL",
+ __func__));
+
+ inp = tw->tw_inpcb;
+ in_pcbref(inp);
+ TW_RUNLOCK(V_tw_lock);
+
+ if (INP_INFO_TRY_RLOCK(&V_tcbinfo)) {
+
+ INP_WLOCK(inp);
+ tw = intotw(inp);
+ if (in_pcbrele_wlocked(inp)) {
+ KASSERT(tw == NULL, ("%s: held last inp "
+ "reference but tw not NULL", __func__));
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ continue;
+ }
+
+ if (tw == NULL) {
+ /* tcp_twclose() has already been called */
+ INP_WUNLOCK(inp);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ continue;
+ }
+
+ tcp_twclose(tw, reuse);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ if (reuse)
+ return tw;
+ } else {
+ /* INP_INFO lock is busy, continue later. */
+ INP_WLOCK(inp);
+ if (!in_pcbrele_wlocked(inp))
+ INP_WUNLOCK(inp);
+ break;
+ }
}
- return (NULL);
+
+ return NULL;
}